Module:headword: Difference between revisions

(36 intermediate revisions by 2 users not shown)

Line 2:

-- Named constants for all modules used, to make it easier to swap out sandbox versions.

local en_utilities_module = "Module:en-utilities"

local gender_and_number_module = "Module:getn"

local headword_data_module = "Module:headword/data"

local headword_page_module = "Module:headword/page"

local links_module = "Module:links"

local load_module = "Module:load"

local pages_module = "Module:pages"

local palindromes_module = "Module:palindromes"

local ~~qualifier_module~~ = "Module:qualifier"

local pron_qualifier_module = "Module:pron qualifier"

local scripts_module = "Module:scripts"

local scripts_data_module = "Module:scripts/data"

Line 15:

Line 18:

local table_module = "Module:table"

local utilities_module = "Module:utilities"

~~local m_str_utils = require(string_utilities_module)~~

local concat = table.concat

local ~~encode_entities~~ = ~~m_str_utils~~.~~encode_entities~~

local dump = mw.dumpObject

local insert = table.insert

local ipairs = ipairs

local max = math.max

local new_title = mw.title.new

local pairs = pairs

local ~~pattern_escape~~ = ~~m_str_utils.pattern_escape~~

local require = require

~~local rgmatch = mw.ustring.gmatch~~

~~local rsubn = mw.ustring.gsub~~

~~local rfind = mw.ustring.find~~

~~local ulen = m_str_utils.len~~

~~local rmatch = mw.ustring.match~~

local toNFC = mw.ustring.toNFC

local toNFD = mw.ustring.toNFD

local type = type

local ufind = mw.ustring.find

local ugmatch = mw.ustring.gmatch

local ugsub = mw.ustring.gsub

local umatch = mw.ustring.match

--[==[

Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]

local function encode_entities(...)

encode_entities = require(string_utilities_module).encode_entities

return encode_entities(...)

end

local function extend(...)

extend = require(table_module).extend

return extend(...)

end

local function find_best_script_without_lang(...)

find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang

return find_best_script_without_lang(...)

end

local function format_categories(...)

format_categories = require(utilities_module).format_categories

return format_categories(...)

end

local function format_genders(...)

format_genders = require(gender_and_number_module).format_genders

return format_genders(...)

end

local function format_pron_qualifiers(...)

format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers

return format_pron_qualifiers(...)

end

local function full_link(...)

full_link = require(links_module).full_link

return full_link(...)

end

local function get_current_L2(...)

get_current_L2 = require(pages_module).get_current_L2

return get_current_L2(...)

end

local function get_link_page(...)

get_link_page = require(links_module).get_link_page

return get_link_page(...)

end

local function get_script(...)

get_script = require(scripts_module).getByCode

return get_script(...)

end

local function is_palindrome(...)

is_palindrome = require(palindromes_module).is_palindrome

return is_palindrome(...)

end

local function language_link(...)

language_link = require(links_module).language_link

return language_link(...)

end

local function load_data(...)

load_data = require(load_module).load_data

return load_data(...)

end

local function pattern_escape(...)

pattern_escape = require(string_utilities_module).pattern_escape

return pattern_escape(...)

end

local function pluralize(...)

pluralize = require(en_utilities_module).pluralize

return pluralize(...)

end

local function process_page(...)

process_page = require(headword_page_module).process_page

return process_page(...)

end

local function remove_links(...)

remove_links = require(links_module).remove_links

return remove_links(...)

end

local function shallow_copy(...)

shallow_copy = require(table_module).shallowCopy

return shallow_copy(...)

end

local function tag_text(...)

tag_text = require(script_utilities_module).tag_text

return tag_text(...)

end

local function tag_transcription(...)

tag_transcription = require(script_utilities_module).tag_transcription

return tag_transcription(...)

end

local function tag_translit(...)

tag_translit = require(script_utilities_module).tag_translit

return tag_translit(...)

end

local function trim(...)

trim = require(string_utilities_module).trim

return trim(...)

end

local function ulen(...)

ulen = require(string_utilities_module).len

return ulen(...)

end

--[==[

Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]

local m_data

local function get_data()

m_data = load_data(headword_data_module)

return m_data

end

local ~~m_data~~ = ~~mw.loadData~~(~~headword_data_module~~)

local script_data

local function get_script_data()

script_data = load_data(scripts_data_module)

return script_data

end

local ~~isLemma = m_data.lemmas~~

local script_utilities_data

local ~~isNonLemma~~ = ~~m_data.nonlemmas~~

local function get_script_utilities_data()

~~local notranslit = m_data.notranslit~~

script_utilities_data = load_data(script_utilities_data_module)

~~local toBeTagged = m_data.toBeTagged~~

return script_utilities_data

end

-- If set to true, categories always appear, even in non-mainspace pages

local test_force_categories = false

~~-- Version of rsubn() that discards all but the first return value.~~

~~local function rsub(term, foo, bar)~~

~~return (rsubn(term, foo, bar))~~

~~end~~

local function text_in_script(text, script_code)

local sc = ~~require(scripts_module).getByCode~~(script_code)

local sc = get_script(script_code)

if not sc then

error("Internal error: Bad script code " .. script_code)

Line 56:

Line 184:

local out

if characters then

text = ~~rsub~~(text, "%W", "")

text = ugsub(text, "%W", "")

out = ~~rfind~~(text, "[" .. characters .. "]")

out = ufind(text, "[" .. characters .. "]")

end

Line 71:

Line 199:

--[[ List of punctuation or spacing characters that are found inside of words.

Used to exclude characters from the regex above. ]]

local wordPunc = "-־׳״'.·*’་•:᠊"

local wordPunc = "-#%%&@־׳״'.·*’་•:᠊"

local notWordPunc = "[^" .. wordPunc .. "]+"

-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or

-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references

-- customized separator: `part` is the object specifying the term, which should optionally contain:

-- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should

-- * left qualifiers in `q`, an array of strings ~~(or `qualifiers` for compatibility purposes)~~;

-- optionally contain:

-- * left qualifiers in `q`, an array of strings;

-- * right qualifiers in `qq`, an array of strings;

-- * left labels in `l`, an array of strings;

-- * right labels in `ll`, an array of strings;

-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`

-- (formatted reference text) and optionally `name` and/or `group`;

-- * a separator in `separator`, defaulting to " or " if this is not the first term (j > 1), otherwise "".

-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.

~~local function format_term_with_qualifiers_and_refs(part, formatted, j)~~

~~local left_qualifiers, right_qualifiers~~

~~local reftext~~

~~left_qualifiers~~ = part~~.q and #part.q > 0 and part.q~~

local function format_term_with_qualifiers_and_refs(lang, part, formatted, j)

if ~~left_qualifiers~~ then

local function part_non_empty(field)

~~left_qualifiers~~ = ~~require~~(~~qualifier_module).format_qualifier~~(~~left_qualifiers) .~~. " "

local list = part[field]

if not list then

return nil

end

if type(list) ~= "table" then

error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))

end

return list[1]

end

~~right_qualifiers = part.qq and #part.qq > 0 and part.qq~~

if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or

if ~~right_qualifiers then~~

part_non_empty("ll") or part_non_empty("refs") then

~~right_qualifiers =~~ " " ~~.. require~~(~~qualifier_module~~)~~.format_qualifier~~(~~right_qualifiers~~)

formatted = format_pron_qualifiers {

~~end~~

lang = lang,

~~if part.refs and #part.refs > 0 then~~

text = formatted,

~~local refs = {}~~

q = part.q,

~~for _, ref in ipairs~~(~~part.refs~~) do

qq = part.qq,

~~if type~~(~~ref) ~=~~ "~~table~~" then

l = part.l,

~~ref~~ = {~~text = ref}~~

ll = part.ll,

~~end~~

refs = part.refs,

~~local refargs~~

}

~~if ref~~.~~name or ref.group then~~

~~refargs~~ = ~~{name = ref~~.~~name~~, ~~group~~ = ~~ref~~.~~group}~~

~~end~~

~~insert(~~refs~~, mw~~.~~getCurrentFrame():extensionTag("ref", ref.text~~, ~~refargs))~~

~~end~~

~~reftext = concat(refs)~~

end

local separator = part.separator or j > 1 and " or " -- use "" to request no separator

~~if left_qualifiers then~~

~~formatted = left_qualifiers .. formatted~~

~~end~~

~~if reftext then~~

~~formatted = formatted .. reftext~~

~~end~~

~~if right_qualifiers then~~

~~formatted = formatted .. right_qualifiers~~

~~end~~

if separator then

formatted = separator .. formatted

Line 132:

Line 252:

--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]

function export.head_is_multiword(head)

for possibleWordBreak in ~~rgmatch~~(head, spacingPunctuation) do

for possibleWordBreak in ugmatch(head, spacingPunctuation) do

if ~~rmatch~~(possibleWordBreak, notWordPunc) then

if umatch(possibleWordBreak, notWordPunc) then

return true

end

Line 141:

Line 261:

end

do

~~--[==[Add links to a multiword head.]==]~~

~~function export.add_multiword_links(head, default)~~

local function workaround_to_exclude_chars(s)

return ~~rsub~~(s, notWordPunc, "\2%1\1")

return (ugsub(s, notWordPunc, "\2%1\1"))

end

head = "\1" .. ~~rsub~~(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"

--[==[Add links to a multiword head.]==]

if default then

function export.add_multiword_links(head, default)

head = head

head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"

:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")

if default then

:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")

head = head

end

:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")

:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")

end

--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").

head = encode_entities(head, "[]", true, true)

--[=[

use this when workaround is no longer needed:

head = "[[" .. ~~rsub~~(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

Remove any empty links, which could have been created above

at the beginning or end of the string.

]=]

return (head

:gsub("\1\2", "")

:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))

end

local function non_categorizable(full_raw_pagename)

return full_raw_pagename:find("^Appendix:Gestures/")

return full_raw_pagename:find("^Appendix:Gestures/") or

-- Unsupported titles with descriptive names.

(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`"))

end

local function tag_text_and_add_quals_and_refs(data, head, formatted, j)

-- Add language and script wrapper.

formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)

-- Add qualifiers, labels, references and separator.

return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j)

end

-- Format a headword with transliterations.

local function format_headword(data)

~~local m_scriptutils = require(script_utilities_module)~~

-- Are there non-empty transliterations?

local has_translits = false

Line 190:

Line 317:

local unique_head_parts = {}

local has_multiple_heads = #data.heads ~~> 1~~

local has_multiple_heads = not not data.heads[2]

for j, head in ipairs(data.heads) do

Line 204:

Line 331:

-- Apply processing to the headword, for formatting links and such.

if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then

formatted = ~~require(links_module).~~language_link{term = head.term, lang = data.lang}

formatted = language_link{term = head.term, lang = data.lang}

else

formatted = data.lang:makeDisplayText(head.term, head.sc, true)

end

local ~~function~~ tag_text_and_add_quals_and_refs(~~head, formatted, j)~~

local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j)

~~-- Add language and script wrapper.~~

~~formatted = m_scriptutils.tag_text(formatted,~~ data~~.lang~~, ~~head.sc, "head", nil, j == 1 and data.id or nil)~~

~~-- Add qualifiers, references and separator.~~

~~return format_term_with_qualifiers_and_refs(head, formatted, j)~~

~~end~~

~~local head_part = tag_text_and_add_quals_and_refs(~~head, formatted, j)

insert(head_parts, head_part)

Line 227:

Line 346:

unique_head_part = head_part

else

unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)

unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1)

end

unique_head_parts[unique_head_part] = true

Line 255:

Line 374:

local this_parts = {}

if head.tr then

insert(this_parts, ~~m_scriptutils.~~tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))

insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))

if head.ts then

insert(this_parts, " ")

Line 261:

Line 380:

end

if head.ts then

insert(this_parts, "/" .~~. m_scriptutils~~.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")

insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")

end

insert(translit_parts, concat(this_parts))

Line 270:

Line 389:

local langname = data.lang:getCanonicalName()

local transliteration_page = ~~mw.title.new~~(langname .. " transliteration~~", "Wiktionary~~")

local transliteration_page = new_title(langname .. " transliteration")

local saw_translit_page = false

if transliteration_page and transliteration_page.exists then

translits_formatted = " [[~~Wiktionary:~~" .. langname .. " transliteration|•]]" .. translits_formatted

translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted

saw_translit_page = true

end

Line 281:

Line 400:

if not saw_translit_page and data.lang:hasType("etymology-only") then

langname = data.lang:getFullName()

transliteration_page = ~~mw.title.new~~(langname .. " transliteration~~", "Wiktionary~~")

transliteration_page = new_title(langname .. " transliteration")

if transliteration_page and transliteration_page.exists then

translits_formatted = " [[~~Wiktionary:~~" .. langname .. " transliteration|•]]" .. translits_formatted

translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted

end

Line 304:

Line 423:

local function ~~format_genders~~(data)

local function format_headword_genders(data)

local retval = ""

if data.genders and #data.genders ~~> 0~~ then

if data.genders and data.genders[1] then

if data.gloss then

retval = ","

end

local pos_for_cat

if not data.nogendercat ~~and not~~ m_data.no_gender_cat[data.lang:getCode()] ~~and~~

if not data.nogendercat then

~~not m_data.~~no_gender_cat[data.lang:getFullCode()] then

local no_gender_cat = (m_data or get_data()).no_gender_cat

~~local pos_category~~ = data.pos_category:gsub("^reconstructed ", "")

if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then

~~pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]~~

pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")]

end

local text, cats = ~~require(gender_and_number_module).~~format_genders(data.genders, data.lang, pos_for_cat)

local text, cats = format_genders(data.genders, data.lang, pos_for_cat)

~~for _, cat in ipairs(~~cats~~) do~~

if cats then

~~insert~~(data.categories, ~~cat~~)

extend(data.categories, cats)

end

retval = retval .. " " .. text

Line 325:

Line 445:

end

-- Forward reference

local format_inflections

local function format_inflection_parts(data, parts)

~~local any_part_translit = false~~

for j, part in ipairs(parts) do

if type(part) ~= "table" then

Line 338:

Line 458:

if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then

error("The face `" .. face .. "` " .. (

~~mw.loadData~~(~~script_utilities_data_module~~).faces[face] and

(script_utilities_data or get_script_utilities_data()).faces[face] and

"should not be used for non-headword terms on the headword line." or

"is invalid."

Line 347:

Line 467:

-- right into the 'data' table to disable inflection links of the entire headword

-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin

local nolinkinfl = data.nolinkinfl

local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl

local formatted

Line 361:

Line 481:

-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it

-- to be enabled in languages with more complex scripts (e.g. Arabic).

local tr = part.~~translit~~ or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)

--

if ~~tr ~= "-"~~ then

-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the

~~any_part_translit = true~~

-- nested inflections structure?

local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil

-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.

if part.translit then

error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")

end

~~formatted~~ = ~~require~~(~~links_module~~).full_link(

if part.transcription then

error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")

end

local postprocess_annotations

if part.inflections then

postprocess_annotations = function(infldata)

insert(infldata.annotations, format_inflections(data, part.inflections))

end

formatted = full_link(

{

term = not nolinkinfl and part.term or nil,

Line 371:

Line 505:

lang = part.lang or data.lang,

sc = part.sc or parts.sc or nil,

gloss = part.gloss,

pos = part.pos,

lit = part.lit,

id = part.id,

genders = part.genders,

tr = tr,

ts = part.~~transcription~~,

ts = part.ts,

accel = partaccel or parts.accel,

postprocess_annotations = postprocess_annotations,

},

face

Line 381:

Line 519:

end

parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)

parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part,

formatted, j)

end

local parts_output

if #parts ~~> 0~~ then

if parts[1] then

parts_output = (parts.label and " " or "") .. concat(parts)

elseif parts.request then

Line 396:

Line 535:

local parts_label = parts.label and ("" .. parts.label .. "") or ""

return parts_label .. parts_output, ~~any_part_translit~~

return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)

end

-- Format the inflections following the headword.

-- Format the inflections following the headword or nested after a given inflection.

~~local~~ function ~~format_inflections~~(data)

format_inflections = function(data, inflections)

~~local any_part_translit = false~~

if inflections and inflections[1] then

if ~~data.~~inflections and ~~#data.~~inflections ~~> 0~~ then

-- Format each inflection individually.

for key, infl in ipairs(~~data.~~inflections) do

for key, infl in ipairs(inflections) do

~~local this_any_part_translit~~

inflections[key] = format_inflection_parts(data, infl)

~~data.~~inflections[key]~~, this_any_part_translit~~ = format_inflection_parts(data, infl)

~~if this_any_part_translit then~~

~~any_part_translit = true~~

~~end~~

end

~~local concat_result =~~ concat(~~data.~~inflections, ", ")

return concat(inflections, ", ")

~~return " (" .. concat_result .. ")"~~

else

return ""

end

-- Format the top-level inflections following the headword. Currently this just adds parens around the

-- formatted comma-separated inflections in `data.inflections`.

local function format_top_level_inflections(data)

local result = format_inflections(data, data.inflections)

if result ~= "" then

return " (" .. result .. ")"

else

return result

end

--[==[

-- Returns the plural form of `pos`, a raw part of speech input, which could be singular or

Returns the plural form of `pos`, a raw part of speech input, which could be singular or

-- plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to

plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to

-- "kanji").]==]

"kanji").

]==]

function export.pluralize_pos(pos)

return m_data.irregular_plurals[pos] or

-- Make the plural form of the part of speech

return (m_data or get_data()).irregular_plurals[pos] or

pos:sub(-1) == "s" and pos or

~~-- Make the plural form of the part of speech~~

pluralize(pos)

~~require("Module:string utilities").~~pluralize(pos)

end

--[==[

-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil

Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil

-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).

if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).

-- If you have a POS in its singular form, call export.pluralize_pos() above to pluralize it

If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it

-- in a smart fashion that knows when to add "-s" and when to add "-es", and also takes

in a smart fashion that knows when to add "-s" and when to add "-es", and also takes

-- into account any irregular plurals.~~]==]~~

into account any irregular plurals.

--

-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess

If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess

-- based on whether it ends in " forms"; otherwise, return nil.]==]

based on whether it ends in " forms"; otherwise, return nil.

]==]

function export.pos_lemma_or_nonlemma(plpos, best_guess)

local m_headword_data = m_data or get_data()

local isLemma = m_headword_data.lemmas

-- Is it a lemma category?

if isLemma[plpos] then

Line 450:

Line 600:

end

-- Is it a nonlemma category?

local isNonLemma = m_headword_data.nonlemmas

if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then

return "non-lemma form"

Line 463:

Line 614:

end

--[==[

Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form

aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural.

]==]

function export.canonicalize_pos(pos)

-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away.

if pos == "pre" then

-- Don't throw error on 'pref' as it's an alias for "prefix".

error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'")

end

-- Likewise for pro = pronoun.

if pos == "pro" or pos == "prof" then

error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")

end

local m_headword_data = m_data or get_data()

if m_headword_data.pos_aliases[pos] then

pos = m_headword_data.pos_aliases[pos]

elseif pos:sub(-1) == "f" then

pos = pos:sub(1, -2)

pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"

end

return export.pluralize_pos(pos)

end

-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a

Line 475:

Line 649:

local typ = type(data[element])

if typ ~= "table" then

error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))

error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))

end

for k, v in pairs(data[element]) do

if k ~= "maxindex" then

if type(k) ~= "number" then

error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))

error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))

end

if k > maxind then

Line 487:

Line 661:

if v then

if type(v) ~= "string" then

error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))

error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))

end

if not allow_blank_string and v == "" then

error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))

error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))

end

Line 519:

Line 693:

-- that.

if tbl == true then

~~if page.raw_defaultsort ~= sortkey then~~

~~insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")~~

~~end~~

return

end

Line 531:

Line 702:

different = true

end

~~end~~

~~if redundant then~~

~~insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")~~

~~end~~

~~if different then~~

~~insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")~~

end

return sortkey

end

function export.maintenance_cats(page, lang, lang_cats, page_cats)

~~for _~~, ~~cat in ipairs(~~page.cats) do

extend(page_cats, page.cats)

~~insert(page_cats, cat)~~

~~end~~

lang = lang:getFull() -- since we are just generating categories

local canonical = lang:getCanonicalName()

Line 550:

Line 713:

if tbl then

sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)

~~insert(lang_cats, canonical .. " entries with topic categories using raw markup")~~

end

tbl = page.wikitext_langname_cat[canonical]

if tbl then

handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)

~~insert(lang_cats, canonical .. " entries with language name categories using raw markup")~~

~~end~~

~~if require(utilities_module).get_current_L2() ~= canonical then~~

~~insert(lang_cats, canonical .. " entries with incorrect language header")~~

end

Line 570:

Line 728:

]==]

function export.full_headword(data)

~~local remove_links = require(links_module).remove_links~~

~~local format_categories = require(utilities_module).format_categories~~

-- Prevent data from being destructively modified.

local data = ~~require(table_module).shallowcopy~~(data)

local data = shallow_copy(data)

------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------

if data.getCanonicalName then

error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")

error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")

end

if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then

error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")

error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")

end

if data.id and type(data.id) ~= "string" then

error("The id in the data table should be a string.")

error("Internal error: The id in the data table should be a string.")

end

Line 593:

Line 748:

local langcode = data.lang:getCode()

local full_langcode = ~~langcode~~

local full_langcode = data.lang:getFullCode()

local langname = data.lang:getCanonicalName()

local full_langname = ~~langname~~

local full_langname = data.lang:getFullName()

local raw_pagename~~, page~~ = data.pagename

local raw_pagename = data.pagename

if raw_pagename and raw_pagename ~= ~~m_data~~.pagename then -- for testing, doc pages, etc.

local page

page ~~= require~~(~~headword_page_module~~).process_page(raw_pagename)

local m_headword_data = m_data or get_data()

if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.

-- data.pagename is often set on documentation and test pages through the pagename= parameter of various

-- templates, to emulate running on that page. Having a large number of such test templates on a single

-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,

-- we don't really need to do that and can function fine without fetching and parsing the contents of a

-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser

-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to

-- be found and where actual content does not live (User, Template, Module).

local actual_namespace = m_headword_data.page.namespace

local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or

actual_namespace == "Module"

page = process_page(raw_pagename, no_fetch_content)

else

page = ~~m_data~~.page

page = m_headword_data.page

end

~~-- Check the~~ namespace ~~against the language type.~~

local namespace = page.namespace

if page.namespace ~~== "" then~~

~~if data.lang:hasType("reconstructed") then~~

~~error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")~~

~~elseif data.lang:hasType("appendix-constructed") then~~

~~error("Entries in " .. langname .. " must be placed in the Appendix: namespace")~~

~~end~~

------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------

Line 618:

Line 778:

-- new-style

if data.translits or data.transcriptions then

error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")

error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")

end

else

-- convert old-style `heads`, `translits` and `transcriptions` to new-style

local maxind = ~~math.~~max(

local maxind = max(

init_and_find_maximum_index(data, "heads"~~, true~~),

init_and_find_maximum_index(data, "heads"),

init_and_find_maximum_index(data, "translits", true),

init_and_find_maximum_index(data, "transcriptions", true)

Line 643:

Line 803:

------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------

init_and_find_maximum_index(data, "categories"~~, true~~)

-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]

init_and_find_maximum_index(data, "whole_page_categories"~~, true~~)

if data.altform then

data.noposcat = true

end

init_and_find_maximum_index(data, "categories")

init_and_find_maximum_index(data, "whole_page_categories")

local pos_category_already_present = false

if #data.categories ~~> 0~~ then

if data.categories[1] then

local escaped_langname = pattern_escape(full_langname)

local matches_lang_pattern = "^" .. escaped_langname .. " "

Line 660:

Line 825:

if not data.pos_category then

error("`data.pos_category` not specified and could not be inferred from the categories given in "

error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "

.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "

.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "

Line 680:

Line 845:

local postype = export.pos_lemma_or_nonlemma(data.pos_category)

if not data.noposcat then

if postype == "lemma" then

postype = data.lang:getMainCategoryName()

end

insert(data.categories, 1, full_langname .. " " .. postype .. "s")

end

insert(data.categories, 1, "Contionary")

-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]

if data.altform then

insert(data.categories, 1, full_langname .. " alternative forms")

end

------------ 5. Create a default headword, and add links to multiword page names. ------------

-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be

-- reconstructed.

local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")

local lang_reconstructed = data.lang:hasType("reconstructed")

if is_anti_asterisk then

if not lang_reconstructed then

error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")

end

lang_reconstructed = false

end

-- Determine if term is reconstructed

local is_reconstructed = ~~page.~~namespace == "Reconstruction" or ~~data.lang:hasType("reconstructed")~~

local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed

-- Create a default headword based on the pagename, which is determined in

Line 693:

Line 881:

-- Add links to multi-word page names when appropriate

if not data.nolinkhead ~~and not m_data~~.no_multiword_links[langcode] ~~and not m_data.no_multiword_links~~[full_langcode]

if not (is_reconstructed or data.nolinkhead) then

~~and not is_reconstructed~~ and export.head_is_multiword(default_head) then

local no_links = m_headword_data.no_multiword_links

default_head = export.add_multiword_links(default_head, true)

if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then

default_head = export.add_multiword_links(default_head, true)

end

Line 702:

Line 892:

end

------------ 6. Fill in missing values in `data.heads`. ------------

------------ 6. Check the namespace against the language type. ------------

if namespace == "" then

if lang_reconstructed then

error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")

elseif data.lang:hasType("appendix-constructed") then

error("Entries in " .. langname .. " must be placed in the Appendix: namespace")

end

elseif namespace == "Citations" or namespace == "Thesaurus" then

error("Headword templates should not be used in the " .. namespace .. ": namespace.")

end

------------ 7. Fill in missing values in `data.heads`. ------------

-- True if any script among the headword scripts has spaces in it.

Line 711:

Line 913:

for _, head in ipairs(data.heads) do

------ 6a. If missing head, replace with default head.

------ 7a. If missing head, replace with default head.

if not head.term then

head.term = default_head

elseif head.term == default_head then

has_redundant_head_param = true

elseif is_anti_asterisk and head.term == "!!" then

-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.

head.term = "!!" .. default_head

elseif head.term:find("^[!?]$") then

-- If explicit head= just consists of ! or ?, add it to the end of the default head.

head.term = default_head .. head.term

end

head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term

if is_reconstructed then

local head_term = head.term

if head_term:find("%[%[") then

head_term = ~~require(links_module).~~remove_links(head_term)

head_term = remove_links(head_term)

end

if head_term:sub(1, 1) ~= "*" then

Line 728:

Line 937:

end

------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,

------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,

------ otherwise fall back to the overall script if given. If neither given, autodetect the script.

local auto_sc = ~~require("Module:scripts").getByCode(~~data.lang:~~getScriptCodes~~(head.term))

local auto_sc = data.lang:findBestScript(head.term)

if (

auto_sc:getCode() == "None" and

~~require(scripts_module).findBestScriptWithoutLang~~(head.term):getCode() ~= "None"

find_best_script_without_lang(head.term):getCode() ~= "None"

) then

insert(data.categories, full_langname .. " terms in nonstandard scripts")

Line 743:

Line 952:

if not head.sc then -- Overall script code given.

head.sc = data.sc

~~end~~

~~-- Track uses of sc parameter.~~

~~if head.sc:getCode() == auto_sc:getCode() then~~

~~insert(data.categories, full_langname .. " terms with redundant script codes")~~

~~else~~

~~insert(data.categories, full_langname .. " terms with non-redundant manual script codes")~~

end

Line 762:

Line 965:

any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()

------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given

------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given

------ (provided automatic translit is available, e.g. not in Persian or Hebrew).

-- Make transliterations

head.tr_manual = nil

-- Try to generate a transliteration if necessary

if head.tr == "-" then

head.tr = nil

~~elseif~~ not notranslit[langcode] ~~and not~~ notranslit[full_langcode] and head.sc:isTransliterated() then

else

head.tr_manual = not not head.tr

local notranslit = m_headword_data.notranslit

if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then

head.tr_manual = not not head.tr

local text = head.~~term~~

local text = head.term_no_initial_bang_bang

if not data.lang:link_tr(head.sc) then

text = remove_links(text)

end

local automated_tr, tr_categories

automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)

if automated_tr or head.tr_fail then

local manual_tr = head.tr

~~if manual_tr then~~

if not manual_tr then

if ~~(remove_links(~~manual_tr~~) == remove_links(automated_tr)) and (not head.tr_fail)~~ then

head.tr = automated_tr

~~insert(data.categories, full_langname .. " terms with redundant transliterations")~~

extend(data.categories, tr_categories)

~~elseif not~~ head.~~tr_fail then~~

~~insert~~(data.categories, ~~full_langname .. " terms with non-redundant manual transliterations"~~)

end

if not ~~manual_tr~~ then

-- There is still no transliteration?

head.tr = ~~automated_tr~~

-- Add the entry to a cleanup category.

for _, category ~~in ipairs(tr_categories) do~~

if not head.tr then

insert(data.categories, ~~category~~)

head.tr = "transliteration needed"

~~end~~

-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.

-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].

insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")

else

-- Otherwise, trim it.

head.tr = trim(head.tr)

end

~~end~~

~~-- There is still no transliteration?~~

~~-- Add the entry to a cleanup category.~~

~~if not head.tr then~~

~~head.tr = "transliteration needed"~~

~~-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.~~

~~-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].~~

~~insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")~~

~~else~~

~~-- Otherwise, trim it.~~

~~head.tr = mw.text.trim(head.tr)~~

end

Line 816:

Line 1,013:

-- Link to the transliteration entry for languages that require this.

if head.tr and data.lang:link_tr(head.sc) then

head.tr = ~~require(links_module).~~full_link {

head.tr = full_link{

term = head.tr,

lang = data.lang,

sc = ~~require(scripts_module).getByCode~~("Latn"),

sc = get_script("Latn"),

tr = "-"

}

Line 825:

Line 1,022:

end

------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.

Line 837:

Line 1,034:

local dt_script = data.heads[1].sc

local dt_script_code = dt_script:getCode()

local page_non_ascii = ~~page.~~namespace == "" and not page.pagename:find("^[%z\1-\127]+$")

local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$")

local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")

if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then

display_title = 'Unsupported titles/' .. page.unsupported_titles[unsupported_pagename] .. ''

elseif page_non_ascii and toBeTagged[dt_script_code]

elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]

or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))

or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then

Line 848:

Line 1,045:

elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then

display_title = '' .. page.full_raw_pagename .. ''

elseif ~~page.~~namespace == "Reconstruction" then

elseif namespace == "Reconstruction" then

local matched

display_title, matched = ~~rsubn~~(

display_title, matched = ugsub(

page.full_raw_pagename,

"^(Reconstruction:[^/]+/)(.+)$",

function(before, term)

return before ..

return before .. tag_text(term, data.lang, dt_script)

~~require(script_utilities_module)~~.tag_text(

term,

data.lang,

dt_script

)

end

)

Line 865:

Line 1,057:

display_title = nil

end

-- FIXME: Generalize this.

-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab

-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title

-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other

-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word

-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages.

-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any

-- languages using scripts other than the ones just mentioned.

if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then

display_title = nil

end

Line 874:

Line 1,077:

end

------------ 8. Insert additional categories. ------------

------------ 9. Insert additional categories. ------------

if ~~has_redundant_head_param~~ then

-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".

if not data.~~no_redundant_head_cat then~~

if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then

~~insert~~(~~data~~.~~categories, full_langname~~ .. " ~~terms with redundant head parameter"~~)

local no_multiword_cat = m_headword_data.no_multiword_cat

~~end~~

if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then

~~end~~

-- Check for spaces or hyphens, but exclude prefixes and suffixes.

-- Use the pagename, not the head= value, because the latter may have extra

-- junk in it, e.g. superscripted text that throws off the algorithm.

local no_hyphen = m_headword_data.hyphen_not_multiword_sep

-- Exclude hyphens if the data module states that they should for this language.

local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."

local is_multiword = umatch(page.pagename, checkpattern)

~~-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".~~

if is_multiword and not non_categorizable(page.full_raw_pagename) then

if ~~not data.nomultiwordcat and any_script_has_spaces~~ and ~~postype == "lemma" and~~

insert(data.categories, full_langname .. " multiword terms")

not ~~m_data~~.~~no_multiword_cat[langcode] and not m_data.no_multiword_cat[full_langcode]~~ then

elseif not is_multiword then

~~-- Check for spaces or hyphens, but exclude prefixes and suffixes~~.

local long_word_threshold = m_headword_data.long_word_thresholds[langcode]

~~-- Use the pagename~~, ~~not the head= value, because the latter may have extra~~

if long_word_threshold and ulen(page.pagename) >= long_word_threshold then

~~-- junk in it, e.g~~. ~~superscripted text that throws off the algorithm~~.

insert(data.categories, "Long " .. full_langname .. " words")

~~local checkpattern =~~ "~~.[%s%-፡].~~"

end

~~if m_data.hyphen_not_multiword_sep[langcode] or m_data.hyphen_not_multiword_sep[full_langcode]~~ then

end

~~-- Exclude hyphens if the data module states that they should for this language~~

~~checkpattern~~ = ".[~~%s፡~~]."

~~end~~

if ~~rmatch~~(page.pagename~~, checkpattern) and not non_categorizable(page.full_raw_pagename~~) then

insert(data.categories, full_langname .. " ~~multiword terms~~")

end

Line 906:

Line 1,110:

-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.

if ~~page.~~namespace ~= "Reconstruction" then

if namespace ~= "Reconstruction" then

-- Map from languages to a string containing the characters to ignore when considering whether a term has

-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic

Line 956:

Line 1,160:

local ch_to_ignore = characters_to_ignore[full_langcode]

if ch_to_ignore then

canon_pagename = ~~rsub~~(canon_pagename, "[" .. ch_to_ignore .. "]", "")

canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")

end

~~local script_data = mw.loadData(scripts_data_module)~~

while true do

if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then

Line 965:

Line 1,168:

-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]

num_loops = num_loops + 1

local pagename_script = ~~require(scripts_module).findBestScriptWithoutLang~~(canon_pagename, "None only as last resort")

local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort")

local script_chars = pagename_script.characters

if not script_chars then

Line 973:

Line 1,176:

local script_code = pagename_script:getCode()

local replaced

canon_pagename, replaced = ~~rsubn~~(canon_pagename, "[" .. script_chars .. "]", "")

canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "")

if replaced and script_code ~= "Zmth" and script_data[script_code] and

if (

script_data[script_code].character_category ~= false then

replaced and

script_code ~= "Zmth" and

(script_data or get_script_data())[script_code] and

script_data[script_code].character_category ~= false

) then

script_code = script_code:gsub("^.-%-", "")

if not seen_scripts[script_code] then

Line 988:

Line 1,195:

end

-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.

local standard = data.lang:getStandardCharacters()

Line 1,032:

Line 1,239:

return ""

end

local sc_standard = ~~rsub~~(sc_standard, page.comb_chars.combined_double, explode)

local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode)

sc_standard = ~~rsub~~(sc_standard,page.comb_chars.combined_single, explode)

sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode)

:gsub(".[\128-\191]*", explode)

local num_cat_inserted

Line 1,043:

Line 1,250:

num_cat_inserted = true

end

elseif ~~rfind~~(char, page.emoji_pattern) then

elseif ufind(char, page.emoji_pattern) then

insert(data.categories, full_langname .. " terms spelled with emoji")

else

Line 1,057:

Line 1,264:

-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.

sc_standard = toNFD(sc_standard)

for diacritic in ~~rgmatch~~(page.decompose_pagename, page.comb_chars.diacritics_single) do

for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do

if not ~~rmatch~~(sc_standard, diacritic) then

if not umatch(sc_standard, diacritic) then

insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic)

end

for diacritic in ~~rgmatch~~(page.decompose_pagename, page.comb_chars.diacritics_double) do

for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do

if not ~~rmatch~~(sc_standard, diacritic) then

if not umatch(sc_standard, diacritic) then

insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌")

end

Line 1,071:

Line 1,278:

-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).

elseif ulen(page.pagename) ~= 1 then

for character in ~~rgmatch~~(page.pagename, "([^" .. standard .. "])") do

for character in ugmatch(page.pagename, "([^" .. standard .. "])") do

local upper = char_category(character)

if not ~~rmatch~~(upper, "[" .. standard .. "]") then

if not umatch(upper, "[" .. standard .. "]") then

character = upper

end

Line 1,080:

Line 1,287:

end

if data.heads[1].sc:isSystem("alphabet") then

local pagename, i = page.pagename:ulower(), 2

while ~~rmatch~~(pagename, "(%a)" .. ("%1"):rep(i)) do

while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do

i = i + 1

insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")

end

-- Categorise for palindromes

if not data.nopalindromecat and ~~page.~~namespace ~= "Reconstruction" and ulen(page.pagename) > 2

if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2

-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of

-- multiple scripts?

and ~~require(palindromes_module).~~is_palindrome(page.pagename, data.lang, data.heads[1].sc) then

and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then

insert(data.categories, full_langname .. " palindromes")

end

if ~~page~~.~~namespace ==~~ "" ~~and not~~ data.lang:~~hasType~~("~~reconstructed~~") then

if data.affix then

~~local m_links = require~~(~~links_module~~)

for _, aff in ipairs(data.affix) do

if mw.ustring.match(aff, "^%-[^-]*%-$") then

table.insert(data.categories, data.lang:getCanonicalName() .. " words interfixed with " .. aff)

elseif mw.ustring.match(aff, "%-%s%-") then

table.insert(data.categories, data.lang:getCanonicalName() .. " words circumfixed with " .. aff)

elseif mw.ustring.match(aff, "%-$") then

table.insert(data.categories, data.lang:getCanonicalName() .. " words prefixed with " .. aff)

elseif mw.ustring.match(aff, "^%-") then

table.insert(data.categories, data.lang:getCanonicalName() .. " words suffixed with " .. aff)

end

-- Add to various maintenance categories.

export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)

------------ 9. Format and return headwords, genders, inflections and categories. ------------

------------ 10. Format and return headwords, genders, inflections and categories. ------------

-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),

Line 1,110:

Line 1,327:

local text = '' ..

format_headword(data) ..

~~format_genders~~(data) ..

format_headword_genders(data) ..

~~format_inflections~~(data) .. ''

format_top_level_inflections(data) .. ''

-- Language-specific categories.

@@ Line 2: / Line 2: @@
 -- Named constants for all modules used, to make it easier to swap out sandbox versions.
+local en_utilities_module = "Module:en-utilities"
 local gender_and_number_module = "Module:getn"
 local headword_data_module = "Module:headword/data"
 local headword_page_module = "Module:headword/page"
 local links_module = "Module:links"
+local load_module = "Module:load"
+local pages_module = "Module:pages"
 local palindromes_module = "Module:palindromes"
-local qualifier_module = "Module:qualifier"
+local pron_qualifier_module = "Module:pron qualifier"
 local scripts_module = "Module:scripts"
 local scripts_data_module = "Module:scripts/data"
@@ Line 15: / Line 18: @@
 local table_module = "Module:table"
 local utilities_module = "Module:utilities"
-local m_str_utils = require(string_utilities_module)
 local concat = table.concat
-local encode_entities = m_str_utils.encode_entities
+local dump = mw.dumpObject
 local insert = table.insert
 local ipairs = ipairs
+local max = math.max
+local new_title = mw.title.new
 local pairs = pairs
-local pattern_escape = m_str_utils.pattern_escape
+local require = require
-local rgmatch = mw.ustring.gmatch
-local rsubn = mw.ustring.gsub
-local rfind = mw.ustring.find
-local ulen = m_str_utils.len
-local rmatch = mw.ustring.match
 local toNFC = mw.ustring.toNFC
 local toNFD = mw.ustring.toNFD
+local type = type
+local ufind = mw.ustring.find
+local ugmatch = mw.ustring.gmatch
+local ugsub = mw.ustring.gsub
+local umatch = mw.ustring.match
+--[==[
+Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
+	local function encode_entities(...)
+		encode_entities = require(string_utilities_module).encode_entities
+		return encode_entities(...)
+	end
+	local function extend(...)
+		extend = require(table_module).extend
+		return extend(...)
+	end
+	local function find_best_script_without_lang(...)
+		find_best_script_without_lang = require(scripts_module).findBestScriptWithoutLang
+		return find_best_script_without_lang(...)
+	end
+	local function format_categories(...)
+		format_categories = require(utilities_module).format_categories
+		return format_categories(...)
+	end
+	local function format_genders(...)
+		format_genders = require(gender_and_number_module).format_genders
+		return format_genders(...)
+	end
+	local function format_pron_qualifiers(...)
+		format_pron_qualifiers = require(pron_qualifier_module).format_qualifiers
+		return format_pron_qualifiers(...)
+	end
+	local function full_link(...)
+		full_link = require(links_module).full_link
+		return full_link(...)
+	end
+	local function get_current_L2(...)
+		get_current_L2 = require(pages_module).get_current_L2
+		return get_current_L2(...)
+	end
+	local function get_link_page(...)
+		get_link_page = require(links_module).get_link_page
+		return get_link_page(...)
+	end
+	local function get_script(...)
+		get_script = require(scripts_module).getByCode
+		return get_script(...)
+	end
+	local function is_palindrome(...)
+		is_palindrome = require(palindromes_module).is_palindrome
+		return is_palindrome(...)
+	end
+	local function language_link(...)
+		language_link = require(links_module).language_link
+		return language_link(...)
+	end
+	local function load_data(...)
+		load_data = require(load_module).load_data
+		return load_data(...)
+	end
+	local function pattern_escape(...)
+		pattern_escape = require(string_utilities_module).pattern_escape
+		return pattern_escape(...)
+	end
+	local function pluralize(...)
+		pluralize = require(en_utilities_module).pluralize
+		return pluralize(...)
+	end
+	local function process_page(...)
+		process_page = require(headword_page_module).process_page
+		return process_page(...)
+	end
+	local function remove_links(...)
+		remove_links = require(links_module).remove_links
+		return remove_links(...)
+	end
+	local function shallow_copy(...)
+		shallow_copy = require(table_module).shallowCopy
+		return shallow_copy(...)
+	end
+	local function tag_text(...)
+		tag_text = require(script_utilities_module).tag_text
+		return tag_text(...)
+	end
+	local function tag_transcription(...)
+		tag_transcription = require(script_utilities_module).tag_transcription
+		return tag_transcription(...)
+	end
+	local function tag_translit(...)
+		tag_translit = require(script_utilities_module).tag_translit
+		return tag_translit(...)
+	end
+	local function trim(...)
+		trim = require(string_utilities_module).trim
+		return trim(...)
+	end
+	local function ulen(...)
+		ulen = require(string_utilities_module).len
+		return ulen(...)
+	end
+--[==[
+Loaders for objects, which load data (or some other object) into some variable, which can then be accessed as "foo or get_foo()", where the function get_foo sets the object to "foo" and then returns it. This ensures they are only loaded when needed, and avoids the need to check for the existence of the object each time, since once "foo" has been set, "get_foo" will not be called again.]==]
+	local m_data
+	local function get_data()
+		m_data = load_data(headword_data_module)
+		return m_data
+	end
-local m_data = mw.loadData(headword_data_module)
+	local script_data
+	local function get_script_data()
+		script_data = load_data(scripts_data_module)
+		return script_data
+	end
-local isLemma = m_data.lemmas
+	local script_utilities_data
-local isNonLemma = m_data.nonlemmas
+	local function get_script_utilities_data()
-local notranslit = m_data.notranslit
+		script_utilities_data = load_data(script_utilities_data_module)
-local toBeTagged = m_data.toBeTagged
+		return script_utilities_data
+	end
 -- If set to true, categories always appear, even in non-mainspace pages
 local test_force_categories = false
--- Version of rsubn() that discards all but the first return value.
-local function rsub(term, foo, bar)
-	return (rsubn(term, foo, bar))
-end
 local function text_in_script(text, script_code)
-	local sc = require(scripts_module).getByCode(script_code)
+	local sc = get_script(script_code)
 	if not sc then
 		error("Internal error: Bad script code " .. script_code)
@@ Line 56: / Line 184: @@
 	local out
 	if characters then
-		text = rsub(text, "%W", "")
+		text = ugsub(text, "%W", "")
-		out = rfind(text, "[" .. characters .. "]")
+		out = ufind(text, "[" .. characters .. "]")
 	end
@@ Line 71: / Line 199: @@
 --[[ List of punctuation or spacing characters that are found inside of words.
 	 Used to exclude characters from the regex above. ]]
-local wordPunc = "-־׳״'.·*’་•:᠊"
+local wordPunc = "-#%%&@־׳״'.·*’་•:᠊"
 local notWordPunc = "[^" .. wordPunc .. "]+"
--- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
+-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, labels, references
--- customized separator: `part` is the object specifying the term, which should optionally contain:
+-- or customized separator: `part` is the object specifying the term (and `lang` the language of the term), which should
--- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
+-- optionally contain:
+-- * left qualifiers in `q`, an array of strings;
 -- * right qualifiers in `qq`, an array of strings;
+-- * left labels in `l`, an array of strings;
+-- * right labels in `ll`, an array of strings;
 -- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
 --   (formatted reference text) and optionally `name` and/or `group`;
 -- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
 -- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
-local function format_term_with_qualifiers_and_refs(part, formatted, j)
-	local left_qualifiers, right_qualifiers
-	local reftext
-	left_qualifiers = part.q and #part.q > 0 and part.q
+local function format_term_with_qualifiers_and_refs(lang, part, formatted, j)
-	if left_qualifiers then
+	local function part_non_empty(field)
-		left_qualifiers = require(qualifier_module).format_qualifier(left_qualifiers) .. " "
+		local list = part[field]
+		if not list then
+			return nil
+		end
+		if type(list) ~= "table" then
+			error(("Internal error: Wrong type for `part.%s`=%s, should be \"table\""):format(field, dump(list)))
+		end
+		return list[1]
 	end
-	right_qualifiers = part.qq and #part.qq > 0 and part.qq
+	if part_non_empty("q") or part_non_empty("qq") or part_non_empty("l") or
-	if right_qualifiers then
+		part_non_empty("ll") or part_non_empty("refs") then
-		right_qualifiers = " " .. require(qualifier_module).format_qualifier(right_qualifiers)
+		formatted = format_pron_qualifiers {
-	end
+			lang = lang,
-	if part.refs and #part.refs > 0 then
+			text = formatted,
-		local refs = {}
+			q = part.q,
-		for _, ref in ipairs(part.refs) do
+			qq = part.qq,
-			if type(ref) ~= "table" then
+			l = part.l,
-				ref = {text = ref}
+			ll = part.ll,
-			end
+			refs = part.refs,
-			local refargs
+		}
-			if ref.name or ref.group then
-				refargs = {name = ref.name, group = ref.group}
-			end
-			insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
-		end
-		reftext = concat(refs)
 	end
 	local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator
-	if left_qualifiers then
-		formatted = left_qualifiers .. formatted
-	end
-	if reftext then
-		formatted = formatted .. reftext
-	end
-	if right_qualifiers then
-		formatted = formatted .. right_qualifiers
-	end
 	if separator then
 		formatted = separator .. formatted
@@ Line 132: / Line 252: @@
 --[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
 function export.head_is_multiword(head)
-	for possibleWordBreak in rgmatch(head, spacingPunctuation) do
+	for possibleWordBreak in ugmatch(head, spacingPunctuation) do
-		if rmatch(possibleWordBreak, notWordPunc) then
+		if umatch(possibleWordBreak, notWordPunc) then
 			return true
 		end
@@ Line 141: / Line 261: @@
 end
+do
---[==[Add links to a multiword head.]==]
-function export.add_multiword_links(head, default)
 	local function workaround_to_exclude_chars(s)
-		return rsub(s, notWordPunc, "\2%1\1")
+		return (ugsub(s, notWordPunc, "\2%1\1"))
 	end
-	head = "\1" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
+	--[==[Add links to a multiword head.]==]
-	if default then
+	function export.add_multiword_links(head, default)
-		head = head
+		head = "\1" .. ugsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "\2"
-			:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
+		if default then
-			:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
+			head = head
-	end
+				:gsub("(\1[^\2]*)\\([:#][^\2]*\2)", "%1\\\\%2")
+				:gsub("(\1[^\2]*)([:#][^\2]*\2)", "%1\\%2")
+		end
-	--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
+		--Escape any remaining square brackets to stop them breaking links (e.g. "[citation needed]").
-	head = encode_entities(head, "[]", true, true)
+		head = encode_entities(head, "[]", true, true)
-	--[=[
+		--[=[
-	use this when workaround is no longer needed:
+		use this when workaround is no longer needed:
-	head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
+		head = "[[" .. ugsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"
-	Remove any empty links, which could have been created above
+		Remove any empty links, which could have been created above
-	at the beginning or end of the string.
+		at the beginning or end of the string.
-	]=]
+		]=]
-	return (head
+		return (head
-		:gsub("\1\2", "")
+			:gsub("\1\2", "")
-		:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
+			:gsub("[\1\2]", {["\1"] = "[[", ["\2"] = "]]"}))
+	end
 end
 local function non_categorizable(full_raw_pagename)
-	return full_raw_pagename:find("^Appendix:Gestures/")
+	return full_raw_pagename:find("^Appendix:Gestures/") or
+		-- Unsupported titles with descriptive names.
+		(full_raw_pagename:find("^Unsupported titles/") and not full_raw_pagename:find("`"))
 end
+local function tag_text_and_add_quals_and_refs(data, head, formatted, j)
+	-- Add language and script wrapper.
+	formatted = tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
+	-- Add qualifiers, labels, references and separator.
+	return format_term_with_qualifiers_and_refs(data.lang, head, formatted, j)
+end
 -- Format a headword with transliterations.
 local function format_headword(data)
-	local m_scriptutils = require(script_utilities_module)
 	-- Are there non-empty transliterations?
 	local has_translits = false
@@ Line 190: / Line 317: @@
 	local unique_head_parts = {}
-	local has_multiple_heads = #data.heads > 1
+	local has_multiple_heads = not not data.heads[2]
 	for j, head in ipairs(data.heads) do
@@ Line 204: / Line 331: @@
 		-- Apply processing to the headword, for formatting links and such.
 		if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Image" then
-			formatted = require(links_module).language_link{term = head.term, lang = data.lang}
+			formatted = language_link{term = head.term, lang = data.lang}
 		else
 			formatted = data.lang:makeDisplayText(head.term, head.sc, true)
 		end
-		local function tag_text_and_add_quals_and_refs(head, formatted, j)
+		local head_part = tag_text_and_add_quals_and_refs(data, head, formatted, j)
-			-- Add language and script wrapper.
-			formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)
-			-- Add qualifiers, references and separator.
-			return format_term_with_qualifiers_and_refs(head, formatted, j)
-		end
-		local head_part = tag_text_and_add_quals_and_refs(head, formatted, j)
 		insert(head_parts, head_part)
@@ Line 227: / Line 346: @@
 				unique_head_part = head_part
 			else
-				unique_head_part = tag_text_and_add_quals_and_refs(head, formatted, 1)
+				unique_head_part = tag_text_and_add_quals_and_refs(data, head, formatted, 1)
 			end
 			unique_head_parts[unique_head_part] = true
@@ Line 255: / Line 374: @@
 				local this_parts = {}
 				if head.tr then
-					insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
+					insert(this_parts, tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
 					if head.ts then
 						insert(this_parts, " ")
@@ Line 261: / Line 380: @@
 				end
 				if head.ts then
-					insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
+					insert(this_parts, "/" .. tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
 				end
 				insert(translit_parts, concat(this_parts))
@@ Line 270: / Line 389: @@
 		local langname = data.lang:getCanonicalName()
-		local transliteration_page = mw.title.new(langname .. " transliteration", "Wiktionary")
+		local transliteration_page = new_title(langname .. " transliteration")
 		local saw_translit_page = false
 		if transliteration_page and transliteration_page.exists then
-			translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
+			translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
 			saw_translit_page = true
 		end
@@ Line 281: / Line 400: @@
 		if not saw_translit_page and data.lang:hasType("etymology-only") then
 			langname = data.lang:getFullName()
-			transliteration_page = mw.title.new(langname .. " transliteration", "Wiktionary")
+			transliteration_page = new_title(langname .. " transliteration")
 			if transliteration_page and transliteration_page.exists then
-				translits_formatted = " [[Wiktionary:" .. langname .. " transliteration|•]]" .. translits_formatted
+				translits_formatted = " [[" .. langname .. " transliteration|•]]" .. translits_formatted
 			end
 		end
@@ Line 304: / Line 423: @@
-local function format_genders(data)
+local function format_headword_genders(data)
 	local retval = ""
-	if data.genders and #data.genders > 0 then
+	if data.genders and data.genders[1] then
 		if data.gloss then
 			retval = ","
 		end
 		local pos_for_cat
-		if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] and
+		if not data.nogendercat then
-			not m_data.no_gender_cat[data.lang:getFullCode()] then
+			local no_gender_cat = (m_data or get_data()).no_gender_cat
-			local pos_category = data.pos_category:gsub("^reconstructed ", "")
+			if not (no_gender_cat[data.lang:getCode()] or no_gender_cat[data.lang:getFullCode()]) then
-			pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
+				pos_for_cat = (m_data or get_data()).pos_for_gender_number_cat[data.pos_category:gsub("^reconstructed ", "")]
+			end
 		end
-		local text, cats = require(gender_and_number_module).format_genders(data.genders, data.lang, pos_for_cat)
+		local text, cats = format_genders(data.genders, data.lang, pos_for_cat)
-		for _, cat in ipairs(cats) do
+		if cats then
-			insert(data.categories, cat)
+			extend(data.categories, cats)
 		end
 		retval = retval .. "&nbsp;" .. text
@@ Line 325: / Line 445: @@
 end
+-- Forward reference
+local format_inflections
 local function format_inflection_parts(data, parts)
-	local any_part_translit = false
 	for j, part in ipairs(parts) do
 		if type(part) ~= "table" then
@@ Line 338: / Line 458: @@
 		if face ~= "bold" and face ~= "plain" and face ~= "hypothetical" then
 			error("The face `" .. face .. "` " .. (
-				mw.loadData(script_utilities_data_module).faces[face] and
+				(script_utilities_data or get_script_utilities_data()).faces[face] and
 				"should not be used for non-headword terms on the headword line." or
 				"is invalid."
@@ Line 347: / Line 467: @@
 		-- right into the 'data' table to disable inflection links of the entire headword
 		-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
-		local nolinkinfl = data.nolinkinfl
+		local nolinkinfl = part.face == "hypothetical" or part.nolinkinfl or data.nolinkinfl
 		local formatted
@@ Line 361: / Line 481: @@
 			-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
 			-- to be enabled in languages with more complex scripts (e.g. Arabic).
-			local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
+			--
-			if tr ~= "-" then
+			-- FIXME: With nested inflections, should we also respect `enable_auto_translit` at the top level of the
-				any_part_translit = true
+			-- nested inflections structure?
+			local tr = part.tr or not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil
+			-- FIXME: Temporary errors added 2025-10-03. Remove after a month or so.
+			if part.translit then
+				error("Internal error: Use field `tr` not `translit` for specifying an inflection part translit")
 			end
-			formatted = require(links_module).full_link(
+			if part.transcription then
+				error("Internal error: Use field `ts` not `transcription` for specifying an inflection part transcription")
+			end
+			local postprocess_annotations
+			if part.inflections then
+				postprocess_annotations = function(infldata)
+					insert(infldata.annotations, format_inflections(data, part.inflections))
+				end
+			end
+			formatted = full_link(
 				{
 					term = not nolinkinfl and part.term or nil,
@@ Line 371: / Line 505: @@
 					lang = part.lang or data.lang,
 					sc = part.sc or parts.sc or nil,
+					gloss = part.gloss,
+					pos = part.pos,
+					lit = part.lit,
 					id = part.id,
 					genders = part.genders,
 					tr = tr,
-					ts = part.transcription,
+					ts = part.ts,
 					accel = partaccel or parts.accel,
+					postprocess_annotations = postprocess_annotations,
 				},
 				face
@@ Line 381: / Line 519: @@
 		end
-		parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
+		parts[j] = format_term_with_qualifiers_and_refs(part.lang or data.lang, part,
+			formatted, j)
 	end
 	local parts_output
-	if #parts > 0 then
+	if parts[1] then
 		parts_output = (parts.label and " " or "") .. concat(parts)
 	elseif parts.request then
@@ Line 396: / Line 535: @@
 	local parts_label = parts.label and ("<i>" .. parts.label .. "</i>") or ""
-	return parts_label .. parts_output, any_part_translit
+	return format_term_with_qualifiers_and_refs(data.lang, parts, parts_label .. parts_output, 1)
 end
--- Format the inflections following the headword.
+-- Format the inflections following the headword or nested after a given inflection.
-local function format_inflections(data)
+format_inflections = function(data, inflections)
-	local any_part_translit = false
+	if inflections and inflections[1] then
-	if data.inflections and #data.inflections > 0 then
 		-- Format each inflection individually.
-		for key, infl in ipairs(data.inflections) do
+		for key, infl in ipairs(inflections) do
-			local this_any_part_translit
+			inflections[key] = format_inflection_parts(data, infl)
-			data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
-			if this_any_part_translit then
-				any_part_translit = true
-			end
 		end
-		local concat_result = concat(data.inflections, ", ")
+		return concat(inflections, ", ")
-		return " (" .. concat_result .. ")"
 	else
 		return ""
 	end
 end
+-- Format the top-level inflections following the headword. Currently this just adds parens around the
+-- formatted comma-separated inflections in `data.inflections`.
+local function format_top_level_inflections(data)
+	local result = format_inflections(data, data.inflections)
+	if result ~= "" then
+		return " (" .. result .. ")"
+	else
+		return result
+	end
+end
 --[==[
--- Returns the plural form of `pos`, a raw part of speech input, which could be singular or
+Returns the plural form of `pos`, a raw part of speech input, which could be singular or
--- plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
+plural. Irregular plural POS are taken into account (e.g. "kanji" pluralizes to
--- "kanji").]==]
+"kanji").
+]==]
 function export.pluralize_pos(pos)
-	return m_data.irregular_plurals[pos] or
+	-- Make the plural form of the part of speech
+	return (m_data or get_data()).irregular_plurals[pos] or
 		pos:sub(-1) == "s" and pos or
-		-- Make the plural form of the part of speech
+		pluralize(pos)
-		require("Module:string utilities").pluralize(pos)
 end
 --[==[
--- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
+Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
--- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
+if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
--- If you have a POS in its singular form, call export.pluralize_pos() above to pluralize it
+If you have a POS in its singular form, call {export.pluralize_pos()} above to pluralize it
--- in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
+in a smart fashion that knows when to add "-s" and when to add "-es", and also takes
--- into account any irregular plurals.]==]
+into account any irregular plurals.
---
--- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
+If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
--- based on whether it ends in " forms"; otherwise, return nil.]==]
+based on whether it ends in " forms"; otherwise, return nil.
+]==]
 function export.pos_lemma_or_nonlemma(plpos, best_guess)
+	local m_headword_data = m_data or get_data()
+	local isLemma = m_headword_data.lemmas
 	-- Is it a lemma category?
 	if isLemma[plpos] then
@@ Line 450: / Line 600: @@
 	end
 	-- Is it a nonlemma category?
+	local isNonLemma = m_headword_data.nonlemmas
 	if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
 		return "non-lemma form"
@@ Line 463: / Line 614: @@
 end
+--[==[
+Canonicalize a part of speech as specified in 2= in {{tl|head}}. This checks for POS aliases and non-lemma form
+aliases ending in 'f', and then pluralizes if the POS term does not have an invariable plural.
+]==]
+function export.canonicalize_pos(pos)
+	-- FIXME: Temporary code to throw an error for alias 'pre' (= preposition) that will go away.
+	if pos == "pre" then
+		-- Don't throw error on 'pref' as it's an alias for "prefix".
+		error("POS 'pre' for 'preposition' no longer allowed as it's too ambiguous; use 'prep'")
+	end
+	-- Likewise for pro = pronoun.
+	if pos == "pro" or pos == "prof" then
+		error("POS 'pro' for 'pronoun' no longer allowed as it's too ambiguous; use 'pron'")
+	end
+	local m_headword_data = m_data or get_data()
+	if m_headword_data.pos_aliases[pos] then
+		pos = m_headword_data.pos_aliases[pos]
+	elseif pos:sub(-1) == "f" then
+		pos = pos:sub(1, -2)
+		pos = (m_headword_data.pos_aliases[pos] or pos) .. " forms"
+	end
+	return export.pluralize_pos(pos)
+end
 -- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
@@ Line 475: / Line 649: @@
 	local typ = type(data[element])
 	if typ ~= "table" then
-		error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
+		error(("Internal error: In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
 	end
 	for k, v in pairs(data[element]) do
 		if k ~= "maxindex" then
 			if type(k) ~= "number" then
-				error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
+				error(("Internal error: Unrecognized non-numeric key '%s' in `data.%s`"):format(k, element))
 			end
 			if k > maxind then
@@ Line 487: / Line 661: @@
 			if v then
 				if type(v) ~= "string" then
-					error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
+					error(("Internal error: For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
 				end
 				if not allow_blank_string and v == "" then
-					error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
+					error(("Internal error: For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
 				end
 			end
@@ Line 519: / Line 693: @@
 		-- that.
 		if tbl == true then
-			if page.raw_defaultsort ~= sortkey then
-				insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
-			end
 			return
 		end
@@ Line 531: / Line 702: @@
 				different = true
 			end
-		end
-		if redundant then
-			insert(lang_cats, lang:getFullName() .. " terms with redundant sortkeys")
-		end
-		if different then
-			insert(lang_cats, lang:getFullName() .. " terms with non-redundant non-automated sortkeys")
 		end
 		return sortkey
 	end
 	function export.maintenance_cats(page, lang, lang_cats, page_cats)
-		for _, cat in ipairs(page.cats) do
+		extend(page_cats, page.cats)
-			insert(page_cats, cat)
-		end
 		lang = lang:getFull() -- since we are just generating categories
 		local canonical = lang:getCanonicalName()
@@ Line 550: / Line 713: @@
 		if tbl then
 			sortkey = handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
-			insert(lang_cats, canonical .. " entries with topic categories using raw markup")
 		end
 		tbl = page.wikitext_langname_cat[canonical]
 		if tbl then
 			handle_raw_sortkeys(tbl, sortkey, page, lang, lang_cats)
-			insert(lang_cats, canonical .. " entries with language name categories using raw markup")
-		end
-		if require(utilities_module).get_current_L2() ~= canonical then
-			insert(lang_cats, canonical .. " entries with incorrect language header")
 		end
 	end
@@ Line 570: / Line 728: @@
 ]==]
 function export.full_headword(data)
-	local remove_links = require(links_module).remove_links
-	local format_categories = require(utilities_module).format_categories
 	-- Prevent data from being destructively modified.
-	local data = require(table_module).shallowcopy(data)
+	local data = shallow_copy(data)
 	------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------
 	if data.getCanonicalName then
-		error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
+		error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
 	end
 	if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
-		error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
+		error("Internal error: In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
 	end
 	if data.id and type(data.id) ~= "string" then
-		error("The id in the data table should be a string.")
+		error("Internal error: The id in the data table should be a string.")
 	end
@@ Line 593: / Line 748: @@
 	local langcode = data.lang:getCode()
-	local full_langcode = langcode
+	local full_langcode = data.lang:getFullCode()
 	local langname = data.lang:getCanonicalName()
-	local full_langname = langname
+	local full_langname = data.lang:getFullName()
-	local raw_pagename, page = data.pagename
+	local raw_pagename = data.pagename
-	if raw_pagename and raw_pagename ~= m_data.pagename then -- for testing, doc pages, etc.
+	local page
-		page = require(headword_page_module).process_page(raw_pagename)
+	local m_headword_data = m_data or get_data()
+	if raw_pagename and raw_pagename ~= m_headword_data.pagename then -- for testing, doc pages, etc.
+		-- data.pagename is often set on documentation and test pages through the pagename= parameter of various
+		-- templates, to emulate running on that page. Having a large number of such test templates on a single
+		-- page often leads to timeouts, because we fetch and parse the contents of each page in turn. However,
+		-- we don't really need to do that and can function fine without fetching and parsing the contents of a
+		-- given page, so turn off content fetching/parsing (and also setting the DEFAULTSORT key through a parser
+		-- function, which is *slooooow*) in certain namespaces where test and documentation templates are likely to
+		-- be found and where actual content does not live (User, Template, Module).
+		local actual_namespace = m_headword_data.page.namespace
+		local no_fetch_content = actual_namespace == "User" or actual_namespace == "Template" or
+			actual_namespace == "Module"
+		page = process_page(raw_pagename, no_fetch_content)
 	else
-		page = m_data.page
+		page = m_headword_data.page
 	end
-	-- Check the namespace against the language type.
+	local namespace = page.namespace
-	if page.namespace == "" then
-		if data.lang:hasType("reconstructed") then
-			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
-		elseif data.lang:hasType("appendix-constructed") then
-			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
-		end
-	end
 	------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------
@@ Line 618: / Line 778: @@
 		-- new-style
 		if data.translits or data.transcriptions then
-			error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
+			error("Internal error: In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
 		end
 	else
 		-- convert old-style `heads`, `translits` and `transcriptions` to new-style
-		local maxind = math.max(
+		local maxind = max(
-			init_and_find_maximum_index(data, "heads", true),
+			init_and_find_maximum_index(data, "heads"),
 			init_and_find_maximum_index(data, "translits", true),
 			init_and_find_maximum_index(data, "transcriptions", true)
@@ Line 643: / Line 803: @@
 	------------ 4. Initialize and validate `data.categories` and `data.whole_page_categories`, and determine `pos_category` if not given, and add basic categories. ------------
-	init_and_find_maximum_index(data, "categories", true)
+	-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
-	init_and_find_maximum_index(data, "whole_page_categories", true)
+	if data.altform then
+		data.noposcat = true
+	end
+	init_and_find_maximum_index(data, "categories")
+	init_and_find_maximum_index(data, "whole_page_categories")
 	local pos_category_already_present = false
-	if #data.categories > 0 then
+	if data.categories[1] then
 		local escaped_langname = pattern_escape(full_langname)
 		local matches_lang_pattern = "^" .. escaped_langname .. " "
@@ Line 660: / Line 825: @@
 	if not data.pos_category then
-		error("`data.pos_category` not specified and could not be inferred from the categories given in "
+		error("Internal error: `data.pos_category` not specified and could not be inferred from the categories given in "
 			.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
 			.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
@@ Line 680: / Line 845: @@
 	local postype = export.pos_lemma_or_nonlemma(data.pos_category)
 	if not data.noposcat then
+		if postype == "lemma" then
+			postype = data.lang:getMainCategoryName()
+		end
 		insert(data.categories, 1, full_langname .. " " .. postype .. "s")
+	end
+	insert(data.categories, 1, "Contionary")
+	-- EXPERIMENTAL: see [[Wiktionary:Beer parlour/2024/June#Decluttering the altform mess]]
+	if data.altform then
+		insert(data.categories, 1, full_langname .. " alternative forms")
 	end
 	------------ 5. Create a default headword, and add links to multiword page names. ------------
+	-- Determine if this is an "anti-asterisk" term, i.e. an attested term in a language that must normally be
+	-- reconstructed.
+	local is_anti_asterisk = data.heads[1].term and data.heads[1].term:find("^!!")
+	local lang_reconstructed = data.lang:hasType("reconstructed")
+	if is_anti_asterisk then
+		if not lang_reconstructed then
+			error("Anti-asterisk feature (head= beginning with !!) can only be used with reconstructed languages")
+		end
+		lang_reconstructed = false
+	end
 	-- Determine if term is reconstructed
-	local is_reconstructed = page.namespace == "Reconstruction" or data.lang:hasType("reconstructed")
+	local is_reconstructed = namespace == "Reconstruction" or lang_reconstructed
 	-- Create a default headword based on the pagename, which is determined in
@@ Line 693: / Line 881: @@
 	-- Add links to multi-word page names when appropriate
-	if not data.nolinkhead and not m_data.no_multiword_links[langcode] and not m_data.no_multiword_links[full_langcode]
+	if not (is_reconstructed or data.nolinkhead) then
-		and	not is_reconstructed and export.head_is_multiword(default_head) then
+		local no_links = m_headword_data.no_multiword_links
-		default_head = export.add_multiword_links(default_head, true)
+		if not (no_links[langcode] or no_links[full_langcode]) and export.head_is_multiword(default_head) then
+			default_head = export.add_multiword_links(default_head, true)
+		end
 	end
@@ Line 702: / Line 892: @@
 	end
-	------------ 6. Fill in missing values in `data.heads`. ------------
+	------------ 6. Check the namespace against the language type. ------------
+	if namespace == "" then
+		if lang_reconstructed then
+			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
+		elseif data.lang:hasType("appendix-constructed") then
+			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
+		end
+	elseif namespace == "Citations" or namespace == "Thesaurus" then
+		error("Headword templates should not be used in the " .. namespace .. ": namespace.")
+	end
+	------------ 7. Fill in missing values in `data.heads`. ------------
 	-- True if any script among the headword scripts has spaces in it.
@@ Line 711: / Line 913: @@
 	for _, head in ipairs(data.heads) do
-		------ 6a. If missing head, replace with default head.
+		------ 7a. If missing head, replace with default head.
 		if not head.term then
 			head.term = default_head
 		elseif head.term == default_head then
 			has_redundant_head_param = true
+		elseif is_anti_asterisk and head.term == "!!" then
+			-- If explicit head=!! is given, it's an anti-asterisk term and we fill in the default head.
+			head.term = "!!" .. default_head
+		elseif head.term:find("^[!?]$") then
+			-- If explicit head= just consists of ! or ?, add it to the end of the default head.
+			head.term = default_head .. head.term
 		end
+		head.term_no_initial_bang_bang = is_anti_asterisk and head.term:sub(3) or head.term
 		if is_reconstructed then
 			local head_term = head.term
 			if head_term:find("%[%[") then
-				head_term = require(links_module).remove_links(head_term)
+				head_term = remove_links(head_term)
 			end
 			if head_term:sub(1, 1) ~= "*" then
@@ Line 728: / Line 937: @@
 		end
-		------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
+		------ 7b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
 		------     otherwise fall back to the overall script if given. If neither given, autodetect the script.
-		local auto_sc = require("Module:scripts").getByCode(data.lang:getScriptCodes(head.term))
+		local auto_sc = data.lang:findBestScript(head.term)
 		if (
 			auto_sc:getCode() == "None" and
-			require(scripts_module).findBestScriptWithoutLang(head.term):getCode() ~= "None"
+			find_best_script_without_lang(head.term):getCode() ~= "None"
 		) then
 			insert(data.categories, full_langname .. " terms in nonstandard scripts")
@@ Line 743: / Line 952: @@
 			if not head.sc then -- Overall script code given.
 				head.sc = data.sc
-			end
-			-- Track uses of sc parameter.
-			if head.sc:getCode() == auto_sc:getCode() then
-				insert(data.categories, full_langname .. " terms with redundant script codes")
-			else
-				insert(data.categories, full_langname .. " terms with non-redundant manual script codes")
 			end
 		end
@@ Line 762: / Line 965: @@
 		any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()
-		------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
+		------ 7c. Create automatic transliterations for any non-Latin headwords without manual translit given
 		------     (provided automatic translit is available, e.g. not in Persian or Hebrew).
 		-- Make transliterations
 		head.tr_manual = nil
 		-- Try to generate a transliteration if necessary
 		if head.tr == "-" then
 			head.tr = nil
-		elseif not notranslit[langcode] and not notranslit[full_langcode] and head.sc:isTransliterated() then
+		else
-			head.tr_manual = not not head.tr
+			local notranslit = m_headword_data.notranslit
+			if not (notranslit[langcode] or notranslit[full_langcode]) and head.sc:isTransliterated() then
+				head.tr_manual = not not head.tr
-			local text = head.term
+				local text = head.term_no_initial_bang_bang
-			if not data.lang:link_tr(head.sc) then
+				if not data.lang:link_tr(head.sc) then
-				text = remove_links(text)
+					text = remove_links(text)
-			end
+				end
-			local automated_tr, tr_categories
+				local automated_tr, tr_categories
-			automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
+				automated_tr, head.tr_fail, tr_categories = data.lang:transliterate(text, head.sc)
-			if automated_tr or head.tr_fail then
+				if automated_tr or head.tr_fail then
-				local manual_tr = head.tr
+					local manual_tr = head.tr
-				if manual_tr then
+					if not manual_tr then
-					if (remove_links(manual_tr) == remove_links(automated_tr)) and (not head.tr_fail) then
+						head.tr = automated_tr
-						insert(data.categories, full_langname .. " terms with redundant transliterations")
+						extend(data.categories, tr_categories)
-					elseif not head.tr_fail then
-						insert(data.categories, full_langname .. " terms with non-redundant manual transliterations")
 					end
 				end
-				if not manual_tr then
+				-- There is still no transliteration?
-					head.tr = automated_tr
+				-- Add the entry to a cleanup category.
-					for _, category in ipairs(tr_categories) do
+				if not head.tr then
-						insert(data.categories, category)
+					head.tr = "<small>transliteration needed</small>"
-					end
+					-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
+					-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
+					insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
+				else
+					-- Otherwise, trim it.
+					head.tr = trim(head.tr)
 				end
-			end
-			-- There is still no transliteration?
-			-- Add the entry to a cleanup category.
-			if not head.tr then
-				head.tr = "<small>transliteration needed</small>"
-				-- FIXME: No current support for 'Request for transliteration of Classical Persian terms' or similar.
-				-- Consider adding this support in [[Module:category tree/poscatboiler/data/entry maintenance]].
-				insert(data.categories, "Requests for transliteration of " .. full_langname .. " terms")
-			else
-				-- Otherwise, trim it.
-				head.tr = mw.text.trim(head.tr)
 			end
 		end
@@ Line 816: / Line 1,013: @@
 		-- Link to the transliteration entry for languages that require this.
 		if head.tr and data.lang:link_tr(head.sc) then
-			head.tr = require(links_module).full_link {
+			head.tr = full_link{
 				term = head.tr,
 				lang = data.lang,
-				sc = require(scripts_module).getByCode("Latn"),
+				sc = get_script("Latn"),
 				tr = "-"
 			}
@@ Line 825: / Line 1,022: @@
 	end
-	------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
+	------------ 8. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------
 	-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
@@ Line 837: / Line 1,034: @@
 	local dt_script = data.heads[1].sc
 	local dt_script_code = dt_script:getCode()
-	local page_non_ascii = page.namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
+	local page_non_ascii = namespace == "" and not page.pagename:find("^[%z\1-\127]+$")
 	local unsupported_pagename, unsupported = page.full_raw_pagename:gsub("^Unsupported titles/", "")
 	if unsupported == 1 and page.unsupported_titles[unsupported_pagename] then
 		display_title = 'Unsupported titles/<span class="' .. dt_script_code .. '">' .. page.unsupported_titles[unsupported_pagename] .. '</span>'
-	elseif page_non_ascii and toBeTagged[dt_script_code]
+	elseif page_non_ascii and m_headword_data.toBeTagged[dt_script_code]
 		or (dt_script_code == "Jpan" and (text_in_script(page.pagename, "Hira") or text_in_script(page.pagename, "Kana")))
 		or (dt_script_code == "Kore" and text_in_script(page.pagename, "Hang")) then
@@ Line 848: / Line 1,045: @@
 	elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
 		display_title = '<span class="Hani">' .. page.full_raw_pagename .. '</span>'
-	elseif page.namespace == "Reconstruction" then
+	elseif namespace == "Reconstruction" then
 		local matched
-		display_title, matched = rsubn(
+		display_title, matched = ugsub(
 			page.full_raw_pagename,
 			"^(Reconstruction:[^/]+/)(.+)$",
 			function(before, term)
-				return before ..
+				return before .. tag_text(term, data.lang, dt_script)
-					require(script_utilities_module).tag_text(
-						term,
-						data.lang,
-						dt_script
-					)
 			end
 		)
@@ Line 865: / Line 1,057: @@
 			display_title = nil
 		end
+	end
+	-- FIXME: Generalize this.
+	-- If the current language uses ur-Arab (for Urdu, etc.), ku-Arab (Central Kurdish) or pa-Arab
+	-- (Shahmukhi, for Punjabi) and there's more than one language on the page, don't set the display title
+	-- because these three scripts display in Nastaliq and we don't want this for terms that also exist in other
+	-- languages that don't display in Nastaliq (e.g. Arabic or Persian) to display in Nastaliq. Because the word
+	-- "Urdu" occurs near the end of the alphabet, Urdu fonts tend to override the fonts of other languages.
+	-- FIXME: This is checking for more than one language on the page but instead needs to check if there are any
+	-- languages using scripts other than the ones just mentioned.
+	if (dt_script_code == "ur-Arab" or dt_script_code == "ku-Arab" or dt_script_code == "pa-Arab") and page.L2_list.n > 1 then
+		display_title = nil
 	end
@@ Line 874: / Line 1,077: @@
 	end
-	------------ 8. Insert additional categories. ------------
+	------------ 9. Insert additional categories. ------------
-	if has_redundant_head_param then
+    -- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
-		if not data.no_redundant_head_cat then
+	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" then
-			insert(data.categories, full_langname .. " terms with redundant head parameter")
+		local no_multiword_cat = m_headword_data.no_multiword_cat
-		end
+		if not (no_multiword_cat[langcode] or no_multiword_cat[full_langcode]) then
-	end
+			-- Check for spaces or hyphens, but exclude prefixes and suffixes.
+			-- Use the pagename, not the head= value, because the latter may have extra
+			-- junk in it, e.g. superscripted text that throws off the algorithm.
+			local no_hyphen = m_headword_data.hyphen_not_multiword_sep
+			-- Exclude hyphens if the data module states that they should for this language.
+			local checkpattern = (no_hyphen[langcode] or no_hyphen[full_langcode]) and ".[%s፡]." or ".[%s%-፡]."
+			local is_multiword = umatch(page.pagename, checkpattern)
-	-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
+			if is_multiword and not non_categorizable(page.full_raw_pagename) then
-	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and
+				insert(data.categories, full_langname .. " multiword terms")
-		not m_data.no_multiword_cat[langcode] and not m_data.no_multiword_cat[full_langcode] then
+			elseif not is_multiword then
-		-- Check for spaces or hyphens, but exclude prefixes and suffixes.
+				local long_word_threshold = m_headword_data.long_word_thresholds[langcode]
-		-- Use the pagename, not the head= value, because the latter may have extra
+				if long_word_threshold and ulen(page.pagename) >= long_word_threshold then
-		-- junk in it, e.g. superscripted text that throws off the algorithm.
+					insert(data.categories, "Long " .. full_langname .. " words")
-		local checkpattern = ".[%s%-፡]."
+				end
-		if m_data.hyphen_not_multiword_sep[langcode] or m_data.hyphen_not_multiword_sep[full_langcode] then
+			end
-			-- Exclude hyphens if the data module states that they should for this language
-			checkpattern = ".[%s፡]."
-		end
-		if rmatch(page.pagename, checkpattern) and not non_categorizable(page.full_raw_pagename) then
-			insert(data.categories, full_langname .. " multiword terms")
 		end
 	end
@@ Line 906: / Line 1,110: @@
 	-- Reconstructed terms often use weird combinations of scripts and realistically aren't spelled so much as notated.
-	if page.namespace ~= "Reconstruction" then
+	if namespace ~= "Reconstruction" then
 		-- Map from languages to a string containing the characters to ignore when considering whether a term has
 		-- multiple written scripts in it. Typically these are Greek or Cyrillic letters used for their phonetic
@@ Line 956: / Line 1,160: @@
 		local ch_to_ignore = characters_to_ignore[full_langcode]
 		if ch_to_ignore then
-			canon_pagename = rsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
+			canon_pagename = ugsub(canon_pagename, "[" .. ch_to_ignore .. "]", "")
 		end
-		local script_data = mw.loadData(scripts_data_module)
 		while true do
 			if canon_pagename == "" or num_seen_scripts >= 2 or num_loops >= 10 then
@@ Line 965: / Line 1,168: @@
 			-- Make sure we don't get into a loop checking the same script over and over again; happens with e.g. [[ᠪᡳ]]
 			num_loops = num_loops + 1
-			local pagename_script = require(scripts_module).findBestScriptWithoutLang(canon_pagename, "None only as last resort")
+			local pagename_script = find_best_script_without_lang(canon_pagename, "None only as last resort")
 			local script_chars = pagename_script.characters
 			if not script_chars then
@@ Line 973: / Line 1,176: @@
 			local script_code = pagename_script:getCode()
 			local replaced
-			canon_pagename, replaced = rsubn(canon_pagename, "[" .. script_chars .. "]", "")
+			canon_pagename, replaced = ugsub(canon_pagename, "[" .. script_chars .. "]", "")
-			if replaced and script_code ~= "Zmth" and script_data[script_code] and
+			if (
-				script_data[script_code].character_category ~= false then
+				replaced and
+				script_code ~= "Zmth" and
+				(script_data or get_script_data())[script_code] and
+				script_data[script_code].character_category ~= false
+			) then
 				script_code = script_code:gsub("^.-%-", "")
 				if not seen_scripts[script_code] then
@@ Line 988: / Line 1,195: @@
 		end
 	end
 	-- Categorise for unusual characters. Takes into account combining characters, so that we can categorise for characters with diacritics that aren't encoded as atomic characters (e.g. U̠). These can be in two formats: single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character). Each can have any number of diacritics.
 	local standard = data.lang:getStandardCharacters()
@@ Line 1,032: / Line 1,239: @@
 							return ""
 						end
-						local sc_standard = rsub(sc_standard, page.comb_chars.combined_double, explode)
+						local sc_standard = ugsub(sc_standard, page.comb_chars.combined_double, explode)
-						sc_standard = rsub(sc_standard,page.comb_chars.combined_single, explode)
+						sc_standard = ugsub(sc_standard,page.comb_chars.combined_single, explode)
 							:gsub(".[\128-\191]*", explode)
 						local num_cat_inserted
@@ Line 1,043: / Line 1,250: @@
 										num_cat_inserted = true
 									end
-								elseif rfind(char, page.emoji_pattern) then
+								elseif ufind(char, page.emoji_pattern) then
 									insert(data.categories, full_langname .. " terms spelled with emoji")
 								else
@@ Line 1,057: / Line 1,264: @@
 					-- If a diacritic doesn't appear in any of the standard characters, also categorise for it generally.
 					sc_standard = toNFD(sc_standard)
-					for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
+					for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_single) do
-						if not rmatch(sc_standard, diacritic) then
+						if not umatch(sc_standard, diacritic) then
 							insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic)
 						end
 					end
-					for diacritic in rgmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
+					for diacritic in ugmatch(page.decompose_pagename, page.comb_chars.diacritics_double) do
-						if not rmatch(sc_standard, diacritic) then
+						if not umatch(sc_standard, diacritic) then
 							insert(data.categories, full_langname .. " terms spelled with ◌" .. diacritic .. "◌")
 						end
@@ Line 1,071: / Line 1,278: @@
 		-- Ancient Greek, Hindi and Lao handled the old way for now, as their standard chars still need to be converted to the new format (because there are a lot of them).
 		elseif ulen(page.pagename) ~= 1 then
-			for character in rgmatch(page.pagename, "([^" .. standard .. "])") do
+			for character in ugmatch(page.pagename, "([^" .. standard .. "])") do
 				local upper = char_category(character)
-				if not rmatch(upper, "[" .. standard .. "]") then
+				if not umatch(upper, "[" .. standard .. "]") then
 					character = upper
 				end
@@ Line 1,080: / Line 1,287: @@
 		end
 	end
 	if data.heads[1].sc:isSystem("alphabet") then
 		local pagename, i = page.pagename:ulower(), 2
-		while rmatch(pagename, "(%a)" .. ("%1"):rep(i)) do
+		while umatch(pagename, "(%a)" .. ("%1"):rep(i)) do
 			i = i + 1
 			insert(data.categories, full_langname .. " terms with " .. i .. " consecutive instances of the same letter")
 		end
 	end
 	-- Categorise for palindromes
-	if not data.nopalindromecat and page.namespace ~= "Reconstruction" and ulen(page.pagename) > 2
+	if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(page.pagename) > 2
 		-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
 		-- multiple scripts?
-		and require(palindromes_module).is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
+		and is_palindrome(page.pagename, data.lang, data.heads[1].sc) then
 		insert(data.categories, full_langname .. " palindromes")
 	end
-	if page.namespace == "" and not data.lang:hasType("reconstructed") then
+	if data.affix then
-		local m_links = require(links_module)
+		for _, aff in ipairs(data.affix) do
+			if mw.ustring.match(aff, "^%-[^-]*%-$") then
+				table.insert(data.categories, data.lang:getCanonicalName() .. " words interfixed with " .. aff)
+			elseif mw.ustring.match(aff, "%-%s%-") then
+				table.insert(data.categories, data.lang:getCanonicalName() .. " words circumfixed with " .. aff)
+			elseif mw.ustring.match(aff, "%-$") then
+				table.insert(data.categories, data.lang:getCanonicalName() .. " words prefixed with " .. aff)
+			elseif mw.ustring.match(aff, "^%-") then
+				table.insert(data.categories, data.lang:getCanonicalName() .. " words suffixed with " .. aff)
+			end
+		end
 	end
 	-- Add to various maintenance categories.
 	export.maintenance_cats(page, data.lang, data.categories, data.whole_page_categories)
-	------------ 9. Format and return headwords, genders, inflections and categories. ------------
+	------------ 10. Format and return headwords, genders, inflections and categories. ------------
 	-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
@@ Line 1,110: / Line 1,327: @@
 	local text = '<span class="headword-line">' ..
 		format_headword(data) ..
-		format_genders(data) ..
+		format_headword_genders(data) ..
-		format_inflections(data) .. '</span>'
+		format_top_level_inflections(data) .. '</span>'
 	-- Language-specific categories.