Module:string utilities: Difference between revisions

Line 1:

local export = {}

local function_module = "Module:fun"

local load_module = "Module:load"

local memoize_module = "Module:memoize"

local mw = mw

local string = string

Line 12:

Line 18:

local gsub = string.gsub

local len = string.len

~~local load_data = mw.loadData~~

local lower = string.lower

local match = string.match

local next = next

local require = require

local reverse = string.reverse

local select = select

Line 34:

Line 40:

local usub = ustring.sub

local uupper = ustring.upper

local memoize = require(memoize_module)

-- Defined below.

local charset_escape

Line 39:

Line 48:

local explode_utf8

local format_fun

local get_charset

local get_indefinite_article

local gsplit

local pattern_escape

local pattern_simplifier

Line 47:

Line 58:

local ulen

local ~~module_name~~ = ~~"string_utilities"~~

--[==[

Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]

local function is_callable(...)

is_callable = require(function_module).is_callable

return is_callable(...)

end

local ~~export~~ = {}

local function load_data(...)

load_data = require(load_module).load_data

return load_data(...)

end

local function prepare_iter(str, pattern, str_lib, plain)

local callable = is_callable(pattern)

if str_lib or plain then

return pattern, #str, string, callable

elseif not callable then

local simple = pattern_simplifier(pattern)

if simple then

return simple, #str, string, false

end

return pattern, ulen(str), ustring, callable

end

--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]

Line 62:

Line 94:

explode_utf8 = export.explode_utf8

--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|~~patterns~~]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{~~code|~~lua|"^$()%.[]*+-?"}} becomes {{~~code|~~lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]

do

function export.pattern_escape(str)

local charset_chars = {

return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))

["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"

end

}

~~pattern_escape~~ = export.~~pattern_escape~~

charset_chars.__index = charset_chars

local chars = setmetatable({

["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",

["."] = "%.", ["?"] = "%?", ["["] = "%["

}, charset_chars)

--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>, and converts the null character to <code>%z</code>. For example, {{lua|"^$()%.[]*+-?\0"}} becomes {{lua|"%^%$%(%)%%%.%[%]%*%+%-%?%z"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]

function export.pattern_escape(str)

return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))

end

pattern_escape = export.pattern_escape

--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>, and converts the null character to <code>%z</code>.]==]

function export.charset_escape(str)

return (gsub(str, "[%z%%%-%]^]", charset_chars))

end

charset_escape = export.charset_escape

--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]

function export.replacement_escape(str)

return (gsub(str, "%%", "%%%%"))

end

replacement_escape = export.replacement_escape

local function case_insensitive_char(ch)

local upper_ch = uupper(ch)

if upper_ch == ch then

ch = ulower(ch)

if ch == upper_ch then

return chars[ch] or ch

end

return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"

end

--[=~~=[Escapes only the magic characters used in [~~[~~mw:Extension:Scribunto/Lua reference manual#Patterns|pattern~~]~~] character sets:~~ <~~code~~>~~%-]^~~<~~/code>.~~]==]

local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)

~~function export.charset_escape~~(str)

if not (loc1 and start <= str_len) then

return (~~gsub~~(str, "[~~%%%-%~~]~~^]"~~, "~~%%%0~~"))

-- Add final chunk and return.

end

n = n + 1

~~charset_escape = export.charset_escape~~

text[n] = _gsub(_sub(str, start), ".", chars)

return

elseif loc2 < loc1 then

if _sub == sub then

local b = byte(str, loc1)

if b and b >= 128 then

loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)

end

n = n + 1

text[n] = _gsub(_sub(str, start, loc1), ".", chars)

start = loc1 + 1

if start > str_len then

return

end

else

-- Add chunk up to the current match.

n = n + 1

text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)

-- Add current match.

n = n + 1

text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)

start = loc2 + 1

end

return n, start

end

--[==[Escapes ~~only <code>%</code>, which is~~ the ~~only~~ magic ~~character~~ used in ~~replacement~~ [[mw:Extension:Scribunto/Lua reference manual#Patterns|~~patterns~~]] ~~with string.gsub~~ and mw.~~ustring.gsub~~.]==]

--[==[

function export.~~replacement_escape~~(str)

Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns any pattern matching facilities off in the optional pattern supplied.]==]

return (gsub(str, "%%", "~~%%%%~~"))

function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)

if pattern_or_func == nil then

return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))

end

local text, n, start, str_len, _string, callable = {}, 0, 1

pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)

local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub

if callable then

repeat

n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))

until not start

-- Special case if the pattern is anchored to the start: "^" always

-- anchors to the start position, not the start of the string, so get

-- around this by only attempting one match with the pattern, then match

-- the end of the string.

elseif byte(pattern_or_func) == 0x5E then -- ^

n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))

if start ~= nil then

iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))

end

else

repeat

n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))

until not start

end

return concat(text)

end

~~replacement_escape = export.replacement_escape~~

do

local character_classes

local function get_character_classes()

character_classes, get_character_classes = {

[0x41] = true, [0x61] = true, -- Aa

[0x43] = true, [0x63] = true, -- Cc

[0x44] = true, [0x64] = true, -- Dd

[0x4C] = true, [0x6C] = true, -- Ll

[0x50] = true, [0x70] = true, -- Pp

[0x53] = true, [0x73] = true, -- Ss

[0x55] = true, [0x75] = true, -- Uu

[0x57] = true, [0x77] = true, -- Ww

[0x58] = true, [0x78] = true, -- Xx

[0x5A] = true, -- z dealt with separately.

}, nil

return character_classes

end

local function check_sets_equal(set1, set2)

local k2

Line 129:

Line 265:

local function parse_1_byte_charset(pattern, pos)

local ch

while true do

~~local ch, nxt_pos~~

pos, ch = match(pattern, "()([%%%]\192-\255])", pos)

pos, ch~~, nxt_pos~~ = match(pattern, "()([%%%]\~~194~~-\~~244~~]~~[\128-\191]*)(~~)", pos)

if ch == "%" then

if ~~not ch then~~

local nxt = byte(pattern, pos + 1)

~~return false~~

if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z

~~elseif~~ ch == "%" then

~~if match~~(pattern, "^[acdlpsuwxACDLPSUWXZ~~\128-\255]"~~, ~~nxt_pos) then~~

return false

end

pos = pos + 2

elseif ch == "]" then

pos = ~~nxt_pos~~

pos = pos + 1

return pos

else

return false

end

--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]

pattern_simplifier ~~= require("Module:fun").memoize(function~~(pattern)

function pattern_simplifier(pattern)

if type(pattern) == "number" then

return tostring(pattern)

end

local pos, ~~captures~~, start, n, output = 1, 0, 1, 0

local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0

while true do

~~local ch, nxt_pos~~

-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.

pos, ch, nxt_pos = match(pattern, "()([%%(.[\~~194~~-\~~244~~][\128-\191]*)()", pos)

pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)

if not ch then

break

end

local nxt = ~~sub~~(pattern~~, nxt_pos~~, nxt_pos)

local nxt = byte(pattern, nxt_pos)

if ch == "%" then

if nxt == "b~~" then~~

if nxt == 0x62 then -- b

~~if not match~~(pattern, "^(~~)[^\~~128~~-\255][^\~~128~~-\255]", pos + 2~~) then

local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)

if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then

return false

end

pos = pos + 4

elseif nxt == "f~~" then~~

elseif nxt == 0x66 then -- f

~~pos~~ = ~~pos~~ + 2

nxt_pos = nxt_pos + 2

~~if not match~~(pattern, ~~"^(~~)%~~[[^^]"~~, ~~pos~~) then

local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)

-- Only possible to convert a positive %f charset which is

-- all ASCII, so use parse_1_byte_charset.

if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^

return false

elseif nxt3 == 0x5D then -- Initial ] is non-magic.

nxt_pos = nxt_pos + 1

end

~~-- Only possible to convert a %f charset which is all~~

pos = parse_1_byte_charset(pattern, nxt_pos)

~~-- ASCII, so use parse_1_byte_charset.~~

pos = parse_1_byte_charset(pattern, ~~pos~~)

if not pos then

return false

end

elseif nxt == "Z~~" then~~

elseif nxt == 0x5A then -- Z

~~pos = pos + 2~~

nxt = byte(pattern, nxt_pos + 1)

nxt = ~~sub~~(pattern, ~~pos, pos~~)

if nxt == 0x2A or nxt == 0x2D then -- *-

if nxt == ~~"*"~~ or nxt == ~~"+" or nxt == "~~-~~" then~~

pos = pos + 3

pos = pos + 1

else

output = output or {}

if output == nil then

output = {}

end

local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"

n = n + 1

if nxt == ~~"?"~~ then

if nxt == 0x2B then -- +

output[n] = ~~sub(pattern, start, pos - 3)~~ .. "~~[\1~~-~~\127\194~~-~~\244~~]?[\128-\191]*"

output[n] = ins .. "%Z*"

pos = pos + 1

pos = pos + 3

elseif nxt == 0x3F then -- ?

output[n] = ins .. "?[\128-\191]*"

pos = pos + 3

else

output[n] = ~~sub(pattern, start, pos - 3)~~ .. "~~[\1-\127\194-\244]~~[\128-\191]*"

output[n] = ins .. "[\128-\191]*"

pos = pos + 2

end

start = pos

end

elseif ~~find~~("acdlpsuwxACDLPSUWX", ~~nxt, 1, true) then~~

elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz

return false

-- Skip the next character if it's ASCII. Otherwise, we will

-- still need to do length checks.

else

pos = pos + ~~(byte~~(nxt) < 128 and 2 or 1)

pos = pos + (nxt < 128 and 2 or 1)

end

elseif ch == "(" then

if nxt == ~~")"~~ or ~~captures~~ == 32 then

if nxt == 0x29 or capture_groups == 32 then -- )

return false

end

~~captures~~ = ~~captures~~ + 1

capture_groups = capture_groups + 1

pos = pos + 1

elseif ch == "." then

if nxt == ~~"*"~~ or nxt == ~~"+" or nxt == "~~-~~" then~~

if nxt == 0x2A or nxt == 0x2D then -- *-

pos = pos + 2

else

output = output or {}

if output == nil then

output = {}

end

local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"

n = n + 1

if nxt == "?" then

if nxt == 0x2B then -- +

output[n] = ~~sub(pattern, start, pos - 1)~~ .. "~~[^\128-\191]~~?[\128-\191]*"

output[n] = ins .. ".*"

pos = pos + 2

elseif nxt == 0x3F then -- ?

output[n] = ins .. "?[\128-\191]*"

pos = pos + 2

else

output[n] = ~~sub(pattern, start, pos - 1)~~ .. "~~[^\128-\191]~~[\128-\191]*"

output[n] = ins .. "[\128-\191]*"

pos = pos + 1

end

Line 224:

Line 375:

elseif ch == "[" then

-- Fail negative charsets. TODO: 1-byte charsets should be safe.

if nxt == "^~~" then~~

if nxt == 0x5E then -- ^

return false

-- If the first character is "%", ch_len is determined by the

-- next one instead.

elseif nxt == "%" then

elseif nxt == 0x25 then -- %

nxt = byte(pattern, nxt_pos + 1)

elseif nxt == 0x5D then -- Initial ] is non-magic.

nxt_pos = nxt_pos + 1

~~nxt = sub(pattern, nxt_pos, nxt_pos)~~

end

local ch_len = ~~#match(pattern, "^.[\~~128~~-\191]*", nxt_pos)~~

if not nxt then

return false

end

local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4

if ch_len == 1 then -- Single-byte charset.

pos = parse_1_byte_charset(pattern, ~~pos + 1~~)

pos = parse_1_byte_charset(pattern, nxt_pos)

if not pos then

return false

end

else -- Multibyte charset.

-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.

local charset_pos, bytes = pos

pos = pos + 1

while true do -- TODO: non-ASCII charset ranges.

pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)

pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)

~~if not ch then~~

~~return false~~

-- If escaped, get the next character. No need to

-- distinguish magic characters or character classes,

-- as they'll all fail for having the wrong length

-- anyway.

~~elseif~~ ch == "%" then

if ch == "%" then

pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", ~~pos~~)

pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)

elseif ch == "]" then

pos = nxt_pos

break

end

if ch_len ~~~= #ch~~ then

if not (ch and nxt_pos - pos == ch_len) then

return false

elseif bytes == nil then

bytes = {}

end

~~bytes = bytes or {}~~

local bytes, last = bytes, nxt_pos - 1

local bytes = bytes

for i = pos, last - 1 do

for i = 1, ~~ch_len~~ - 1 do

local b = byte(pattern, i)

local b = byte(~~ch, i~~, i)

local bytes_b = bytes[b]

bytes[b] = bytes[b] ~~or {}~~

if bytes_b == nil then

~~bytes =~~ bytes[b]

bytes_b = {}

bytes[b] = bytes_b

end

bytes[b], bytes = bytes_b, bytes_b

end

bytes[byte(ch, -1)] = true

bytes[byte(pattern, last)] = true

pos = nxt_pos

end

Line 271:

Line 430:

return false

end

~~local~~ nxt = ~~sub~~(pattern~~, pos~~, pos)

nxt = byte(pattern, pos)

if (

(nxt == ~~"?"~~ or nxt == ~~"*"~~ or nxt == ~~"-"~~) or

(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?

(nxt == ~~"+"~~ and ch_len > 2) or

(nxt == 0x2B and ch_len > 2) or -- +

not check_sets(bytes)

) then

Line 292:

Line 451:

bytes = next_byte

until next_byte == true

if nxt == "+~~" then~~

if nxt == 0x2B then -- +

local range1, range2 = ranges[1], ranges[2]

ranges[1] = make_charset(range1)

ranges[1], ranges[3] = make_charset(range1), make_charset(range2)

~~ranges[3] =~~ make_charset(range2)

local n = #range2

for i = 1, #range1 do

Line 308:

Line 466:

end

output = output or {}

if output == nil then

output = {}

end

nxt = byte(pattern, pos)

n = n + 1

output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)

output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..

((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped

start = pos

end

elseif nxt == "+~~" then~~

elseif not nxt then

if ~~#ch~~ ~= 2 then

break

elseif nxt == 0x2B then -- +

if nxt_pos - pos ~= 2 then

return false

elseif output == nil then

output = {}

end

~~output~~ = ~~output or {}~~

pos, nxt_pos = pos + 1, nxt_pos + 1

nxt = byte(pattern, nxt_pos)

local ch2 = sub(pattern, pos, pos)

n = n + 1

output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. ~~sub~~(~~ch, 2, 2~~)

output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..

pos = nxt_pos ~~+ 1~~

((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped

~~start = pos~~

pos, start = nxt_pos, nxt_pos

elseif nxt == ~~"?"~~ or nxt == ~~"*"~~ or nxt == "-~~" then~~

elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?

return false

else

Line 332:

Line 500:

end

return concat(output) .. sub(pattern, start)

end, true)

end

export.pattern_simplifier = pattern_simplifier -- ~~For testing~~.

pattern_simplifier = memoize(pattern_simplifier, true)

export.pattern_simplifier = pattern_simplifier

end

--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {{lua|"abcd-g"}} becomes {{lua|"[abcd-g]"}}, and {{lua|"[]"}} becomes {{lua|"[[%]]"}}).

The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==]

function get_charset(charset)

if type(charset) == "number" then

return tostring(charset)

end

local pos, start, n, output = 1, 1, 0

if byte(charset) == 0x5E then -- ^

pos = pos + 1

end

-- FIXME: "]" is non-magic if it's the first character in a charset.

local nxt_pos, nxt

while true do

local new_pos, ch = match(charset, "()([%%%-%]])", pos)

if not ch then

break

-- Skip percent escapes. Ranges can't start with them, either.

elseif ch == "%" then

pos = new_pos + 2

else

-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.

if ch == "-" and new_pos > pos then

pos, nxt_pos, nxt = new_pos - 1, new_pos, ch

ch = sub(charset, pos, pos)

else

pos, nxt_pos = new_pos, new_pos + 1

nxt = sub(charset, nxt_pos, nxt_pos)

end

-- Range.

if nxt == "-" then

if output == nil then

output = {}

end

n = n + 1

output[n] = sub(charset, start, pos - 1)

nxt_pos = nxt_pos + 1

nxt = sub(charset, nxt_pos, nxt_pos)

-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.

if nxt == "" or nxt == "%" then

n = n + 1

output[n] = (ch == "]" and "%]" or ch) .. "%-"

start = nxt_pos

nxt_pos = nxt_pos + 2

-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is omitted if the range would be empty (i.e. if the first byte is greater than the second).

else

n = n + 1

output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..

(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)

nxt_pos = nxt_pos + 1

start = nxt_pos

end

elseif ch == "-" or ch == "]" then

if output == nil then

output = {}

end

n = n + 1

output[n] = sub(charset, start, pos - 1) .. "%" .. ch

start = nxt_pos

end

pos = nxt_pos

end

if start == 1 then

return "[" .. charset .. "]"

end

return "[" .. concat(output) .. sub(charset, start) .. "]"

end

get_charset = memoize(get_charset, true)

export.get_charset = get_charset

function export.len(str)

Line 430:

Line 670:

--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]

function export.reverse(str)

return reverse(gsub(str, "[\~~194~~-\~~244~~][\128-\191]*", reverse))

return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))

end

Line 441:

Line 681:

cp = tonumber(cp)

if cp < 0 then

err("-0x~~" .. format("~~%X", -cp ~~+ 1~~))

err(format("-0x%X", -cp))

elseif cp < 0x80 then

return char(cp)

Line 466:

Line 706:

)

end

err(~~"0x" ..~~ format("%X", cp))

err(format("0x%X", cp))

end

Line 567:

Line 807:

do

local function add_captures(~~text~~, n, ...)

local function add_captures(t, n, ...)

if ... == nil then

return

end

-- Insert any captures from the splitting pattern.

local offset, capture = n - 1, ...

while capture do

n = n + 1

~~text~~[n] = capture

t[n] = capture

capture = select(n - offset, ...)

end

Line 578:

Line 821:

end

~~local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)~~

--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.

~~if not (loc1 and start <= str_len) then~~

-- ~~If no match, or there is but we're past the end~~ of ~~the string~~

~~-- (which happens when the match is the empty string), then add~~

~~-- the final chunk and return~~.

~~n = n + 1~~

text~~[n] = _sub~~(~~str, start~~)

~~return~~

~~elseif loc2 < loc1 then~~

~~-- Special case: If we match the empty string, then include~~ the

~~-- next character; this avoids an infinite loop, and makes~~

-- splitting ~~by an empty string work the way mw~~.~~text~~.split() ~~does~~

-- (~~including non-adjacent empty string matches with %f)~~. ~~If we~~

~~-- reach~~ the ~~end~~ of the string ~~this way, return immediately~~, ~~so we~~

~~-- don't get a final empty string. If using~~ the string library, we

~~-- need to make sure we advance by one UTF-8 character~~.

~~if _sub == sub then~~

~~loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)~~

~~end~~

~~n = n + 1~~

~~text[n] = _sub(str, start, loc1)~~

~~start = loc1 + 1~~

~~if start > str_len then~~

~~return ... and add_captures(text, n, ...) or n~~

~~end~~

~~else~~

~~-- Add chunk up to~~ the ~~current match.~~

~~n = n + 1~~

~~text[n] = _sub(str~~, ~~start~~, ~~loc1 - 1)~~

~~start = loc2 + 1~~

~~end~~

~~return (~~.~~.. and add_captures(text, n, ...) or n), start~~

~~end~~

~~local~~ function ~~_split~~(str, ~~pattern~~, ~~str_len, _sub, _find~~, plain)

In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil if there are no further matches. By default, the start index will be calculated using the ustring library, unless `str_lib` or `plain` is set.]==]

local ~~text~~, n, ~~start =~~ {}, 0~~, 1~~

function export.split(str, pattern_or_func, str_lib, plain)

local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0

repeat

n~~, start~~ = ~~iterate~~(~~str, str_len, text~~, n, ~~start, _sub, _find~~(~~str, pattern, start, plain~~))

n = add_captures(t, n, iter())

until ~~not start~~

until n == nil

return t

~~return text~~

~~end~~

~~--[=~~=[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]

~~function export.split(str, pattern, str_lib, plain)~~

~~if str_lib or plain then~~

~~return _split(str, pattern, #str, sub, find, plain)~~

~~end~~

~~local simple~~ = ~~pattern_simplifier(pattern)~~

~~if simple then~~

return ~~_split(str, simple, #str, sub, find)~~

~~end~~

~~return _split(str, pattern, ulen(str), usub, ufind)~~

end

export.capturing_split = export.split -- To be removed.

end

do

--[==[Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the string up the splitting pattern, with any capture groups being returned as additional values on that iteration.]==]

-- ~~TODO: merge this with export.~~split. ~~Not clear how to do this while~~

function export.gsplit(str, pattern_or_func, str_lib, plain)

~~-- maintaining~~ the ~~same level of performance~~, as ~~gsplit is slower~~.

local start, final, str_len, _string, callable = 1

~~local~~ function ~~_split~~(str, ~~pattern~~, str_len, ~~_sub~~, ~~_find~~, plain)

pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)

local ~~start~~, ~~final~~ = 1

local _find, _sub = _string.find, _string.sub

local function iter(loc1, loc2, ...)

-- If no match, return the final chunk.

-- If no match, or there is but we're past the end of the string

if ~~not~~ loc1 then

-- (which happens when the match is the empty string), then return

-- the final chunk.

if not loc1 then

final = true

return _sub(str, start)

end

-- Special case: If we match the empty string, then eat the

-- next character; this avoids an infinite loop, and makes

-- splitting by the empty string work the way mw.text.gsplit() does

-- (including non-adjacent empty string matches with %f). If we

-- reach the end of the string this way, set `final` to true, so we

-- don't get stuck matching the empty string at the end.

local chunk

if loc2 < loc1 then

-- If using the string library, we need to make sure we advance

-- by one UTF-8 character.

if _sub == sub then

local b = byte(str, loc1)

if b and b >= 128 then

loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)

end

chunk = _sub(str, start, loc1)

if loc1 >= str_len then

final = true

~~return _sub(str, start)~~

~~end~~

~~-- Special case: If we match the empty string, then eat the~~

~~-- next character; this avoids an infinite loop, and makes~~

~~-- splitting by the empty string work the way mw.text.gsplit() does~~

~~-- (including non-adjacent empty string matches with %f). If we~~

~~-- reach the end of the string this way, set `final` to true, so we~~

~~-- don't get stuck matching the empty string at the end.~~

~~local chunk~~

~~if loc2 < loc1 then~~

~~-- If using the string library, we need to make sure we advance~~

~~-- by one UTF-8 character.~~

~~if _sub == sub then~~

~~loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)~~

~~end~~

~~chunk = _sub(str, start, loc1)~~

~~if loc1 >= str_len then~~

~~final = true~~

~~else~~

~~start = loc1 + 1~~

~~end~~

~~-- Eat chunk up to the current match.~~

else

~~chunk~~ = ~~_sub(str, start,~~ loc1 ~~- 1)~~

start = loc1 + 1

~~start = loc2~~ + 1

end

~~return~~ chunk, ~~...~~

-- Eat chunk up to the current match.

else

chunk = _sub(str, start, loc1 - 1)

start = loc2 + 1

end

return chunk, ...

end

if callable then

return function()

if not final then

return iter(_find(str, ~~pattern~~, start, plain))

return iter(pattern_or_func(str, start))

end

-- Special case if the pattern is anchored to the start: "^" always

-- anchors to the start position, not the start of the string, so get

-- around this by only attempting one match with the pattern, then match

-- the end of the string.

elseif byte(pattern_or_func) == 0x5E then -- ^

local returned

return function()

if not returned then

returned = true

return iter(_find(str, pattern_or_func, start, plain))

elseif not final then

return iter(_find(str, "$", start, plain))

end

~~return nil~~

end

return function()

function ~~export.gsplit~~(~~str, pattern, str_lib, plain~~)

if not final then

if ~~str_lib or plain~~ then

return iter(_find(str, pattern_or_func, start, plain))

return ~~_split~~(str, ~~pattern, #str, sub~~, ~~find~~, plain)

~~end~~

~~local simple = pattern_simplifier(pattern)~~

~~if simple then~~

~~return _split(str, simple, #str, sub, find~~)

end

~~return _split(str, pattern, ulen(str), usub, ufind)~~

end

gsplit = export.gsplit

function export.trim(str, charset)

function export.trim(str, charset, str_lib, plain)

if ~~not~~ charset then

if charset == nil then

~~return match(str~~, "^()%s*$"~~) and~~ "" or match(str, "^%s*~~(.*%S)~~")

-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" first.

~~elseif match(charset~~, "~~^()[^\128-\255]*$~~") ~~then~~

return match(gsub(str, "^%s*", ""), "^.*%S") or ""

~~return match(str~~, "^~~()[" .. charset~~ .~~. "]~~*$") ~~and ""~~ or ~~match(str,~~ "^[" .. charset .. "~~]*(.*[^~~" ~~.. charset .. "])")~~

elseif charset == "" then

return str

end

~~return umatch~~(~~str~~, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")

charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)

-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there would be two callbacks into PHP, which is slower.

local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"

if not str_lib then

local simple = pattern_simplifier(pattern)

if not simple then

return umatch(str, pattern)

end

pattern = simple

end

return match(str, pattern)

end

do

local entities

local function get_entities()

local function ~~decode_numeric_entity~~(~~code, pattern, base~~)

entities, get_entities = load_data("Module:data/entities"), nil

~~local cp~~ = ~~match~~(~~code, pattern~~) ~~and tonumber(code~~, ~~base)~~

return entities

return ~~cp and cp < 0x110000 and u(cp) or nil~~

end

local function decode_entity(hash, x, code)

if hash == "#" then

if hash == "" then

return x == "" ~~and decode_numeric_entity~~(code, "^%d+$") or

return (entities or get_entities())[x .. code]

~~decode_numeric_entity~~(code, "^%x+$", 16)

end

local cp

if x == "" then

cp = match(code, "^()%d+$") and tonumber(code)

else

cp = match(code, "^()%x+$") and tonumber(code, 16)

end

~~entities = entities or load_data~~(~~"Module:data/entities"~~)

return cp and cp < 0x110000 and u(cp) or nil

~~return entities[x .. code]~~

end

-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].

function export.decode_entities(str)

return find(str, "&", 1, true) and

local amp = find(str, "&", nil, true)

gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str

return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str

end

do

local ~~html_entities~~

local entities

local function get_entities()

-- Memoized HTML entities (taken from mw.text.lua).

entities, get_entities = {

["\""] = """,

["&"] = "&",

["'"] = "'",

["<"] = "<",

[">"] = ">",

["\194\160"] = " ",

}, nil

return entities

end

local function encode_entity(ch)

local entity = ~~html_entities~~[ch]

local entity = (entities or get_entities())[ch]

if entity then

if entity == nil then

~~return~~ entity

entity = "&#" .. codepoint(ch) .. ";"

entities[ch] = entity

end

~~entity = "&#" .. codepoint(ch) .. ";"~~

~~html_entities[ch] = entity~~

return entity

end

function export.encode_entities(str, charset, str_lib, plain)

~~-- Memoized HTML entities (taken from mw.text.lua).~~

if charset == nil then

~~html_entities = html_entities or {~~

return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))

~~["\""]~~ = ~~""",~~

elseif charset == "" then

~~["&"]~~ = ~~"&",~~

return str

~~["'"] = "'",~~

end

~~["<"] = "<",~~

local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)

~~[">"] = ">",~~

if not str_lib then

~~["\194\160"] = " ",~~

local simple = pattern_simplifier(pattern)

}

if not simple then

~~if not charset~~ then

return (ugsub(str, pattern, encode_entity))

return (gsub(str, "[\"&'<>\194]\160?", ~~html_entities~~))

elseif ~~plain~~ then

return (~~gsub(str,~~ "[" .. charset_escape(charset) .. "]"~~, encode_entity~~))

~~elseif~~ str_lib then

if not ~~match(charset, "^()[^\128-\255]*$")~~ then

~~error~~(~~"Cannot use the string library with a character set that contains a character with a codepoint above U+007F."~~)

end

~~return (gsub(str, "[" .. charset .. "]", encode_entity))~~

pattern = simple

end

~~local pattern = charset and "[" .. charset .. "]"~~

return (gsub(str, pattern, encode_entity))

~~local simple = pattern_simplifier(pattern)~~

~~if simple then~~

return (gsub~~(str, simple, encode_entity))~~

~~end~~

~~return (ugsub~~(str, pattern, encode_entity))

end

Line 787:

Line 1,013:

enctype = enctype and upper(enctype) or "QUERY"

if enctype == "PATH" then

return find(str, "%", 1, true) and

return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str

gsub(str, "%%(%x%x)", decode_path) or str

elseif enctype == "QUERY" then

return (find(str, "%", 1, true) or find(str, "+", 1, true)) and

return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str

gsub(str, "([%%%+])(%x?%x?)", decode) or str

elseif enctype == "WIKI" then

return (find(str, "%", 1, true) or find(str, "_", 1, true)) and

return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str

gsub(str, "([%%_])(%x?%x?)", decode) or str

end

error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)

Line 802:

Line 1,025:

do

local function _remove_comments(str, pre)

local head = find(str, "<!--", 1, true)

local head = find(str, "<!--", nil, true)

if not head then

return str

Line 843:

Line 1,066:

end

--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{~~code|~~lua|"\0"}}, {{~~code|~~lua|"\t"}}, {{~~code|~~lua|"\n"}}, {{~~code|~~lua|"\v"}}, {{~~code|~~lua|"\r"}} and {{~~code|~~lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]

--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{lua|"\0"}}, {{lua|"\t"}}, {{lua|"\n"}}, {{lua|"\v"}}, {{lua|"\r"}} and {{lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]

function export.php_trim(str)

-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking.

return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""

end

php_trim = export.php_trim

--[==[Takes a parameter name as ~~an input~~, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{~~code|~~lua|frame.args}} table). For example, {{~~code|~~lua|"1"}} is normalized to {{~~code|~~lua|1}} (a number), ~~and~~ {{~~code|~~lua|" foo "}} is normalized to {{~~code|~~lua|"foo"}}. If the ~~input~~ is not ~~a string~~, it is ~~returned unchanged~~.

--[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{lua|frame.args}} table). For example, {{lua|"1"}} (a string) is normalized to {{lua|1}} (a number), {{lua|" foo "}} is normalized to {{lua|"foo"}}, and {{lua|1.5}} (a number) is normalized to {{lua|"1.5"}} (a string). Inputs which cannot be normalized (e.g. booleans) return {{lua|nil}}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {{lua|frame:expandTemplates()}}).

~~After being~~ trimmed with {{~~code|~~lua|export.php_trim}}, ~~strings~~ are converted to numbers if:

Strings are trimmed with {{lua|export.php_trim}}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true:

# They are integers~~, with~~ no decimals (2.0) or ~~leading zeroes (~~02).

# They are integers; i.e. no decimals or leading zeroes (e.g. {{lua|"2"}}, but not {{lua|"2.0"}} or {{lua|"02"}}).

# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.

# ~~For positive values~~, ~~they do~~ not ~~have a~~ leading {{~~code~~|lua|+}} ~~sign~~.]==]

# There is no leading sign unless < 0 (e.g. {{lua|"2"}} or {{lua|"-2"}}, but not {{lua|"+2"}} or {{lua|"-0"}}).

function export.scribunto_param_key(key)

# They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set).

if type(key) ~= "string" then

return key

Numbers are converted to strings if '''either''':

# They are not integers (e.g. {{lua|1.5}}).

# They are > 2{{sup|53}} or < -2{{sup|53}}.

When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {{lua|tostring()}}.]==]

function export.scribunto_param_key(key, no_trim)

local tp = type(key)

if tp == "string" then

if not no_trim then

key = php_trim(key)

end

if match(key, "^()-?[1-9]%d*$") then

local num = tonumber(key)

-- Lua integers are only precise to 2^53 - 1, so specifically check for 2^53 and -2^53 as strings, since a numerical comparison won't work as it can't distinguish 2^53 from 2^53 + 1.

return (

num <= 9007199254740991 and num >= -9007199254740991 or

key == "9007199254740992" or

key == "-9007199254740992"

) and num or key

end

return key == "0" and 0 or key

elseif tp == "number" then

-- No special handling needed for inf or NaN.

return key % 1 == 0 and (

key <= 9007199254740992 and key >= -9007199254740992 and key or

key <= 9223372036854775808 and key >= -9223372036854775808 and format("%d", key)

) or tostring(key)

end

~~key = php_trim(key)~~

return nil

~~if match(key, "^-?[1-9]%d*$") then~~

~~local num = tonumber(key)~~

~~-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.~~

return (

~~num <= 9007199254740991 and num >= -9007199254740991 or~~

~~key == "9007199254740992" or~~

~~key == "-9007199254740992"~~

~~) and num or key~~

~~elseif key == "0" then~~

~~return 0~~

~~end~~

~~return key~~

end

do

local byte_escapes

local function get_byte_escapes()

byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil

return byte_escapes

end

local function escape_byte(b)

return byte_escapes[b] or format("\\%03d", byte(b))

return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))

end

function export.escape_bytes(str)

~~byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes~~

return (gsub(str, ".", escape_byte))

end

Line 892:

Line 1,134:

return name == "op" and "{" or

name == "cl" and "}" or

error(~~module_name~~ .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")

error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")

elseif fun(name) and type(fun(name)) ~= "string" then

error(~~module_name~~ .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")

error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")

end

return fun(name) or error(~~module_name~~ .. ".format: \"" .. name .. "\" not found in table")

return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")

end))

end

format_fun = export.format_fun

--[==[This function, unlike {{~~code|~~lua|string.format}} and {{~~code|~~lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{~~code|~~lua|{param_name}}} in the format string with the table's entry for {{~~code|~~lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.

--[==[This function, unlike {{lua|string.format}} and {{lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{lua|{param_name}}} in the format string with the table's entry for {{lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.

====Examples====

* {{~~code|~~lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}

* {{lua|=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}

*: produces: {{~~code|~~lua|"one fish, two fish, red fish, blue fish"}}

*: produces: {{lua|"one fish, two fish, red fish, blue fish"}}

* {{~~code|~~lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}

* {{lua|=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}

*: produces: {{~~code|~~lua|"The set {1, 2, 3} contains three elements."}}

*: produces: {{lua|"The set {1, 2, 3} contains three elements."}}

*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]

function export.format(str, tbl)

Line 957:

Line 1,199:

end

do

function export.pluralize(...) -- To be removed once all calling modules have been changed to call Module:en-utilities directly.

~~local~~ function ~~word_ends_in_consonant_plus_y(str)~~

export.pluralize = require("Module:en-utilities").pluralize

~~-- FIXME, a subrule of rule #1 above says the -ies ending doesn't~~

return export.pluralize(...)

~~-- apply to proper nouns, hence "the Gettys", "the public Ivys"~~.

~~-- We should maybe consider applying this rule here; but it may not~~

~~-- be important as this function is almost always called on common nouns~~

-- (e.g. ~~parts of speech, place types)~~.

~~return find(str, "[^aeiouyAEIOUY ]y$"~~)

~~end~~

~~local function word_takes_es_plural(str)~~

~~return find(str, "[sxz]$") or find(str, "[csz]h$")~~

~~end~~

~~local function do_pluralize(str)~~

~~if word_ends_in_consonant_plus_y(str) then~~

-- ~~avoid returning multiple values~~

~~return (gsub(str, "y$", "ies"))~~

~~elseif word_takes_es_plural(str) then~~

~~return str .. "es"~~

~~end~~

~~return str .. "s"~~

~~end~~

~~--[==[~~

~~Pluralize a word in a smart fashion, according~~ to ~~normal English rules.~~

~~# If word ends in consonant + -y, replace the -y with -ies.~~

~~# If the word ends in -s, -x, -z, -ch, -sh, -zh, add -es.~~

~~# Otherwise, add -s.~~

~~This handles links correctly~~:

~~# If a piped link, change the second part appropriately.~~

~~# If a non~~-~~piped link and rule #1 above applies, convert to a piped link with the second part containing the plural~~.

~~# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.~~

~~]==]~~

~~function~~ export.pluralize~~(str)~~

~~if type(str) == "table" then~~

~~-- allow calling from a template~~

~~str~~ = ~~str.args[1]~~

~~end~~

~~-- Check for a link. This pattern matches both piped and unpiped links.~~

~~-- If the link is not piped, the second capture (linktext) will be empty.~~

~~local beginning, link, linktext = match~~(~~str,~~ "~~^(.*)%[%[([^|%]]+)%|?(.~~-~~)%]%]$")~~

~~if not link then~~

~~return do_pluralize(str)~~

~~elseif linktext ~= "" then~~

~~return beginning ..~~ "~~[[" .. link .. "|" .. do_pluralize(linktext~~) .~~. "]]"~~

~~elseif word_ends_in_consonant_plus_y(link) then~~

return ~~beginning~~ .~~. "[[" .. link .. "|" .. gsub~~(~~link, "y$", "ies") .. "]]"~~

~~end~~

~~return beginning .. "[[" .. link .~~. ~~"]]"~~ .. ~~(word_takes_es_plural(link) and "es" or "s"~~)

~~end~~

end

Line 1,072:

Line 1,265:

function export.get_indefinite_article(str, ucfirst)

str = str or ""

~~local is_vowel = false~~

-- If there's a link at the beginning, examine the first letter of the

-- link text. This pattern matches both piped and unpiped links.

-- If the link is not piped, the second capture (linktext) will be empty.

local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")

if link ~~then~~

if match(link and (linktext ~= "" and linktext or link) or str, "^()[AEIOUaeiou]") then

~~is_vowel = find~~(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")

return ucfirst and "An" or "an"

~~else~~

~~is_vowel = find(str,~~ "~~^[AEIOUaeiou]~~")

end

return ~~is_vowel and (ucfirst and "An" or "an") or (~~ucfirst and "A" or "a")

return ucfirst and "A" or "a"

end

get_indefinite_article = export.get_indefinite_article

@@ Line 1: / Line 1: @@
+local export = {}
+local function_module = "Module:fun"
+local load_module = "Module:load"
+local memoize_module = "Module:memoize"
 local mw = mw
 local string = string
@@ Line 12: / Line 18: @@
 local gsub = string.gsub
 local len = string.len
-local load_data = mw.loadData
 local lower = string.lower
 local match = string.match
 local next = next
+local require = require
 local reverse = string.reverse
 local select = select
@@ Line 34: / Line 40: @@
 local usub = ustring.sub
 local uupper = ustring.upper
+local memoize = require(memoize_module)
 -- Defined below.
 local charset_escape
@@ Line 39: / Line 48: @@
 local explode_utf8
 local format_fun
+local get_charset
 local get_indefinite_article
+local gsplit
 local pattern_escape
 local pattern_simplifier
@@ Line 47: / Line 58: @@
 local ulen
-local module_name = "string_utilities"
+--[==[
+Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
+local function is_callable(...)
+	is_callable = require(function_module).is_callable
+	return is_callable(...)
+end
-local export = {}
+local function load_data(...)
+	load_data = require(load_module).load_data
+	return load_data(...)
+end
+local function prepare_iter(str, pattern, str_lib, plain)
+	local callable = is_callable(pattern)
+	if str_lib or plain then
+		return pattern, #str, string, callable
+	elseif not callable then
+		local simple = pattern_simplifier(pattern)
+		if simple then
+			return simple, #str, string, false
+		end
+	end
+	return pattern, ulen(str), ustring, callable
+end
 --[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
@@ Line 62: / Line 94: @@
 explode_utf8 = export.explode_utf8
---[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
+do
-function export.pattern_escape(str)
+	local charset_chars = {
-	return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
+		["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"
-end
+	}
-pattern_escape = export.pattern_escape
+	charset_chars.__index = charset_chars
+	local chars = setmetatable({
+		["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",
+		["."] = "%.", ["?"] = "%?", ["["] = "%["
+	}, charset_chars)
+	--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>, and converts the null character to <code>%z</code>. For example, {{lua|"^$()%.[]*+-?\0"}} becomes {{lua|"%^%$%(%)%%%.%[%]%*%+%-%?%z"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
+	function export.pattern_escape(str)
+		return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
+	end
+	pattern_escape = export.pattern_escape
+	--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>, and converts the null character to <code>%z</code>.]==]
+	function export.charset_escape(str)
+		return (gsub(str, "[%z%%%-%]^]", charset_chars))
+	end
+	charset_escape = export.charset_escape
+	--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
+	function export.replacement_escape(str)
+		return (gsub(str, "%%", "%%%%"))
+	end
+	replacement_escape = export.replacement_escape
+	local function case_insensitive_char(ch)
+		local upper_ch = uupper(ch)
+		if upper_ch == ch then
+			ch = ulower(ch)
+			if ch == upper_ch then
+				return chars[ch] or ch
+			end
+		end
+		return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"
+	end
---[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
+	local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)
-function export.charset_escape(str)
+		if not (loc1 and start <= str_len) then
-	return (gsub(str, "[%%%-%]^]", "%%%0"))
+			-- Add final chunk and return.
-end
+			n = n + 1
-charset_escape = export.charset_escape
+			text[n] = _gsub(_sub(str, start), ".", chars)
+			return
+		elseif loc2 < loc1 then
+			if _sub == sub then
+				local b = byte(str, loc1)
+				if b and b >= 128 then
+					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
+				end
+			end
+			n = n + 1
+			text[n] = _gsub(_sub(str, start, loc1), ".", chars)
+			start = loc1 + 1
+			if start > str_len then
+				return
+			end
+		else
+			-- Add chunk up to the current match.
+			n = n + 1
+			text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)
+			-- Add current match.
+			n = n + 1
+			text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)
+			start = loc2 + 1
+		end
+		return n, start
+	end
---[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
+	--[==[
-function export.replacement_escape(str)
+	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns any pattern matching facilities off in the optional pattern supplied.]==]
-	return (gsub(str, "%%", "%%%%"))
+	function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
+		if pattern_or_func == nil then
+			return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))
+		end
+		local text, n, start, str_len, _string, callable = {}, 0, 1
+		pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
+		local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub
+		if callable then
+			repeat
+				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))
+			until not start
+		-- Special case if the pattern is anchored to the start: "^" always
+		-- anchors to the start position, not the start of the string, so get
+		-- around this by only attempting one match with the pattern, then match
+		-- the end of the string.
+		elseif byte(pattern_or_func) == 0x5E then -- ^
+			n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
+			if start ~= nil then
+				iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))
+			end
+		else
+			repeat
+				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
+			until not start
+		end
+		return concat(text)
+	end
 end
-replacement_escape = export.replacement_escape
 do
+	local character_classes
+	local function get_character_classes()
+		character_classes, get_character_classes = {
+			[0x41] = true, [0x61] = true, -- Aa
+			[0x43] = true, [0x63] = true, -- Cc
+			[0x44] = true, [0x64] = true, -- Dd
+			[0x4C] = true, [0x6C] = true, -- Ll
+			[0x50] = true, [0x70] = true, -- Pp
+			[0x53] = true, [0x73] = true, -- Ss
+			[0x55] = true, [0x75] = true, -- Uu
+			[0x57] = true, [0x77] = true, -- Ww
+			[0x58] = true, [0x78] = true, -- Xx
+			[0x5A] = true, -- z dealt with separately.
+		}, nil
+		return character_classes
+	end
 	local function check_sets_equal(set1, set2)
 		local k2
@@ Line 129: / Line 265: @@
 	local function parse_1_byte_charset(pattern, pos)
+		local ch
 		while true do
-			local ch, nxt_pos
+			pos, ch = match(pattern, "()([%%%]\192-\255])", pos)
-			pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
+			if ch == "%" then
-			if not ch then
+				local nxt = byte(pattern, pos + 1)
-				return false
+				if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z
-			elseif ch == "%" then
-				if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
 					return false
 				end
 				pos = pos + 2
 			elseif ch == "]" then
-				pos = nxt_pos
+				pos = pos + 1
 				return pos
 			else
 				return false
 			end
 		end
 	end
 	--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
-	pattern_simplifier = require("Module:fun").memoize(function(pattern)
+	function pattern_simplifier(pattern)
 		if type(pattern) == "number" then
 			return tostring(pattern)
 		end
-		local pos, captures, start, n, output = 1, 0, 1, 0
+		local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0
 		while true do
-			local ch, nxt_pos
+			-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.
-			pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
+			pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)
 			if not ch then
 				break
 			end
-			local nxt = sub(pattern, nxt_pos, nxt_pos)
+			local nxt = byte(pattern, nxt_pos)
 			if ch == "%" then
-				if nxt == "b" then
+				if nxt == 0x62 then -- b
-					if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
+					local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)
+					if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then
 						return false
 					end
 					pos = pos + 4
-				elseif nxt == "f" then
+				elseif nxt == 0x66 then -- f
-					pos = pos + 2
+					nxt_pos = nxt_pos + 2
-					if not match(pattern, "^()%[[^^]", pos) then
+					local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)
+					-- Only possible to convert a positive %f charset which is
+					-- all ASCII, so use parse_1_byte_charset.
+					if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^
 						return false
+					elseif nxt3 == 0x5D then -- Initial ] is non-magic.
+						nxt_pos = nxt_pos + 1
 					end
-					-- Only possible to convert a %f charset which is all
+					pos = parse_1_byte_charset(pattern, nxt_pos)
-					-- ASCII, so use parse_1_byte_charset.
-					pos = parse_1_byte_charset(pattern, pos)
 					if not pos then
 						return false
 					end
-				elseif nxt == "Z" then
+				elseif nxt == 0x5A then -- Z
-					pos = pos + 2
+					nxt = byte(pattern, nxt_pos + 1)
-					nxt = sub(pattern, pos, pos)
+					if nxt == 0x2A or nxt == 0x2D then -- *-
-					if nxt == "*" or nxt == "+" or nxt == "-" then
+						pos = pos + 3
-						pos = pos + 1
 					else
-						output = output or {}
+						if output == nil then
+							output = {}
+						end
+						local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"
 						n = n + 1
-						if nxt == "?" then
+						if nxt == 0x2B then -- +
-							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
+							output[n] = ins .. "%Z*"
-							pos = pos + 1
+							pos = pos + 3
+						elseif nxt == 0x3F then -- ?
+							output[n] = ins .. "?[\128-\191]*"
+							pos = pos + 3
 						else
-							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
+							output[n] = ins .. "[\128-\191]*"
+							pos = pos + 2
 						end
 						start = pos
 					end
-				elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
+				elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz
 					return false
 				-- Skip the next character if it's ASCII. Otherwise, we will
 				-- still need to do length checks.
 				else
-					pos = pos + (byte(nxt) < 128 and 2 or 1)
+					pos = pos + (nxt < 128 and 2 or 1)
 				end
 			elseif ch == "(" then
-				if nxt == ")" or captures == 32 then
+				if nxt == 0x29 or capture_groups == 32 then -- )
 					return false
 				end
-				captures = captures + 1
+				capture_groups = capture_groups + 1
 				pos = pos + 1
 			elseif ch == "." then
-				if nxt == "*" or nxt == "+" or nxt == "-" then
+				if nxt == 0x2A or nxt == 0x2D then -- *-
 					pos = pos + 2
 				else
-					output = output or {}
+					if output == nil then
+						output = {}
+					end
+					local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"
 					n = n + 1
-					if nxt == "?" then
+					if nxt == 0x2B then -- +
-						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
+						output[n] = ins .. ".*"
+						pos = pos + 2
+					elseif nxt == 0x3F then -- ?
+						output[n] = ins .. "?[\128-\191]*"
 						pos = pos + 2
 					else
-						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
+						output[n] = ins .. "[\128-\191]*"
 						pos = pos + 1
 					end
@@ Line 224: / Line 375: @@
 			elseif ch == "[" then
 				-- Fail negative charsets. TODO: 1-byte charsets should be safe.
-				if nxt == "^" then
+				if nxt == 0x5E then -- ^
 					return false
 				-- If the first character is "%", ch_len is determined by the
 				-- next one instead.
-				elseif nxt == "%" then
+				elseif nxt == 0x25 then -- %
+					nxt = byte(pattern, nxt_pos + 1)
+				elseif nxt == 0x5D then -- Initial ] is non-magic.
 					nxt_pos = nxt_pos + 1
-					nxt = sub(pattern, nxt_pos, nxt_pos)
 				end
-				local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
+				if not nxt then
+					return false
+				end
+				local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4
 				if ch_len == 1 then -- Single-byte charset.
-					pos = parse_1_byte_charset(pattern, pos + 1)
+					pos = parse_1_byte_charset(pattern, nxt_pos)
 					if not pos then
 						return false
 					end
 				else -- Multibyte charset.
+					-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.
 					local charset_pos, bytes = pos
 					pos = pos + 1
 					while true do -- TODO: non-ASCII charset ranges.
-						pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
+						pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)
-						if not ch then
-							return false
 						-- If escaped, get the next character. No need to
 						-- distinguish magic characters or character classes,
 						-- as they'll all fail for having the wrong length
 						-- anyway.
-						elseif ch == "%" then
+						if ch == "%" then
-							pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
+							pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)
 						elseif ch == "]" then
 							pos = nxt_pos
 							break
 						end
-						if ch_len ~= #ch then
+						if not (ch and nxt_pos - pos == ch_len) then
 							return false
+						elseif bytes == nil then
+							bytes = {}
 						end
-						bytes = bytes or {}
+						local bytes, last = bytes, nxt_pos - 1
-						local bytes = bytes
+						for i = pos, last - 1 do
-						for i = 1, ch_len - 1 do
+							local b = byte(pattern, i)
-							local b = byte(ch, i, i)
+							local bytes_b = bytes[b]
-							bytes[b] = bytes[b] or {}
+							if bytes_b == nil then
-							bytes = bytes[b]
+								bytes_b = {}
+								bytes[b] = bytes_b
+							end
+							bytes[b], bytes = bytes_b, bytes_b
 						end
-						bytes[byte(ch, -1)] = true
+						bytes[byte(pattern, last)] = true
 						pos = nxt_pos
 					end
@@ Line 271: / Line 430: @@
 						return false
 					end
-					local nxt = sub(pattern, pos, pos)
+					nxt = byte(pattern, pos)
 					if (
-						(nxt == "?" or nxt == "*" or nxt == "-") or
+						(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?
-						(nxt == "+" and ch_len > 2) or
+						(nxt == 0x2B and ch_len > 2) or -- +
 						not check_sets(bytes)
 					) then
@@ Line 292: / Line 451: @@
 						bytes = next_byte
 					until next_byte == true
-					if nxt == "+" then
+					if nxt == 0x2B then -- +
 						local range1, range2 = ranges[1], ranges[2]
-						ranges[1] = make_charset(range1)
+						ranges[1], ranges[3] = make_charset(range1), make_charset(range2)
-						ranges[3] = make_charset(range2)
 						local n = #range2
 						for i = 1, #range1 do
@@ Line 308: / Line 466: @@
 						end
 					end
-					output = output or {}
+					if output == nil then
+						output = {}
+					end
+					nxt = byte(pattern, pos)
 					n = n + 1
-					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
+					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..
+						((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
 					start = pos
 				end
-			elseif nxt == "+" then
+			elseif not nxt then
-				if #ch ~= 2 then
+				break
+			elseif nxt == 0x2B then -- +
+				if nxt_pos - pos ~= 2 then
 					return false
+				elseif output == nil then
+					output = {}
 				end
-				output = output or {}
+				pos, nxt_pos = pos + 1, nxt_pos + 1
+				nxt = byte(pattern, nxt_pos)
+				local ch2 = sub(pattern, pos, pos)
 				n = n + 1
-				output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
+				output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..
-				pos = nxt_pos + 1
+					((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
-				start = pos
+				pos, start = nxt_pos, nxt_pos
-			elseif nxt == "?" or nxt == "*" or nxt == "-" then
+			elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?
 				return false
 			else
@@ Line 332: / Line 500: @@
 		end
 		return concat(output) .. sub(pattern, start)
-	end, true)
+	end
-	export.pattern_simplifier = pattern_simplifier -- For testing.
+	pattern_simplifier = memoize(pattern_simplifier, true)
+	export.pattern_simplifier = pattern_simplifier
+end
+--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {{lua|"abcd-g"}} becomes {{lua|"[abcd-g]"}}, and {{lua|"[]"}} becomes {{lua|"[[%]]"}}).
+The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==]
+function get_charset(charset)
+	if type(charset) == "number" then
+		return tostring(charset)
+	end
+	local pos, start, n, output = 1, 1, 0
+	if byte(charset) == 0x5E then -- ^
+		pos = pos + 1
+	end
+	 -- FIXME: "]" is non-magic if it's the first character in a charset.
+	local nxt_pos, nxt
+	while true do
+		local new_pos, ch = match(charset, "()([%%%-%]])", pos)
+		if not ch then
+			break
+		-- Skip percent escapes. Ranges can't start with them, either.
+		elseif ch == "%" then
+			pos = new_pos + 2
+		else
+			-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.
+			if ch == "-" and new_pos > pos then
+				pos, nxt_pos, nxt = new_pos - 1, new_pos, ch
+				ch = sub(charset, pos, pos)
+			else
+				pos, nxt_pos = new_pos, new_pos + 1
+				nxt = sub(charset, nxt_pos, nxt_pos)
+			end
+			-- Range.
+			if nxt == "-" then
+				if output == nil then
+					output = {}
+				end
+				n = n + 1
+				output[n] = sub(charset, start, pos - 1)
+				nxt_pos = nxt_pos + 1
+				nxt = sub(charset, nxt_pos, nxt_pos)
+				-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.
+				if nxt == "" or nxt == "%" then
+					n = n + 1
+					output[n] = (ch == "]" and "%]" or ch) .. "%-"
+					start = nxt_pos
+					nxt_pos = nxt_pos + 2
+				-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is omitted if the range would be empty (i.e. if the first byte is greater than the second).
+				else
+					n = n + 1
+					output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..
+						(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)
+					nxt_pos = nxt_pos + 1
+					start = nxt_pos
+				end
+			elseif ch == "-" or ch == "]" then
+				if output == nil then
+					output = {}
+				end
+				n = n + 1
+				output[n] = sub(charset, start, pos - 1) .. "%" .. ch
+				start = nxt_pos
+			end
+			pos = nxt_pos
+		end
+	end
+	if start == 1 then
+		return "[" .. charset .. "]"
+	end
+	return "[" .. concat(output) .. sub(charset, start) .. "]"
 end
+get_charset = memoize(get_charset, true)
+export.get_charset = get_charset
 function export.len(str)
@@ Line 430: / Line 670: @@
 --[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
 function export.reverse(str)
-	return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
+	return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
 end
@@ Line 441: / Line 681: @@
 		cp = tonumber(cp)
 		if cp < 0 then
-			err("-0x" .. format("%X", -cp + 1))
+			err(format("-0x%X", -cp))
 		elseif cp < 0x80 then
 			return char(cp)
@@ Line 466: / Line 706: @@
 			)
 		end
-		err("0x" .. format("%X", cp))
+		err(format("0x%X", cp))
 	end
@@ Line 567: / Line 807: @@
 do
-	local function add_captures(text, n, ...)
+	local function add_captures(t, n, ...)
+		if ... == nil then
+			return
+		end
 		-- Insert any captures from the splitting pattern.
 		local offset, capture = n - 1, ...
 		while capture do
 			n = n + 1
-			text[n] = capture
+			t[n] = capture
 			capture = select(n - offset, ...)
 		end
@@ Line 578: / Line 821: @@
 	end
-	local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
+	--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
-		if not (loc1 and start <= str_len) then
-			-- If no match, or there is but we're past the end of the string
-			-- (which happens when the match is the empty string), then add
-			-- the final chunk and return.
-			n = n + 1
-			text[n] = _sub(str, start)
-			return
-		elseif loc2 < loc1 then
-			-- Special case: If we match the empty string, then include the
-			-- next character; this avoids an infinite loop, and makes
-			-- splitting by an empty string work the way mw.text.split() does
-			-- (including non-adjacent empty string matches with %f). If we
-			-- reach the end of the string this way, return immediately, so we
-			-- don't get a final empty string. If using the string library, we
-			-- need to make sure we advance by one UTF-8 character.
-			if _sub == sub then
-				loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
-			end
-			n = n + 1
-			text[n] = _sub(str, start, loc1)
-			start = loc1 + 1
-			if start > str_len then
-				return ... and add_captures(text, n, ...) or n
-			end
-		else
-			-- Add chunk up to the current match.
-			n = n + 1
-			text[n] = _sub(str, start, loc1 - 1)
-			start = loc2 + 1
-		end
-		return (... and add_captures(text, n, ...) or n), start
-	end
-	local function _split(str, pattern, str_len, _sub, _find, plain)
+		In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil if there are no further matches. By default, the start index will be calculated using the ustring library, unless `str_lib` or `plain` is set.]==]
-		local text, n, start = {}, 0, 1
+	function export.split(str, pattern_or_func, str_lib, plain)
+		local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
 		repeat
-			n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
+			n = add_captures(t, n, iter())
-		until not start
+		until n == nil
+		return t
-		return text
-	end
-	--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
-	function export.split(str, pattern, str_lib, plain)
-		if str_lib or plain then
-			return _split(str, pattern, #str, sub, find, plain)
-		end
-		local simple = pattern_simplifier(pattern)
-		if simple then
-			return _split(str, simple, #str, sub, find)
-		end
-		return _split(str, pattern, ulen(str), usub, ufind)
 	end
 	export.capturing_split = export.split -- To be removed.
 end
-do
+--[==[Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the string up the splitting pattern, with any capture groups being returned as additional values on that iteration.]==]
-	-- TODO: merge this with export.split. Not clear how to do this while
+function export.gsplit(str, pattern_or_func, str_lib, plain)
-	-- maintaining the same level of performance, as gsplit is slower.
+	local start, final, str_len, _string, callable = 1
-	local function _split(str, pattern, str_len, _sub, _find, plain)
+	pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
-		local start, final = 1
+	local _find, _sub = _string.find, _string.sub
-		local function iter(loc1, loc2, ...)
+	local function iter(loc1, loc2, ...)
-			-- If no match, return the final chunk.
+		-- If no match, or there is but we're past the end of the string
-			if not loc1 then
+		-- (which happens when the match is the empty string), then return
+		-- the final chunk.
+		if not loc1 then
+			final = true
+			return _sub(str, start)
+		end
+		-- Special case: If we match the empty string, then eat the
+		-- next character; this avoids an infinite loop, and makes
+		-- splitting by the empty string work the way mw.text.gsplit() does
+		-- (including non-adjacent empty string matches with %f). If we
+		-- reach the end of the string this way, set `final` to true, so we
+		-- don't get stuck matching the empty string at the end.
+		local chunk
+		if loc2 < loc1 then
+			-- If using the string library, we need to make sure we advance
+			-- by one UTF-8 character.
+			if _sub == sub then
+				local b = byte(str, loc1)
+				if b and b >= 128 then
+					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
+				end
+			end
+			chunk = _sub(str, start, loc1)
+			if loc1 >= str_len then
 				final = true
-				return _sub(str, start)
-			end
-			-- Special case: If we match the empty string, then eat the
-			-- next character; this avoids an infinite loop, and makes
-			-- splitting by the empty string work the way mw.text.gsplit() does
-			-- (including non-adjacent empty string matches with %f). If we
-			-- reach the end of the string this way, set `final` to true, so we
-			-- don't get stuck matching the empty string at the end.
-			local chunk
-			if loc2 < loc1 then
-				-- If using the string library, we need to make sure we advance
-				-- by one UTF-8 character.
-				if _sub == sub then
-					loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
-				end
-				chunk = _sub(str, start, loc1)
-				if loc1 >= str_len then
-					final = true
-				else
-					start = loc1 + 1
-				end
-			-- Eat chunk up to the current match.
 			else
-				chunk = _sub(str, start, loc1 - 1)
+				start = loc1 + 1
-				start = loc2 + 1
 			end
-			return chunk, ...
+		-- Eat chunk up to the current match.
+		else
+			chunk = _sub(str, start, loc1 - 1)
+			start = loc2 + 1
 		end
+		return chunk, ...
+	end
+	if callable then
 		return function()
 			if not final then
-				return iter(_find(str, pattern, start, plain))
+				return iter(pattern_or_func(str, start))
+			end
+		end
+	-- Special case if the pattern is anchored to the start: "^" always
+	-- anchors to the start position, not the start of the string, so get
+	-- around this by only attempting one match with the pattern, then match
+	-- the end of the string.
+	elseif byte(pattern_or_func) == 0x5E then -- ^
+		local returned
+		return function()
+			if not returned then
+				returned = true
+				return iter(_find(str, pattern_or_func, start, plain))
+			elseif not final then
+				return iter(_find(str, "$", start, plain))
 			end
-			return nil
 		end
 	end
+	return function()
-	function export.gsplit(str, pattern, str_lib, plain)
+		if not final then
-		if str_lib or plain then
+			return iter(_find(str, pattern_or_func, start, plain))
-			return _split(str, pattern, #str, sub, find, plain)
-		end
-		local simple = pattern_simplifier(pattern)
-		if simple then
-			return _split(str, simple, #str, sub, find)
 		end
-		return _split(str, pattern, ulen(str), usub, ufind)
 	end
 end
+gsplit = export.gsplit
-function export.trim(str, charset)
+function export.trim(str, charset, str_lib, plain)
-	if not charset then
+	if charset == nil then
-		return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
+		-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" first.
-	elseif match(charset, "^()[^\128-\255]*$") then
+		return match(gsub(str, "^%s*", ""), "^.*%S") or ""
-		return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
+	elseif charset == "" then
+		return str
 	end
-	return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
+	charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
+	-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there would be two callbacks into PHP, which is slower.
+	local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
+	if not str_lib then
+		local simple = pattern_simplifier(pattern)
+		if not simple then
+			return umatch(str, pattern)
+		end
+		pattern = simple
+	end
+	return match(str, pattern)
 end
 do
 	local entities
+	local function get_entities()
-	local function decode_numeric_entity(code, pattern, base)
+		entities, get_entities = load_data("Module:data/entities"), nil
-		local cp = match(code, pattern) and tonumber(code, base)
+		return entities
-		return cp and cp < 0x110000 and u(cp) or nil
 	end
 	local function decode_entity(hash, x, code)
-		if hash == "#" then
+		if hash == "" then
-			return x == "" and decode_numeric_entity(code, "^%d+$") or
+			return (entities or get_entities())[x .. code]
-				decode_numeric_entity(code, "^%x+$", 16)
+		end
+		local cp
+		if x == "" then
+			cp = match(code, "^()%d+$") and tonumber(code)
+		else
+			cp = match(code, "^()%x+$") and tonumber(code, 16)
 		end
-		entities = entities or load_data("Module:data/entities")
+		return cp and cp < 0x110000 and u(cp) or nil
-		return entities[x .. code]
 	end
 	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
 	function export.decode_entities(str)
-		return find(str, "&", 1, true) and
+		local amp = find(str, "&", nil, true)
-			gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
+		return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
 	end
 end
 do
-	local html_entities
+	local entities
+	local function get_entities()
+		-- Memoized HTML entities (taken from mw.text.lua).
+		entities, get_entities = {
+			["\""] = "&quot;",
+			["&"] = "&amp;",
+			["'"] = "&#039;",
+			["<"] = "&lt;",
+			[">"] = "&gt;",
+			["\194\160"] = "&nbsp;",
+		}, nil
+		return entities
+	end
 	local function encode_entity(ch)
-		local entity = html_entities[ch]
+		local entity = (entities or get_entities())[ch]
-		if entity then
+		if entity == nil then
-			return entity
+			entity = "&#" .. codepoint(ch) .. ";"
+			entities[ch] = entity
 		end
-		entity = "&#" .. codepoint(ch) .. ";"
-		html_entities[ch] = entity
 		return entity
 	end
 	function export.encode_entities(str, charset, str_lib, plain)
-		-- Memoized HTML entities (taken from mw.text.lua).
+		if charset == nil then
-		html_entities = html_entities or {
+			return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))
-			["\""] = "&quot;",
+		elseif charset == "" then
-			["&"] = "&amp;",
+			return str
-			["'"] = "&#039;",
+		end
-			["<"] = "&lt;",
+		local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)
-			[">"] = "&gt;",
+		if not str_lib then
-			["\194\160"] = "&nbsp;",
+			local simple = pattern_simplifier(pattern)
-		}
+			if not simple then
-		if not charset then
+				return (ugsub(str, pattern, encode_entity))
-			return (gsub(str, "[\"&'<>\194]\160?", html_entities))
-		elseif plain then
-			return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
-		elseif str_lib then
-			if not match(charset, "^()[^\128-\255]*$") then
-				error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
 			end
-			return (gsub(str, "[" .. charset .. "]", encode_entity))
+			pattern = simple
 		end
-		local pattern = charset and "[" .. charset .. "]"
+		return (gsub(str, pattern, encode_entity))
-		local simple = pattern_simplifier(pattern)
-		if simple then
-			return (gsub(str, simple, encode_entity))
-		end
-		return (ugsub(str, pattern, encode_entity))
 	end
 end
@@ Line 787: / Line 1,013: @@
 		enctype = enctype and upper(enctype) or "QUERY"
 		if enctype == "PATH" then
-			return find(str, "%", 1, true) and
+			return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str
-				gsub(str, "%%(%x%x)", decode_path) or str
 		elseif enctype == "QUERY" then
-			return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
+			return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str
-				gsub(str, "([%%%+])(%x?%x?)", decode) or str
 		elseif enctype == "WIKI" then
-			return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
+			return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
-				gsub(str, "([%%_])(%x?%x?)", decode) or str
 		end
 		error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
@@ Line 802: / Line 1,025: @@
 do
 	local function _remove_comments(str, pre)
-		local head = find(str, "<!--", 1, true)
+		local head = find(str, "<!--", nil, true)
 		if not head then
 			return str
@@ Line 843: / Line 1,066: @@
 end
---[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
+--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{lua|"\0"}}, {{lua|"\t"}}, {{lua|"\n"}}, {{lua|"\v"}}, {{lua|"\r"}} and {{lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
 function export.php_trim(str)
+	-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking.
 	return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
 end
 php_trim = export.php_trim
---[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.
+--[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{lua|frame.args}} table). For example, {{lua|"1"}} (a string) is normalized to {{lua|1}} (a number), {{lua|" foo "}} is normalized to {{lua|"foo"}}, and {{lua|1.5}} (a number) is normalized to {{lua|"1.5"}} (a string). Inputs which cannot be normalized (e.g. booleans) return {{lua|nil}}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {{lua|frame:expandTemplates()}}).
-After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
+Strings are trimmed with {{lua|export.php_trim}}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true:
-# They are integers, with no decimals (2.0) or leading zeroes (02).
+# They are integers; i.e. no decimals or leading zeroes (e.g. {{lua|"2"}}, but not {{lua|"2.0"}} or {{lua|"02"}}).
 # They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
-# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
+# There is no leading sign unless < 0 (e.g. {{lua|"2"}} or {{lua|"-2"}}, but not {{lua|"+2"}} or {{lua|"-0"}}).
-function export.scribunto_param_key(key)
+# They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set).
-	if type(key) ~= "string" then
-		return key
+Numbers are converted to strings if '''either''':
+# They are not integers (e.g. {{lua|1.5}}).
+# They are > 2{{sup|53}} or < -2{{sup|53}}.
+When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {{lua|tostring()}}.]==]
+function export.scribunto_param_key(key, no_trim)
+	local tp = type(key)
+	if tp == "string" then
+		if not no_trim then
+			key = php_trim(key)
+		end
+		if match(key, "^()-?[1-9]%d*$") then
+			local num = tonumber(key)
+			-- Lua integers are only precise to 2^53 - 1, so specifically check for 2^53 and -2^53 as strings, since a numerical comparison won't work as it can't distinguish 2^53 from 2^53 + 1.
+			return (
+				num <= 9007199254740991 and num >= -9007199254740991 or
+				key == "9007199254740992" or
+				key == "-9007199254740992"
+			) and num or key
+		end
+		return key == "0" and 0 or key
+	elseif tp == "number" then
+		-- No special handling needed for inf or NaN.
+		return key % 1 == 0 and (
+			key <= 9007199254740992 and key >= -9007199254740992 and key or
+			key <= 9223372036854775808 and key >= -9223372036854775808 and format("%d", key)
+		) or tostring(key)
 	end
-	key = php_trim(key)
+	return nil
-	if match(key, "^-?[1-9]%d*$") then
-		local num = tonumber(key)
-		-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
-		return (
-			num <= 9007199254740991 and num >= -9007199254740991 or
-			key == "9007199254740992" or
-			key == "-9007199254740992"
-		) and num or key
-	elseif key == "0" then
-		return 0
-	end
-	return key
 end
 do
 	local byte_escapes
+	local function get_byte_escapes()
+		byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
+		return byte_escapes
+	end
 	local function escape_byte(b)
-		return byte_escapes[b] or format("\\%03d", byte(b))
+		return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))
 	end
 	function export.escape_bytes(str)
-		byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
 		return (gsub(str, ".", escape_byte))
 	end
@@ Line 892: / Line 1,134: @@
 			return name == "op" and "{" or
 				name == "cl" and "}" or
-				error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
+				error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")
 		elseif fun(name) and type(fun(name)) ~= "string" then
-			error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
+			error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
 		end
-		return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
+		return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")
 	end))
 end
 format_fun = export.format_fun
---[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
+--[==[This function, unlike {{lua|string.format}} and {{lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{lua|{param_name}}} in the format string with the table's entry for {{lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
 ====Examples====
-* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
+* {{lua|=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
-*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
+*: produces: {{lua|"one fish, two fish, red fish, blue fish"}}
-* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
+* {{lua|=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
-*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
+*: produces: {{lua|"The set {1, 2, 3} contains three elements."}}
 *:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
 function export.format(str, tbl)
@@ Line 957: / Line 1,199: @@
 end
-do
+function export.pluralize(...) -- To be removed once all calling modules have been changed to call Module:en-utilities directly.
-	local function word_ends_in_consonant_plus_y(str)
+	export.pluralize = require("Module:en-utilities").pluralize
-		-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
+	return export.pluralize(...)
-		-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-		-- We should maybe consider applying this rule here; but it may not
-		-- be important as this function is almost always called on common nouns
-		-- (e.g. parts of speech, place types).
-		return find(str, "[^aeiouyAEIOUY ]y$")
-	end
-	local function word_takes_es_plural(str)
-		return find(str, "[sxz]$") or find(str, "[csz]h$")
-	end
-	local function do_pluralize(str)
-		if word_ends_in_consonant_plus_y(str) then
-			-- avoid returning multiple values
-			return (gsub(str, "y$", "ies"))
-		elseif word_takes_es_plural(str) then
-			return str .. "es"
-		end
-		return str .. "s"
-	end
-	--[==[
-	Pluralize a word in a smart fashion, according to normal English rules.
-	# If word ends in consonant + -y, replace the -y with -ies.
-	# If the word ends in -s, -x, -z, -ch, -sh, -zh, add -es.
-	# Otherwise, add -s.
-	This handles links correctly:
-	# If a piped link, change the second part appropriately.
-	# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
-	# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
-	]==]
-	function export.pluralize(str)
-		if type(str) == "table" then
-			-- allow calling from a template
-			str = str.args[1]
-		end
-		-- Check for a link. This pattern matches both piped and unpiped links.
-		-- If the link is not piped, the second capture (linktext) will be empty.
-		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
-		if not link then
-			return do_pluralize(str)
-		elseif linktext ~= "" then
-			return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
-		elseif word_ends_in_consonant_plus_y(link) then
-			return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
-		end
-		return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
-	end
 end
@@ Line 1,072: / Line 1,265: @@
 function export.get_indefinite_article(str, ucfirst)
 	str = str or ""
-	local is_vowel = false
 	-- If there's a link at the beginning, examine the first letter of the
 	-- link text. This pattern matches both piped and unpiped links.
 	-- If the link is not piped, the second capture (linktext) will be empty.
 	local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
-	if link then
+	if match(link and (linktext ~= "" and linktext or link) or str, "^()[AEIOUaeiou]") then
-		is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
+		return ucfirst and "An" or "an"
-	else
-		is_vowel = find(str, "^[AEIOUaeiou]")
 	end
-	return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
+	return ucfirst and "A" or "a"
 end
 get_indefinite_article = export.get_indefinite_article