Module:string utilities: Difference between revisions

Line 4:

local load_module = "Module:load"

local memoize_module = "Module:memoize"

local string_char_module = "Module:string/char"

local string_charset_escape_module = "Module:string/charsetEscape"

local mw = mw

Line 17:

Line 19:

local gmatch = string.gmatch

local gsub = string.gsub

local insert = table.insert

local len = string.len

local lower = string.lower

Line 36:

Line 39:

local ulower = ustring.lower

local umatch = ustring.match

local unpack = unpack

local unpack = unpack or table.unpack -- Lua 5.2 compatibility

local upper = string.upper

local usub = ustring.sub

Line 44:

Line 47:

-- Defined below.

~~local charset_escape~~

local codepoint

local explode_utf8

local format_fun

local get_charset

~~local get_indefinite_article~~

local gsplit

local pattern_escape

local pattern_simplifier

~~local php_trim~~

local replacement_escape

local title_case

local trim

local u

local ucfirst

local ulen

--[==[

Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]

Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures

modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no

overhead after the first call, since the target functions are called directly in any subsequent calls.

]==]

local function charset_escape(...)

charset_escape = require(string_charset_escape_module)

return charset_escape(...)

end

local function is_callable(...)

is_callable = require(function_module).is_callable

Line 69:

Line 78:

load_data = require(load_module).load_data

return load_data(...)

end

local function u(...)

u = require(string_char_module)

return u(...)

end

Line 84:

Line 98:

end

--[==[Returns {nil} if the input value is the empty string, or otherwise the same value.

--[==[

Returns {nil} if the input value is the empty string, or otherwise the same value.

If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is the empty string, returns {nil}.

If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is

the empty string, returns {nil}.

If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).]==]

If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input

string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation

marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also

be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).

]==]

function export.is_not_empty(str, do_trim, quote_delimiters)

if str == "" then

Line 103:

Line 123:

end

--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function ~~has no safety checks for non-~~UTF-8 ~~byte sequences,~~ to optimize speed and memory use. ~~Inputs~~ containing ~~them therefore~~ result in ~~undefined~~ behaviour.]==]

--[==[

Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8

in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in

unexpected behaviour.

]==]

function export.explode_utf8(str)

local text, i = {}, 0

Line 113:

Line 137:

end

explode_utf8 = export.explode_utf8

--[==[

Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:

* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to

`0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte

characters start with `0xF0` to `0xF4`.

* The leading byte must not fall outside of the above ranges.

* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.

* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).

* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to

U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings

that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte

character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000).

Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but

`0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.

If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in

UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of

surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher

codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances

where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly

4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates,

even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).

]==]

function export.isutf8(str, allow_surrogates)

for ch in gmatch(str, "[\128-\255][\128-\191]*") do

if #ch > 4 then

return false

end

local b1, b2, b3, b4 = byte(ch, 1, 4)

if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then

return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F

elseif not b3 then -- 2-byte

if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong

return false

end

elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte

return false

elseif not b4 then -- 3-byte

if b1 > 0xEF then

return false

elseif b2 < 0xA0 then

if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong

return false

end

elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate

return false

end

elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte

return false

elseif b2 < 0x90 then

if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong

return false

end

elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high

return false

end

return true

end

do

Line 125:

Line 209:

}, charset_chars)

--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example, {"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]

--[==[

Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's

version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example,

{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving

arbitrary text (e.g. from user input).

]==]

function export.pattern_escape(str)

return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))

Line 131:

Line 220:

pattern_escape = export.pattern_escape

--[==[~~Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: {%-]^}, and converts the null character to {%z}.]==]~~

--[==[

~~function export.charset_escape(str)~~

Escapes only {%}, which is the only magic character used in replacement

~~return (gsub(str, "[%z%%%-%]^]", charset_chars))~~

[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.

~~end~~

]==]

~~charset_escape = export.charset_escape~~

~~--[==[~~Escapes only {%}, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]

function export.replacement_escape(str)

return (gsub(str, "%%", "%%%%"))

Line 186:

Line 272:

--[==[

Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns any pattern matching facilities off in the optional pattern supplied.]==]

Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes

all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second

argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns

any pattern matching facilities off in the optional pattern supplied.

]==]

function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)

if pattern_or_func == nil then

Line 303:

Line 393:

end

--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]

--[==[

Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion

isn't possible, returns false.

]==]

function pattern_simplifier(pattern)

if type(pattern) == "number" then

Line 525:

Line 618:

end

--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).

--[==[

Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring

library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).

The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==]

The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used

(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary

characters.

]==]

function get_charset(charset)

if type(charset) == "number" then

Line 569:

Line 667:

start = nxt_pos

nxt_pos = nxt_pos + 2

-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is omitted if the range would be empty (i.e. if the first byte is greater than the second).

-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be

-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is

-- omitted if the range would be empty (i.e. if the first byte is greater than the second).

else

n = n + 1

Line 683:

Line 783:

end

--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]

--[==[

Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.

]==]

function export.plain_gsub(str, pattern, repl, n)

return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)

end

--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]

--[==[

Reverses a UTF-8 string; equivalent to string.reverse.

]==]

function export.reverse(str)

return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))

end

function export.char(...) -- To be moved to [[Module:string/char]].

return u(...)

end

do

local function ~~err~~(cp)

local function utf8_err(func_name)

error~~("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)~~

error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4)

~~end~~

~~local function utf8_char(cp)~~

~~cp = tonumber(cp)~~

~~if cp < 0 then~~

~~err~~(format("~~-0x~~%~~X",~~ -~~cp))~~

~~elseif cp < 0x80 then~~

~~return char(cp)~~

~~elseif cp < 0x800 then~~

~~return char(~~

~~0xC0 + cp / 0x40,~~

~~0x80 + cp % 0x40~~

)

~~elseif cp < 0x10000 then~~

~~if cp >= 0xD800 and cp < 0xE000 then~~

~~return "?" -- mw.ustring.char returns~~ "~~?" for surrogates.~~

~~end~~

~~return char(~~

~~0xE0 + cp / 0x1000,~~

~~0x80 + cp / 0x40 % 0x40~~,

~~0x80 + cp % 0x40~~

)

~~elseif cp < 0x110000 then~~

~~return char(~~

~~0xF0 + cp / 0x40000~~,

~~0x80 + cp / 0x1000 % 0x40,~~

~~0x80 + cp / 0x40 % 0x40,~~

~~0x80 + cp % 0x40~~

)

~~end~~

~~err(format("0x%X", cp)~~)

end

~~function export.char(cp, ...)~~

local function get_codepoint(func_name, b1, b2, b3, b4)

~~if ... == nil then~~

if b1 <= 0x7F then

~~return utf8_char(cp)~~

~~end~~

~~local ret = {cp, ...}~~

~~for i = 1, select("#", cp, ...) do~~

~~ret[i] = utf8_char(ret[i])~~

~~end~~

~~return concat(ret)~~

~~end~~

~~u = export.char~~

~~end~~

do

local function get_codepoint(b1, b2, b3, b4)

if b1 < ~~128~~ then

return b1, 1

elseif b1 < ~~224~~ then

elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then

~~return~~ 0x40 * b1 + b2 - 0x3080, 2

utf8_err(func_name)

elseif b1 < ~~240~~ then

elseif b1 <= 0xDF then

~~return~~ 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3

local cp = 0x40 * b1 + b2 - 0x3080

return cp >= 0x80 and cp or utf8_err(func_name), 2

elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then

utf8_err(func_name)

elseif b1 <= 0xEF then

local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080

return cp >= 0x800 and cp or utf8_err(func_name), 3

elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then

utf8_err(func_name)

end

~~return~~ 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4

local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080

return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4

end

function export.codepoint(str, i, j)

if type(str) == "number" then

if str == "" then

return -- return nothing

elseif type(str) == "number" then

return byte(str, i, j)

end

i, j = i or 1, j == -1 and #str or i or 1

if i == 1 and j == 1 then

return (get_codepoint(byte(str, 1, 4)))

return (get_codepoint("codepoint", byte(str, 1, 4)))

elseif i < 0 or j < 0 then

return ucodepoint(str, i, j) -- FIXME

Line 777:

Line 851:

nr = nr + 1

local add

ret[nr], add = get_codepoint(b1, b2, b3, b4)

ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4)

nb = nb + add

end

Line 809:

Line 883:

return nil

end

local ret, add = get_codepoint(b1, b2, b3, b4)

local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4)

nb = nb + add

return ret

Line 816:

Line 890:

end

--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]

do

function export.lower(str)

local _ulower = ulower

return (match(str, "^()[^\128-\255]*$") and lower or ~~ulower~~)(str)

--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]

function export.lower(str)

return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str)

end

--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]

do

function export.upper(str)

local _uupper = uupper

return (match(str, "^()[^\128-\255]*$") and upper or ~~uupper~~)(str)

--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]

function export.upper(str)

return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str)

end

Line 841:

Line 923:

end

--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.

--[==[

Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like

Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by

one character at a time; Python returns the whole remainder of the string). When possible, it will use the string

library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the

string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.

In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil if there are no further matches. By default, the start index will be calculated using the ustring library, unless `str_lib` or `plain` is set.]==]

In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start

index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil

if there are no further matches. By default, the start index will be calculated using the ustring library, unless

`str_lib` or `plain` is set.

]==]

function export.split(str, pattern_or_func, str_lib, plain)

local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0

Line 854:

Line 945:

end

--[==[Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the string up the splitting pattern, with any capture groups being returned as additional values on that iteration.]==]

--[==[

Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the

string up the splitting pattern, with any capture groups being returned as additional values on that iteration.

]==]

function export.gsplit(str, pattern_or_func, str_lib, plain)

local start, final, str_len, _string, callable = 1

Line 926:

Line 1,020:

end

gsplit = export.gsplit

function export.count(str, pattern, plain)

if plain then

return select(2, gsub(str, pattern_escape(pattern), ""))

end

local simple = pattern_simplifier(pattern)

if simple then

return select(2, gsub(str, pattern, ""))

end

return select(2, ugsub(str, pattern, ""))

end

function export.trim(str, charset, str_lib, plain)

if charset == nil then

-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" first.

-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are

-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to ""

-- first.

return match(gsub(str, "^%s*", ""), "^.*%S") or ""

elseif charset == "" then

Line 935:

Line 1,042:

end

charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)

-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there would be two callbacks into PHP, which is slower.

-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets

-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there

-- would be two callbacks into PHP, which is slower.

local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"

if not str_lib then

Line 965:

Line 1,074:

cp = match(code, "^()%x+$") and tonumber(code, 16)

end

return cp and cp < ~~0x110000~~ and u(cp) or nil

return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil

end

-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].

-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases

-- which have also been included in [[Module:data/entities]].

function export.decode_entities(str)

local amp = find(str, "&", nil, true)

Line 989:

Line 1,099:

return entities

end

local function encode_entity(ch)

local entity = (entities or get_entities())[ch]

if entity == nil then

~~entity~~ = ~~"&#" ..~~ codepoint(ch) .. ";"

local cp = codepoint(ch)

-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities.

entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false

entities[ch] = entity

end

return entity

return entity or nil

end

function export.encode_entities(str, charset, str_lib, plain)

if charset == nil then

Line 1,040:

Line 1,152:

return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str

end

error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)

error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2)

end

Line 1,068:

Line 1,180:

end

--[==[Removes any HTML comments from the input text. `stage` can be one of three options:

--[==[

* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all {{code|html|<nowiki></nowiki>}} pairs are removed, as well as any text after an unclosed {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).

Removes any HTML comments from the input text. `stage` can be one of three options:

* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code|html|<nowiki></nowiki>}} pairs until no more are found (e.g. {{code|html|<nowiki><!-- ... --></nowiki>}} would be fully removed), but any unclosed {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {"PRE"} method will have already been applied by the native parser.

* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all

* {"BOTH"} applies {"PRE"} then {"POST"}.]==]

{{code|html|<nowiki></nowiki>}} pairs are removed, as well as any text after an unclosed

{{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or

[[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the

preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags);

if full accuracy is absolutely necessary, use [[Module:template parser]] instead).

* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops

over the text, removing any {{code|html|<nowiki></nowiki>}} pairs until no more are found (e.g.

{{code|html|<nowiki><!-- ... --></nowiki>}} would be fully removed), but any unclosed

{{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs,

where the {"PRE"} method will have already been applied by the native parser.

* {"BOTH"} applies {"PRE"} then {"POST"}.

]==]

function export.remove_comments(str, stage)

if not stage or stage == "PRE" then

Line 1,078:

Line 1,201:

local processed = stage == "POST" and _remove_comments(str) or

stage == "BOTH" and _remove_comments(str, true) or

error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)

error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2)

while processed ~= str do

str = processed

Line 1,085:

Line 1,208:

return str

end

~~end~~

--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {"\0"}, {"\t"}, {"\n"}, {"\v"}, {"\r"} and {" "}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]

~~function export.php_trim(str)~~

-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking.

~~return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""~~

~~end~~

~~php_trim = export.php_trim~~

--[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {frame.args} table). For example, {"1"} (a string) is normalized to {1} (a number), {" foo "} is normalized to {"foo"}, and {1.5} (a number) is normalized to {"1.5"} (a string). Inputs which cannot be normalized (e.g. booleans) return {nil}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {frame:expandTemplates()}).

~~Strings are trimmed with {export.php_trim}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true:~~

~~# They are integers; i.e. no decimals or leading zeroes (e.g. {"2"}, but not {"2.0"} or {"02"}).~~

~~# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.~~

~~# There is no leading sign unless < 0 (e.g. {"2"} or {"-2"}, but not {"+2"} or {"-0"}).~~

~~# They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set).~~

~~Numbers are converted to strings if '''either''':~~

~~# They are not integers (e.g. {1.5}).~~

~~# They are > 2{{sup|53}} or < -2{{sup|53}}.~~

When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {tostring()}.]==]

~~function export.scribunto_param_key(key, no_trim)~~

~~local tp = type(key)~~

~~if tp == "string" then~~

~~if not no_trim then~~

~~key = php_trim(key)~~

~~end~~

~~if match(key, "^()-?[1-9]%d*$") then~~

~~local num = tonumber(key)~~

~~-- Lua integers are only precise to 2^53 - 1, so specifically check for 2^53 and -2^53 as strings, since a numerical comparison won't work as it can't distinguish 2^53 from 2^53 + 1.~~

~~return (~~

~~num <= 9007199254740991 and num >= -9007199254740991 or~~

~~key == "9007199254740992" or~~

~~key == "-9007199254740992"~~

~~) and num or key~~

~~end~~

~~return key == "0" and 0 or key~~

~~elseif tp == "number" then~~

~~-- No special handling needed for inf or NaN.~~

~~return key % 1 == 0 and (~~

~~key <= 9007199254740992 and key >= -9007199254740992 and key or~~

~~key <= 9223372036854775808 and key >= -9223372036854775808 and format("%d", key)~~

~~) or tostring(key)~~

~~end~~

~~return nil~~

end

do

local byte_escapes

local function get_byte_escapes()

byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil

Line 1,164:

Line 1,240:

format_fun = export.format_fun

--[==[This function, unlike {string.format} and {mw.ustring.format}, takes just two ~~parameters—a~~ format string and a ~~table—and~~ replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.

--[==[

This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table,

and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening

and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a

slash can be escaped by doubling the initial slash.

====Examples====

Line 1,171:

Line 1,251:

* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}

*: produces: {"The set {1, 2, 3} contains three elements."}

*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]

*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.

]==]

function export.format(str, tbl)

return format_fun(str, function(key)

Line 1,180:

Line 1,261:

do

local function do_uclcfirst(str, case_func)

-- ~~Actual function to re~~-case of the first letter.

-- Re-case the first letter.

local ~~first_letter~~ = ~~case_func(~~match(str, "^.[\128-\191]*") ~~or "~~")

local first, remainder = match(str, "^(.[\128-\191]*)(.*)")

return ~~first_letter~~ .. ~~sub(str, #first_letter + 1~~)

return first and (case_func(first) .. remainder) or ""

end

local function uclcfirst(str, case_func)

-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref>

-- correctly; it's intended for text wrapped in or the like, as happens when passing text through

-- [[Module:links]].

local html_at_beginning = nil

if str:match("^<") then

while true do

local html_tag, rest = str:match("^(<.->)(.*)$")

if not html_tag then

break

end

if not html_at_beginning then

html_at_beginning = {}

end

insert(html_at_beginning, html_tag)

str = rest

end

-- If there's a link at the beginning, re-case the first letter of the

-- link text. This pattern matches both piped and unpiped links.

-- If the link is not piped, the second capture (linktext) will be empty.

local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")

local retval

if link then

~~return~~ "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder

retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder

else

retval = do_uclcfirst(str, case_func)

end

~~return do_uclcfirst~~(~~str, case_func~~)

if html_at_beginning then

retval = concat(html_at_beginning) .. retval

end

return retval

end

--[==[

Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally

surrounded by HTML tags such as `<nowiki>...</nowiki>`, possibly nested. Intended to correctly

uppercase the first character of text that may include links that have been passed through `full_link()` in

[[Module:links]] or a similar function.

]==]

function export.ucfirst(str)

return uclcfirst(str, uupper)

end

ucfirst = export.ucfirst

--[==[

Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally

surrounded by HTML tags such as `<nowiki>...</nowiki>`, possibly nested. Intended to correctly

lowercase the first character of text that may include links that have been passed through `full_link()` in

[[Module:links]] or a similar function.

]==]

function export.lcfirst(str)

return uclcfirst(str, ulower)

end

~~local function capitalize(w)~~

--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==]

~~return uclcfirst(w, uupper)~~

~~end~~

--[==[~~Capitalize~~ each word of a string. WARNING: May be broken in the presence of multiword links.]==]

function export.capitalize(str)

~~if type(str) == "table" then~~

~~-- allow calling from a template~~

~~str = str.args[1]~~

~~end~~

-- Capitalize multi-word that is separated by spaces

-- by uppercasing the first letter of each part.

~~-- I assume nobody will input all CAP text.~~

return (ugsub(str, "%w+", ucfirst))

return (ugsub(str, "%S+", ~~capitalize~~))

end

~~end~~

function ~~export.pluralize~~(~~...~~) ~~-- To be removed once all calling modules have been changed to call Module:en-utilities directly.~~

local function do_title_case(first, remainder)

~~export.pluralize~~ = ~~require~~(~~"Module:en-utilities"~~)~~.pluralize~~

first = uupper(first)

return ~~export.pluralize~~(...)

return remainder == "" and first or (first .. ulower(remainder))

end

do

--[==[

~~local function do_singularize(str)~~

Capitalizes each word of the input string, with any further letters in each word being converted to lowercase.

~~local sing~~ = ~~match~~(str~~, "^(.-)ies$"~~)

]==]

~~if sing then~~

function export.title_case(str)

return ~~sing ..~~ "y"

return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case)

~~end~~

~~-- Handle cases like~~ "~~[[parish]]es~~"

~~return match~~(str, "^(~~.-[cs]h~~%~~]*)es$"~~) ~~or -- not -zhes~~

~~-- Handle cases like "[[box]]es"~~

~~match(str, "^~~(~~.-x~~%]*)~~es$~~"~~) or -- not -ses or -zes~~

~~-- Handle regular plurals~~

~~match(str~~, ~~"^(.-)s$"~~) or

~~-- Otherwise, return input~~

~~str~~

end

title_case = export.title_case

~~local function collapse_link(link, linktext)~~

~~if link~~ =~~= linktext then~~

~~return "[[" .. link .. "]]"~~

~~end~~

~~return "[["~~ .~~. link .. "|" .. linktext .. "]]"~~

~~end~~

--[==[

~~Singularize a word in a smart fashion, according to normal English rules. Works analogously~~ to {~~pluralize()~~}.

Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between

words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase).

~~'''NOTE''': This doesn't always work as well as {pluralize()~~}. ~~Beware. It will mishandle cases like "passes"~~ -~~> "passe", "eyries" -> "eyry".~~

~~# If~~ word ~~ends in -ies, replace -ies with -y.~~

~~# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]~~

~~# Otherwise, remove -s~~.

~~This handles links correctly:~~

# If ~~a piped link~~, ~~change~~ the ~~second part appropriately. Collapse~~ the ~~link to a simple link if both parts end up the same.~~

~~# If a non-piped link, singularize the link.~~

~~# A link like "[[parish]]es"~~ will be ~~handled correctly because the code that checks for -shes etc~~. ~~allows ] characters between the~~

~~'sh' etc~~. ~~and final -es~~.

]==]

function export.~~singularize~~(str)

function export.camel_case(str, lower_first)

~~if type(str) == "table" then~~

str = ugsub(str, "%W*(%w*)", title_case)

~~-- allow calling from a template~~

return lower_first and do_uclcfirst(str, ulower) or str

~~str =~~ str~~.args[1]~~

~~end~~

~~-- Check for a link. This pattern matches both piped and unpiped links.~~

~~-- If the link is not piped, the second capture (linktext) will be empty.~~

~~local beginning, link, linktext~~ = ~~match~~(str, "~~^(.~~*~~)%[%[~~(~~[^|~~%~~]]+~~)~~%|?(.-)%]%]$~~")

~~if not link then~~

return ~~do_singularize~~(str)

~~elseif linktext ~= "" then~~

~~return beginning .. collapse_link(link~~, ~~do_singularize(linktext~~))

~~end~~

~~return beginning .. "[[" .. do_singularize(link) .. "]]"~~

end

~~--[==[~~

do

~~Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.~~

local function do_snake_case(nonword, word)

~~Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with~~

return nonword == "" and word or "_" .. word

~~a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.~~

~~]==]~~

function ~~export.get_indefinite_article~~(~~str~~, ~~ucfirst~~)

~~str = str or ""~~

~~-- If there's a link at the beginning, examine the first letter of the~~

~~-- link text. This pattern matches both piped and unpiped links.~~

~~-- If the link is not piped, the second capture (linktext) will be empty.~~

~~local link, linktext~~ = ~~match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")~~

~~if match(link and (linktext ~~~= "" and ~~linktext~~ or ~~link) or str,~~ "~~^()[AEIOUaeiou]") then~~

~~return ucfirst and "An" or "an~~"

end

~~return ucfirst and "A" or "a"~~

~~end~~

~~get_indefinite_article = export.get_indefinite_article~~

--[==[

~~Prefix `text` with~~ the ~~appropriate indefinite article to prefix~~ to ~~`text`~~. ~~Correctly handles links and capitalized~~

Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between

~~text. Does not correctly handle~~ words ~~like [[union]], [[uniform]] and [[university]] that take "a" despite beginning~~

words.

~~with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase~~.

]==]

function export.snake_case(str)

function export.~~add_indefinite_article~~(~~text, ucfirst~~)

return (ugsub(str, "(%W*)(%w*)", do_snake_case))

return ~~get_indefinite_article~~(~~text~~, ~~ucfirst~~) .. " ~~" .. text~~

end

return export

@@ Line 4: / Line 4: @@
 local load_module = "Module:load"
 local memoize_module = "Module:memoize"
+local string_char_module = "Module:string/char"
+local string_charset_escape_module = "Module:string/charsetEscape"
 local mw = mw
@@ Line 17: / Line 19: @@
 local gmatch = string.gmatch
 local gsub = string.gsub
+local insert = table.insert
 local len = string.len
 local lower = string.lower
@@ Line 36: / Line 39: @@
 local ulower = ustring.lower
 local umatch = ustring.match
-local unpack = unpack
+local unpack = unpack or table.unpack -- Lua 5.2 compatibility
 local upper = string.upper
 local usub = ustring.sub
@@ Line 44: / Line 47: @@
 -- Defined below.
-local charset_escape
 local codepoint
 local explode_utf8
 local format_fun
 local get_charset
-local get_indefinite_article
 local gsplit
 local pattern_escape
 local pattern_simplifier
-local php_trim
 local replacement_escape
+local title_case
 local trim
-local u
+local ucfirst
 local ulen
 --[==[
-Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
+Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures
+modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no
+overhead after the first call, since the target functions are called directly in any subsequent calls.
+]==]
+local function charset_escape(...)
+	charset_escape = require(string_charset_escape_module)
+	return charset_escape(...)
+end
 local function is_callable(...)
 	is_callable = require(function_module).is_callable
@@ Line 69: / Line 78: @@
 	load_data = require(load_module).load_data
 	return load_data(...)
+end
+local function u(...)
+	u = require(string_char_module)
+	return u(...)
 end
@@ Line 84: / Line 98: @@
 end
---[==[Returns {nil} if the input value is the empty string, or otherwise the same value.
+--[==[
+Returns {nil} if the input value is the empty string, or otherwise the same value.
-If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is the empty string, returns {nil}.
+If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is
+the empty string, returns {nil}.
-If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).]==]
+If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input
+string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation
+marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also
+be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).
+]==]
 function export.is_not_empty(str, do_trim, quote_delimiters)
 	if str == "" then
@@ Line 103: / Line 123: @@
 end
---[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
+--[==[
+Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8
+in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in
+unexpected behaviour.
+]==]
 function export.explode_utf8(str)
 	local text, i = {}, 0
@@ Line 113: / Line 137: @@
 end
 explode_utf8 = export.explode_utf8
+--[==[
+Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:
+* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to
+  `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte
+  characters start with `0xF0` to `0xF4`.
+* The leading byte must not fall outside of the above ranges.
+* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.
+* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).
+* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to
+  U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings
+  that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte
+  character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000).
+  Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but
+  `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.
+If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in
+UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of
+surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher
+codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances
+where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly
+hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates,
+even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).
+]==]
+function export.isutf8(str, allow_surrogates)
+	for ch in gmatch(str, "[\128-\255][\128-\191]*") do
+		if #ch > 4 then
+			return false
+		end
+		local b1, b2, b3, b4 = byte(ch, 1, 4)
+		if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
+			return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F
+		elseif not b3 then -- 2-byte
+			if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong
+				return false
+			end
+		elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte
+			return false
+		elseif not b4 then -- 3-byte
+			if b1 > 0xEF then
+				return false
+			elseif b2 < 0xA0 then
+				if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong
+					return false
+				end
+			elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate
+				return false
+			end
+		elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte
+			return false
+		elseif b2 < 0x90 then
+			if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong
+				return false
+			end
+		elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high
+			return false
+		end
+	end
+	return true
+end
 do
@@ Line 125: / Line 209: @@
 	}, charset_chars)
-	--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example, {"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
+	--[==[
+	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's
+	version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example,
+	{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving
+	arbitrary text (e.g. from user input).
+	]==]
 	function export.pattern_escape(str)
 		return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
@@ Line 131: / Line 220: @@
 	pattern_escape = export.pattern_escape
-	--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: {%-]^}, and converts the null character to {%z}.]==]
+	--[==[
-	function export.charset_escape(str)
+	Escapes only {%}, which is the only magic character used in replacement
-		return (gsub(str, "[%z%%%-%]^]", charset_chars))
+	[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.
-	end
+	]==]
-	charset_escape = export.charset_escape
-	--[==[Escapes only {%}, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
 	function export.replacement_escape(str)
 		return (gsub(str, "%%", "%%%%"))
@@ Line 186: / Line 272: @@
 	--[==[
-	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns any pattern matching facilities off in the optional pattern supplied.]==]
+	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes
+	all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second
+	argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns
+	any pattern matching facilities off in the optional pattern supplied.
+	]==]
 	function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
 		if pattern_or_func == nil then
@@ Line 303: / Line 393: @@
 	end
-	--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
+	--[==[
+	Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion
+	isn't possible, returns false.
+	]==]
 	function pattern_simplifier(pattern)
 		if type(pattern) == "number" then
@@ Line 525: / Line 618: @@
 end
---[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).
+--[==[
+Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring
+library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).
-The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==]
+The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used
+(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary
+characters.
+]==]
 function get_charset(charset)
 	if type(charset) == "number" then
@@ Line 569: / Line 667: @@
 					start = nxt_pos
 					nxt_pos = nxt_pos + 2
-				-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is omitted if the range would be empty (i.e. if the first byte is greater than the second).
+				-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be
+				-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is
+				-- omitted if the range would be empty (i.e. if the first byte is greater than the second).
 				else
 					n = n + 1
@@ Line 683: / Line 783: @@
 end
---[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
+--[==[
+Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.
+]==]
 function export.plain_gsub(str, pattern, repl, n)
 	return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
 end
---[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
+--[==[
+Reverses a UTF-8 string; equivalent to string.reverse.
+]==]
 function export.reverse(str)
 	return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
+end
+function export.char(...) -- To be moved to [[Module:string/char]].
+	return u(...)
 end
 do
-	local function err(cp)
+	local function utf8_err(func_name)
-		error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
+		error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4)
-	end
-	local function utf8_char(cp)
-		cp = tonumber(cp)
-		if cp < 0 then
-			err(format("-0x%X", -cp))
-		elseif cp < 0x80 then
-			return char(cp)
-		elseif cp < 0x800 then
-			return char(
-xC0 + cp / 0x40,
-x80 + cp % 0x40
-			)
-		elseif cp < 0x10000 then
-			if cp >= 0xD800 and cp < 0xE000 then
-				return "?" -- mw.ustring.char returns "?" for surrogates.
-			end
-			return char(
-xE0 + cp / 0x1000,
-x80 + cp / 0x40 % 0x40,
-x80 + cp % 0x40
-			)
-		elseif cp < 0x110000 then
-			return char(
-xF0 + cp / 0x40000,
-x80 + cp / 0x1000 % 0x40,
-x80 + cp / 0x40 % 0x40,
-x80 + cp % 0x40
-			)
-		end
-		err(format("0x%X", cp))
 	end
-	function export.char(cp, ...)
+	local function get_codepoint(func_name, b1, b2, b3, b4)
-		if ... == nil then
+		if b1 <= 0x7F then
-			return utf8_char(cp)
-		end
-		local ret = {cp, ...}
-		for i = 1, select("#", cp, ...) do
-			ret[i] = utf8_char(ret[i])
-		end
-		return concat(ret)
-	end
-	u = export.char
-end
-do
-	local function get_codepoint(b1, b2, b3, b4)
-		if b1 < 128 then
 			return b1, 1
-		elseif b1 < 224 then
+		elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
-			return 0x40 * b1 + b2 - 0x3080, 2
+			utf8_err(func_name)
-		elseif b1 < 240 then
+		elseif b1 <= 0xDF then
-			return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
+			local cp = 0x40 * b1 + b2 - 0x3080
+			return cp >= 0x80 and cp or utf8_err(func_name), 2
+		elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then
+			utf8_err(func_name)
+		elseif b1 <= 0xEF then
+			local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080
+			return cp >= 0x800 and cp or utf8_err(func_name), 3
+		elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then
+			utf8_err(func_name)
 		end
-		return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
+		local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080
+		return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4
 	end
 	function export.codepoint(str, i, j)
-		if type(str) == "number" then
+		if str == "" then
+			return -- return nothing
+		elseif type(str) == "number" then
 			return byte(str, i, j)
 		end
 		i, j = i or 1, j == -1 and #str or i or 1
 		if i == 1 and j == 1 then
-			return (get_codepoint(byte(str, 1, 4)))
+			return (get_codepoint("codepoint", byte(str, 1, 4)))
 		elseif i < 0 or j < 0 then
 			return ucodepoint(str, i, j) -- FIXME
@@ Line 777: / Line 851: @@
 				nr = nr + 1
 				local add
-				ret[nr], add = get_codepoint(b1, b2, b3, b4)
+				ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4)
 				nb = nb + add
 			end
@@ Line 809: / Line 883: @@
 				return nil
 			end
-			local ret, add = get_codepoint(b1, b2, b3, b4)
+			local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4)
 			nb = nb + add
 			return ret
@@ Line 816: / Line 890: @@
 end
---[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
+do
-function export.lower(str)
+	local _ulower = ulower
-	return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
+	--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
+	function export.lower(str)
+		return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str)
+	end
 end
---[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
+do
-function export.upper(str)
+	local _uupper = uupper
-	return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
+	--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
+	function export.upper(str)
+		return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str)
+	end
 end
@@ Line 841: / Line 923: @@
 	end
-	--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
+	--[==[
+	Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like
+	Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by
+	one character at a time; Python returns the whole remainder of the string). When possible, it will use the string
+	library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the
+	string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
-		In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil if there are no further matches. By default, the start index will be calculated using the ustring library, unless `str_lib` or `plain` is set.]==]
+	In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start
+	index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil
+	if there are no further matches. By default, the start index will be calculated using the ustring library, unless
+	`str_lib` or `plain` is set.
+	]==]
 	function export.split(str, pattern_or_func, str_lib, plain)
 		local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
@@ Line 854: / Line 945: @@
 end
---[==[Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the string up the splitting pattern, with any capture groups being returned as additional values on that iteration.]==]
+--[==[
+Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the
+string up the splitting pattern, with any capture groups being returned as additional values on that iteration.
+]==]
 function export.gsplit(str, pattern_or_func, str_lib, plain)
 	local start, final, str_len, _string, callable = 1
@@ Line 926: / Line 1,020: @@
 end
 gsplit = export.gsplit
+function export.count(str, pattern, plain)
+	if plain then
+		return select(2, gsub(str, pattern_escape(pattern), ""))
+	end
+	local simple = pattern_simplifier(pattern)
+	if simple then
+		return select(2, gsub(str, pattern, ""))
+	end
+	return select(2, ugsub(str, pattern, ""))
+end
 function export.trim(str, charset, str_lib, plain)
 	if charset == nil then
-		-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" first.
+		-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are
+		-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to ""
+		-- first.
 		return match(gsub(str, "^%s*", ""), "^.*%S") or ""
 	elseif charset == "" then
@@ Line 935: / Line 1,042: @@
 	end
 	charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
-	-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there would be two callbacks into PHP, which is slower.
+	-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets
+	-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there
+	-- would be two callbacks into PHP, which is slower.
 	local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
 	if not str_lib then
@@ Line 965: / Line 1,074: @@
 			cp = match(code, "^()%x+$") and tonumber(code, 16)
 		end
-		return cp and cp < 0x110000 and u(cp) or nil
+		return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil
 	end
-	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
+	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases
+	-- which have also been included in [[Module:data/entities]].
 	function export.decode_entities(str)
 		local amp = find(str, "&", nil, true)
@@ Line 989: / Line 1,099: @@
 		return entities
 	end
 	local function encode_entity(ch)
 		local entity = (entities or get_entities())[ch]
 		if entity == nil then
-			entity = "&#" .. codepoint(ch) .. ";"
+			local cp = codepoint(ch)
+			-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities.
+			entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false
 			entities[ch] = entity
 		end
-		return entity
+		return entity or nil
 	end
 	function export.encode_entities(str, charset, str_lib, plain)
 		if charset == nil then
@@ Line 1,040: / Line 1,152: @@
 			return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
 		end
-		error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
+		error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2)
 	end
 end
@@ Line 1,068: / Line 1,180: @@
 	end
-	--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
+	--[==[
-	* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
+	Removes any HTML comments from the input text. `stage` can be one of three options:
-	* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {"PRE"} method will have already been applied by the native parser.
+	* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all
-	* {"BOTH"} applies {"PRE"} then {"POST"}.]==]
+	  {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed
+	  {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or
+	  [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the
+	  preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags);
+	  if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
+	* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops
+	  over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g.
+	  {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed
+	  {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs,
+	  where the {"PRE"} method will have already been applied by the native parser.
+	* {"BOTH"} applies {"PRE"} then {"POST"}.
+	]==]
 	function export.remove_comments(str, stage)
 		if not stage or stage == "PRE" then
@@ Line 1,078: / Line 1,201: @@
 		local processed = stage == "POST" and _remove_comments(str) or
 			stage == "BOTH" and _remove_comments(str, true) or
-			error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
+			error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2)
 		while processed ~= str do
 			str = processed
@@ Line 1,085: / Line 1,208: @@
 		return str
 	end
-end
---[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {"\0"}, {"\t"}, {"\n"}, {"\v"}, {"\r"} and {" "}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
-function export.php_trim(str)
-	-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking.
-	return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
-end
-php_trim = export.php_trim
---[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {frame.args} table). For example, {"1"} (a string) is normalized to {1} (a number), {" foo "} is normalized to {"foo"}, and {1.5} (a number) is normalized to {"1.5"} (a string). Inputs which cannot be normalized (e.g. booleans) return {nil}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {frame:expandTemplates()}).
-Strings are trimmed with {export.php_trim}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true:
-# They are integers; i.e. no decimals or leading zeroes (e.g. {"2"}, but not {"2.0"} or {"02"}).
-# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
-# There is no leading sign unless < 0 (e.g. {"2"} or {"-2"}, but not {"+2"} or {"-0"}).
-# They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set).
-Numbers are converted to strings if '''either''':
-# They are not integers (e.g. {1.5}).
-# They are > 2{{sup|53}} or < -2{{sup|53}}.
-When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {tostring()}.]==]
-function export.scribunto_param_key(key, no_trim)
-	local tp = type(key)
-	if tp == "string" then
-		if not no_trim then
-			key = php_trim(key)
-		end
-		if match(key, "^()-?[1-9]%d*$") then
-			local num = tonumber(key)
-			-- Lua integers are only precise to 2^53 - 1, so specifically check for 2^53 and -2^53 as strings, since a numerical comparison won't work as it can't distinguish 2^53 from 2^53 + 1.
-			return (
-				num <= 9007199254740991 and num >= -9007199254740991 or
-				key == "9007199254740992" or
-				key == "-9007199254740992"
-			) and num or key
-		end
-		return key == "0" and 0 or key
-	elseif tp == "number" then
-		-- No special handling needed for inf or NaN.
-		return key % 1 == 0 and (
-			key <= 9007199254740992 and key >= -9007199254740992 and key or
-			key <= 9223372036854775808 and key >= -9223372036854775808 and format("%d", key)
-		) or tostring(key)
-	end
-	return nil
 end
 do
 	local byte_escapes
 	local function get_byte_escapes()
 		byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
@@ Line 1,164: / Line 1,240: @@
 format_fun = export.format_fun
---[==[This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters—a format string and a table—and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
+--[==[
+This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table,
+and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening
+and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a
+slash can be escaped by doubling the initial slash.
 ====Examples====
@@ Line 1,171: / Line 1,251: @@
 * {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}
 *: produces: {"The set {1, 2, 3} contains three elements."}
-*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
+*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
+]==]
 function export.format(str, tbl)
 	return format_fun(str, function(key)
@@ Line 1,180: / Line 1,261: @@
 do
 	local function do_uclcfirst(str, case_func)
-		-- Actual function to re-case of the first letter.
+		-- Re-case the first letter.
-		local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
+		local first, remainder = match(str, "^(.[\128-\191]*)(.*)")
-		return first_letter .. sub(str, #first_letter + 1)
+		return first and (case_func(first) .. remainder) or ""
 	end
 	local function uclcfirst(str, case_func)
+		-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref>
+		-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through
+		-- [[Module:links]].
+		local html_at_beginning = nil
+		if str:match("^<") then
+			while true do
+				local html_tag, rest = str:match("^(<.->)(.*)$")
+				if not html_tag then
+					break
+				end
+				if not html_at_beginning then
+					html_at_beginning = {}
+				end
+				insert(html_at_beginning, html_tag)
+				str = rest
+			end
+		end
 		-- If there's a link at the beginning, re-case the first letter of the
 		-- link text. This pattern matches both piped and unpiped links.
 		-- If the link is not piped, the second capture (linktext) will be empty.
 		local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
+		local retval
 		if link then
-			return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
+			retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
+		else
+			retval = do_uclcfirst(str, case_func)
 		end
-		return do_uclcfirst(str, case_func)
+		if html_at_beginning then
+			retval = concat(html_at_beginning) .. retval
+		end
+		return retval
 	end
+	--[==[
+	Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally
+	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
+	uppercase the first character of text that may include links that have been passed through `full_link()` in
+	[[Module:links]] or a similar function.
+	]==]
 	function export.ucfirst(str)
 		return uclcfirst(str, uupper)
 	end
+	ucfirst = export.ucfirst
+	--[==[
+	Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally
+	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
+	lowercase the first character of text that may include links that have been passed through `full_link()` in
+	[[Module:links]] or a similar function.
+	]==]
 	function export.lcfirst(str)
 		return uclcfirst(str, ulower)
 	end
-	local function capitalize(w)
+	--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==]
-		return uclcfirst(w, uupper)
-	end
-	--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
 	function export.capitalize(str)
-		if type(str) == "table" then
-			-- allow calling from a template
-			str = str.args[1]
-		end
 		-- Capitalize multi-word that is separated by spaces
 		-- by uppercasing the first letter of each part.
-		-- I assume nobody will input all CAP text.
+		return (ugsub(str, "%w+", ucfirst))
-		return (ugsub(str, "%S+", capitalize))
 	end
-end
-function export.pluralize(...) -- To be removed once all calling modules have been changed to call Module:en-utilities directly.
+	local function do_title_case(first, remainder)
-	export.pluralize = require("Module:en-utilities").pluralize
+		first = uupper(first)
-	return export.pluralize(...)
+		return remainder == "" and first or (first .. ulower(remainder))
-end
+	end
-do
+	--[==[
-	local function do_singularize(str)
+	Capitalizes each word of the input string, with any further letters in each word being converted to lowercase.
-		local sing = match(str, "^(.-)ies$")
+	]==]
-		if sing then
+	function export.title_case(str)
-			return sing .. "y"
+		return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case)
-		end
-		-- Handle cases like "[[parish]]es"
-		return match(str, "^(.-[cs]h%]*)es$") or -- not -zhes
-		-- Handle cases like "[[box]]es"
-			match(str, "^(.-x%]*)es$") or -- not -ses or -zes
-		-- Handle regular plurals
-			match(str, "^(.-)s$") or
-		-- Otherwise, return input
-			str
 	end
+	title_case = export.title_case
-	local function collapse_link(link, linktext)
-		if link == linktext then
-			return "[[" .. link .. "]]"
-		end
-		return "[[" .. link .. "|" .. linktext .. "]]"
-	end
 	--[==[
-	Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.
+	Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between
+	words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase).
-	'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-	# If word ends in -ies, replace -ies with -y.
-	# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
-	# Otherwise, remove -s.
-	This handles links correctly:
-	# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
-	# If a non-piped link, singularize the link.
-	# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
-	  'sh' etc. and final -es.
 	]==]
-	function export.singularize(str)
+	function export.camel_case(str, lower_first)
-		if type(str) == "table" then
+		str = ugsub(str, "%W*(%w*)", title_case)
-			-- allow calling from a template
+		return lower_first and do_uclcfirst(str, ulower) or str
-			str = str.args[1]
-		end
-		-- Check for a link. This pattern matches both piped and unpiped links.
-		-- If the link is not piped, the second capture (linktext) will be empty.
-		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
-		if not link then
-			return do_singularize(str)
-		elseif linktext ~= "" then
-			return beginning .. collapse_link(link, do_singularize(linktext))
-		end
-		return beginning .. "[[" .. do_singularize(link) .. "]]"
 	end
 end
---[==[
+do
-Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
+	local function do_snake_case(nonword, word)
-Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
+		return nonword == "" and word or "_" .. word
-a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
-]==]
-function export.get_indefinite_article(str, ucfirst)
-	str = str or ""
-	-- If there's a link at the beginning, examine the first letter of the
-	-- link text. This pattern matches both piped and unpiped links.
-	-- If the link is not piped, the second capture (linktext) will be empty.
-	local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
-	if match(link and (linktext ~= "" and linktext or link) or str, "^()[AEIOUaeiou]") then
-		return ucfirst and "An" or "an"
 	end
-	return ucfirst and "A" or "a"
-end
-get_indefinite_article = export.get_indefinite_article
---[==[
+	--[==[
-Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
+	Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between
-text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
+	words.
-with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
+	]==]
-]==]
+	function export.snake_case(str)
-function export.add_indefinite_article(text, ucfirst)
+		return (ugsub(str, "(%W*)(%w*)", do_snake_case))
-	return get_indefinite_article(text, ucfirst) .. " " .. text
+	end
 end
 return export