Module:string utilities: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
Tag: Reverted
m 1 revision imported
 
(5 intermediate revisions by 2 users not shown)
Line 1: Line 1:
local module_name = "string_utilities"
local export = {}
local export = {}


local format_escapes = {
local function_module = "Module:fun"
    ["op"] = "{",
local load_module = "Module:load"
    ["cl"] = "}",
local memoize_module = "Module:memoize"
}
local string_char_module = "Module:string/char"
local string_charset_escape_module = "Module:string/charsetEscape"


function export.format_fun(str, fun)
local mw = mw
    return (str:gsub("{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
local string = string
        if #p1 + #p2 == 1 then
local table = table
            return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
local ustring = mw.ustring
        else
 
        if fun(name) and type(fun(name)) ~= "string" then
local byte = string.byte
        error(module_name .. ".format: '" .. name .. "' is a " .. type(fun(name)) .. ", not a string")
local char = string.char
        end
local concat = table.concat
            return fun(name) or error(module_name .. ".format: '" .. name .. "' not found in table")
local find = string.find
        end
local format = string.format
    end))
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local len = string.len
local lower = string.lower
local match = string.match
local next = next
local require = require
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper
 
local memoize = require(memoize_module)
 
-- Defined below.
local codepoint
local explode_utf8
local format_fun
local get_charset
local gsplit
local pattern_escape
local pattern_simplifier
local replacement_escape
local title_case
local trim
local ucfirst
local ulen
 
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures
modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no
overhead after the first call, since the target functions are called directly in any subsequent calls.
]==]
local function charset_escape(...)
charset_escape = require(string_charset_escape_module)
return charset_escape(...)
end
 
local function is_callable(...)
is_callable = require(function_module).is_callable
return is_callable(...)
end
 
local function load_data(...)
load_data = require(load_module).load_data
return load_data(...)
end
end


--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
local function u(...)
====Examples====
u = require(string_char_module)
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
return u(...)
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
    return export.format_fun(str, function (key) return tbl[key] end)
end
end


-- A helper function which takes a string, position and type ("byte" or "char"), and returns the equivalent position for the other type (e.g. iterate_utf8("字典", 2, "char") returns 4, because character 2 of "字典" begins with byte 4). `pos` can be positive or negative, and the function will iterate over the string forwards or backwards (respectively) until it reaches the input position. Checks byte-by-byte; skipping over trailing bytes, and then calculating the correct byte trail for any leading bytes (i.e. how many trailing bytes should follow); these trailing bytes are then checked together.
local function prepare_iter(str, pattern, str_lib, plain)
-- The optional parameters `init_from_type` and `init_to_type` can be used to start part-way through an iteration to improve performance, if multiple values need to be returned from the same string. For example, iterate_utf8("слова́рь", 11, "byte", 5, 3) will begin checking at byte 5/the start of character 3. Note: The function won't check if these values match each other (as the only way to do this would be to run the iteration from the beginning), so mismatched values will return incorrect results.
local callable = is_callable(pattern)
local function iterate_utf8(text, pos, from_type, init_from_type, init_to_type)
if str_lib or plain then
-- Position 0 is always valid and never changes.
return pattern, #str, string, callable
if pos == 0 then
elseif not callable then
return pos
local simple = pattern_simplifier(pattern)
if simple then
return simple, #str, string, false
end
end
end
return pattern, ulen(str), ustring, callable
local to_type
end
if from_type == "char" then
 
to_type = "byte"
--[==[
else
Returns {nil} if the input value is the empty string, or otherwise the same value.
to_type = "char"
 
If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is
the empty string, returns {nil}.
 
If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input
string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation
marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also
be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).
]==]
function export.is_not_empty(str, do_trim, quote_delimiters)
if str == "" then
return nil
elseif not (str and type(str) == "string") then
return str
elseif do_trim then
str = trim(str)
if str == "" then
return nil
end
end
end
return quote_delimiters and gsub(str, "^(['\"])(.*)%1$", "%2") or str
-- Positive positions iterate forwards; negative positions iterate backwards.
end
local iterate_val
 
if pos > 0 then
--[==[
iterate_val = 1
Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8
else
in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in
iterate_val = -1
unexpected behaviour.
]==]
function export.explode_utf8(str)
local text, i = {}, 0
for ch in gmatch(str, ".[\128-\191]*") do
i = i + 1
text[i] = ch
end
end
return text
-- Adjust init_from_type and init_to_type to the iteration before, so that matches for the position given by them will work.
end
local trail, cp, min, b = 0
explode_utf8 = export.explode_utf8
local c, leading_byte = {}
 
c[from_type] = init_from_type and init_from_type ~= 0 and init_from_type - iterate_val or 0
--[==[
c[to_type] = init_to_type and init_to_type ~= 0 and init_to_type - iterate_val or 0
Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:
* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to
while true do
  `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte
if pos > 0 then
  characters start with `0xF0` to `0xF4`.
b = text:byte(c.byte + 1)
* The leading byte must not fall outside of the above ranges.
else
* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.
b = text:byte(text:len() + c.byte)
* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).
* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to
  U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings
  that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte
  character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000).
  Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but
  `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.
 
If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in
UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of
surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher
codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances
where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly
4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates,
even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).
]==]
function export.isutf8(str, allow_surrogates)
for ch in gmatch(str, "[\128-\255][\128-\191]*") do
if #ch > 4 then
return false
end
end
-- Position byte doesn't exist, so iterate the return value and return it.
local b1, b2, b3, b4 = byte(ch, 1, 4)
if not b then
if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
return c[to_type] + iterate_val
return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F
elseif b < 0x80 then
elseif not b3 then -- 2-byte
-- 1-byte codepoint, 00-7F.
if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong
trail = 0
return false
cp = b
min = 0
leading_byte = true
elseif b < 0xc0 then
-- A trailing byte.
leading_byte = false
elseif b < 0xc2 then
-- An overlong encoding for a 1-byte codepoint.
error("String " .. text .. " is not UTF-8.")
elseif b < 0xe0 then
-- 2-byte codepoint, C2-DF.
trail = 1
cp = b - 0xc0
min = 0x80
leading_byte = true
elseif b < 0xf0 then
-- 3-byte codepoint, E0-EF.
trail = 2
cp = b - 0xe0
min = 0x800
leading_byte = true
elseif b < 0xf4 then
-- 4-byte codepoint, F0-F3.
trail = 3
cp = b - 0xf0
min = 0x10000
leading_byte = true
elseif b == 0xf4 then
-- 4-byte codepoint, F4.
-- Make sure it doesn't decode to over U+10FFFF.
if text:byte(c.byte + 2) > 0x8f then
error("String " .. text .. " is not UTF-8.")
end
end
trail = 3
elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte
cp = 4
return false
min = 0x100000
elseif not b4 then -- 3-byte
leading_byte = true
if b1 > 0xEF then
else
return false
-- Codepoint over U+10FFFF, or invalid byte.
elseif b2 < 0xA0 then
error("String " .. text .. " is not UTF-8.")
if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong
return false
end
elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate
return false
end
elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte
return false
elseif b2 < 0x90 then
if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong
return false
end
elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high
return false
end
end
end
-- Check subsequent bytes for multibyte codepoints.
return true
if leading_byte then
end
local from, to
 
if pos > 0 then
do
from, to = c.byte + 2, c.byte + 1 + trail
local charset_chars = {
else
["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"
from, to = text:len() + c.byte + 1, text:len() + c.byte + trail
}
charset_chars.__index = charset_chars
local chars = setmetatable({
["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",
["."] = "%.", ["?"] = "%?", ["["] = "%["
}, charset_chars)
 
--[==[
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's
version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example,
{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving
arbitrary text (e.g. from user input).
]==]
function export.pattern_escape(str)
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
end
pattern_escape = export.pattern_escape
 
--[==[
Escapes only {%}, which is the only magic character used in replacement
[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.
]==]
function export.replacement_escape(str)
return (gsub(str, "%%", "%%%%"))
end
replacement_escape = export.replacement_escape
 
local function case_insensitive_char(ch)
local upper_ch = uupper(ch)
if upper_ch == ch then
ch = ulower(ch)
if ch == upper_ch then
return chars[ch] or ch
end
end
for trailing_byte = from, to do
end
b = text:byte(trailing_byte)
return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"
if not b or b < 0x80 or b > 0xbf then
end
error("String " .. text .. " is not UTF-8.")
 
local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)
if not (loc1 and start <= str_len) then
-- Add final chunk and return.
n = n + 1
text[n] = _gsub(_sub(str, start), ".", chars)
return
elseif loc2 < loc1 then
if _sub == sub then
local b = byte(str, loc1)
if b and b >= 128 then
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
end
end
cp = cp * 0x40 + b - 0x80
end
end
local next_byte = text:byte(to + 1)
n = n + 1
if next_byte and next_byte >= 0x80 and next_byte <= 0xbf then
text[n] = _gsub(_sub(str, start, loc1), ".", chars)
-- Too many trailing bytes.
start = loc1 + 1
error("String " .. text .. " is not UTF-8.")
if start > str_len then
elseif cp < min then
return
-- Overlong encoding.
error("String " .. text .. " is not UTF-8.")
end
end
else
-- Add chunk up to the current match.
n = n + 1
text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)
-- Add current match.
n = n + 1
text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)
start = loc2 + 1
end
return n, start
end
--[==[
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes
all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second
argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns
any pattern matching facilities off in the optional pattern supplied.
]==]
function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
if pattern_or_func == nil then
return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))
end
end
c.byte = c.byte + iterate_val
local text, n, start, str_len, _string, callable = {}, 0, 1
if leading_byte then
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
c.char = c.char + iterate_val
local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub
if callable then
repeat
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))
until not start
-- Special case if the pattern is anchored to the start: "^" always
-- anchors to the start position, not the start of the string, so get
-- around this by only attempting one match with the pattern, then match
-- the end of the string.
elseif byte(pattern_or_func) == 0x5E then -- ^
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
if start ~= nil then
iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))
end
else
repeat
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
until not start
end
end
if c[from_type] == pos then
return concat(text)
return c[to_type]
end
end
end
end
end


--[==[Converts a character position to the equivalent byte position.]==]
do
function export.charsToBytes(text, pos)
local character_classes
return iterate_utf8(text, pos, "char")
local function get_character_classes()
end
character_classes, get_character_classes = {
 
[0x41] = true, [0x61] = true, -- Aa
--[==[Converts a byte position to the equivalent character position.]==]
[0x43] = true, [0x63] = true, -- Cc
function export.bytesToChars(text, pos)
[0x44] = true, [0x64] = true, -- Dd
local byte = text:byte(pos)
[0x4C] = true, [0x6C] = true, -- Ll
if byte and byte >= 0x80 and byte <= 0xbf then
[0x50] = true, [0x70] = true, -- Pp
error("Byte " .. pos .. " is not a leading byte.")
[0x53] = true, [0x73] = true, -- Ss
[0x55] = true, [0x75] = true, -- Uu
[0x57] = true, [0x77] = true, -- Ww
[0x58] = true, [0x78] = true, -- Xx
[0x5A] = true, -- z dealt with separately.
}, nil
return character_classes
end
local function check_sets_equal(set1, set2)
local k2
for k1, v1 in next, set1 do
local v2 = set2[k1]
if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
return false
end
k2 = next(set2, k2)
end
return next(set2, k2) == nil
end
end
return iterate_utf8(text, pos, "byte")
end
local function check_sets(bytes)
 
local key, set1, set = next(bytes)
-- A helper function which iterates through a pattern, and returns two values: a potentially modified version of the pattern, and a boolean indicating whether the returned pattern is simple (i.e. whether it can be used with the stock string library); if not, then the pattern is complex (i.e. it must be used with the ustring library, which is much more resource-intensive).
if set1 == true then
local function patternSimplifier(text, pattern, plain)
return true
pattern = tostring(pattern)
elseif not check_sets(set1) then
-- If `plain` is set, then the pattern is treated as literal (so is always simple). Only used by find.
return false
if plain then
end
return pattern, true
while true do
--If none of these are present, then the pattern has to be simple.
key, set = next(bytes, key)
elseif not (
if not key then
pattern:match("%[.-[\128-\255].-%]") or
return true
pattern:match("[\128-\255][%*%+%?%-]") or
elseif not check_sets_equal(set, set1) then
pattern:match("%%[abcdlpsuwxACDLPSUWXZ]") or
return false
pattern:match("%[%^[^%]]+%]") or
end
pattern:match("%.[^%*%+%-]") or
end
pattern:match("%.$") or
pattern:match("%%b.?[\128-\255]") or
pattern:match("()", 1, true)
) then
return pattern, true
end
end
-- Otherwise, the pattern could go either way.
-- Build up the new pattern in a table, then concatenate at the end. we do it this way, as occasionally entries get modified along the way.
local new_pattern = {}
local len, pos, b = pattern:len(), 0
local char, next_char
-- `escape` and `balanced` are counters, which ensure the effects of % or %b (respectively) are distributed over the following bytes.
local function make_charset(range)
-- `set` is a boolean that states whether the current byte is in a charset.
if #range == 1 then
-- `capture` keeps track of how many layers of capture groups the position is in, while `captures` keeps a tally of how many groups have been detected (due to the string library limit of 32).
return char(range[1])
local escape, set, balanced, capture, captures = 0, false, 0, 0, 0
end
sort(range)
local compressed, n, start = {}, 0, range[1]
for i = 1, #range do
local this, nxt = range[i], range[i + 1]
if nxt ~= this + 1 then
n = n + 1
compressed[n] = this == start and char(this) or
char(start) .. "-" .. char(this)
start = nxt
end
end
return "[" .. concat(compressed) .. "]"
end
while pos < len do
local function parse_1_byte_charset(pattern, pos)
pos = pos + 1
local ch
b = pattern:byte(pos)
while true do
if escape > 0 then escape = escape - 1 end
pos, ch = match(pattern, "()([%%%]\192-\255])", pos)
if balanced > 0 then balanced = balanced - 1 end
if ch == "%" then
char = next_char or pattern:sub(pos, pos)
local nxt = byte(pattern, pos + 1)
next_char = pattern:sub(pos + 1, pos + 1)
if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z
if escape == 0 then
return false
if char == "%" then
-- Apply % escape.
if next_char == "." or next_char == "%" or next_char == "[" or next_char == "]" then
escape = 2
if balanced > 0 then balanced = balanced + 1 end
-- These charsets make the pattern complex.
elseif next_char:match("[acdlpsuwxACDLPSUWXZ]") then
return pattern, false
-- This is "%b".
elseif next_char == "b" then
balanced = 4
end
-- Enter or leave a charset.
elseif char == "[" then
set = true
elseif char == "]" then
set = false
elseif char == "(" then
capture = capture + 1
elseif char == ")" then
if capture > 0 and set == false and balanced == 0 then
captures = captures + 1
capture = capture - 1
end
end
pos = pos + 2
elseif ch == "]" then
pos = pos + 1
return pos
else
return false
end
end
end
end
end
-- Multibyte char.
if b > 0x7f then
--[==[
-- If followed by "*", "+" or "-", then 2-byte chars can be converted into charsets. However, this is not possible with 3 or 4-byte chars, as the charset would be too permissive, because if the trailing bytes were in a different order then this could be a different valid character.
Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion
if next_char == "*" or next_char == "+" or next_char == "-" then
isn't possible, returns false.
local prev_pos = pattern:byte(pos - 1)
]==]
if prev_pos > 0xc1 and prev_pos < 0xe0 then
function pattern_simplifier(pattern)
new_pattern[#new_pattern] = "[" .. new_pattern[#new_pattern]
if type(pattern) == "number" then
table.insert(new_pattern, char .. "]")
return tostring(pattern)
end
local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0
while true do
-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.
pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)
if not ch then
break
end
local nxt = byte(pattern, nxt_pos)
if ch == "%" then
if nxt == 0x62 then -- b
local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)
if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then
return false
end
pos = pos + 4
elseif nxt == 0x66 then -- f
nxt_pos = nxt_pos + 2
local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)
-- Only possible to convert a positive %f charset which is
-- all ASCII, so use parse_1_byte_charset.
if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^
return false
elseif nxt3 == 0x5D then -- Initial ] is non-magic.
nxt_pos = nxt_pos + 1
end
pos = parse_1_byte_charset(pattern, nxt_pos)
if not pos then
return false
end
elseif nxt == 0x5A then -- Z
nxt = byte(pattern, nxt_pos + 1)
if nxt == 0x2A or nxt == 0x2D then -- *-
pos = pos + 3
else
if output == nil then
output = {}
end
local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"
n = n + 1
if nxt == 0x2B then -- +
output[n] = ins .. "%Z*"
pos = pos + 3
elseif nxt == 0x3F then -- ?
output[n] = ins .. "?[\128-\191]*"
pos = pos + 3
else
output[n] = ins .. "[\128-\191]*"
pos = pos + 2
end
start = pos
end
elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz
return false
-- Skip the next character if it's ASCII. Otherwise, we will
-- still need to do length checks.
else
else
return pattern, false
pos = pos + (nxt < 128 and 2 or 1)
end
end
-- If in a charset or used in "%b", then the pattern is complex.
elseif ch == "(" then
-- If followed by "?", add "?" after each byte.
if nxt == 0x29 or capture_groups == 32 then -- )
elseif next_char == "?" then
return false
table.insert(new_pattern, char .. "?")
local check_pos, check_b, i = pos, pattern:byte(pos), #new_pattern
while check_b and check_b < 0xc0 do
check_pos = check_pos - 1
check_b = pattern:byte(check_pos)
i = i - 1
new_pattern[i] = new_pattern[i] .. "?"
end
end
capture_groups = capture_groups + 1
pos = pos + 1
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
elseif ch == "." then
elseif set or balanced > 0 then
if nxt == 0x2A or nxt == 0x2D then -- *-
return pattern, false
pos = pos + 2
else
if output == nil then
output = {}
end
local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"
n = n + 1
if nxt == 0x2B then -- +
output[n] = ins .. ".*"
pos = pos + 2
elseif nxt == 0x3F then -- ?
output[n] = ins .. "?[\128-\191]*"
pos = pos + 2
else
output[n] = ins .. "[\128-\191]*"
pos = pos + 1
end
start = pos
end
elseif ch == "[" then
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
if nxt == 0x5E then -- ^
return false
-- If the first character is "%", ch_len is determined by the
-- next one instead.
elseif nxt == 0x25 then -- %
nxt = byte(pattern, nxt_pos + 1)
elseif nxt == 0x5D then -- Initial ] is non-magic.
nxt_pos = nxt_pos + 1
end
if not nxt then
return false
end
local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4
if ch_len == 1 then -- Single-byte charset.
pos = parse_1_byte_charset(pattern, nxt_pos)
if not pos then
return false
end
else -- Multibyte charset.
-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.
local charset_pos, bytes = pos
pos = pos + 1
while true do -- TODO: non-ASCII charset ranges.
pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)
-- If escaped, get the next character. No need to
-- distinguish magic characters or character classes,
-- as they'll all fail for having the wrong length
-- anyway.
if ch == "%" then
pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)
elseif ch == "]" then
pos = nxt_pos
break
end
if not (ch and nxt_pos - pos == ch_len) then
return false
elseif bytes == nil then
bytes = {}
end
local bytes, last = bytes, nxt_pos - 1
for i = pos, last - 1 do
local b = byte(pattern, i)
local bytes_b = bytes[b]
if bytes_b == nil then
bytes_b = {}
bytes[b] = bytes_b
end
bytes[b], bytes = bytes_b, bytes_b
end
bytes[byte(pattern, last)] = true
pos = nxt_pos
end
if not pos then
return false
end
nxt = byte(pattern, pos)
if (
(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?
(nxt == 0x2B and ch_len > 2) or -- +
not check_sets(bytes)
) then
return false
end
local ranges, b, key, next_byte = {}, 0
repeat
key, next_byte = next(bytes)
local range, n = {key}, 1
-- Loop starts on the second iteration.
for key in next, bytes, key do
n = n + 1
range[n] = key
end
b = b + 1
ranges[b] = range
bytes = next_byte
until next_byte == true
if nxt == 0x2B then -- +
local range1, range2 = ranges[1], ranges[2]
ranges[1], ranges[3] = make_charset(range1), make_charset(range2)
local n = #range2
for i = 1, #range1 do
n = n + 1
range2[n] = range1[i]
end
ranges[2] = make_charset(range2) .. "*"
pos = pos + 1
else
for i = 1, #ranges do
ranges[i] = make_charset(ranges[i])
end
end
if output == nil then
output = {}
end
nxt = byte(pattern, pos)
n = n + 1
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
start = pos
end
elseif not nxt then
break
elseif nxt == 0x2B then -- +
if nxt_pos - pos ~= 2 then
return false
elseif output == nil then
output = {}
end
pos, nxt_pos = pos + 1, nxt_pos + 1
nxt = byte(pattern, nxt_pos)
local ch2 = sub(pattern, pos, pos)
n = n + 1
output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
pos, start = nxt_pos, nxt_pos
elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?
return false
else
else
table.insert(new_pattern, char)
pos = nxt_pos
end
end
elseif char == "." then
-- "*", "+", "-" are always okay after ".", as they don't care how many bytes a char has.
if set or next_char == "*" or next_char == "+" or next_char == "-" or escape > 0 then
table.insert(new_pattern, char)
-- If followed by "?", make sure "?" is after the leading byte of the UTF-8 char pattern, then skip forward one.
elseif next_char == "?" then
table.insert(new_pattern, "[%z\1-\127\194-\244]?[\128-\191]*")
pos = pos + 1
next_char = pattern:sub(pos + 1, pos + 1)
-- If used with "%b", pattern is complex.
elseif balanced > 0 then
return pattern, false
-- Otherwise, add the UTF-8 char pattern.
else
table.insert(new_pattern, "[%z\1-\127\194-\244][\128-\191]*")
end
-- Negative charsets are always complex, unless the text has no UTF-8 chars.
elseif char == "[" and next_char == "^" and escape == 0 and text:match("[\128-\255]") then
return pattern, false
-- "()" matches the position unless escaped or used with "%b", so always necessitates ustring (as we need it to match the char position, not the byte one).
elseif char == "(" and next_char == ")" and balanced == 0 and escape == 0 and text:match("[\128-\255]") then
return pattern, false
else
table.insert(new_pattern, char)
end
end
if start == 1 then
return pattern
end
return concat(output) .. sub(pattern, start)
end
end
if captures > 32 then
pattern_simplifier = memoize(pattern_simplifier, true)
return pattern, false
export.pattern_simplifier = pattern_simplifier
else
pattern = table.concat(new_pattern)
return pattern, true
end
end
end


--[==[A version of len which uses string.len, but returns the same result as mw.ustring.len.]==]
--[==[
function export.len(text)
Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring
text = tostring(text)
library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).
local len_bytes = text:len()
 
if not text:match("[\128-\255]") then
The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used
return len_bytes
(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary
else
characters.
return iterate_utf8(text, len_bytes, "byte")
]==]
function get_charset(charset)
if type(charset) == "number" then
return tostring(charset)
end
end
end
local pos, start, n, output = 1, 1, 0
 
if byte(charset) == 0x5E then -- ^
--[==[A version of sub which uses string.sub, but returns the same result as mw.ustring.sub.]==]
pos = pos + 1
function export.sub(text, i_char, j_char)
text = tostring(text)
if not text:match("[\128-\255]") then
return text:sub(i_char, j_char)
end
end
local i_byte, j_byte
-- FIXME: "]" is non-magic if it's the first character in a charset.
if j_char then
local nxt_pos, nxt
if i_char > 0 and j_char > 0 then
while true do
if j_char < i_char then return "" end
local new_pos, ch = match(charset, "()([%%%-%]])", pos)
i_byte = iterate_utf8(text, i_char, "char")
if not ch then
j_byte = iterate_utf8(text, j_char + 1, "char", i_char, i_byte) - 1
break
elseif i_char < 0 and j_char < 0 then
-- Skip percent escapes. Ranges can't start with them, either.
if j_char < i_char then return "" end
elseif ch == "%" then
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
pos = new_pos + 2
i_byte = iterate_utf8(text, i_char, "char", j_char, j_byte)
-- For some reason, mw.ustring.sub with i=0, j=0 returns the same result as for i=1, j=1, while string.sub always returns "". However, mw.ustring.sub does return "" with i=1, j=0. As such, we need to adjust j_char to 1 if i_char is either 0, or negative with a magnitude greater than the length of the string.
elseif j_char == 0 then
i_byte = iterate_utf8(text, i_char, "char")
if i_byte == 0 or -i_byte > text:len() then j_char = 1 end
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
else
else
i_byte = iterate_utf8(text, i_char, "char")
-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.
j_byte = iterate_utf8(text, j_char + 1, "char") - 1
if ch == "-" and new_pos > pos then
pos, nxt_pos, nxt = new_pos - 1, new_pos, ch
ch = sub(charset, pos, pos)
else
pos, nxt_pos = new_pos, new_pos + 1
nxt = sub(charset, nxt_pos, nxt_pos)
end
-- Range.
if nxt == "-" then
if output == nil then
output = {}
end
n = n + 1
output[n] = sub(charset, start, pos - 1)
nxt_pos = nxt_pos + 1
nxt = sub(charset, nxt_pos, nxt_pos)
-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.
if nxt == "" or nxt == "%" then
n = n + 1
output[n] = (ch == "]" and "%]" or ch) .. "%-"
start = nxt_pos
nxt_pos = nxt_pos + 2
-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be
-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is
-- omitted if the range would be empty (i.e. if the first byte is greater than the second).
else
n = n + 1
output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..
(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)
nxt_pos = nxt_pos + 1
start = nxt_pos
end
elseif ch == "-" or ch == "]" then
if output == nil then
output = {}
end
n = n + 1
output[n] = sub(charset, start, pos - 1) .. "%" .. ch
start = nxt_pos
end
pos = nxt_pos
end
end
else
i_byte = iterate_utf8(text, i_char, "char")
end
end
return text:sub(i_byte, j_byte)
if start == 1 then
return "[" .. charset .. "]"
end
return "[" .. concat(output) .. sub(charset, start) .. "]"
end
end
get_charset = memoize(get_charset, true)
export.get_charset = get_charset


--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.len(str)
function export.lower(text)
return type(str) == "number" and len(str) or
text = tostring(text)
#str - #gsub(str, "[^\128-\191]+", "")
if not text:match("[\128-\255]") then
return text:lower()
else
return mw.ustring.lower(text)
end
end
end
ulen = export.len


--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.sub(str, i, j)
function export.upper(text)
str, i = type(str) == "number" and tostring(str) or str, i or 1
text = tostring(text)
if i < 0 or j and j < 0 then
if not text:match("[\128-\255]") then
return usub(str, i, j)
return text:upper()
elseif j and i > j or i > #str then
else
return ""
return mw.ustring.upper(text)
end
local n, new_i = 0
for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
n = n + loc2 - loc1
if not new_i and n >= i then
new_i = loc2 - (n - i) - 1
if not j then
return sub(str, new_i)
end
end
if j and n > j then
return sub(str, new_i, loc2 - (n - j) - 1)
end
end
end
return new_i and sub(str, new_i) or ""
end
end


--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
do
function export.find(text, pattern, init_char, plain)
local function _find(str, loc1, loc2, ...)
text = tostring(text)
if loc1 and not match(str, "^()[^\128-\255]*$") then
local simple
-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
pattern, simple = patternSimplifier(text, pattern, plain)
loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
-- If the pattern is simple but multibyte characters are present, then init_char needs to be converted into bytes for string.find to work properly, and the return values need to be converted back into chars.
-- Offset length with loc1 to get loc2.
if simple then
loc2 = loc1 + loc2 - 1
if not text:match("[\128-\255]") then
end
return text:find(pattern, init_char, plain)
return loc1, loc2, ...
else
end
local init_byte = init_char and iterate_utf8(text, init_char, "char")
local byte1, byte2, c1, c2, c3, c4, c5, c6, c7, c8, c9 = text:find(pattern, init_byte, plain)
--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
function export.find(str, pattern, init, plain)
-- If string.find returned nil, then return nil.
init = init or 1
if not (byte1 and byte2) then
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
return nil
return ufind(str, pattern, init, plain)
end
elseif plain then
return _find(str, find(str, pattern, init, true))
-- Get first return value. If we have a positive init_char, we can save resources by resuming at that point.
end
local char1, char2
local simple = pattern_simplifier(pattern)
if (not init_char) or init_char > 0 then
if simple then
char1 = iterate_utf8(text, byte1, "byte", init_byte, init_char)
return _find(str, find(str, simple, init))
else
char1 = iterate_utf8(text, byte1, "byte")
end
-- If byte1 and byte2 are the same, don't bother running iterate_utf8 twice. Otherwise, resume iterate_utf8 from byte1 to find char2.
if byte1 == byte2 then
char2 = char1
else
char2 = iterate_utf8(text, byte2, "byte", byte1, char1)
end
return unpack{char1, char2, c1, c2, c3, c4, c5, c6, c7, c8, c9}
end
end
else
return ufind(str, pattern, init)
return mw.ustring.find(text, pattern, init_char, plain)
end
end
end
end


--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(text, pattern, init)
function export.match(str, pattern, init)
text = tostring(text)
init = init or 1
local simple
if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
pattern, simple = patternSimplifier(text, pattern)
return umatch(str, pattern, init)
end
local simple = pattern_simplifier(pattern)
if simple then
if simple then
if init and text:find("[\128-\255]") then
return match(str, simple, init)
init = iterate_utf8(text, init, "char")
end
return text:match(pattern, init)
else
return mw.ustring.match(text, pattern, init)
end
end
return umatch(str, pattern, init)
end
end


--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(text, pattern)
function export.gmatch(str, pattern)
text = tostring(text)
local simple = pattern_simplifier(pattern)
local simple
pattern, simple = patternSimplifier(text, pattern)
if simple then
if simple then
return text:gmatch(pattern)
return gmatch(str, simple)
else
return mw.ustring.gmatch(text, pattern)
end
end
return ugmatch(str, pattern)
end
end


--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(text, pattern, repl, n)
function export.gsub(str, pattern, repl, n)
text = tostring(text)
local simple = pattern_simplifier(pattern)
local simple
pattern, simple = patternSimplifier(text, pattern)
if simple then
if simple then
return text:gsub(pattern, repl, n)
return gsub(str, simple, repl, n)
else
return mw.ustring.gsub(text, pattern, repl, n)
end
end
return ugsub(str, pattern, repl, n)
end
end


--[==[
--[==[
-- Reimplementation of mw.ustring.split() that includes any capturing
Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
]==]
]==]
function export.capturing_split(str, pattern)
function export.plain_gsub(str, pattern, repl, n)
    local ret = {}
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
    -- (.-) corresponds to (.*?) in Python or Perl; () captures the
    -- current position after matching.
    pattern = "(.-)" .. pattern .. "()"
    local start = 1
    while true do
        -- Did we reach the end of the string?
        if start > #str then
            table.insert(ret, "")
            return ret
        end
        -- match() returns all captures as multiple return values;
        -- we need to insert into a table to get them all.
        local captures = {export.match(str, pattern, start)}
        -- If no match, add the remainder of the string.
        if #captures == 0 then
            table.insert(ret, export.sub(str, start))
            return ret
        end
        local newstart = table.remove(captures)
        -- Special case: If we don't advance by any characters, then advance
        -- by one character; this avoids an infinite loop, and makes splitting
        -- by an empty string work the way mw.ustring.split() does. If we
        -- reach the end of the string this way, return immediately, so we
        -- don't get a final empty string.
        if newstart == start then
            table.insert(ret, export.sub(str, start, start))
            table.remove(captures, 1)
            start = start + 1
            if start > #str then
            return ret
            end
        else
            table.insert(ret, table.remove(captures, 1))
            start = newstart
        end
        -- Insert any captures from the splitting pattern.
        for _, x in ipairs(captures) do
            table.insert(ret, x)
        end
    end
end
end


local function uclcfirst(text, dolower)
--[==[
local function douclcfirst(text)
Reverses a UTF-8 string; equivalent to string.reverse.
-- Actual function to re-case of the first letter.
]==]
local first_letter = export.sub(text, 1, 1)
function export.reverse(str)
first_letter = dolower and export.lower(first_letter) or export.upper(first_letter)
return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
return first_letter .. export.sub(text, 2)
end
 
function export.char(...) -- To be moved to [[Module:string/char]].
return u(...)
end
 
do
local function utf8_err(func_name)
error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4)
end
end
-- If there's a link at the beginning, re-case the first letter of the
 
-- link text. This pattern matches both piped and unpiped links.
local function get_codepoint(func_name, b1, b2, b3, b4)
-- If the link is not piped, the second capture (linktext) will be empty.
if b1 <= 0x7F then
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
return b1, 1
if link then
elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
utf8_err(func_name)
elseif b1 <= 0xDF then
local cp = 0x40 * b1 + b2 - 0x3080
return cp >= 0x80 and cp or utf8_err(func_name), 2
elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then
utf8_err(func_name)
elseif b1 <= 0xEF then
local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080
return cp >= 0x800 and cp or utf8_err(func_name), 3
elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then
utf8_err(func_name)
end
local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080
return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4
end
 
function export.codepoint(str, i, j)
if str == "" then
return -- return nothing
elseif type(str) == "number" then
return byte(str, i, j)
end
i, j = i or 1, j == -1 and #str or i or 1
if i == 1 and j == 1 then
return (get_codepoint("codepoint", byte(str, 1, 4)))
elseif i < 0 or j < 0 then
return ucodepoint(str, i, j) -- FIXME
end
local n, nb, ret, nr = 0, 1, {}, 0
while n < j do
n = n + 1
if n < i then
local b = byte(str, nb)
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
else
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
break
end
nr = nr + 1
local add
ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4)
nb = nb + add
end
end
return unpack(ret)
end
codepoint = export.codepoint
function export.gcodepoint(str, i, j)
i, j = i or 1, j ~= -1 and j or nil
if i < 0 or j and j < 0 then
return ugcodepoint(str, i, j) -- FIXME
end
local n, nb = 1, 1
while n < i do
local b = byte(str, nb)
if not b then
break
end
nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
n = n + 1
end
return function()
if j and n > j then
return nil
end
n = n + 1
local b1, b2, b3, b4 = byte(str, nb, nb + 3)
if not b1 then
return nil
end
local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4)
nb = nb + add
return ret
end
end
end
return douclcfirst(text)
end
end


function export.ucfirst(text)
do
return uclcfirst(text, false)
local _ulower = ulower
 
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(str)
return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str)
end
end
end


function export.lcfirst(text)
do
return uclcfirst(text, true)
local _uupper = uupper
 
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(str)
return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str)
end
end
 
do
local function add_captures(t, n, ...)
if ... == nil then
return
end
-- Insert any captures from the splitting pattern.
local offset, capture = n - 1, ...
while capture do
n = n + 1
t[n] = capture
capture = select(n - offset, ...)
end
return n
end
--[==[
Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like
Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by
one character at a time; Python returns the whole remainder of the string). When possible, it will use the string
library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the
string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start
index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil
if there are no further matches. By default, the start index will be calculated using the ustring library, unless
`str_lib` or `plain` is set.
]==]
function export.split(str, pattern_or_func, str_lib, plain)
local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
repeat
n = add_captures(t, n, iter())
until n == nil
return t
end
export.capturing_split = export.split -- To be removed.
end
end


-- Almost identical to mw.text.nowiki, but with minor changes to be identical to the PHP equivalent: ";" always escapes, and colons in certain protocols only escape after regex \b. Also about 2-3 times as fast.
--[==[
function export.nowiki(text)
Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the
return (text
string up the splitting pattern, with any capture groups being returned as additional values on that iteration.
:gsub("[\"&'<=>%[%]{|};]", {
]==]
["\""] = "&#34;", ["&"] = "&#38;", ["'"] = "&#39;",
function export.gsplit(str, pattern_or_func, str_lib, plain)
["<"] = "&#60;", ["="] = "&#61;", [">"] = "&#62;",
local start, final, str_len, _string, callable = 1
["["] = "&#91;", ["]"] = "&#93;", ["{"] = "&#123;",
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
["|"] = "&#124;", ["}"] = "&#125;", [";"] = "&#59;"
local _find, _sub = _string.find, _string.sub
})
:gsub("%f[^%z\r\n][#*: \n\r\t]", {
local function iter(loc1, loc2, ...)
["#"] = "&#35;", ["*"] = "&#42;", [":"] = "&#58;",
-- If no match, or there is but we're past the end of the string
[" "] = "&#32;", ["\n"] = "&#10;", ["\r"] = "&#13;",
-- (which happens when the match is the empty string), then return
["\t"] = "&#9;"
-- the final chunk.
})
if not loc1 then
:gsub("(%f[^%z\r\n])%-(%-%-%-)", "%1&#45;%2")
final = true
:gsub("__", "_&#95;")
return _sub(str, start)
:gsub("://", "&#58;//")
end
:gsub("([IP]?[MRS][BFI][CDN])([\t\n\f\r ])", function(m1, m2)
-- Special case: If we match the empty string, then eat the
if m1 == "ISBN" or m1 == "RFC" or m1 == "PMID" then
-- next character; this avoids an infinite loop, and makes
return m1 .. m2:gsub(".", {
-- splitting by the empty string work the way mw.text.gsplit() does
["\t"] = "&#9;", ["\n"] = "&#10;", ["\f"] = "&#12;",
-- (including non-adjacent empty string matches with %f). If we
["\r"] = "&#13;", [" "] = "&#32;"
-- reach the end of the string this way, set `final` to true, so we
})
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
local b = byte(str, loc1)
if b and b >= 128 then
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
end
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
else
start = loc1 + 1
end
-- Eat chunk up to the current match.
else
chunk = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
return chunk, ...
end
if callable then
return function()
if not final then
return iter(pattern_or_func(str, start))
end
end
-- Special case if the pattern is anchored to the start: "^" always
-- anchors to the start position, not the start of the string, so get
-- around this by only attempting one match with the pattern, then match
-- the end of the string.
elseif byte(pattern_or_func) == 0x5E then -- ^
local returned
return function()
if not returned then
returned = true
return iter(_find(str, pattern_or_func, start, plain))
elseif not final then
return iter(_find(str, "$", start, plain))
end
end
end)
end
:gsub("[%w_]+:", {
end
["bitcoin:"] = "bitcoin&#58;", ["geo:"] = "geo&#58;", ["magnet:"] = "magnet&#58;",
return function()
["mailto:"] = "mailto&#58;", ["matrix:"] = "matrix&#58;", ["news:"] = "news&#58;",
if not final then
["sip:"] = "sip&#58;", ["sips:"] = "sips&#58;", ["sms:"] = "sms&#58;",
return iter(_find(str, pattern_or_func, start, plain))
["tel:"] = "tel&#58;", ["urn:"] = "urn&#58;", ["xmpp:"] = "xmpp&#58;"
end
}))
end
end
end
gsplit = export.gsplit


function export.capitalize(text)
function export.count(str, pattern, plain)
if type(text) == "table" then
if plain then
-- allow calling from a template
return select(2, gsub(str, pattern_escape(pattern), ""))
text = text.args[1]
end
end
-- Capitalize multi-word that is separated by spaces
local simple = pattern_simplifier(pattern)
-- by uppercasing the first letter of each part.
if simple then
-- I assume nobody will input all CAP text.
return select(2, gsub(str, pattern, ""))
w2 = {}
for w in export.gmatch(text, "%S+") do
table.insert(w2, uclcfirst(w, false))
end
end
return table.concat(w2, " ")
return select(2, ugsub(str, pattern, ""))
end
end


function export.pluralize(text)
function export.trim(str, charset, str_lib, plain)
if type(text) == "table" then
if charset == nil then
-- allow calling from a template
-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are
text = text.args[1]
-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to ""
-- first.
return match(gsub(str, "^%s*", ""), "^.*%S") or ""
elseif charset == "" then
return str
end
end
-- Pluralize a word in a smart fashion, according to normal English rules.
charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
-- 1. If word ends in consonant + -y, replace the -y with -ies.
-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets
-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there
-- 3. Otherwise, add -s.
-- would be two callbacks into PHP, which is slower.
-- This handles links correctly:
local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
-- 1. If a piped link, change the second part appropriately.
if not str_lib then
-- 2. If a non-piped link and rule #1 above applies, convert to a piped link
local simple = pattern_simplifier(pattern)
--    with the second part containing the plural.
if not simple then
-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural
return umatch(str, pattern)
--   outside the link.
end
pattern = simple
local function word_ends_in_consonant_plus_y(text)
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- We should maybe consider applying this rule here; but it may not
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
return text:find("[^aeiouAEIOU ]y$")
end
end
return match(str, pattern)
local function word_takes_es_plural(text)
end
return text:find("[sxz]$") or text:find("[cs]h$")
trim = export.trim
 
do
local entities
local function get_entities()
entities, get_entities = load_data("Module:data/entities"), nil
return entities
end
end
 
local function do_pluralize(text)
local function decode_entity(hash, x, code)
if word_ends_in_consonant_plus_y(text) then
if hash == "" then
-- avoid returning multiple values
return (entities or get_entities())[x .. code]
local hack_single_retval = text:gsub("y$", "ies")
end
return hack_single_retval
local cp
elseif word_takes_es_plural(text) then
if x == "" then
return text .. "es"
cp = match(code, "^()%d+$") and tonumber(code)
else
else
return text .. "s"
cp = match(code, "^()%x+$") and tonumber(code, 16)
end
end
return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil
end
end
 
-- Check for a link. This pattern matches both piped and unpiped links.
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases
-- If the link is not piped, the second capture (linktext) will be empty.
-- which have also been included in [[Module:data/entities]].
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
function export.decode_entities(str)
if link then
local amp = find(str, "&", nil, true)
if linktext ~= "" then
return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
end
end
 
do
local entities
local function get_entities()
-- Memoized HTML entities (taken from mw.text.lua).
entities, get_entities = {
["\""] = "&quot;",
["&"] = "&amp;",
["'"] = "&#039;",
["<"] = "&lt;",
[">"] = "&gt;",
["\194\160"] = "&nbsp;",
}, nil
return entities
end
 
local function encode_entity(ch)
local entity = (entities or get_entities())[ch]
if entity == nil then
local cp = codepoint(ch)
-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities.
entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false
entities[ch] = entity
end
return entity or nil
end
 
function export.encode_entities(str, charset, str_lib, plain)
if charset == nil then
return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))
elseif charset == "" then
return str
end
end
if word_ends_in_consonant_plus_y(link) then
local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)
return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]"
if not str_lib then
local simple = pattern_simplifier(pattern)
if not simple then
return (ugsub(str, pattern, encode_entity))
end
pattern = simple
end
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
return (gsub(str, pattern, encode_entity))
end
end
return do_pluralize(text)
end
end


function export.singularize(text)
do
if type(text) == "table" then
local function decode_path(code)
-- allow calling from a template
return char(tonumber(code, 16))
text = text.args[1]
end
end
-- Singularize a word in a smart fashion, according to normal English rules.
-- Works analogously to pluralize().
local function decode(lead, trail)
-- NOTE: This doesn't always work as well as pluralize(). Beware. It will
if lead == "+" or lead == "_" then
-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
return " " .. trail
-- 1. If word ends in -ies, replace -ies with -y.
elseif #trail == 2 then
-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect
return decode_path(trail)
--    -ses, cf. "houses", "impasses".]
-- 3. Otherwise, remove -s.
-- This handles links correctly:
-- 1. If a piped link, change the second part appropriately. Collapse the
--    link to a simple link if both parts end up the same.
-- 2. If a non-piped link, singularize the link.
-- 3. A link like "[[parish]]es" will be handled correctly because the
--    code that checks for -shes etc. allows ] characters between the
--    'sh' etc. and final -es.
local function do_singularize(text)
local sing = text:match("^(.-)ies$")
if sing then
return sing .. "y"
end
end
-- Handle cases like "[[parish]]es"
return lead .. trail
local sing = text:match("^(.-[sc]h%]*)es$")
end
if sing then
return sing
function export.decode_uri(str, enctype)
enctype = enctype and upper(enctype) or "QUERY"
if enctype == "PATH" then
return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str
elseif enctype == "QUERY" then
return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str
elseif enctype == "WIKI" then
return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
end
error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2)
end
end
 
do
local function _remove_comments(str, pre)
local head = find(str, "<!--", nil, true)
if not head then
return str
end
local ret, n = {sub(str, 1, head - 1)}, 1
while true do
local loc = find(str, "-->", head + 4, true)
if not loc then
return pre and concat(ret) or
concat(ret) .. sub(str, head)
end
head = loc + 3
loc = find(str, "<!--", head, true)
if not loc then
return concat(ret) .. sub(str, head)
end
n = n + 1
ret[n] = sub(str, head, loc - 1)
head = loc
end
end
-- Handle cases like "[[box]]es"
end
local sing = text:match("^(.-x%]*)es$")
if sing then
--[==[
return sing
Removes any HTML comments from the input text. `stage` can be one of three options:
* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all
  {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed
  {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or
  [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the
  preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags);
  if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops
  over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g.
  {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed
  {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs,
  where the {"PRE"} method will have already been applied by the native parser.
* {"BOTH"} applies {"PRE"} then {"POST"}.
]==]
function export.remove_comments(str, stage)
if not stage or stage == "PRE" then
return _remove_comments(str, true)
end
end
local sing = text:match("^(.-)s$")
local processed = stage == "POST" and _remove_comments(str) or
if sing then
stage == "BOTH" and _remove_comments(str, true) or
return sing
error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2)
while processed ~= str do
str = processed
processed = _remove_comments(str)
end
end
return text
return str
end
end
end


local function collapse_link(link, linktext)
do
if link == linktext then
local byte_escapes
return "[[" .. link .. "]]"
local function get_byte_escapes()
byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
return byte_escapes
end
local function escape_byte(b)
return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))
end
function export.escape_bytes(str)
return (gsub(str, ".", escape_byte))
end
end
 
function export.format_fun(str, fun)
return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
if #p1 + #p2 == 1 then
return name == "op" and "{" or
name == "cl" and "}" or
error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")
elseif fun(name) and type(fun(name)) ~= "string" then
error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
end
return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")
end))
end
format_fun = export.format_fun
 
--[==[
This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table,
and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening
and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a
slash can be escaped by doubling the initial slash.
 
====Examples====
* {string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"}) }
*: produces: {"one fish, two fish, red fish, blue fish"}
* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}
*: produces: {"The set {1, 2, 3} contains three elements."}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
]==]
function export.format(str, tbl)
return format_fun(str, function(key)
return tbl[key]
end)
end
 
do
local function do_uclcfirst(str, case_func)
-- Re-case the first letter.
local first, remainder = match(str, "^(.[\128-\191]*)(.*)")
return first and (case_func(first) .. remainder) or ""
end
local function uclcfirst(str, case_func)
-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref>
-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through
-- [[Module:links]].
local html_at_beginning = nil
if str:match("^<") then
while true do
local html_tag, rest = str:match("^(<.->)(.*)$")
if not html_tag then
break
end
if not html_at_beginning then
html_at_beginning = {}
end
insert(html_at_beginning, html_tag)
str = rest
end
end
 
-- If there's a link at the beginning, re-case the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
local retval
if link then
retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
else
else
return "[[" .. link .. "|" .. linktext .. "]]"
retval = do_uclcfirst(str, case_func)
end
if html_at_beginning then
retval = concat(html_at_beginning) .. retval
end
end
return retval
end
--[==[
Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
uppercase the first character of text that may include links that have been passed through `full_link()` in
[[Module:links]] or a similar function.
]==]
function export.ucfirst(str)
return uclcfirst(str, uupper)
end
end
ucfirst = export.ucfirst


-- Check for a link. This pattern matches both piped and unpiped links.
--[==[
-- If the link is not piped, the second capture (linktext) will be empty.
Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally
local beginning, link, linktext = export.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
if link then
lowercase the first character of text that may include links that have been passed through `full_link()` in
if linktext ~= "" then
[[Module:links]] or a similar function.
return beginning .. collapse_link(link, do_singularize(linktext))
]==]
end
function export.lcfirst(str)
return beginning .. "[[" .. do_singularize(link) .. "]]"
return uclcfirst(str, ulower)
end
--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==]
function export.capitalize(str)
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
return (ugsub(str, "%w+", ucfirst))
end
end


return do_singularize(text)
local function do_title_case(first, remainder)
end
first = uupper(first)
return remainder == "" and first or (first .. ulower(remainder))
end


--[==[
Capitalizes each word of the input string, with any further letters in each word being converted to lowercase.
]==]
function export.title_case(str)
return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case)
end
title_case = export.title_case


function export.add_indefinite_article(text, uppercase)
--[==[
local is_vowel = false
Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between
-- If there's a link at the beginning, examine the first letter of the
words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase).
-- link text. This pattern matches both piped and unpiped links.
]==]
-- If the link is not piped, the second capture (linktext) will be empty.
function export.camel_case(str, lower_first)
local link, linktext, remainder = export.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
str = ugsub(str, "%W*(%w*)", title_case)
if link then
return lower_first and do_uclcfirst(str, ulower) or str
is_vowel = export.find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
is_vowel = export.find(text, "^[AEIOUaeiou]")
end
end
return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
end
end


-- Convert risky characters to HTML entities, which minimizes interference once returned (e.g. for "sms:a", "<!-- -->" etc.).
do
function export.escape_risky_characters(text)
local function do_snake_case(nonword, word)
if text:match("\"'") then
return nonword == "" and word or "_" .. word
for _, pattern in ipairs(require("Module:languages/data/patterns")) do
text = text:gsub(pattern, function(m1) return mw.text.encode(m1, "\"'") end)
end
end
end
-- Spacing characters in isolation generally need to be escaped in order to be properly processed by the MediaWiki software.
 
if not mw.ustring.match(text, "%S") then
--[==[
return mw.text.encode(text, "%s")
Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between
else
words.
return mw.text.encode(text, "!#%%&*+/:;<=>?@[\\%]_{|}")
]==]
function export.snake_case(str)
return (ugsub(str, "(%W*)(%w*)", do_snake_case))
end
end
end
end


return export
return export

Latest revision as of 17:47, 4 November 2025



local export = {}

local function_module = "Module:fun"
local load_module = "Module:load"
local memoize_module = "Module:memoize"
local string_char_module = "Module:string/char"
local string_charset_escape_module = "Module:string/charsetEscape"

local mw = mw
local string = string
local table = table
local ustring = mw.ustring

local byte = string.byte
local char = string.char
local concat = table.concat
local find = string.find
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local len = string.len
local lower = string.lower
local match = string.match
local next = next
local require = require
local reverse = string.reverse
local select = select
local sort = table.sort
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ucodepoint = ustring.codepoint
local ufind = ustring.find
local ugcodepoint = ustring.gcodepoint
local ugmatch = ustring.gmatch
local ugsub = ustring.gsub
local ulower = ustring.lower
local umatch = ustring.match
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local upper = string.upper
local usub = ustring.sub
local uupper = ustring.upper

local memoize = require(memoize_module)

-- Defined below.
local codepoint
local explode_utf8
local format_fun
local get_charset
local gsplit
local pattern_escape
local pattern_simplifier
local replacement_escape
local title_case
local trim
local ucfirst
local ulen

--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures
modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no
overhead after the first call, since the target functions are called directly in any subsequent calls.
]==]
local function charset_escape(...)
	charset_escape = require(string_charset_escape_module)
	return charset_escape(...)
end

local function is_callable(...)
	is_callable = require(function_module).is_callable
	return is_callable(...)
end

local function load_data(...)
	load_data = require(load_module).load_data
	return load_data(...)
end

local function u(...)
	u = require(string_char_module)
	return u(...)
end

local function prepare_iter(str, pattern, str_lib, plain)
	local callable = is_callable(pattern)
	if str_lib or plain then
		return pattern, #str, string, callable
	elseif not callable then
		local simple = pattern_simplifier(pattern)
		if simple then
			return simple, #str, string, false
		end
	end
	return pattern, ulen(str), ustring, callable
end

--[==[
Returns {nil} if the input value is the empty string, or otherwise the same value.

If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is
the empty string, returns {nil}.

If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input
string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation
marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also
be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).
]==]
function export.is_not_empty(str, do_trim, quote_delimiters)
	if str == "" then
		return nil
	elseif not (str and type(str) == "string") then
		return str
	elseif do_trim then
		str = trim(str)
		if str == "" then
			return nil
		end
	end
	return quote_delimiters and gsub(str, "^(['\"])(.*)%1$", "%2") or str
end

--[==[
Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8
in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in
unexpected behaviour.
]==]
function export.explode_utf8(str)
	local text, i = {}, 0
	for ch in gmatch(str, ".[\128-\191]*") do
		i = i + 1
		text[i] = ch
	end
	return text
end
explode_utf8 = export.explode_utf8

--[==[
Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:
* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to
  `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte
  characters start with `0xF0` to `0xF4`.
* The leading byte must not fall outside of the above ranges.
* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.
* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).
* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to
  U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings
  that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte
  character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000).
  Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but
  `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.

If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in
UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of
surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher
codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances
where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly
4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates,
even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).
]==]
function export.isutf8(str, allow_surrogates)
	for ch in gmatch(str, "[\128-\255][\128-\191]*") do
		if #ch > 4 then
			return false
		end
		local b1, b2, b3, b4 = byte(ch, 1, 4)
		if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
			return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F
		elseif not b3 then -- 2-byte
			if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong
				return false
			end
		elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte
			return false
		elseif not b4 then -- 3-byte
			if b1 > 0xEF then
				return false
			elseif b2 < 0xA0 then
				if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong
					return false
				end
			elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate
				return false
			end
		elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte
			return false
		elseif b2 < 0x90 then
			if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong
				return false
			end
		elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high
			return false
		end
	end
	return true
end

do
	local charset_chars = {
		["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"
	}
	charset_chars.__index = charset_chars
	
	local chars = setmetatable({
		["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",
		["."] = "%.", ["?"] = "%?", ["["] = "%["
	}, charset_chars)

	--[==[
	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's
	version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example,
	{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving
	arbitrary text (e.g. from user input).
	]==]
	function export.pattern_escape(str)
		return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
	end
	pattern_escape = export.pattern_escape

	--[==[
	Escapes only {%}, which is the only magic character used in replacement
	[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.
	]==]
	function export.replacement_escape(str)
		return (gsub(str, "%%", "%%%%"))
	end
	replacement_escape = export.replacement_escape

	local function case_insensitive_char(ch)
		local upper_ch = uupper(ch)
		if upper_ch == ch then
			ch = ulower(ch)
			if ch == upper_ch then
				return chars[ch] or ch
			end
		end
		return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"
	end

	local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)
		if not (loc1 and start <= str_len) then
			-- Add final chunk and return.
			n = n + 1
			text[n] = _gsub(_sub(str, start), ".", chars)
			return
		elseif loc2 < loc1 then
			if _sub == sub then
				local b = byte(str, loc1)
				if b and b >= 128 then
					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
				end
			end
			n = n + 1
			text[n] = _gsub(_sub(str, start, loc1), ".", chars)
			start = loc1 + 1
			if start > str_len then
				return
			end
		else
			-- Add chunk up to the current match.
			n = n + 1
			text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)
			-- Add current match.
			n = n + 1
			text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)
			start = loc2 + 1
		end
		return n, start
	end

	--[==[
	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes
	all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second
	argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns
	any pattern matching facilities off in the optional pattern supplied.
	]==]
	function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
		if pattern_or_func == nil then
			return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))
		end
		
		local text, n, start, str_len, _string, callable = {}, 0, 1
		pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
		local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub
		
		if callable then
			repeat
				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))
			until not start
		-- Special case if the pattern is anchored to the start: "^" always
		-- anchors to the start position, not the start of the string, so get
		-- around this by only attempting one match with the pattern, then match
		-- the end of the string.
		elseif byte(pattern_or_func) == 0x5E then -- ^
			n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
			if start ~= nil then
				iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))
			end
		else
			repeat
				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
			until not start
		end
		
		return concat(text)
	end
end

do
	local character_classes
	local function get_character_classes()
		character_classes, get_character_classes = {
			[0x41] = true, [0x61] = true, -- Aa
			[0x43] = true, [0x63] = true, -- Cc
			[0x44] = true, [0x64] = true, -- Dd
			[0x4C] = true, [0x6C] = true, -- Ll
			[0x50] = true, [0x70] = true, -- Pp
			[0x53] = true, [0x73] = true, -- Ss
			[0x55] = true, [0x75] = true, -- Uu
			[0x57] = true, [0x77] = true, -- Ww
			[0x58] = true, [0x78] = true, -- Xx
			[0x5A] = true, -- z dealt with separately.
		}, nil
		return character_classes
	end
	
	local function check_sets_equal(set1, set2)
		local k2
		for k1, v1 in next, set1 do
			local v2 = set2[k1]
			if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
				return false
			end
			k2 = next(set2, k2)
		end
		return next(set2, k2) == nil
	end
	
	local function check_sets(bytes)
		local key, set1, set = next(bytes)
		if set1 == true then
			return true
		elseif not check_sets(set1) then
			return false
		end
		while true do
			key, set = next(bytes, key)
			if not key then
				return true
			elseif not check_sets_equal(set, set1) then
				return false
			end
		end
	end
	
	local function make_charset(range)
		if #range == 1 then
			return char(range[1])
		end
		sort(range)
		local compressed, n, start = {}, 0, range[1]
		for i = 1, #range do
			local this, nxt = range[i], range[i + 1]
			if nxt ~= this + 1 then
				n = n + 1
				compressed[n] = this == start and char(this) or
					char(start) .. "-" .. char(this)
				start = nxt
			end
		end
		return "[" .. concat(compressed) .. "]"
	end
	
	local function parse_1_byte_charset(pattern, pos)
		local ch
		while true do
			pos, ch = match(pattern, "()([%%%]\192-\255])", pos)
			if ch == "%" then
				local nxt = byte(pattern, pos + 1)
				if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z
					return false
				end
				pos = pos + 2
			elseif ch == "]" then
				pos = pos + 1
				return pos
			else
				return false
			end
		end
	end
	
	--[==[
	Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion
	isn't possible, returns false.
	]==]
	function pattern_simplifier(pattern)
		if type(pattern) == "number" then
			return tostring(pattern)
		end
		local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0
		while true do
			-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.
			pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)
			if not ch then
				break
			end
			local nxt = byte(pattern, nxt_pos)
			if ch == "%" then
				if nxt == 0x62 then -- b
					local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)
					if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then
						return false
					end
					pos = pos + 4
				elseif nxt == 0x66 then -- f
					nxt_pos = nxt_pos + 2
					local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)
					-- Only possible to convert a positive %f charset which is
					-- all ASCII, so use parse_1_byte_charset.
					if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^
						return false
					elseif nxt3 == 0x5D then -- Initial ] is non-magic.
						nxt_pos = nxt_pos + 1
					end
					pos = parse_1_byte_charset(pattern, nxt_pos)
					if not pos then
						return false
					end
				elseif nxt == 0x5A then -- Z
					nxt = byte(pattern, nxt_pos + 1)
					if nxt == 0x2A or nxt == 0x2D then -- *-
						pos = pos + 3
					else
						if output == nil then
							output = {}
						end
						local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"
						n = n + 1
						if nxt == 0x2B then -- +
							output[n] = ins .. "%Z*"
							pos = pos + 3
						elseif nxt == 0x3F then -- ?
							output[n] = ins .. "?[\128-\191]*"
							pos = pos + 3
						else
							output[n] = ins .. "[\128-\191]*"
							pos = pos + 2
						end
						start = pos
					end
				elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz
					return false
				-- Skip the next character if it's ASCII. Otherwise, we will
				-- still need to do length checks.
				else
					pos = pos + (nxt < 128 and 2 or 1)
				end
			elseif ch == "(" then
				if nxt == 0x29 or capture_groups == 32 then -- )
					return false
				end
				capture_groups = capture_groups + 1
				pos = pos + 1
			elseif ch == "." then
				if nxt == 0x2A or nxt == 0x2D then -- *-
					pos = pos + 2
				else
					if output == nil then
						output = {}
					end
					local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"
					n = n + 1
					if nxt == 0x2B then -- +
						output[n] = ins .. ".*"
						pos = pos + 2
					elseif nxt == 0x3F then -- ?
						output[n] = ins .. "?[\128-\191]*"
						pos = pos + 2
					else
						output[n] = ins .. "[\128-\191]*"
						pos = pos + 1
					end
					start = pos
				end
			elseif ch == "[" then
				-- Fail negative charsets. TODO: 1-byte charsets should be safe.
				if nxt == 0x5E then -- ^
					return false
				-- If the first character is "%", ch_len is determined by the
				-- next one instead.
				elseif nxt == 0x25 then -- %
					nxt = byte(pattern, nxt_pos + 1)
				elseif nxt == 0x5D then -- Initial ] is non-magic.
					nxt_pos = nxt_pos + 1
				end
				if not nxt then
					return false
				end
				local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4
				if ch_len == 1 then -- Single-byte charset.
					pos = parse_1_byte_charset(pattern, nxt_pos)
					if not pos then
						return false
					end
				else -- Multibyte charset.
					-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.
					local charset_pos, bytes = pos
					pos = pos + 1
					while true do -- TODO: non-ASCII charset ranges.
						pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)
						-- If escaped, get the next character. No need to
						-- distinguish magic characters or character classes,
						-- as they'll all fail for having the wrong length
						-- anyway.
						if ch == "%" then
							pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)
						elseif ch == "]" then
							pos = nxt_pos
							break
						end
						if not (ch and nxt_pos - pos == ch_len) then
							return false
						elseif bytes == nil then
							bytes = {}
						end
						local bytes, last = bytes, nxt_pos - 1
						for i = pos, last - 1 do
							local b = byte(pattern, i)
							local bytes_b = bytes[b]
							if bytes_b == nil then
								bytes_b = {}
								bytes[b] = bytes_b
							end
							bytes[b], bytes = bytes_b, bytes_b
						end
						bytes[byte(pattern, last)] = true
						pos = nxt_pos
					end
					if not pos then
						return false
					end
					nxt = byte(pattern, pos)
					if (
						(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?
						(nxt == 0x2B and ch_len > 2) or -- +
						not check_sets(bytes)
					) then
						return false
					end
					local ranges, b, key, next_byte = {}, 0
					repeat
						key, next_byte = next(bytes)
						local range, n = {key}, 1
						-- Loop starts on the second iteration.
						for key in next, bytes, key do
							n = n + 1
							range[n] = key
						end
						b = b + 1
						ranges[b] = range
						bytes = next_byte
					until next_byte == true
					if nxt == 0x2B then -- +
						local range1, range2 = ranges[1], ranges[2]
						ranges[1], ranges[3] = make_charset(range1), make_charset(range2)
						local n = #range2
						for i = 1, #range1 do
							n = n + 1
							range2[n] = range1[i]
						end
						ranges[2] = make_charset(range2) .. "*"
						pos = pos + 1
					else
						for i = 1, #ranges do
							ranges[i] = make_charset(ranges[i])
						end
					end
					if output == nil then
						output = {}
					end
					nxt = byte(pattern, pos)
					n = n + 1
					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..
						((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
					start = pos
				end
			elseif not nxt then
				break
			elseif nxt == 0x2B then -- +
				if nxt_pos - pos ~= 2 then
					return false
				elseif output == nil then
					output = {}
				end
				pos, nxt_pos = pos + 1, nxt_pos + 1
				nxt = byte(pattern, nxt_pos)
				local ch2 = sub(pattern, pos, pos)
				n = n + 1
				output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..
					((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
				pos, start = nxt_pos, nxt_pos
			elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?
				return false
			else
				pos = nxt_pos
			end
		end
		if start == 1 then
			return pattern
		end
		return concat(output) .. sub(pattern, start)
	end
	pattern_simplifier = memoize(pattern_simplifier, true)
	export.pattern_simplifier = pattern_simplifier
end

--[==[
Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring
library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).

The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used
(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary
characters.
]==]
function get_charset(charset)
	if type(charset) == "number" then
		return tostring(charset)
	end
	local pos, start, n, output = 1, 1, 0
	if byte(charset) == 0x5E then -- ^
		pos = pos + 1
	end
	 -- FIXME: "]" is non-magic if it's the first character in a charset.
	local nxt_pos, nxt
	while true do
		local new_pos, ch = match(charset, "()([%%%-%]])", pos)
		if not ch then
			break
		-- Skip percent escapes. Ranges can't start with them, either.
		elseif ch == "%" then
			pos = new_pos + 2
		else
			-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.
			if ch == "-" and new_pos > pos then
				pos, nxt_pos, nxt = new_pos - 1, new_pos, ch
				ch = sub(charset, pos, pos)
			else
				pos, nxt_pos = new_pos, new_pos + 1
				nxt = sub(charset, nxt_pos, nxt_pos)
			end
			-- Range.
			if nxt == "-" then 
				if output == nil then
					output = {}
				end
				n = n + 1
				output[n] = sub(charset, start, pos - 1)
				nxt_pos = nxt_pos + 1
				nxt = sub(charset, nxt_pos, nxt_pos)
				-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.
				if nxt == "" or nxt == "%" then
					n = n + 1
					output[n] = (ch == "]" and "%]" or ch) .. "%-"
					start = nxt_pos
					nxt_pos = nxt_pos + 2
				-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be
				-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is
				-- omitted if the range would be empty (i.e. if the first byte is greater than the second).
				else
					n = n + 1
					output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..
						(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)
					nxt_pos = nxt_pos + 1
					start = nxt_pos
				end
			elseif ch == "-" or ch == "]" then
				if output == nil then
					output = {}
				end
				n = n + 1
				output[n] = sub(charset, start, pos - 1) .. "%" .. ch
				start = nxt_pos
			end
			pos = nxt_pos
		end
	end
	if start == 1 then
		return "[" .. charset .. "]"
	end
	return "[" .. concat(output) .. sub(charset, start) .. "]"
end
get_charset = memoize(get_charset, true)
export.get_charset = get_charset

function export.len(str)
	return type(str) == "number" and len(str) or
		#str - #gsub(str, "[^\128-\191]+", "")
end
ulen = export.len

function export.sub(str, i, j)
	str, i = type(str) == "number" and tostring(str) or str, i or 1
	if i < 0 or j and j < 0 then
		return usub(str, i, j)
	elseif j and i > j or i > #str then
		return ""
	end
	local n, new_i = 0
	for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
		n = n + loc2 - loc1
		if not new_i and n >= i then
			new_i = loc2 - (n - i) - 1
			if not j then
				return sub(str, new_i)
			end
		end
		if j and n > j then
			return sub(str, new_i, loc2 - (n - j) - 1)
		end
	end
	return new_i and sub(str, new_i) or ""
end

do
	local function _find(str, loc1, loc2, ...)
		if loc1 and not match(str, "^()[^\128-\255]*$") then
			-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
			loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
			-- Offset length with loc1 to get loc2.
			loc2 = loc1 + loc2 - 1
		end
		return loc1, loc2, ...
	end
	
	--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
	function export.find(str, pattern, init, plain)
		init = init or 1
		if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
			return ufind(str, pattern, init, plain)
		elseif plain then
			return _find(str, find(str, pattern, init, true))
		end
		local simple = pattern_simplifier(pattern)
		if simple then
			return _find(str, find(str, simple, init))
		end
		return ufind(str, pattern, init)
	end
end

--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
function export.match(str, pattern, init)
	init = init or 1
	if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
		return umatch(str, pattern, init)
	end
	local simple = pattern_simplifier(pattern)
	if simple then
		return match(str, simple, init)
	end
	return umatch(str, pattern, init)
end

--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
function export.gmatch(str, pattern)
	local simple = pattern_simplifier(pattern)
	if simple then
		return gmatch(str, simple)
	end
	return ugmatch(str, pattern)
end

--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
function export.gsub(str, pattern, repl, n)
	local simple = pattern_simplifier(pattern)
	if simple then
		return gsub(str, simple, repl, n)
	end
	return ugsub(str, pattern, repl, n)
end

--[==[
Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.
]==]
function export.plain_gsub(str, pattern, repl, n)
	return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
end

--[==[
Reverses a UTF-8 string; equivalent to string.reverse.
]==]
function export.reverse(str)
	return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
end

function export.char(...) -- To be moved to [[Module:string/char]].
	return u(...)
end

do
	local function utf8_err(func_name)
		error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4)
	end

	local function get_codepoint(func_name, b1, b2, b3, b4)
		if b1 <= 0x7F then
			return b1, 1
		elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
			utf8_err(func_name)
		elseif b1 <= 0xDF then
			local cp = 0x40 * b1 + b2 - 0x3080
			return cp >= 0x80 and cp or utf8_err(func_name), 2
		elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then
			utf8_err(func_name)
		elseif b1 <= 0xEF then
			local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080
			return cp >= 0x800 and cp or utf8_err(func_name), 3
		elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then
			utf8_err(func_name)
		end
		local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080
		return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4
	end

	function export.codepoint(str, i, j)
		if str == "" then
			return -- return nothing
		elseif type(str) == "number" then
			return byte(str, i, j)
		end
		i, j = i or 1, j == -1 and #str or i or 1
		if i == 1 and j == 1 then
			return (get_codepoint("codepoint", byte(str, 1, 4)))
		elseif i < 0 or j < 0 then
			return ucodepoint(str, i, j) -- FIXME
		end
		local n, nb, ret, nr = 0, 1, {}, 0
		while n < j do
			n = n + 1
			if n < i then
				local b = byte(str, nb)
				nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
			else
				local b1, b2, b3, b4 = byte(str, nb, nb + 3)
				if not b1 then
					break
				end
				nr = nr + 1
				local add
				ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4)
				nb = nb + add
			end
		end
		return unpack(ret)
	end
	codepoint = export.codepoint
	
	function export.gcodepoint(str, i, j)
		i, j = i or 1, j ~= -1 and j or nil
		if i < 0 or j and j < 0 then
			return ugcodepoint(str, i, j) -- FIXME
		end
		local n, nb = 1, 1
		while n < i do
			local b = byte(str, nb)
			if not b then
				break
			end
			nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
			n = n + 1
		end
		
		return function()
			if j and n > j then
				return nil
			end
			n = n + 1
			local b1, b2, b3, b4 = byte(str, nb, nb + 3)
			if not b1 then
				return nil
			end
			local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4)
			nb = nb + add
			return ret
		end
	end
end

do
	local _ulower = ulower

	--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
	function export.lower(str)
		return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str)
	end
end

do
	local _uupper = uupper

	--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
	function export.upper(str)
		return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str)
	end
end

do
	local function add_captures(t, n, ...)
		if ... == nil then
			return
		end
		-- Insert any captures from the splitting pattern.
		local offset, capture = n - 1, ...
		while capture do
			n = n + 1
			t[n] = capture
			capture = select(n - offset, ...)
		end
		return n
	end
	
	--[==[
	Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like
	Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by
	one character at a time; Python returns the whole remainder of the string). When possible, it will use the string
	library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the
	string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
	
	In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start
	index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil
	if there are no further matches. By default, the start index will be calculated using the ustring library, unless
	`str_lib` or `plain` is set.
	]==]
	function export.split(str, pattern_or_func, str_lib, plain)
		local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
		repeat
			n = add_captures(t, n, iter())
		until n == nil
		return t
	end
	export.capturing_split = export.split -- To be removed.
end

--[==[
Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the
string up the splitting pattern, with any capture groups being returned as additional values on that iteration.
]==]
function export.gsplit(str, pattern_or_func, str_lib, plain)
	local start, final, str_len, _string, callable = 1
	pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
	local _find, _sub = _string.find, _string.sub
	
	local function iter(loc1, loc2, ...)
		-- If no match, or there is but we're past the end of the string
		-- (which happens when the match is the empty string), then return
		-- the final chunk.
		if not loc1 then
			final = true
			return _sub(str, start)
		end
		-- Special case: If we match the empty string, then eat the
		-- next character; this avoids an infinite loop, and makes
		-- splitting by the empty string work the way mw.text.gsplit() does
		-- (including non-adjacent empty string matches with %f). If we
		-- reach the end of the string this way, set `final` to true, so we
		-- don't get stuck matching the empty string at the end.
		local chunk
		if loc2 < loc1 then
			-- If using the string library, we need to make sure we advance
			-- by one UTF-8 character.
			if _sub == sub then
				local b = byte(str, loc1)
				if b and b >= 128 then
					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
				end
			end
			chunk = _sub(str, start, loc1)
			if loc1 >= str_len then
				final = true
			else
				start = loc1 + 1
			end
		-- Eat chunk up to the current match.
		else
			chunk = _sub(str, start, loc1 - 1)
			start = loc2 + 1
		end
		return chunk, ...
	end
	
	if callable then
		return function()
			if not final then
				return iter(pattern_or_func(str, start))
			end
		end
	-- Special case if the pattern is anchored to the start: "^" always
	-- anchors to the start position, not the start of the string, so get
	-- around this by only attempting one match with the pattern, then match
	-- the end of the string.
	elseif byte(pattern_or_func) == 0x5E then -- ^
		local returned
		return function()
			if not returned then
				returned = true
				return iter(_find(str, pattern_or_func, start, plain))
			elseif not final then
				return iter(_find(str, "$", start, plain))
			end
		end
	end
	return function()
		if not final then
			return iter(_find(str, pattern_or_func, start, plain))
		end
	end
end
gsplit = export.gsplit

function export.count(str, pattern, plain)
	if plain then
		return select(2, gsub(str, pattern_escape(pattern), ""))
	end
	local simple = pattern_simplifier(pattern)
	if simple then
		return select(2, gsub(str, pattern, ""))
	end
	return select(2, ugsub(str, pattern, ""))
end

function export.trim(str, charset, str_lib, plain)
	if charset == nil then
		-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are
		-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to ""
		-- first.
		return match(gsub(str, "^%s*", ""), "^.*%S") or ""
	elseif charset == "" then
		return str
	end
	charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
	-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets
	-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there
	-- would be two callbacks into PHP, which is slower.
	local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
	if not str_lib then
		local simple = pattern_simplifier(pattern)
		if not simple then
			return umatch(str, pattern)
		end
		pattern = simple
	end
	return match(str, pattern)
end
trim = export.trim

do
	local entities
	local function get_entities()
		entities, get_entities = load_data("Module:data/entities"), nil
		return entities
	end

	local function decode_entity(hash, x, code)
		if hash == "" then
			return (entities or get_entities())[x .. code]
		end
		local cp
		if x == "" then
			cp = match(code, "^()%d+$") and tonumber(code)
		else
			cp = match(code, "^()%x+$") and tonumber(code, 16)
		end
		return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil
	end

	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases
	-- which have also been included in [[Module:data/entities]].
	function export.decode_entities(str)
		local amp = find(str, "&", nil, true)
		return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
	end
end

do
	local entities
	local function get_entities()
		-- Memoized HTML entities (taken from mw.text.lua).
		entities, get_entities = {
			["\""] = "&quot;",
			["&"] = "&amp;",
			["'"] = "&#039;",
			["<"] = "&lt;",
			[">"] = "&gt;",
			["\194\160"] = "&nbsp;",
		}, nil
		return entities
	end

	local function encode_entity(ch)
		local entity = (entities or get_entities())[ch]
		if entity == nil then
			local cp = codepoint(ch)
			-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities.
			entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false
			entities[ch] = entity
		end
		return entity or nil
	end

	function export.encode_entities(str, charset, str_lib, plain)
		if charset == nil then
			return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))
		elseif charset == "" then
			return str
		end
		local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)
		if not str_lib then
			local simple = pattern_simplifier(pattern)
			if not simple then
				return (ugsub(str, pattern, encode_entity))
			end
			pattern = simple
		end
		return (gsub(str, pattern, encode_entity))
	end
end

do
	local function decode_path(code)
		return char(tonumber(code, 16))
	end
	
	local function decode(lead, trail)
		if lead == "+" or lead == "_" then
			return " " .. trail
		elseif #trail == 2 then
			return decode_path(trail)
		end
		return lead .. trail
	end
	
	function export.decode_uri(str, enctype)
		enctype = enctype and upper(enctype) or "QUERY"
		if enctype == "PATH" then
			return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str
		elseif enctype == "QUERY" then
			return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str
		elseif enctype == "WIKI" then
			return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
		end
		error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2)
	end
end

do
	local function _remove_comments(str, pre)
		local head = find(str, "<!--", nil, true)
		if not head then
			return str
		end
		local ret, n = {sub(str, 1, head - 1)}, 1
		while true do
			local loc = find(str, "-->", head + 4, true)
			if not loc then
				return pre and concat(ret) or
					concat(ret) .. sub(str, head)
			end
			head = loc + 3
			loc = find(str, "<!--", head, true)
			if not loc then
				return concat(ret) .. sub(str, head)
			end
			n = n + 1
			ret[n] = sub(str, head, loc - 1)
			head = loc
		end
	end
	
	--[==[
	Removes any HTML comments from the input text. `stage` can be one of three options:
	* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all
	  {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed
	  {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or
	  [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the
	  preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags);
	  if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
	* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops
	  over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g.
	  {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed
	  {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs,
	  where the {"PRE"} method will have already been applied by the native parser.
	* {"BOTH"} applies {"PRE"} then {"POST"}.
	]==]
	function export.remove_comments(str, stage)
		if not stage or stage == "PRE" then
			return _remove_comments(str, true)
		end
		local processed = stage == "POST" and _remove_comments(str) or
			stage == "BOTH" and _remove_comments(str, true) or
			error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2)
		while processed ~= str do
			str = processed
			processed = _remove_comments(str)
		end
		return str
	end
end

do
	local byte_escapes
	local function get_byte_escapes()
		byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
		return byte_escapes
	end
	
	local function escape_byte(b)
		return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))
	end
	
	function export.escape_bytes(str)
		return (gsub(str, ".", escape_byte))
	end
end

function export.format_fun(str, fun)
	return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
		if #p1 + #p2 == 1 then
			return name == "op" and "{" or
				name == "cl" and "}" or
				error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")
		elseif fun(name) and type(fun(name)) ~= "string" then
			error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
		end
		return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")
	end))
end
format_fun = export.format_fun

--[==[
This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table,
and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening
and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a
slash can be escaped by doubling the initial slash.

====Examples====
* {string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"}) }
*: produces: {"one fish, two fish, red fish, blue fish"}
* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}
*: produces: {"The set {1, 2, 3} contains three elements."}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
]==]
function export.format(str, tbl)
	return format_fun(str, function(key)
		return tbl[key]
	end)
end

do
	local function do_uclcfirst(str, case_func)
		-- Re-case the first letter.
		local first, remainder = match(str, "^(.[\128-\191]*)(.*)")
		return first and (case_func(first) .. remainder) or ""
	end
	
	local function uclcfirst(str, case_func)
		-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref>
		-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through
		-- [[Module:links]].
		local html_at_beginning = nil
		if str:match("^<") then
			while true do
				local html_tag, rest = str:match("^(<.->)(.*)$")
				if not html_tag then
					break
				end
				if not html_at_beginning then
					html_at_beginning = {}
				end
				insert(html_at_beginning, html_tag)
				str = rest
			end
		end

		-- If there's a link at the beginning, re-case the first letter of the
		-- link text. This pattern matches both piped and unpiped links.
		-- If the link is not piped, the second capture (linktext) will be empty.
		local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
		local retval
		if link then
			retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
		else
			retval = do_uclcfirst(str, case_func)
		end
		if html_at_beginning then
			retval = concat(html_at_beginning) .. retval
		end
		return retval
	end
	
	--[==[
	Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally
	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
	uppercase the first character of text that may include links that have been passed through `full_link()` in
	[[Module:links]] or a similar function.
	]==]
	function export.ucfirst(str)
		return uclcfirst(str, uupper)
	end
	ucfirst = export.ucfirst

	--[==[
	Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally
	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
	lowercase the first character of text that may include links that have been passed through `full_link()` in
	[[Module:links]] or a similar function.
	]==]
	function export.lcfirst(str)
		return uclcfirst(str, ulower)
	end
	
	--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==]
	function export.capitalize(str)
		-- Capitalize multi-word that is separated by spaces
		-- by uppercasing the first letter of each part.
		return (ugsub(str, "%w+", ucfirst))
	end

	local function do_title_case(first, remainder)
		first = uupper(first)
		return remainder == "" and first or (first .. ulower(remainder))
	end

	--[==[
	Capitalizes each word of the input string, with any further letters in each word being converted to lowercase.
	]==]
	function export.title_case(str)
		return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case)
	end
	title_case = export.title_case

	--[==[
	Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between
	words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase).
	]==]
	function export.camel_case(str, lower_first)
		str = ugsub(str, "%W*(%w*)", title_case)
		return lower_first and do_uclcfirst(str, ulower) or str
	end
end

do
	local function do_snake_case(nonword, word)
		return nonword == "" and word or "_" .. word
	end

	--[==[
	Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between
	words.
	]==]
	function export.snake_case(str)
		return (ugsub(str, "(%W*)(%w*)", do_snake_case))
	end
end

return export