Home
Random

Log in

Settings

About Linguifex
Disclaimers

Module:string utilities: Difference between revisions

Language
Watch
View history
View source

@@ Line 1: / Line 1: @@
-local module_name = "string_utilities"
 local export = {}
-local rfind = mw.ustring.find
+local function_module = "Module:fun"
+local load_module = "Module:load"
+local memoize_module = "Module:memoize"
+local string_char_module = "Module:string/char"
+local string_charset_escape_module = "Module:string/charsetEscape"
-local format_escapes = {
+local mw = mw
-    ["op"] = "{",
+local string = string
-    ["cl"] = "}",
+local table = table
-}
+local ustring = mw.ustring
-function export.format(str, tbl)
+local byte = string.byte
-    return (string.gsub(str, "{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
+local char = string.char
-        if #p1 + #p2 == 1 then
+local concat = table.concat
-            return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
+local find = string.find
-        else
+local format = string.format
-        	if tbl[name] and type(tbl[name]) ~= "string" then
+local gmatch = string.gmatch
-        		error(module_name .. ".format: '" .. name .. "' is a " .. type(tbl[name]) .. ", not a string")
+local gsub = string.gsub
-        	end
+local insert = table.insert
-            return tbl[name] or error(module_name .. ".format: '" .. name .. "' not found in table")
+local len = string.len
-        end
+local lower = string.lower
-    end))
+local match = string.match
+local next = next
+local require = require
+local reverse = string.reverse
+local select = select
+local sort = table.sort
+local sub = string.sub
+local tonumber = tonumber
+local tostring = tostring
+local type = type
+local ucodepoint = ustring.codepoint
+local ufind = ustring.find
+local ugcodepoint = ustring.gcodepoint
+local ugmatch = ustring.gmatch
+local ugsub = ustring.gsub
+local ulower = ustring.lower
+local umatch = ustring.match
+local unpack = unpack or table.unpack -- Lua 5.2 compatibility
+local upper = string.upper
+local usub = ustring.sub
+local uupper = ustring.upper
+local memoize = require(memoize_module)
+-- Defined below.
+local codepoint
+local explode_utf8
+local format_fun
+local get_charset
+local gsplit
+local pattern_escape
+local pattern_simplifier
+local replacement_escape
+local title_case
+local trim
+local ucfirst
+local ulen
+--[==[
+Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures
+modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no
+overhead after the first call, since the target functions are called directly in any subsequent calls.
+]==]
+local function charset_escape(...)
+	charset_escape = require(string_charset_escape_module)
+	return charset_escape(...)
+end
+local function is_callable(...)
+	is_callable = require(function_module).is_callable
+	return is_callable(...)
+end
+local function load_data(...)
+	load_data = require(load_module).load_data
+	return load_data(...)
 end
--- Reimplementation of mw.ustring.split() that includes any capturing
+local function u(...)
--- groups in the splitting pattern. This works like Python's re.split()
+	u = require(string_char_module)
--- function, except that it has Lua's behavior when the split pattern
+	return u(...)
--- is empty (i.e. advancing by one character at a time; Python returns the
--- whole remainder of the string).
-function export.capturing_split(str, pattern)
-    local ret = {}
-    -- (.-) corresponds to (.*?) in Python or Perl; () captures the
-    -- current position after matching.
-    pattern = "(.-)" .. pattern .. "()"
-    local start = 1
-    while true do
-        -- Did we reach the end of the string?
-        if start > #str then
-            table.insert(ret, "")
-            return ret
-        end
-        -- match() returns all captures as multiple return values;
-        -- we need to insert into a table to get them all.
-        local captures = {mw.ustring.match(str, pattern, start)}
-        -- If no match, add the remainder of the string.
-        if #captures == 0 then
-            table.insert(ret, mw.ustring.sub(str, start))
-            return ret
-        end
-        local newstart = table.remove(captures)
-        -- Special case: If we don't advance by any characters, then advance
-        -- by one character; this avoids an infinite loop, and makes splitting
-        -- by an empty string work the way mw.ustring.split() does. If we
-        -- reach the end of the string this way, return immediately, so we
-        -- don't get a final empty string.
-        if newstart == start then
-            table.insert(ret, mw.ustring.sub(str, start, start))
-            table.remove(captures, 1)
-            start = start + 1
-            if start > #str then
-            	return ret
-            end
-        else
-            table.insert(ret, table.remove(captures, 1))
-            start = newstart
-        end
-        -- Insert any captures from the splitting pattern.
-        for _, x in ipairs(captures) do
-            table.insert(ret, x)
-        end
-    end
 end
-local function uclcfirst(text, dolower)
+local function prepare_iter(str, pattern, str_lib, plain)
-	local function douclcfirst(text)
+	local callable = is_callable(pattern)
-		-- Actual function to re-case of the first letter.
+	if str_lib or plain then
-		local first_letter = mw.ustring.sub(text, 1, 1)
+		return pattern, #str, string, callable
-		first_letter = dolower and mw.ustring.lower(first_letter) or mw.ustring.upper(first_letter)
+	elseif not callable then
-		return first_letter .. mw.ustring.sub(text, 2)
+		local simple = pattern_simplifier(pattern)
+		if simple then
+			return simple, #str, string, false
+		end
 	end
-	-- If there's a link at the beginning, re-case the first letter of the
+	return pattern, ulen(str), ustring, callable
-	-- link text. This pattern matches both piped and unpiped links.
+end
-	-- If the link is not piped, the second capture (linktext) will be empty.
-	local link, linktext, remainder = mw.ustring.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
+--[==[
-	if link then
+Returns {nil} if the input value is the empty string, or otherwise the same value.
-		return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
+If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is
+the empty string, returns {nil}.
+If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input
+string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation
+marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also
+be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).
+]==]
+function export.is_not_empty(str, do_trim, quote_delimiters)
+	if str == "" then
+		return nil
+	elseif not (str and type(str) == "string") then
+		return str
+	elseif do_trim then
+		str = trim(str)
+		if str == "" then
+			return nil
+		end
 	end
-	return douclcfirst(text)
+	return quote_delimiters and gsub(str, "^(['\"])(.*)%1$", "%2") or str
 end
-function export.ucfirst(text)
+--[==[
-	return uclcfirst(text, false)
+Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8
+in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in
+unexpected behaviour.
+]==]
+function export.explode_utf8(str)
+	local text, i = {}, 0
+	for ch in gmatch(str, ".[\128-\191]*") do
+		i = i + 1
+		text[i] = ch
+	end
+	return text
 end
+explode_utf8 = export.explode_utf8
+--[==[
+Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:
+* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to
+  `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte
+  characters start with `0xF0` to `0xF4`.
+* The leading byte must not fall outside of the above ranges.
+* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.
+* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).
+* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to
+  U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings
+  that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte
+  character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000).
+  Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but
+  `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.
-function export.lcfirst(text)
+If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in
-	return uclcfirst(text, true)
+UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of
+surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher
+codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances
+where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly
+hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates,
+even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).
+]==]
+function export.isutf8(str, allow_surrogates)
+	for ch in gmatch(str, "[\128-\255][\128-\191]*") do
+		if #ch > 4 then
+			return false
+		end
+		local b1, b2, b3, b4 = byte(ch, 1, 4)
+		if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
+			return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F
+		elseif not b3 then -- 2-byte
+			if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong
+				return false
+			end
+		elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte
+			return false
+		elseif not b4 then -- 3-byte
+			if b1 > 0xEF then
+				return false
+			elseif b2 < 0xA0 then
+				if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong
+					return false
+				end
+			elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate
+				return false
+			end
+		elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte
+			return false
+		elseif b2 < 0x90 then
+			if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong
+				return false
+			end
+		elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high
+			return false
+		end
+	end
+	return true
 end
-function export.pluralize(text)
+do
-	if type(text) == "table" then
+	local charset_chars = {
-		-- allow calling from a template
+		["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"
-		text = text.args[1]
+	}
+	charset_chars.__index = charset_chars
+	local chars = setmetatable({
+		["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",
+		["."] = "%.", ["?"] = "%?", ["["] = "%["
+	}, charset_chars)
+	--[==[
+	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's
+	version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example,
+	{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving
+	arbitrary text (e.g. from user input).
+	]==]
+	function export.pattern_escape(str)
+		return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
+	end
+	pattern_escape = export.pattern_escape
+	--[==[
+	Escapes only {%}, which is the only magic character used in replacement
+	[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.
+	]==]
+	function export.replacement_escape(str)
+		return (gsub(str, "%%", "%%%%"))
+	end
+	replacement_escape = export.replacement_escape
+	local function case_insensitive_char(ch)
+		local upper_ch = uupper(ch)
+		if upper_ch == ch then
+			ch = ulower(ch)
+			if ch == upper_ch then
+				return chars[ch] or ch
+			end
+		end
+		return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"
+	end
+	local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)
+		if not (loc1 and start <= str_len) then
+			-- Add final chunk and return.
+			n = n + 1
+			text[n] = _gsub(_sub(str, start), ".", chars)
+			return
+		elseif loc2 < loc1 then
+			if _sub == sub then
+				local b = byte(str, loc1)
+				if b and b >= 128 then
+					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
+				end
+			end
+			n = n + 1
+			text[n] = _gsub(_sub(str, start, loc1), ".", chars)
+			start = loc1 + 1
+			if start > str_len then
+				return
+			end
+		else
+			-- Add chunk up to the current match.
+			n = n + 1
+			text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)
+			-- Add current match.
+			n = n + 1
+			text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)
+			start = loc2 + 1
+		end
+		return n, start
+	end
+	--[==[
+	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes
+	all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second
+	argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns
+	any pattern matching facilities off in the optional pattern supplied.
+	]==]
+	function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
+		if pattern_or_func == nil then
+			return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))
+		end
+		local text, n, start, str_len, _string, callable = {}, 0, 1
+		pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
+		local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub
+		if callable then
+			repeat
+				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))
+			until not start
+		-- Special case if the pattern is anchored to the start: "^" always
+		-- anchors to the start position, not the start of the string, so get
+		-- around this by only attempting one match with the pattern, then match
+		-- the end of the string.
+		elseif byte(pattern_or_func) == 0x5E then -- ^
+			n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
+			if start ~= nil then
+				iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))
+			end
+		else
+			repeat
+				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
+			until not start
+		end
+		return concat(text)
+	end
+end
+do
+	local character_classes
+	local function get_character_classes()
+		character_classes, get_character_classes = {
+			[0x41] = true, [0x61] = true, -- Aa
+			[0x43] = true, [0x63] = true, -- Cc
+			[0x44] = true, [0x64] = true, -- Dd
+			[0x4C] = true, [0x6C] = true, -- Ll
+			[0x50] = true, [0x70] = true, -- Pp
+			[0x53] = true, [0x73] = true, -- Ss
+			[0x55] = true, [0x75] = true, -- Uu
+			[0x57] = true, [0x77] = true, -- Ww
+			[0x58] = true, [0x78] = true, -- Xx
+			[0x5A] = true, -- z dealt with separately.
+		}, nil
+		return character_classes
+	end
+	local function check_sets_equal(set1, set2)
+		local k2
+		for k1, v1 in next, set1 do
+			local v2 = set2[k1]
+			if v1 ~= v2 and (v2 == nil or not check_sets_equal(v1, v2)) then
+				return false
+			end
+			k2 = next(set2, k2)
+		end
+		return next(set2, k2) == nil
+	end
+	local function check_sets(bytes)
+		local key, set1, set = next(bytes)
+		if set1 == true then
+			return true
+		elseif not check_sets(set1) then
+			return false
+		end
+		while true do
+			key, set = next(bytes, key)
+			if not key then
+				return true
+			elseif not check_sets_equal(set, set1) then
+				return false
+			end
+		end
 	end
-	-- Pluralize a word in a smart fashion, according to normal English rules.
-	-- 1. If word ends in consonant + -y, replace the -y with -ies.
-	-- 2. If the word ends in -s, -x, -z, -sh, -ch, add -es.
-	-- 3. Otherwise, add -s.
-	-- This handles links correctly:
-	-- 1. If a piped link, change the second part appropriately.
-	-- 2. If a non-piped link and rule #1 above applies, convert to a piped link
-	--    with the second part containing the plural.
-	-- 3. If a non-piped link and rules #2 or #3 above apply, add the plural
-	--    outside the link.
-	local function word_ends_in_consonant_plus_y(text)
+	local function make_charset(range)
-		-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
+		if #range == 1 then
-		-- apply to proper nouns, hence "the Gettys", "the public Ivys".
+			return char(range[1])
-		-- We should maybe consider applying this rule here; but it may not
+		end
-		-- be important as this function is almost always called on common nouns
+		sort(range)
-		-- (e.g. parts of speech, place types).
+		local compressed, n, start = {}, 0, range[1]
-		return text:find("[^aeiouAEIOU ]y$")
+		for i = 1, #range do
+			local this, nxt = range[i], range[i + 1]
+			if nxt ~= this + 1 then
+				n = n + 1
+				compressed[n] = this == start and char(this) or
+					char(start) .. "-" .. char(this)
+				start = nxt
+			end
+		end
+		return "[" .. concat(compressed) .. "]"
 	end
-	local function word_takes_es_plural(text)
+	local function parse_1_byte_charset(pattern, pos)
-		return text:find("[sxz]$") or text:find("[cs]h$")
+		local ch
+		while true do
+			pos, ch = match(pattern, "()([%%%]\192-\255])", pos)
+			if ch == "%" then
+				local nxt = byte(pattern, pos + 1)
+				if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z
+					return false
+				end
+				pos = pos + 2
+			elseif ch == "]" then
+				pos = pos + 1
+				return pos
+			else
+				return false
+			end
+		end
 	end
-	local function do_pluralize(text)
+	--[==[
-		if word_ends_in_consonant_plus_y(text) then
+	Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion
-			-- avoid returning multiple values
+	isn't possible, returns false.
-			local hack_single_retval = text:gsub("y$", "ies")
+	]==]
-			return hack_single_retval
+	function pattern_simplifier(pattern)
-		elseif word_takes_es_plural(text) then
+		if type(pattern) == "number" then
-			return text .. "es"
+			return tostring(pattern)
+		end
+		local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0
+		while true do
+			-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.
+			pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)
+			if not ch then
+				break
+			end
+			local nxt = byte(pattern, nxt_pos)
+			if ch == "%" then
+				if nxt == 0x62 then -- b
+					local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)
+					if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then
+						return false
+					end
+					pos = pos + 4
+				elseif nxt == 0x66 then -- f
+					nxt_pos = nxt_pos + 2
+					local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)
+					-- Only possible to convert a positive %f charset which is
+					-- all ASCII, so use parse_1_byte_charset.
+					if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^
+						return false
+					elseif nxt3 == 0x5D then -- Initial ] is non-magic.
+						nxt_pos = nxt_pos + 1
+					end
+					pos = parse_1_byte_charset(pattern, nxt_pos)
+					if not pos then
+						return false
+					end
+				elseif nxt == 0x5A then -- Z
+					nxt = byte(pattern, nxt_pos + 1)
+					if nxt == 0x2A or nxt == 0x2D then -- *-
+						pos = pos + 3
+					else
+						if output == nil then
+							output = {}
+						end
+						local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"
+						n = n + 1
+						if nxt == 0x2B then -- +
+							output[n] = ins .. "%Z*"
+							pos = pos + 3
+						elseif nxt == 0x3F then -- ?
+							output[n] = ins .. "?[\128-\191]*"
+							pos = pos + 3
+						else
+							output[n] = ins .. "[\128-\191]*"
+							pos = pos + 2
+						end
+						start = pos
+					end
+				elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz
+					return false
+				-- Skip the next character if it's ASCII. Otherwise, we will
+				-- still need to do length checks.
+				else
+					pos = pos + (nxt < 128 and 2 or 1)
+				end
+			elseif ch == "(" then
+				if nxt == 0x29 or capture_groups == 32 then -- )
+					return false
+				end
+				capture_groups = capture_groups + 1
+				pos = pos + 1
+			elseif ch == "." then
+				if nxt == 0x2A or nxt == 0x2D then -- *-
+					pos = pos + 2
+				else
+					if output == nil then
+						output = {}
+					end
+					local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"
+					n = n + 1
+					if nxt == 0x2B then -- +
+						output[n] = ins .. ".*"
+						pos = pos + 2
+					elseif nxt == 0x3F then -- ?
+						output[n] = ins .. "?[\128-\191]*"
+						pos = pos + 2
+					else
+						output[n] = ins .. "[\128-\191]*"
+						pos = pos + 1
+					end
+					start = pos
+				end
+			elseif ch == "[" then
+				-- Fail negative charsets. TODO: 1-byte charsets should be safe.
+				if nxt == 0x5E then -- ^
+					return false
+				-- If the first character is "%", ch_len is determined by the
+				-- next one instead.
+				elseif nxt == 0x25 then -- %
+					nxt = byte(pattern, nxt_pos + 1)
+				elseif nxt == 0x5D then -- Initial ] is non-magic.
+					nxt_pos = nxt_pos + 1
+				end
+				if not nxt then
+					return false
+				end
+				local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4
+				if ch_len == 1 then -- Single-byte charset.
+					pos = parse_1_byte_charset(pattern, nxt_pos)
+					if not pos then
+						return false
+					end
+				else -- Multibyte charset.
+					-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.
+					local charset_pos, bytes = pos
+					pos = pos + 1
+					while true do -- TODO: non-ASCII charset ranges.
+						pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)
+						-- If escaped, get the next character. No need to
+						-- distinguish magic characters or character classes,
+						-- as they'll all fail for having the wrong length
+						-- anyway.
+						if ch == "%" then
+							pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)
+						elseif ch == "]" then
+							pos = nxt_pos
+							break
+						end
+						if not (ch and nxt_pos - pos == ch_len) then
+							return false
+						elseif bytes == nil then
+							bytes = {}
+						end
+						local bytes, last = bytes, nxt_pos - 1
+						for i = pos, last - 1 do
+							local b = byte(pattern, i)
+							local bytes_b = bytes[b]
+							if bytes_b == nil then
+								bytes_b = {}
+								bytes[b] = bytes_b
+							end
+							bytes[b], bytes = bytes_b, bytes_b
+						end
+						bytes[byte(pattern, last)] = true
+						pos = nxt_pos
+					end
+					if not pos then
+						return false
+					end
+					nxt = byte(pattern, pos)
+					if (
+						(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?
+						(nxt == 0x2B and ch_len > 2) or -- +
+						not check_sets(bytes)
+					) then
+						return false
+					end
+					local ranges, b, key, next_byte = {}, 0
+					repeat
+						key, next_byte = next(bytes)
+						local range, n = {key}, 1
+						-- Loop starts on the second iteration.
+						for key in next, bytes, key do
+							n = n + 1
+							range[n] = key
+						end
+						b = b + 1
+						ranges[b] = range
+						bytes = next_byte
+					until next_byte == true
+					if nxt == 0x2B then -- +
+						local range1, range2 = ranges[1], ranges[2]
+						ranges[1], ranges[3] = make_charset(range1), make_charset(range2)
+						local n = #range2
+						for i = 1, #range1 do
+							n = n + 1
+							range2[n] = range1[i]
+						end
+						ranges[2] = make_charset(range2) .. "*"
+						pos = pos + 1
+					else
+						for i = 1, #ranges do
+							ranges[i] = make_charset(ranges[i])
+						end
+					end
+					if output == nil then
+						output = {}
+					end
+					nxt = byte(pattern, pos)
+					n = n + 1
+					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..
+						((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
+					start = pos
+				end
+			elseif not nxt then
+				break
+			elseif nxt == 0x2B then -- +
+				if nxt_pos - pos ~= 2 then
+					return false
+				elseif output == nil then
+					output = {}
+				end
+				pos, nxt_pos = pos + 1, nxt_pos + 1
+				nxt = byte(pattern, nxt_pos)
+				local ch2 = sub(pattern, pos, pos)
+				n = n + 1
+				output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..
+					((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
+				pos, start = nxt_pos, nxt_pos
+			elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?
+				return false
+			else
+				pos = nxt_pos
+			end
+		end
+		if start == 1 then
+			return pattern
+		end
+		return concat(output) .. sub(pattern, start)
+	end
+	pattern_simplifier = memoize(pattern_simplifier, true)
+	export.pattern_simplifier = pattern_simplifier
+end
+--[==[
+Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring
+library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).
+The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used
+(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary
+characters.
+]==]
+function get_charset(charset)
+	if type(charset) == "number" then
+		return tostring(charset)
+	end
+	local pos, start, n, output = 1, 1, 0
+	if byte(charset) == 0x5E then -- ^
+		pos = pos + 1
+	end
+	 -- FIXME: "]" is non-magic if it's the first character in a charset.
+	local nxt_pos, nxt
+	while true do
+		local new_pos, ch = match(charset, "()([%%%-%]])", pos)
+		if not ch then
+			break
+		-- Skip percent escapes. Ranges can't start with them, either.
+		elseif ch == "%" then
+			pos = new_pos + 2
 		else
-			return text .. "s"
+			-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.
+			if ch == "-" and new_pos > pos then
+				pos, nxt_pos, nxt = new_pos - 1, new_pos, ch
+				ch = sub(charset, pos, pos)
+			else
+				pos, nxt_pos = new_pos, new_pos + 1
+				nxt = sub(charset, nxt_pos, nxt_pos)
+			end
+			-- Range.
+			if nxt == "-" then
+				if output == nil then
+					output = {}
+				end
+				n = n + 1
+				output[n] = sub(charset, start, pos - 1)
+				nxt_pos = nxt_pos + 1
+				nxt = sub(charset, nxt_pos, nxt_pos)
+				-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.
+				if nxt == "" or nxt == "%" then
+					n = n + 1
+					output[n] = (ch == "]" and "%]" or ch) .. "%-"
+					start = nxt_pos
+					nxt_pos = nxt_pos + 2
+				-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be
+				-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is
+				-- omitted if the range would be empty (i.e. if the first byte is greater than the second).
+				else
+					n = n + 1
+					output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..
+						(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)
+					nxt_pos = nxt_pos + 1
+					start = nxt_pos
+				end
+			elseif ch == "-" or ch == "]" then
+				if output == nil then
+					output = {}
+				end
+				n = n + 1
+				output[n] = sub(charset, start, pos - 1) .. "%" .. ch
+				start = nxt_pos
+			end
+			pos = nxt_pos
+		end
+	end
+	if start == 1 then
+		return "[" .. charset .. "]"
+	end
+	return "[" .. concat(output) .. sub(charset, start) .. "]"
+end
+get_charset = memoize(get_charset, true)
+export.get_charset = get_charset
+function export.len(str)
+	return type(str) == "number" and len(str) or
+		#str - #gsub(str, "[^\128-\191]+", "")
+end
+ulen = export.len
+function export.sub(str, i, j)
+	str, i = type(str) == "number" and tostring(str) or str, i or 1
+	if i < 0 or j and j < 0 then
+		return usub(str, i, j)
+	elseif j and i > j or i > #str then
+		return ""
+	end
+	local n, new_i = 0
+	for loc1, loc2 in gmatch(str, "()[^\128-\191]+()[\128-\191]*") do
+		n = n + loc2 - loc1
+		if not new_i and n >= i then
+			new_i = loc2 - (n - i) - 1
+			if not j then
+				return sub(str, new_i)
+			end
+		end
+		if j and n > j then
+			return sub(str, new_i, loc2 - (n - j) - 1)
 		end
 	end
+	return new_i and sub(str, new_i) or ""
+end
+do
+	local function _find(str, loc1, loc2, ...)
+		if loc1 and not match(str, "^()[^\128-\255]*$") then
+			-- Use raw values of loc1 and loc2 to get loc1 and the length of the match.
+			loc1, loc2 = ulen(sub(str, 1, loc1)), ulen(sub(str, loc1, loc2))
+			-- Offset length with loc1 to get loc2.
+			loc2 = loc1 + loc2 - 1
+		end
+		return loc1, loc2, ...
+	end
+	--[==[A version of find which uses string.find when possible, but otherwise uses mw.ustring.find.]==]
+	function export.find(str, pattern, init, plain)
+		init = init or 1
+		if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
+			return ufind(str, pattern, init, plain)
+		elseif plain then
+			return _find(str, find(str, pattern, init, true))
+		end
+		local simple = pattern_simplifier(pattern)
+		if simple then
+			return _find(str, find(str, simple, init))
+		end
+		return ufind(str, pattern, init)
+	end
+end
+--[==[A version of match which uses string.match when possible, but otherwise uses mw.ustring.match.]==]
+function export.match(str, pattern, init)
+	init = init or 1
+	if init ~= 1 and not match(str, "^()[^\128-\255]*$") then
+		return umatch(str, pattern, init)
+	end
+	local simple = pattern_simplifier(pattern)
+	if simple then
+		return match(str, simple, init)
+	end
+	return umatch(str, pattern, init)
+end
+--[==[A version of gmatch which uses string.gmatch when possible, but otherwise uses mw.ustring.gmatch.]==]
+function export.gmatch(str, pattern)
+	local simple = pattern_simplifier(pattern)
+	if simple then
+		return gmatch(str, simple)
+	end
+	return ugmatch(str, pattern)
+end
+--[==[A version of gsub which uses string.gsub when possible, but otherwise uses mw.ustring.gsub.]==]
+function export.gsub(str, pattern, repl, n)
+	local simple = pattern_simplifier(pattern)
+	if simple then
+		return gsub(str, simple, repl, n)
+	end
+	return ugsub(str, pattern, repl, n)
+end
+--[==[
+Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.
+]==]
+function export.plain_gsub(str, pattern, repl, n)
+	return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
+end
+--[==[
+Reverses a UTF-8 string; equivalent to string.reverse.
+]==]
+function export.reverse(str)
+	return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
+end
+function export.char(...) -- To be moved to [[Module:string/char]].
+	return u(...)
+end
+do
+	local function utf8_err(func_name)
+		error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4)
+	end
+	local function get_codepoint(func_name, b1, b2, b3, b4)
+		if b1 <= 0x7F then
+			return b1, 1
+		elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
+			utf8_err(func_name)
+		elseif b1 <= 0xDF then
+			local cp = 0x40 * b1 + b2 - 0x3080
+			return cp >= 0x80 and cp or utf8_err(func_name), 2
+		elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then
+			utf8_err(func_name)
+		elseif b1 <= 0xEF then
+			local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080
+			return cp >= 0x800 and cp or utf8_err(func_name), 3
+		elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then
+			utf8_err(func_name)
+		end
+		local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080
+		return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4
+	end
+	function export.codepoint(str, i, j)
+		if str == "" then
+			return -- return nothing
+		elseif type(str) == "number" then
+			return byte(str, i, j)
+		end
+		i, j = i or 1, j == -1 and #str or i or 1
+		if i == 1 and j == 1 then
+			return (get_codepoint("codepoint", byte(str, 1, 4)))
+		elseif i < 0 or j < 0 then
+			return ucodepoint(str, i, j) -- FIXME
+		end
+		local n, nb, ret, nr = 0, 1, {}, 0
+		while n < j do
+			n = n + 1
+			if n < i then
+				local b = byte(str, nb)
+				nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
+			else
+				local b1, b2, b3, b4 = byte(str, nb, nb + 3)
+				if not b1 then
+					break
+				end
+				nr = nr + 1
+				local add
+				ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4)
+				nb = nb + add
+			end
+		end
+		return unpack(ret)
+	end
+	codepoint = export.codepoint
+	function export.gcodepoint(str, i, j)
+		i, j = i or 1, j ~= -1 and j or nil
+		if i < 0 or j and j < 0 then
+			return ugcodepoint(str, i, j) -- FIXME
+		end
+		local n, nb = 1, 1
+		while n < i do
+			local b = byte(str, nb)
+			if not b then
+				break
+			end
+			nb = nb + (b < 128 and 1 or b < 224 and 2 or b < 240 and 3 or 4)
+			n = n + 1
+		end
-	-- Check for a link. This pattern matches both piped and unpiped links.
+		return function()
-	-- If the link is not piped, the second capture (linktext) will be empty.
+			if j and n > j then
-	local beginning, link, linktext = mw.ustring.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
+				return nil
-	if link then
+			end
-		if linktext ~= "" then
+			n = n + 1
-			return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
+			local b1, b2, b3, b4 = byte(str, nb, nb + 3)
+			if not b1 then
+				return nil
+			end
+			local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4)
+			nb = nb + add
+			return ret
+		end
+	end
+end
+do
+	local _ulower = ulower
+	--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
+	function export.lower(str)
+		return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str)
+	end
+end
+do
+	local _uupper = uupper
+	--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
+	function export.upper(str)
+		return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str)
+	end
+end
+do
+	local function add_captures(t, n, ...)
+		if ... == nil then
+			return
 		end
-		if word_ends_in_consonant_plus_y(link) then
+		-- Insert any captures from the splitting pattern.
-			return beginning .. "[[" .. link .. "|" .. link:gsub("y$", "ies") .. "]]"
+		local offset, capture = n - 1, ...
+		while capture do
+			n = n + 1
+			t[n] = capture
+			capture = select(n - offset, ...)
 		end
-		return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
+		return n
 	end
-	return do_pluralize(text)
+	--[==[
+	Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like
+	Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by
+	one character at a time; Python returns the whole remainder of the string). When possible, it will use the string
+	library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the
+	string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
+	In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start
+	index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil
+	if there are no further matches. By default, the start index will be calculated using the ustring library, unless
+	`str_lib` or `plain` is set.
+	]==]
+	function export.split(str, pattern_or_func, str_lib, plain)
+		local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
+		repeat
+			n = add_captures(t, n, iter())
+		until n == nil
+		return t
+	end
+	export.capturing_split = export.split -- To be removed.
 end
-function export.singularize(text)
+--[==[
-	if type(text) == "table" then
+Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the
-		-- allow calling from a template
+string up the splitting pattern, with any capture groups being returned as additional values on that iteration.
-		text = text.args[1]
+]==]
+function export.gsplit(str, pattern_or_func, str_lib, plain)
+	local start, final, str_len, _string, callable = 1
+	pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
+	local _find, _sub = _string.find, _string.sub
+	local function iter(loc1, loc2, ...)
+		-- If no match, or there is but we're past the end of the string
+		-- (which happens when the match is the empty string), then return
+		-- the final chunk.
+		if not loc1 then
+			final = true
+			return _sub(str, start)
+		end
+		-- Special case: If we match the empty string, then eat the
+		-- next character; this avoids an infinite loop, and makes
+		-- splitting by the empty string work the way mw.text.gsplit() does
+		-- (including non-adjacent empty string matches with %f). If we
+		-- reach the end of the string this way, set `final` to true, so we
+		-- don't get stuck matching the empty string at the end.
+		local chunk
+		if loc2 < loc1 then
+			-- If using the string library, we need to make sure we advance
+			-- by one UTF-8 character.
+			if _sub == sub then
+				local b = byte(str, loc1)
+				if b and b >= 128 then
+					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
+				end
+			end
+			chunk = _sub(str, start, loc1)
+			if loc1 >= str_len then
+				final = true
+			else
+				start = loc1 + 1
+			end
+		-- Eat chunk up to the current match.
+		else
+			chunk = _sub(str, start, loc1 - 1)
+			start = loc2 + 1
+		end
+		return chunk, ...
 	end
-	-- Singularize a word in a smart fashion, according to normal English rules.
-	-- Works analogously to pluralize().
+	if callable then
-	-- NOTE: This doesn't always work as well as pluralize(). Beware. It will
+		return function()
-	-- mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
+			if not final then
-	-- 1. If word ends in -ies, replace -ies with -y.
+				return iter(pattern_or_func(str, start))
-	-- 2. If the word ends in -xes, -shes, -ches, remove -es. [Does not affect
+			end
-	--    -ses, cf. "houses", "impasses".]
-	-- 3. Otherwise, remove -s.
-	-- This handles links correctly:
-	-- 1. If a piped link, change the second part appropriately. Collapse the
-	--    link to a simple link if both parts end up the same.
-	-- 2. If a non-piped link, singularize the link.
-	-- 3. A link like "[[parish]]es" will be handled correctly because the
-	--    code that checks for -shes etc. allows ] characters between the
-	--    'sh' etc. and final -es.
-	local function do_singularize(text)
-		local sing = text:match("^(.-)ies$")
-		if sing then
-			return sing .. "y"
 		end
-		-- Handle cases like "[[parish]]es"
+	-- Special case if the pattern is anchored to the start: "^" always
-		local sing = text:match("^(.-[sc]h%]*)es$")
+	-- anchors to the start position, not the start of the string, so get
-		if sing then
+	-- around this by only attempting one match with the pattern, then match
-			return sing
+	-- the end of the string.
+	elseif byte(pattern_or_func) == 0x5E then -- ^
+		local returned
+		return function()
+			if not returned then
+				returned = true
+				return iter(_find(str, pattern_or_func, start, plain))
+			elseif not final then
+				return iter(_find(str, "$", start, plain))
+			end
 		end
-		-- Handle cases like "[[box]]es"
+	end
-		local sing = text:match("^(.-x%]*)es$")
+	return function()
-		if sing then
+		if not final then
-			return sing
+			return iter(_find(str, pattern_or_func, start, plain))
 		end
-		local sing = text:match("^(.-)s$")
+	end
-		if sing then
+end
-			return sing
+gsplit = export.gsplit
+function export.count(str, pattern, plain)
+	if plain then
+		return select(2, gsub(str, pattern_escape(pattern), ""))
+	end
+	local simple = pattern_simplifier(pattern)
+	if simple then
+		return select(2, gsub(str, pattern, ""))
+	end
+	return select(2, ugsub(str, pattern, ""))
+end
+function export.trim(str, charset, str_lib, plain)
+	if charset == nil then
+		-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are
+		-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to ""
+		-- first.
+		return match(gsub(str, "^%s*", ""), "^.*%S") or ""
+	elseif charset == "" then
+		return str
+	end
+	charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
+	-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets
+	-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there
+	-- would be two callbacks into PHP, which is slower.
+	local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
+	if not str_lib then
+		local simple = pattern_simplifier(pattern)
+		if not simple then
+			return umatch(str, pattern)
 		end
-		return text
+		pattern = simple
+	end
+	return match(str, pattern)
+end
+trim = export.trim
+do
+	local entities
+	local function get_entities()
+		entities, get_entities = load_data("Module:data/entities"), nil
+		return entities
 	end
-	local function collapse_link(link, linktext)
+	local function decode_entity(hash, x, code)
-		if link == linktext then
+		if hash == "" then
-			return "[[" .. link .. "]]"
+			return (entities or get_entities())[x .. code]
+		end
+		local cp
+		if x == "" then
+			cp = match(code, "^()%d+$") and tonumber(code)
 		else
-			return "[[" .. link .. "|" .. linktext .. "]]"
+			cp = match(code, "^()%x+$") and tonumber(code, 16)
 		end
+		return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil
 	end
-	-- Check for a link. This pattern matches both piped and unpiped links.
+	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases
-	-- If the link is not piped, the second capture (linktext) will be empty.
+	-- which have also been included in [[Module:data/entities]].
-	local beginning, link, linktext = mw.ustring.match(text, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
+	function export.decode_entities(str)
-	if link then
+		local amp = find(str, "&", nil, true)
-		if linktext ~= "" then
+		return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
-			return beginning .. collapse_link(link, do_singularize(linktext))
+	end
+end
+do
+	local entities
+	local function get_entities()
+		-- Memoized HTML entities (taken from mw.text.lua).
+		entities, get_entities = {
+			["\""] = "&quot;",
+			["&"] = "&amp;",
+			["'"] = "&#039;",
+			["<"] = "&lt;",
+			[">"] = "&gt;",
+			["\194\160"] = "&nbsp;",
+		}, nil
+		return entities
+	end
+	local function encode_entity(ch)
+		local entity = (entities or get_entities())[ch]
+		if entity == nil then
+			local cp = codepoint(ch)
+			-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities.
+			entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false
+			entities[ch] = entity
 		end
-		return beginning .. "[[" .. do_singularize(link) .. "]]"
+		return entity or nil
 	end
-	return do_singularize(text)
+	function export.encode_entities(str, charset, str_lib, plain)
+		if charset == nil then
+			return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))
+		elseif charset == "" then
+			return str
+		end
+		local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)
+		if not str_lib then
+			local simple = pattern_simplifier(pattern)
+			if not simple then
+				return (ugsub(str, pattern, encode_entity))
+			end
+			pattern = simple
+		end
+		return (gsub(str, pattern, encode_entity))
+	end
 end
+do
+	local function decode_path(code)
+		return char(tonumber(code, 16))
+	end
+	local function decode(lead, trail)
+		if lead == "+" or lead == "_" then
+			return " " .. trail
+		elseif #trail == 2 then
+			return decode_path(trail)
+		end
+		return lead .. trail
+	end
+	function export.decode_uri(str, enctype)
+		enctype = enctype and upper(enctype) or "QUERY"
+		if enctype == "PATH" then
+			return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str
+		elseif enctype == "QUERY" then
+			return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str
+		elseif enctype == "WIKI" then
+			return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
+		end
+		error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2)
+	end
+end
+do
+	local function _remove_comments(str, pre)
+		local head = find(str, "<!--", nil, true)
+		if not head then
+			return str
+		end
+		local ret, n = {sub(str, 1, head - 1)}, 1
+		while true do
+			local loc = find(str, "-->", head + 4, true)
+			if not loc then
+				return pre and concat(ret) or
+					concat(ret) .. sub(str, head)
+			end
+			head = loc + 3
+			loc = find(str, "<!--", head, true)
+			if not loc then
+				return concat(ret) .. sub(str, head)
+			end
+			n = n + 1
+			ret[n] = sub(str, head, loc - 1)
+			head = loc
+		end
+	end
+	--[==[
+	Removes any HTML comments from the input text. `stage` can be one of three options:
+	* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all
+	  {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed
+	  {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or
+	  [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the
+	  preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags);
+	  if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
+	* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops
+	  over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g.
+	  {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed
+	  {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs,
+	  where the {"PRE"} method will have already been applied by the native parser.
+	* {"BOTH"} applies {"PRE"} then {"POST"}.
+	]==]
+	function export.remove_comments(str, stage)
+		if not stage or stage == "PRE" then
+			return _remove_comments(str, true)
+		end
+		local processed = stage == "POST" and _remove_comments(str) or
+			stage == "BOTH" and _remove_comments(str, true) or
+			error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2)
+		while processed ~= str do
+			str = processed
+			processed = _remove_comments(str)
+		end
+		return str
+	end
+end
+do
+	local byte_escapes
+	local function get_byte_escapes()
+		byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
+		return byte_escapes
+	end
+	local function escape_byte(b)
+		return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))
+	end
+	function export.escape_bytes(str)
+		return (gsub(str, ".", escape_byte))
+	end
+end
+function export.format_fun(str, fun)
+	return (gsub(str, "{(\\?)((\\?)[^{}]*)}", function(p1, name, p2)
+		if #p1 + #p2 == 1 then
+			return name == "op" and "{" or
+				name == "cl" and "}" or
+				error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")
+		elseif fun(name) and type(fun(name)) ~= "string" then
+			error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
+		end
+		return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")
+	end))
+end
+format_fun = export.format_fun
+--[==[
+This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table,
+and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening
+and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a
+slash can be escaped by doubling the initial slash.
+====Examples====
+* {string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"}) }
+*: produces: {"one fish, two fish, red fish, blue fish"}
+* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}
+*: produces: {"The set {1, 2, 3} contains three elements."}
+*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
+]==]
+function export.format(str, tbl)
+	return format_fun(str, function(key)
+		return tbl[key]
+	end)
+end
+do
+	local function do_uclcfirst(str, case_func)
+		-- Re-case the first letter.
+		local first, remainder = match(str, "^(.[\128-\191]*)(.*)")
+		return first and (case_func(first) .. remainder) or ""
+	end
+	local function uclcfirst(str, case_func)
+		-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref>
+		-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through
+		-- [[Module:links]].
+		local html_at_beginning = nil
+		if str:match("^<") then
+			while true do
+				local html_tag, rest = str:match("^(<.->)(.*)$")
+				if not html_tag then
+					break
+				end
+				if not html_at_beginning then
+					html_at_beginning = {}
+				end
+				insert(html_at_beginning, html_tag)
+				str = rest
+			end
+		end
+		-- If there's a link at the beginning, re-case the first letter of the
+		-- link text. This pattern matches both piped and unpiped links.
+		-- If the link is not piped, the second capture (linktext) will be empty.
+		local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
+		local retval
+		if link then
+			retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
+		else
+			retval = do_uclcfirst(str, case_func)
+		end
+		if html_at_beginning then
+			retval = concat(html_at_beginning) .. retval
+		end
+		return retval
+	end
+	--[==[
+	Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally
+	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
+	uppercase the first character of text that may include links that have been passed through `full_link()` in
+	[[Module:links]] or a similar function.
+	]==]
+	function export.ucfirst(str)
+		return uclcfirst(str, uupper)
+	end
+	ucfirst = export.ucfirst
+	--[==[
+	Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally
+	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
+	lowercase the first character of text that may include links that have been passed through `full_link()` in
+	[[Module:links]] or a similar function.
+	]==]
+	function export.lcfirst(str)
+		return uclcfirst(str, ulower)
+	end
+	--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==]
+	function export.capitalize(str)
+		-- Capitalize multi-word that is separated by spaces
+		-- by uppercasing the first letter of each part.
+		return (ugsub(str, "%w+", ucfirst))
+	end
+	local function do_title_case(first, remainder)
+		first = uupper(first)
+		return remainder == "" and first or (first .. ulower(remainder))
+	end
+	--[==[
+	Capitalizes each word of the input string, with any further letters in each word being converted to lowercase.
+	]==]
+	function export.title_case(str)
+		return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case)
+	end
+	title_case = export.title_case
+	--[==[
+	Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between
+	words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase).
+	]==]
+	function export.camel_case(str, lower_first)
+		str = ugsub(str, "%W*(%w*)", title_case)
+		return lower_first and do_uclcfirst(str, ulower) or str
+	end
+end
+do
+	local function do_snake_case(nonword, word)
+		return nonword == "" and word or "_" .. word
+	end
-function export.add_indefinite_article(text, uppercase)
+	--[==[
-	local is_vowel = false
+	Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between
-	-- If there's a link at the beginning, examine the first letter of the
+	words.
-	-- link text. This pattern matches both piped and unpiped links.
+	]==]
-	-- If the link is not piped, the second capture (linktext) will be empty.
+	function export.snake_case(str)
-	local link, linktext, remainder = mw.ustring.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
+		return (ugsub(str, "(%W*)(%w*)", do_snake_case))
-	if link then
-		is_vowel = rfind(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
-	else
-		is_vowel = rfind(text, "^[AEIOUaeiou]")
 	end
-	return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
 end
 return export

Retrieved from "https://duo.linguifex.com/wiki/Module:string_utilities"

Languages

This page is not available in other languages.

Linguifex

Privacy policy
About Linguifex
Disclaimers
Desktop