Home
Random

Log in

Settings

About Linguifex
Disclaimers

Module:string utilities: Difference between revisions

Language
Watch
View history
View source

@@ Line 1: / Line 1: @@
+local export = {}
+local function_module = "Module:fun"
+local load_module = "Module:load"
+local memoize_module = "Module:memoize"
+local string_char_module = "Module:string/char"
+local string_charset_escape_module = "Module:string/charsetEscape"
 local mw = mw
 local string = string
@@ Line 11: / Line 19: @@
 local gmatch = string.gmatch
 local gsub = string.gsub
+local insert = table.insert
 local len = string.len
-local load_data = mw.loadData
 local lower = string.lower
 local match = string.match
 local next = next
+local require = require
 local reverse = string.reverse
 local select = select
@@ Line 30: / Line 39: @@
 local ulower = ustring.lower
 local umatch = ustring.match
-local unpack = unpack
+local unpack = unpack or table.unpack -- Lua 5.2 compatibility
 local upper = string.upper
 local usub = ustring.sub
 local uupper = ustring.upper
+local memoize = require(memoize_module)
 -- Defined below.
-local charset_escape
 local codepoint
 local explode_utf8
 local format_fun
-local get_indefinite_article
+local get_charset
+local gsplit
 local pattern_escape
 local pattern_simplifier
-local php_trim
 local replacement_escape
-local u
+local title_case
+local trim
+local ucfirst
 local ulen
-local module_name = "string_utilities"
+--[==[
+Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures
+modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no
+overhead after the first call, since the target functions are called directly in any subsequent calls.
+]==]
+local function charset_escape(...)
+	charset_escape = require(string_charset_escape_module)
+	return charset_escape(...)
+end
+local function is_callable(...)
+	is_callable = require(function_module).is_callable
+	return is_callable(...)
+end
+local function load_data(...)
+	load_data = require(load_module).load_data
+	return load_data(...)
+end
+local function u(...)
+	u = require(string_char_module)
+	return u(...)
+end
+local function prepare_iter(str, pattern, str_lib, plain)
+	local callable = is_callable(pattern)
+	if str_lib or plain then
+		return pattern, #str, string, callable
+	elseif not callable then
+		local simple = pattern_simplifier(pattern)
+		if simple then
+			return simple, #str, string, false
+		end
+	end
+	return pattern, ulen(str), ustring, callable
+end
+--[==[
+Returns {nil} if the input value is the empty string, or otherwise the same value.
-local export = {}
+If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is
+the empty string, returns {nil}.
+If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input
+string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation
+marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also
+be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).
+]==]
+function export.is_not_empty(str, do_trim, quote_delimiters)
+	if str == "" then
+		return nil
+	elseif not (str and type(str) == "string") then
+		return str
+	elseif do_trim then
+		str = trim(str)
+		if str == "" then
+			return nil
+		end
+	end
+	return quote_delimiters and gsub(str, "^(['\"])(.*)%1$", "%2") or str
+end
---[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
+--[==[
+Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8
+in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in
+unexpected behaviour.
+]==]
 function export.explode_utf8(str)
 	local text, i = {}, 0
@@ Line 62: / Line 138: @@
 explode_utf8 = export.explode_utf8
---[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
+--[==[
-function export.pattern_escape(str)
+Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:
-	return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
+* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to
+  `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte
+  characters start with `0xF0` to `0xF4`.
+* The leading byte must not fall outside of the above ranges.
+* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.
+* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).
+* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to
+  U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings
+  that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte
+  character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000).
+  Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but
+  `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.
+If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in
+UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of
+surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher
+codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances
+where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly
+hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates,
+even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).
+]==]
+function export.isutf8(str, allow_surrogates)
+	for ch in gmatch(str, "[\128-\255][\128-\191]*") do
+		if #ch > 4 then
+			return false
+		end
+		local b1, b2, b3, b4 = byte(ch, 1, 4)
+		if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
+			return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F
+		elseif not b3 then -- 2-byte
+			if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong
+				return false
+			end
+		elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte
+			return false
+		elseif not b4 then -- 3-byte
+			if b1 > 0xEF then
+				return false
+			elseif b2 < 0xA0 then
+				if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong
+					return false
+				end
+			elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate
+				return false
+			end
+		elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte
+			return false
+		elseif b2 < 0x90 then
+			if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong
+				return false
+			end
+		elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high
+			return false
+		end
+	end
+	return true
 end
-pattern_escape = export.pattern_escape
---[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
+do
-function export.charset_escape(str)
+	local charset_chars = {
-	return (gsub(str, "[%%%-%]^]", "%%%0"))
+		["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"
-end
+	}
-charset_escape = export.charset_escape
+	charset_chars.__index = charset_chars
+	local chars = setmetatable({
+		["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",
+		["."] = "%.", ["?"] = "%?", ["["] = "%["
+	}, charset_chars)
+	--[==[
+	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's
+	version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example,
+	{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving
+	arbitrary text (e.g. from user input).
+	]==]
+	function export.pattern_escape(str)
+		return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
+	end
+	pattern_escape = export.pattern_escape
+	--[==[
+	Escapes only {%}, which is the only magic character used in replacement
+	[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.
+	]==]
+	function export.replacement_escape(str)
+		return (gsub(str, "%%", "%%%%"))
+	end
+	replacement_escape = export.replacement_escape
+	local function case_insensitive_char(ch)
+		local upper_ch = uupper(ch)
+		if upper_ch == ch then
+			ch = ulower(ch)
+			if ch == upper_ch then
+				return chars[ch] or ch
+			end
+		end
+		return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"
+	end
+	local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)
+		if not (loc1 and start <= str_len) then
+			-- Add final chunk and return.
+			n = n + 1
+			text[n] = _gsub(_sub(str, start), ".", chars)
+			return
+		elseif loc2 < loc1 then
+			if _sub == sub then
+				local b = byte(str, loc1)
+				if b and b >= 128 then
+					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
+				end
+			end
+			n = n + 1
+			text[n] = _gsub(_sub(str, start, loc1), ".", chars)
+			start = loc1 + 1
+			if start > str_len then
+				return
+			end
+		else
+			-- Add chunk up to the current match.
+			n = n + 1
+			text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)
+			-- Add current match.
+			n = n + 1
+			text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)
+			start = loc2 + 1
+		end
+		return n, start
+	end
---[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
+	--[==[
-function export.replacement_escape(str)
+	Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes
-	return (gsub(str, "%%", "%%%%"))
+	all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second
+	argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns
+	any pattern matching facilities off in the optional pattern supplied.
+	]==]
+	function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
+		if pattern_or_func == nil then
+			return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))
+		end
+		local text, n, start, str_len, _string, callable = {}, 0, 1
+		pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
+		local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub
+		if callable then
+			repeat
+				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))
+			until not start
+		-- Special case if the pattern is anchored to the start: "^" always
+		-- anchors to the start position, not the start of the string, so get
+		-- around this by only attempting one match with the pattern, then match
+		-- the end of the string.
+		elseif byte(pattern_or_func) == 0x5E then -- ^
+			n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
+			if start ~= nil then
+				iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))
+			end
+		else
+			repeat
+				n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
+			until not start
+		end
+		return concat(text)
+	end
 end
-replacement_escape = export.replacement_escape
 do
+	local character_classes
+	local function get_character_classes()
+		character_classes, get_character_classes = {
+			[0x41] = true, [0x61] = true, -- Aa
+			[0x43] = true, [0x63] = true, -- Cc
+			[0x44] = true, [0x64] = true, -- Dd
+			[0x4C] = true, [0x6C] = true, -- Ll
+			[0x50] = true, [0x70] = true, -- Pp
+			[0x53] = true, [0x73] = true, -- Ss
+			[0x55] = true, [0x75] = true, -- Uu
+			[0x57] = true, [0x77] = true, -- Ww
+			[0x58] = true, [0x78] = true, -- Xx
+			[0x5A] = true, -- z dealt with separately.
+		}, nil
+		return character_classes
+	end
 	local function check_sets_equal(set1, set2)
 		local k2
@@ Line 129: / Line 375: @@
 	local function parse_1_byte_charset(pattern, pos)
+		local ch
 		while true do
-			local ch, nxt_pos
+			pos, ch = match(pattern, "()([%%%]\192-\255])", pos)
-			pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
+			if ch == "%" then
-			if not ch then
+				local nxt = byte(pattern, pos + 1)
-				return false
+				if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z
-			elseif ch == "%" then
-				if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
 					return false
 				end
 				pos = pos + 2
 			elseif ch == "]" then
-				pos = nxt_pos
+				pos = pos + 1
 				return pos
 			else
 				return false
 			end
 		end
 	end
-	--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
+	--[==[
-	pattern_simplifier = require("Module:fun").memoize(function(pattern)
+	Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion
+	isn't possible, returns false.
+	]==]
+	function pattern_simplifier(pattern)
 		if type(pattern) == "number" then
 			return tostring(pattern)
 		end
-		local pos, captures, start, n, output = 1, 0, 1, 0
+		local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0
 		while true do
-			local ch, nxt_pos
+			-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.
-			pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
+			pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)
 			if not ch then
 				break
 			end
-			local nxt = sub(pattern, nxt_pos, nxt_pos)
+			local nxt = byte(pattern, nxt_pos)
 			if ch == "%" then
-				if nxt == "b" then
+				if nxt == 0x62 then -- b
-					if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
+					local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)
+					if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then
 						return false
 					end
 					pos = pos + 4
-				elseif nxt == "f" then
+				elseif nxt == 0x66 then -- f
-					pos = pos + 2
+					nxt_pos = nxt_pos + 2
-					if not match(pattern, "^()%[[^^]", pos) then
+					local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)
+					-- Only possible to convert a positive %f charset which is
+					-- all ASCII, so use parse_1_byte_charset.
+					if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^
 						return false
+					elseif nxt3 == 0x5D then -- Initial ] is non-magic.
+						nxt_pos = nxt_pos + 1
 					end
-					-- Only possible to convert a %f charset which is all
+					pos = parse_1_byte_charset(pattern, nxt_pos)
-					-- ASCII, so use parse_1_byte_charset.
-					pos = parse_1_byte_charset(pattern, pos)
 					if not pos then
 						return false
 					end
-				elseif nxt == "Z" then
+				elseif nxt == 0x5A then -- Z
-					pos = pos + 2
+					nxt = byte(pattern, nxt_pos + 1)
-					nxt = sub(pattern, pos, pos)
+					if nxt == 0x2A or nxt == 0x2D then -- *-
-					if nxt == "*" or nxt == "+" or nxt == "-" then
+						pos = pos + 3
-						pos = pos + 1
 					else
-						output = output or {}
+						if output == nil then
+							output = {}
+						end
+						local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"
 						n = n + 1
-						if nxt == "?" then
+						if nxt == 0x2B then -- +
-							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
+							output[n] = ins .. "%Z*"
-							pos = pos + 1
+							pos = pos + 3
+						elseif nxt == 0x3F then -- ?
+							output[n] = ins .. "?[\128-\191]*"
+							pos = pos + 3
 						else
-							output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
+							output[n] = ins .. "[\128-\191]*"
+							pos = pos + 2
 						end
 						start = pos
 					end
-				elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
+				elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz
 					return false
 				-- Skip the next character if it's ASCII. Otherwise, we will
 				-- still need to do length checks.
 				else
-					pos = pos + (byte(nxt) < 128 and 2 or 1)
+					pos = pos + (nxt < 128 and 2 or 1)
 				end
 			elseif ch == "(" then
-				if nxt == ")" or captures == 32 then
+				if nxt == 0x29 or capture_groups == 32 then -- )
 					return false
 				end
-				captures = captures + 1
+				capture_groups = capture_groups + 1
 				pos = pos + 1
 			elseif ch == "." then
-				if nxt == "*" or nxt == "+" or nxt == "-" then
+				if nxt == 0x2A or nxt == 0x2D then -- *-
 					pos = pos + 2
 				else
-					output = output or {}
+					if output == nil then
+						output = {}
+					end
+					local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"
 					n = n + 1
-					if nxt == "?" then
+					if nxt == 0x2B then -- +
-						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
+						output[n] = ins .. ".*"
+						pos = pos + 2
+					elseif nxt == 0x3F then -- ?
+						output[n] = ins .. "?[\128-\191]*"
 						pos = pos + 2
 					else
-						output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
+						output[n] = ins .. "[\128-\191]*"
 						pos = pos + 1
 					end
@@ Line 224: / Line 488: @@
 			elseif ch == "[" then
 				-- Fail negative charsets. TODO: 1-byte charsets should be safe.
-				if nxt == "^" then
+				if nxt == 0x5E then -- ^
 					return false
 				-- If the first character is "%", ch_len is determined by the
 				-- next one instead.
-				elseif nxt == "%" then
+				elseif nxt == 0x25 then -- %
+					nxt = byte(pattern, nxt_pos + 1)
+				elseif nxt == 0x5D then -- Initial ] is non-magic.
 					nxt_pos = nxt_pos + 1
-					nxt = sub(pattern, nxt_pos, nxt_pos)
 				end
-				local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
+				if not nxt then
+					return false
+				end
+				local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4
 				if ch_len == 1 then -- Single-byte charset.
-					pos = parse_1_byte_charset(pattern, pos + 1)
+					pos = parse_1_byte_charset(pattern, nxt_pos)
 					if not pos then
 						return false
 					end
 				else -- Multibyte charset.
+					-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.
 					local charset_pos, bytes = pos
 					pos = pos + 1
 					while true do -- TODO: non-ASCII charset ranges.
-						pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
+						pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)
-						if not ch then
-							return false
 						-- If escaped, get the next character. No need to
 						-- distinguish magic characters or character classes,
 						-- as they'll all fail for having the wrong length
 						-- anyway.
-						elseif ch == "%" then
+						if ch == "%" then
-							pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
+							pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)
 						elseif ch == "]" then
 							pos = nxt_pos
 							break
 						end
-						if ch_len ~= #ch then
+						if not (ch and nxt_pos - pos == ch_len) then
 							return false
+						elseif bytes == nil then
+							bytes = {}
 						end
-						bytes = bytes or {}
+						local bytes, last = bytes, nxt_pos - 1
-						local bytes = bytes
+						for i = pos, last - 1 do
-						for i = 1, ch_len - 1 do
+							local b = byte(pattern, i)
-							local b = byte(ch, i, i)
+							local bytes_b = bytes[b]
-							bytes[b] = bytes[b] or {}
+							if bytes_b == nil then
-							bytes = bytes[b]
+								bytes_b = {}
+								bytes[b] = bytes_b
+							end
+							bytes[b], bytes = bytes_b, bytes_b
 						end
-						bytes[byte(ch, -1)] = true
+						bytes[byte(pattern, last)] = true
 						pos = nxt_pos
 					end
@@ Line 271: / Line 543: @@
 						return false
 					end
-					local nxt = sub(pattern, pos, pos)
+					nxt = byte(pattern, pos)
 					if (
-						(nxt == "?" or nxt == "*" or nxt == "-") or
+						(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?
-						(nxt == "+" and ch_len > 2) or
+						(nxt == 0x2B and ch_len > 2) or -- +
 						not check_sets(bytes)
 					) then
@@ Line 292: / Line 564: @@
 						bytes = next_byte
 					until next_byte == true
-					if nxt == "+" then
+					if nxt == 0x2B then -- +
 						local range1, range2 = ranges[1], ranges[2]
-						ranges[1] = make_charset(range1)
+						ranges[1], ranges[3] = make_charset(range1), make_charset(range2)
-						ranges[3] = make_charset(range2)
 						local n = #range2
 						for i = 1, #range1 do
@@ Line 308: / Line 579: @@
 						end
 					end
-					output = output or {}
+					if output == nil then
+						output = {}
+					end
+					nxt = byte(pattern, pos)
 					n = n + 1
-					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
+					output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..
+						((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
 					start = pos
 				end
-			elseif nxt == "+" then
+			elseif not nxt then
-				if #ch ~= 2 then
+				break
+			elseif nxt == 0x2B then -- +
+				if nxt_pos - pos ~= 2 then
 					return false
+				elseif output == nil then
+					output = {}
 				end
-				output = output or {}
+				pos, nxt_pos = pos + 1, nxt_pos + 1
+				nxt = byte(pattern, nxt_pos)
+				local ch2 = sub(pattern, pos, pos)
 				n = n + 1
-				output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
+				output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..
-				pos = nxt_pos + 1
+					((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
-				start = pos
+				pos, start = nxt_pos, nxt_pos
-			elseif nxt == "?" or nxt == "*" or nxt == "-" then
+			elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?
 				return false
 			else
@@ Line 332: / Line 613: @@
 		end
 		return concat(output) .. sub(pattern, start)
-	end, true)
+	end
-	export.pattern_simplifier = pattern_simplifier -- For testing.
+	pattern_simplifier = memoize(pattern_simplifier, true)
+	export.pattern_simplifier = pattern_simplifier
+end
+--[==[
+Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring
+library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).
+The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used
+(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary
+characters.
+]==]
+function get_charset(charset)
+	if type(charset) == "number" then
+		return tostring(charset)
+	end
+	local pos, start, n, output = 1, 1, 0
+	if byte(charset) == 0x5E then -- ^
+		pos = pos + 1
+	end
+	 -- FIXME: "]" is non-magic if it's the first character in a charset.
+	local nxt_pos, nxt
+	while true do
+		local new_pos, ch = match(charset, "()([%%%-%]])", pos)
+		if not ch then
+			break
+		-- Skip percent escapes. Ranges can't start with them, either.
+		elseif ch == "%" then
+			pos = new_pos + 2
+		else
+			-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.
+			if ch == "-" and new_pos > pos then
+				pos, nxt_pos, nxt = new_pos - 1, new_pos, ch
+				ch = sub(charset, pos, pos)
+			else
+				pos, nxt_pos = new_pos, new_pos + 1
+				nxt = sub(charset, nxt_pos, nxt_pos)
+			end
+			-- Range.
+			if nxt == "-" then
+				if output == nil then
+					output = {}
+				end
+				n = n + 1
+				output[n] = sub(charset, start, pos - 1)
+				nxt_pos = nxt_pos + 1
+				nxt = sub(charset, nxt_pos, nxt_pos)
+				-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.
+				if nxt == "" or nxt == "%" then
+					n = n + 1
+					output[n] = (ch == "]" and "%]" or ch) .. "%-"
+					start = nxt_pos
+					nxt_pos = nxt_pos + 2
+				-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be
+				-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is
+				-- omitted if the range would be empty (i.e. if the first byte is greater than the second).
+				else
+					n = n + 1
+					output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..
+						(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)
+					nxt_pos = nxt_pos + 1
+					start = nxt_pos
+				end
+			elseif ch == "-" or ch == "]" then
+				if output == nil then
+					output = {}
+				end
+				n = n + 1
+				output[n] = sub(charset, start, pos - 1) .. "%" .. ch
+				start = nxt_pos
+			end
+			pos = nxt_pos
+		end
+	end
+	if start == 1 then
+		return "[" .. charset .. "]"
+	end
+	return "[" .. concat(output) .. sub(charset, start) .. "]"
 end
+get_charset = memoize(get_charset, true)
+export.get_charset = get_charset
 function export.len(str)
@@ Line 423: / Line 783: @@
 end
---[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
+--[==[
+Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.
+]==]
 function export.plain_gsub(str, pattern, repl, n)
 	return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
 end
---[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
+--[==[
+Reverses a UTF-8 string; equivalent to string.reverse.
+]==]
 function export.reverse(str)
-	return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
+	return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
+end
+function export.char(...) -- To be moved to [[Module:string/char]].
+	return u(...)
 end
 do
-	local function err(cp)
+	local function utf8_err(func_name)
-		error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
+		error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4)
 	end
-	local function utf8_char(cp)
+	local function get_codepoint(func_name, b1, b2, b3, b4)
-		cp = tonumber(cp)
+		if b1 <= 0x7F then
-		if cp < 0 then
-			err("-0x" .. format("%X", -cp + 1))
-		elseif cp < 0x80 then
-			return char(cp)
-		elseif cp < 0x800 then
-			return char(
-xC0 + cp / 0x40,
-x80 + cp % 0x40
-			)
-		elseif cp < 0x10000 then
-			if cp >= 0xD800 and cp < 0xE000 then
-				return "?" -- mw.ustring.char returns "?" for surrogates.
-			end
-			return char(
-xE0 + cp / 0x1000,
-x80 + cp / 0x40 % 0x40,
-x80 + cp % 0x40
-			)
-		elseif cp < 0x110000 then
-			return char(
-xF0 + cp / 0x40000,
-x80 + cp / 0x1000 % 0x40,
-x80 + cp / 0x40 % 0x40,
-x80 + cp % 0x40
-			)
-		end
-		err("0x" .. format("%X", cp))
-	end
-	function export.char(cp, ...)
-		if ... == nil then
-			return utf8_char(cp)
-		end
-		local ret = {cp, ...}
-		for i = 1, select("#", cp, ...) do
-			ret[i] = utf8_char(ret[i])
-		end
-		return concat(ret)
-	end
-	u = export.char
-end
-do
-	local function get_codepoint(b1, b2, b3, b4)
-		if b1 < 128 then
 			return b1, 1
-		elseif b1 < 224 then
+		elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
-			return 0x40 * b1 + b2 - 0x3080, 2
+			utf8_err(func_name)
-		elseif b1 < 240 then
+		elseif b1 <= 0xDF then
-			return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
+			local cp = 0x40 * b1 + b2 - 0x3080
+			return cp >= 0x80 and cp or utf8_err(func_name), 2
+		elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then
+			utf8_err(func_name)
+		elseif b1 <= 0xEF then
+			local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080
+			return cp >= 0x800 and cp or utf8_err(func_name), 3
+		elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then
+			utf8_err(func_name)
 		end
-		return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
+		local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080
+		return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4
 	end
 	function export.codepoint(str, i, j)
-		if type(str) == "number" then
+		if str == "" then
+			return -- return nothing
+		elseif type(str) == "number" then
 			return byte(str, i, j)
 		end
 		i, j = i or 1, j == -1 and #str or i or 1
 		if i == 1 and j == 1 then
-			return (get_codepoint(byte(str, 1, 4)))
+			return (get_codepoint("codepoint", byte(str, 1, 4)))
 		elseif i < 0 or j < 0 then
 			return ucodepoint(str, i, j) -- FIXME
@@ Line 517: / Line 851: @@
 				nr = nr + 1
 				local add
-				ret[nr], add = get_codepoint(b1, b2, b3, b4)
+				ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4)
 				nb = nb + add
 			end
@@ Line 549: / Line 883: @@
 				return nil
 			end
-			local ret, add = get_codepoint(b1, b2, b3, b4)
+			local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4)
 			nb = nb + add
 			return ret
@@ Line 556: / Line 890: @@
 end
---[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
+do
-function export.lower(str)
+	local _ulower = ulower
-	return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
+	--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
+	function export.lower(str)
+		return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str)
+	end
 end
---[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
+do
-function export.upper(str)
+	local _uupper = uupper
-	return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
+	--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
+	function export.upper(str)
+		return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str)
+	end
 end
 do
-	local function add_captures(text, n, ...)
+	local function add_captures(t, n, ...)
+		if ... == nil then
+			return
+		end
 		-- Insert any captures from the splitting pattern.
 		local offset, capture = n - 1, ...
 		while capture do
 			n = n + 1
-			text[n] = capture
+			t[n] = capture
 			capture = select(n - offset, ...)
 		end
@@ Line 578: / Line 923: @@
 	end
-	local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
+	--[==[
-		if not (loc1 and start <= str_len) then
+	Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like
-			-- If no match, or there is but we're past the end of the string
+	Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by
-			-- (which happens when the match is the empty string), then add
+	one character at a time; Python returns the whole remainder of the string). When possible, it will use the string
-			-- the final chunk and return.
+	library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the
-			n = n + 1
+	string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
-			text[n] = _sub(str, start)
-			return
-		elseif loc2 < loc1 then
-			-- Special case: If we match the empty string, then include the
-			-- next character; this avoids an infinite loop, and makes
-			-- splitting by an empty string work the way mw.text.split() does
-			-- (including non-adjacent empty string matches with %f). If we
-			-- reach the end of the string this way, return immediately, so we
-			-- don't get a final empty string. If using the string library, we
-			-- need to make sure we advance by one UTF-8 character.
-			if _sub == sub then
-				loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
-			end
-			n = n + 1
-			text[n] = _sub(str, start, loc1)
-			start = loc1 + 1
-			if start > str_len then
-				return ... and add_captures(text, n, ...) or n
-			end
-		else
-			-- Add chunk up to the current match.
-			n = n + 1
-			text[n] = _sub(str, start, loc1 - 1)
-			start = loc2 + 1
-		end
-		return (... and add_captures(text, n, ...) or n), start
-	end
-	local function _split(str, pattern, str_len, _sub, _find, plain)
+	In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start
-		local text, n, start = {}, 0, 1
+	index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil
+	if there are no further matches. By default, the start index will be calculated using the ustring library, unless
+	`str_lib` or `plain` is set.
+	]==]
+	function export.split(str, pattern_or_func, str_lib, plain)
+		local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
 		repeat
-			n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
+			n = add_captures(t, n, iter())
-		until not start
+		until n == nil
+		return t
-		return text
-	end
-	--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
-	function export.split(str, pattern, str_lib, plain)
-		if str_lib or plain then
-			return _split(str, pattern, #str, sub, find, plain)
-		end
-		local simple = pattern_simplifier(pattern)
-		if simple then
-			return _split(str, simple, #str, sub, find)
-		end
-		return _split(str, pattern, ulen(str), usub, ufind)
 	end
 	export.capturing_split = export.split -- To be removed.
 end
-do
+--[==[
-	-- TODO: merge this with export.split. Not clear how to do this while
+Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the
-	-- maintaining the same level of performance, as gsplit is slower.
+string up the splitting pattern, with any capture groups being returned as additional values on that iteration.
-	local function _split(str, pattern, str_len, _sub, _find, plain)
+]==]
-		local start, final = 1
+function export.gsplit(str, pattern_or_func, str_lib, plain)
+	local start, final, str_len, _string, callable = 1
-		local function iter(loc1, loc2, ...)
+	pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
-			-- If no match, return the final chunk.
+	local _find, _sub = _string.find, _string.sub
-			if not loc1 then
+	local function iter(loc1, loc2, ...)
+		-- If no match, or there is but we're past the end of the string
+		-- (which happens when the match is the empty string), then return
+		-- the final chunk.
+		if not loc1 then
+			final = true
+			return _sub(str, start)
+		end
+		-- Special case: If we match the empty string, then eat the
+		-- next character; this avoids an infinite loop, and makes
+		-- splitting by the empty string work the way mw.text.gsplit() does
+		-- (including non-adjacent empty string matches with %f). If we
+		-- reach the end of the string this way, set `final` to true, so we
+		-- don't get stuck matching the empty string at the end.
+		local chunk
+		if loc2 < loc1 then
+			-- If using the string library, we need to make sure we advance
+			-- by one UTF-8 character.
+			if _sub == sub then
+				local b = byte(str, loc1)
+				if b and b >= 128 then
+					loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
+				end
+			end
+			chunk = _sub(str, start, loc1)
+			if loc1 >= str_len then
 				final = true
-				return _sub(str, start)
-			end
-			-- Special case: If we match the empty string, then eat the
-			-- next character; this avoids an infinite loop, and makes
-			-- splitting by the empty string work the way mw.text.gsplit() does
-			-- (including non-adjacent empty string matches with %f). If we
-			-- reach the end of the string this way, set `final` to true, so we
-			-- don't get stuck matching the empty string at the end.
-			local chunk
-			if loc2 < loc1 then
-				-- If using the string library, we need to make sure we advance
-				-- by one UTF-8 character.
-				if _sub == sub then
-					loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
-				end
-				chunk = _sub(str, start, loc1)
-				if loc1 >= str_len then
-					final = true
-				else
-					start = loc1 + 1
-				end
-			-- Eat chunk up to the current match.
 			else
-				chunk = _sub(str, start, loc1 - 1)
+				start = loc1 + 1
-				start = loc2 + 1
 			end
-			return chunk, ...
+		-- Eat chunk up to the current match.
+		else
+			chunk = _sub(str, start, loc1 - 1)
+			start = loc2 + 1
 		end
+		return chunk, ...
+	end
+	if callable then
 		return function()
 			if not final then
-				return iter(_find(str, pattern, start, plain))
+				return iter(pattern_or_func(str, start))
+			end
+		end
+	-- Special case if the pattern is anchored to the start: "^" always
+	-- anchors to the start position, not the start of the string, so get
+	-- around this by only attempting one match with the pattern, then match
+	-- the end of the string.
+	elseif byte(pattern_or_func) == 0x5E then -- ^
+		local returned
+		return function()
+			if not returned then
+				returned = true
+				return iter(_find(str, pattern_or_func, start, plain))
+			elseif not final then
+				return iter(_find(str, "$", start, plain))
 			end
-			return nil
 		end
 	end
+	return function()
-	function export.gsplit(str, pattern, str_lib, plain)
+		if not final then
-		if str_lib or plain then
+			return iter(_find(str, pattern_or_func, start, plain))
-			return _split(str, pattern, #str, sub, find, plain)
 		end
-		local simple = pattern_simplifier(pattern)
-		if simple then
-			return _split(str, simple, #str, sub, find)
-		end
-		return _split(str, pattern, ulen(str), usub, ufind)
 	end
+end
+gsplit = export.gsplit
+function export.count(str, pattern, plain)
+	if plain then
+		return select(2, gsub(str, pattern_escape(pattern), ""))
+	end
+	local simple = pattern_simplifier(pattern)
+	if simple then
+		return select(2, gsub(str, pattern, ""))
+	end
+	return select(2, ugsub(str, pattern, ""))
 end
-function export.trim(str, charset)
+function export.trim(str, charset, str_lib, plain)
-	if not charset then
+	if charset == nil then
-		return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
+		-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are
-	elseif match(charset, "^()[^\128-\255]*$") then
+		-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to ""
-		return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
+		-- first.
+		return match(gsub(str, "^%s*", ""), "^.*%S") or ""
+	elseif charset == "" then
+		return str
 	end
-	return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
+	charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
+	-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets
+	-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there
+	-- would be two callbacks into PHP, which is slower.
+	local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
+	if not str_lib then
+		local simple = pattern_simplifier(pattern)
+		if not simple then
+			return umatch(str, pattern)
+		end
+		pattern = simple
+	end
+	return match(str, pattern)
 end
+trim = export.trim
 do
 	local entities
+	local function get_entities()
-	local function decode_numeric_entity(code, pattern, base)
+		entities, get_entities = load_data("Module:data/entities"), nil
-		local cp = match(code, pattern) and tonumber(code, base)
+		return entities
-		return cp and cp < 0x110000 and u(cp) or nil
 	end
 	local function decode_entity(hash, x, code)
-		if hash == "#" then
+		if hash == "" then
-			return x == "" and decode_numeric_entity(code, "^%d+$") or
+			return (entities or get_entities())[x .. code]
-				decode_numeric_entity(code, "^%x+$", 16)
 		end
-		entities = entities or load_data("Module:data/entities")
+		local cp
-		return entities[x .. code]
+		if x == "" then
+			cp = match(code, "^()%d+$") and tonumber(code)
+		else
+			cp = match(code, "^()%x+$") and tonumber(code, 16)
+		end
+		return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil
 	end
-	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
+	-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases
+	-- which have also been included in [[Module:data/entities]].
 	function export.decode_entities(str)
-		return find(str, "&", 1, true) and
+		local amp = find(str, "&", nil, true)
-			gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
+		return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
 	end
 end
 do
-	local html_entities
+	local entities
+	local function get_entities()
-	local function encode_entity(ch)
-		local entity = html_entities[ch]
-		if entity then
-			return entity
-		end
-		entity = "&#" .. codepoint(ch) .. ";"
-		html_entities[ch] = entity
-		return entity
-	end
-	function export.encode_entities(str, charset, str_lib, plain)
 		-- Memoized HTML entities (taken from mw.text.lua).
-		html_entities = html_entities or {
+		entities, get_entities = {
 			["\""] = "&quot;",
 			["&"] = "&amp;",
@@ Line 750: / Line 1,096: @@
 			[">"] = "&gt;",
 			["\194\160"] = "&nbsp;",
-		}
+		}, nil
-		if not charset then
+		return entities
-			return (gsub(str, "[\"&'<>\194]\160?", html_entities))
+	end
-		elseif plain then
-			return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
+	local function encode_entity(ch)
-		elseif str_lib then
+		local entity = (entities or get_entities())[ch]
-			if not match(charset, "^()[^\128-\255]*$") then
+		if entity == nil then
-				error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
+			local cp = codepoint(ch)
+			-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities.
+			entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false
+			entities[ch] = entity
+		end
+		return entity or nil
+	end
+	function export.encode_entities(str, charset, str_lib, plain)
+		if charset == nil then
+			return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))
+		elseif charset == "" then
+			return str
+		end
+		local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)
+		if not str_lib then
+			local simple = pattern_simplifier(pattern)
+			if not simple then
+				return (ugsub(str, pattern, encode_entity))
 			end
-			return (gsub(str, "[" .. charset .. "]", encode_entity))
+			pattern = simple
-		end
-		local pattern = charset and "[" .. charset .. "]"
-		local simple = pattern_simplifier(pattern)
-		if simple then
-			return (gsub(str, simple, encode_entity))
 		end
-		return (ugsub(str, pattern, encode_entity))
+		return (gsub(str, pattern, encode_entity))
 	end
 end
@@ Line 787: / Line 1,146: @@
 		enctype = enctype and upper(enctype) or "QUERY"
 		if enctype == "PATH" then
-			return find(str, "%", 1, true) and
+			return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str
-				gsub(str, "%%(%x%x)", decode_path) or str
 		elseif enctype == "QUERY" then
-			return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
+			return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str
-				gsub(str, "([%%%+])(%x?%x?)", decode) or str
 		elseif enctype == "WIKI" then
-			return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
+			return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
-				gsub(str, "([%%_])(%x?%x?)", decode) or str
 		end
-		error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
+		error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2)
 	end
 end
@@ Line 802: / Line 1,158: @@
 do
 	local function _remove_comments(str, pre)
-		local head = find(str, "<!--", 1, true)
+		local head = find(str, "<!--", nil, true)
 		if not head then
 			return str
@@ Line 824: / Line 1,180: @@
 	end
-	--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
+	--[==[
-	* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
+	Removes any HTML comments from the input text. `stage` can be one of three options:
-	* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
+	* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all
-	* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
+	  {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed
+	  {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or
+	  [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the
+	  preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags);
+	  if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
+	* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops
+	  over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g.
+	  {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed
+	  {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs,
+	  where the {"PRE"} method will have already been applied by the native parser.
+	* {"BOTH"} applies {"PRE"} then {"POST"}.
+	]==]
 	function export.remove_comments(str, stage)
 		if not stage or stage == "PRE" then
@@ Line 834: / Line 1,201: @@
 		local processed = stage == "POST" and _remove_comments(str) or
 			stage == "BOTH" and _remove_comments(str, true) or
-			error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
+			error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2)
 		while processed ~= str do
 			str = processed
@@ Line 841: / Line 1,208: @@
 		return str
 	end
-end
---[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
-function export.php_trim(str)
-	return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
-end
-php_trim = export.php_trim
---[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.
-After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
-# They are integers, with no decimals (2.0) or leading zeroes (02).
-# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
-# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
-function export.scribunto_param_key(key)
-	if type(key) ~= "string" then
-		return key
-	end
-	key = php_trim(key)
-	if match(key, "^-?[1-9]%d*$") then
-		local num = tonumber(key)
-		-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
-		return (
-			num <= 9007199254740991 and num >= -9007199254740991 or
-			key == "9007199254740992" or
-			key == "-9007199254740992"
-		) and num or key
-	elseif key == "0" then
-		return 0
-	end
-	return key
 end
 do
 	local byte_escapes
+	local function get_byte_escapes()
+		byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
+		return byte_escapes
+	end
 	local function escape_byte(b)
-		return byte_escapes[b] or format("\\%03d", byte(b))
+		return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))
 	end
 	function export.escape_bytes(str)
-		byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
 		return (gsub(str, ".", escape_byte))
 	end
@@ Line 892: / Line 1,231: @@
 			return name == "op" and "{" or
 				name == "cl" and "}" or
-				error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
+				error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")
 		elseif fun(name) and type(fun(name)) ~= "string" then
-			error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
+			error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
 		end
-		return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
+		return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")
 	end))
 end
 format_fun = export.format_fun
---[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
+--[==[
+This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table,
+and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening
+and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a
+slash can be escaped by doubling the initial slash.
 ====Examples====
-* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
+* {string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"}) }
-*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
+*: produces: {"one fish, two fish, red fish, blue fish"}
-* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
+* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}
-*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
+*: produces: {"The set {1, 2, 3} contains three elements."}
-*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
+*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
+]==]
 function export.format(str, tbl)
 	return format_fun(str, function(key)
@@ Line 916: / Line 1,261: @@
 do
 	local function do_uclcfirst(str, case_func)
-		-- Actual function to re-case of the first letter.
+		-- Re-case the first letter.
-		local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
+		local first, remainder = match(str, "^(.[\128-\191]*)(.*)")
-		return first_letter .. sub(str, #first_letter + 1)
+		return first and (case_func(first) .. remainder) or ""
 	end
 	local function uclcfirst(str, case_func)
+		-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref>
+		-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through
+		-- [[Module:links]].
+		local html_at_beginning = nil
+		if str:match("^<") then
+			while true do
+				local html_tag, rest = str:match("^(<.->)(.*)$")
+				if not html_tag then
+					break
+				end
+				if not html_at_beginning then
+					html_at_beginning = {}
+				end
+				insert(html_at_beginning, html_tag)
+				str = rest
+			end
+		end
 		-- If there's a link at the beginning, re-case the first letter of the
 		-- link text. This pattern matches both piped and unpiped links.
 		-- If the link is not piped, the second capture (linktext) will be empty.
 		local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
+		local retval
 		if link then
-			return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
+			retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
+		else
+			retval = do_uclcfirst(str, case_func)
+		end
+		if html_at_beginning then
+			retval = concat(html_at_beginning) .. retval
 		end
-		return do_uclcfirst(str, case_func)
+		return retval
 	end
+	--[==[
+	Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally
+	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
+	uppercase the first character of text that may include links that have been passed through `full_link()` in
+	[[Module:links]] or a similar function.
+	]==]
 	function export.ucfirst(str)
 		return uclcfirst(str, uupper)
 	end
+	ucfirst = export.ucfirst
+	--[==[
+	Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally
+	surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
+	lowercase the first character of text that may include links that have been passed through `full_link()` in
+	[[Module:links]] or a similar function.
+	]==]
 	function export.lcfirst(str)
 		return uclcfirst(str, ulower)
 	end
-	local function capitalize(w)
+	--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==]
-		return uclcfirst(w, uupper)
-	end
-	--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
 	function export.capitalize(str)
-		if type(str) == "table" then
-			-- allow calling from a template
-			str = str.args[1]
-		end
 		-- Capitalize multi-word that is separated by spaces
 		-- by uppercasing the first letter of each part.
-		-- I assume nobody will input all CAP text.
+		return (ugsub(str, "%w+", ucfirst))
-		return (ugsub(str, "%S+", capitalize))
 	end
-end
-do
+	local function do_title_case(first, remainder)
-	local function word_ends_in_consonant_plus_y(str)
+		first = uupper(first)
-		-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
+		return remainder == "" and first or (first .. ulower(remainder))
-		-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-		-- We should maybe consider applying this rule here; but it may not
-		-- be important as this function is almost always called on common nouns
-		-- (e.g. parts of speech, place types).
-		return find(str, "[^aeiouyAEIOUY ]y$")
 	end
-	local function word_takes_es_plural(str)
+	--[==[
-		return find(str, "[sxz]$") or find(str, "[csz]h$")
+	Capitalizes each word of the input string, with any further letters in each word being converted to lowercase.
+	]==]
+	function export.title_case(str)
+		return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case)
 	end
+	title_case = export.title_case
-	local function do_pluralize(str)
-		if word_ends_in_consonant_plus_y(str) then
-			-- avoid returning multiple values
-			return (gsub(str, "y$", "ies"))
-		elseif word_takes_es_plural(str) then
-			return str .. "es"
-		end
-		return str .. "s"
-	end
 	--[==[
-	Pluralize a word in a smart fashion, according to normal English rules.
+	Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between
-	# If word ends in consonant + -y, replace the -y with -ies.
+	words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase).
-	# If the word ends in -s, -x, -z, -ch, -sh, -zh, add -es.
-	# Otherwise, add -s.
-	This handles links correctly:
-	# If a piped link, change the second part appropriately.
-	# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
-	# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
 	]==]
-	function export.pluralize(str)
+	function export.camel_case(str, lower_first)
-		if type(str) == "table" then
+		str = ugsub(str, "%W*(%w*)", title_case)
-			-- allow calling from a template
+		return lower_first and do_uclcfirst(str, ulower) or str
-			str = str.args[1]
-		end
-		-- Check for a link. This pattern matches both piped and unpiped links.
-		-- If the link is not piped, the second capture (linktext) will be empty.
-		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
-		if not link then
-			return do_pluralize(str)
-		elseif linktext ~= "" then
-			return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
-		elseif word_ends_in_consonant_plus_y(link) then
-			return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
-		end
-		return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
 	end
 end
 do
-	local function do_singularize(str)
+	local function do_snake_case(nonword, word)
-		local sing = match(str, "^(.-)ies$")
+		return nonword == "" and word or "_" .. word
-		if sing then
-			return sing .. "y"
-		end
-		-- Handle cases like "[[parish]]es"
-		return match(str, "^(.-[cs]h%]*)es$") or -- not -zhes
-		-- Handle cases like "[[box]]es"
-			match(str, "^(.-x%]*)es$") or -- not -ses or -zes
-		-- Handle regular plurals
-			match(str, "^(.-)s$") or
-		-- Otherwise, return input
-			str
 	end
-	local function collapse_link(link, linktext)
-		if link == linktext then
-			return "[[" .. link .. "]]"
-		end
-		return "[[" .. link .. "|" .. linktext .. "]]"
-	end
 	--[==[
-	Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.
+	Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between
+	words.
-	'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
-	# If word ends in -ies, replace -ies with -y.
-	# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
-	# Otherwise, remove -s.
-	This handles links correctly:
-	# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
-	# If a non-piped link, singularize the link.
-	# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
-	  'sh' etc. and final -es.
 	]==]
-	function export.singularize(str)
+	function export.snake_case(str)
-		if type(str) == "table" then
+		return (ugsub(str, "(%W*)(%w*)", do_snake_case))
-			-- allow calling from a template
-			str = str.args[1]
-		end
-		-- Check for a link. This pattern matches both piped and unpiped links.
-		-- If the link is not piped, the second capture (linktext) will be empty.
-		local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
-		if not link then
-			return do_singularize(str)
-		elseif linktext ~= "" then
-			return beginning .. collapse_link(link, do_singularize(linktext))
-		end
-		return beginning .. "[[" .. do_singularize(link) .. "]]"
 	end
-end
---[==[
-Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
-Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
-a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
-]==]
-function export.get_indefinite_article(str, ucfirst)
-	str = str or ""
-	local is_vowel = false
-	-- If there's a link at the beginning, examine the first letter of the
-	-- link text. This pattern matches both piped and unpiped links.
-	-- If the link is not piped, the second capture (linktext) will be empty.
-	local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
-	if link then
-		is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
-	else
-		is_vowel = find(str, "^[AEIOUaeiou]")
-	end
-	return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
-end
-get_indefinite_article = export.get_indefinite_article
---[==[
-Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
-text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
-with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
-]==]
-function export.add_indefinite_article(text, ucfirst)
-	return get_indefinite_article(text, ucfirst) .. " " .. text
 end
 return export

Retrieved from "https://duo.linguifex.com/wiki/Module:string_utilities"

Languages

This page is not available in other languages.

Linguifex

Privacy policy
About Linguifex
Disclaimers
Desktop