Module:string utilities: Difference between revisions

No edit summary
m 1 revision imported
 
(3 intermediate revisions by 2 users not shown)
Line 1: Line 1:
local export = {}
local function_module = "Module:fun"
local load_module = "Module:load"
local memoize_module = "Module:memoize"
local string_char_module = "Module:string/char"
local string_charset_escape_module = "Module:string/charsetEscape"
local mw = mw
local mw = mw
local string = string
local string = string
Line 11: Line 19:
local gmatch = string.gmatch
local gmatch = string.gmatch
local gsub = string.gsub
local gsub = string.gsub
local insert = table.insert
local len = string.len
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local lower = string.lower
local match = string.match
local match = string.match
local next = next
local next = next
local require = require
local reverse = string.reverse
local reverse = string.reverse
local select = select
local select = select
Line 30: Line 39:
local ulower = ustring.lower
local ulower = ustring.lower
local umatch = ustring.match
local umatch = ustring.match
local unpack = unpack
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local upper = string.upper
local upper = string.upper
local usub = ustring.sub
local usub = ustring.sub
local uupper = ustring.upper
local uupper = ustring.upper
local memoize = require(memoize_module)
-- Defined below.
-- Defined below.
local charset_escape
local codepoint
local codepoint
local explode_utf8
local explode_utf8
local format_fun
local format_fun
local get_indefinite_article
local get_charset
local gsplit
local pattern_escape
local pattern_escape
local pattern_simplifier
local pattern_simplifier
local php_trim
local replacement_escape
local replacement_escape
local u
local title_case
local trim
local ucfirst
local ulen
local ulen


local module_name = "string_utilities"
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures
modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no
overhead after the first call, since the target functions are called directly in any subsequent calls.
]==]
local function charset_escape(...)
charset_escape = require(string_charset_escape_module)
return charset_escape(...)
end
 
local function is_callable(...)
is_callable = require(function_module).is_callable
return is_callable(...)
end
 
local function load_data(...)
load_data = require(load_module).load_data
return load_data(...)
end
 
local function u(...)
u = require(string_char_module)
return u(...)
end
 
local function prepare_iter(str, pattern, str_lib, plain)
local callable = is_callable(pattern)
if str_lib or plain then
return pattern, #str, string, callable
elseif not callable then
local simple = pattern_simplifier(pattern)
if simple then
return simple, #str, string, false
end
end
return pattern, ulen(str), ustring, callable
end
 
--[==[
Returns {nil} if the input value is the empty string, or otherwise the same value.


local export = {}
If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is
the empty string, returns {nil}.
 
If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input
string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation
marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also
be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).
]==]
function export.is_not_empty(str, do_trim, quote_delimiters)
if str == "" then
return nil
elseif not (str and type(str) == "string") then
return str
elseif do_trim then
str = trim(str)
if str == "" then
return nil
end
end
return quote_delimiters and gsub(str, "^(['\"])(.*)%1$", "%2") or str
end


--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
--[==[
Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8
in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in
unexpected behaviour.
]==]
function export.explode_utf8(str)
function export.explode_utf8(str)
local text, i = {}, 0
local text, i = {}, 0
Line 62: Line 138:
explode_utf8 = export.explode_utf8
explode_utf8 = export.explode_utf8


--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
--[==[
function export.pattern_escape(str)
Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to
  `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte
  characters start with `0xF0` to `0xF4`.
* The leading byte must not fall outside of the above ranges.
* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.
* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).
* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to
  U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings
  that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte
  character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000).
  Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but
  `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.
 
If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in
UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of
surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher
codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances
where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly
4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates,
even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).
]==]
function export.isutf8(str, allow_surrogates)
for ch in gmatch(str, "[\128-\255][\128-\191]*") do
if #ch > 4 then
return false
end
local b1, b2, b3, b4 = byte(ch, 1, 4)
if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F
elseif not b3 then -- 2-byte
if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong
return false
end
elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte
return false
elseif not b4 then -- 3-byte
if b1 > 0xEF then
return false
elseif b2 < 0xA0 then
if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong
return false
end
elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate
return false
end
elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte
return false
elseif b2 < 0x90 then
if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong
return false
end
elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high
return false
end
end
return true
end
end
pattern_escape = export.pattern_escape


--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
do
function export.charset_escape(str)
local charset_chars = {
return (gsub(str, "[%%%-%]^]", "%%%0"))
["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"
end
}
charset_escape = export.charset_escape
charset_chars.__index = charset_chars
local chars = setmetatable({
["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",
["."] = "%.", ["?"] = "%?", ["["] = "%["
}, charset_chars)
 
--[==[
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's
version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example,
{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving
arbitrary text (e.g. from user input).
]==]
function export.pattern_escape(str)
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
end
pattern_escape = export.pattern_escape
 
--[==[
Escapes only {%}, which is the only magic character used in replacement
[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.
]==]
function export.replacement_escape(str)
return (gsub(str, "%%", "%%%%"))
end
replacement_escape = export.replacement_escape
 
local function case_insensitive_char(ch)
local upper_ch = uupper(ch)
if upper_ch == ch then
ch = ulower(ch)
if ch == upper_ch then
return chars[ch] or ch
end
end
return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"
end
 
local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)
if not (loc1 and start <= str_len) then
-- Add final chunk and return.
n = n + 1
text[n] = _gsub(_sub(str, start), ".", chars)
return
elseif loc2 < loc1 then
if _sub == sub then
local b = byte(str, loc1)
if b and b >= 128 then
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
end
end
n = n + 1
text[n] = _gsub(_sub(str, start, loc1), ".", chars)
start = loc1 + 1
if start > str_len then
return
end
else
-- Add chunk up to the current match.
n = n + 1
text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)
-- Add current match.
n = n + 1
text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)
start = loc2 + 1
end
return n, start
end


--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
--[==[
function export.replacement_escape(str)
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes
return (gsub(str, "%%", "%%%%"))
all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second
argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns
any pattern matching facilities off in the optional pattern supplied.
]==]
function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
if pattern_or_func == nil then
return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))
end
local text, n, start, str_len, _string, callable = {}, 0, 1
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub
if callable then
repeat
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))
until not start
-- Special case if the pattern is anchored to the start: "^" always
-- anchors to the start position, not the start of the string, so get
-- around this by only attempting one match with the pattern, then match
-- the end of the string.
elseif byte(pattern_or_func) == 0x5E then -- ^
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
if start ~= nil then
iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))
end
else
repeat
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
until not start
end
return concat(text)
end
end
end
replacement_escape = export.replacement_escape


do
do
local character_classes
local function get_character_classes()
character_classes, get_character_classes = {
[0x41] = true, [0x61] = true, -- Aa
[0x43] = true, [0x63] = true, -- Cc
[0x44] = true, [0x64] = true, -- Dd
[0x4C] = true, [0x6C] = true, -- Ll
[0x50] = true, [0x70] = true, -- Pp
[0x53] = true, [0x73] = true, -- Ss
[0x55] = true, [0x75] = true, -- Uu
[0x57] = true, [0x77] = true, -- Ww
[0x58] = true, [0x78] = true, -- Xx
[0x5A] = true, -- z dealt with separately.
}, nil
return character_classes
end
local function check_sets_equal(set1, set2)
local function check_sets_equal(set1, set2)
local k2
local k2
Line 129: Line 375:
local function parse_1_byte_charset(pattern, pos)
local function parse_1_byte_charset(pattern, pos)
local ch
while true do
while true do
local ch, nxt_pos
pos, ch = match(pattern, "()([%%%]\192-\255])", pos)
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
if ch == "%" then
if not ch then
local nxt = byte(pattern, pos + 1)
return false
if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z
elseif ch == "%" then
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
return false
return false
end
end
pos = pos + 2
pos = pos + 2
elseif ch == "]" then
elseif ch == "]" then
pos = nxt_pos
pos = pos + 1
return pos
return pos
else
else
return false
return false
end
end
end
end
end
end
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
--[==[
pattern_simplifier = require("Module:fun").memoize(function(pattern)
Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion
isn't possible, returns false.
]==]
function pattern_simplifier(pattern)
if type(pattern) == "number" then
if type(pattern) == "number" then
return tostring(pattern)
return tostring(pattern)
end
end
local pos, captures, start, n, output = 1, 0, 1, 0
local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0
while true do
while true do
local ch, nxt_pos
-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)
if not ch then
if not ch then
break
break
end
end
local nxt = sub(pattern, nxt_pos, nxt_pos)
local nxt = byte(pattern, nxt_pos)
if ch == "%" then
if ch == "%" then
if nxt == "b" then
if nxt == 0x62 then -- b
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)
if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then
return false
return false
end
end
pos = pos + 4
pos = pos + 4
elseif nxt == "f" then
elseif nxt == 0x66 then -- f
pos = pos + 2
nxt_pos = nxt_pos + 2
if not match(pattern, "^()%[[^^]", pos) then
local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)
-- Only possible to convert a positive %f charset which is
-- all ASCII, so use parse_1_byte_charset.
if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^
return false
return false
elseif nxt3 == 0x5D then -- Initial ] is non-magic.
nxt_pos = nxt_pos + 1
end
end
-- Only possible to convert a %f charset which is all
pos = parse_1_byte_charset(pattern, nxt_pos)
-- ASCII, so use parse_1_byte_charset.
pos = parse_1_byte_charset(pattern, pos)
if not pos then
if not pos then
return false
return false
end
end
elseif nxt == "Z" then
elseif nxt == 0x5A then -- Z
pos = pos + 2
nxt = byte(pattern, nxt_pos + 1)
nxt = sub(pattern, pos, pos)
if nxt == 0x2A or nxt == 0x2D then -- *-
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 3
pos = pos + 1
else
else
output = output or {}
if output == nil then
output = {}
end
local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"
n = n + 1
n = n + 1
if nxt == "?" then
if nxt == 0x2B then -- +
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
output[n] = ins .. "%Z*"
pos = pos + 1
pos = pos + 3
elseif nxt == 0x3F then -- ?
output[n] = ins .. "?[\128-\191]*"
pos = pos + 3
else
else
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
output[n] = ins .. "[\128-\191]*"
pos = pos + 2
end
end
start = pos
start = pos
end
end
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz
return false
return false
-- Skip the next character if it's ASCII. Otherwise, we will
-- Skip the next character if it's ASCII. Otherwise, we will
-- still need to do length checks.
-- still need to do length checks.
else
else
pos = pos + (byte(nxt) < 128 and 2 or 1)
pos = pos + (nxt < 128 and 2 or 1)
end
end
elseif ch == "(" then
elseif ch == "(" then
if nxt == ")" or captures == 32 then
if nxt == 0x29 or capture_groups == 32 then -- )
return false
return false
end
end
captures = captures + 1
capture_groups = capture_groups + 1
pos = pos + 1
pos = pos + 1
elseif ch == "." then
elseif ch == "." then
if nxt == "*" or nxt == "+" or nxt == "-" then
if nxt == 0x2A or nxt == 0x2D then -- *-
pos = pos + 2
pos = pos + 2
else
else
output = output or {}
if output == nil then
output = {}
end
local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"
n = n + 1
n = n + 1
if nxt == "?" then
if nxt == 0x2B then -- +
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
output[n] = ins .. ".*"
pos = pos + 2
elseif nxt == 0x3F then -- ?
output[n] = ins .. "?[\128-\191]*"
pos = pos + 2
pos = pos + 2
else
else
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
output[n] = ins .. "[\128-\191]*"
pos = pos + 1
pos = pos + 1
end
end
Line 224: Line 488:
elseif ch == "[" then
elseif ch == "[" then
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
if nxt == "^" then
if nxt == 0x5E then -- ^
return false
return false
-- If the first character is "%", ch_len is determined by the
-- If the first character is "%", ch_len is determined by the
-- next one instead.
-- next one instead.
elseif nxt == "%" then
elseif nxt == 0x25 then -- %
nxt = byte(pattern, nxt_pos + 1)
elseif nxt == 0x5D then -- Initial ] is non-magic.
nxt_pos = nxt_pos + 1
nxt_pos = nxt_pos + 1
nxt = sub(pattern, nxt_pos, nxt_pos)
end
end
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
if not nxt then
return false
end
local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4
if ch_len == 1 then -- Single-byte charset.
if ch_len == 1 then -- Single-byte charset.
pos = parse_1_byte_charset(pattern, pos + 1)
pos = parse_1_byte_charset(pattern, nxt_pos)
if not pos then
if not pos then
return false
return false
end
end
else -- Multibyte charset.
else -- Multibyte charset.
-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.
local charset_pos, bytes = pos
local charset_pos, bytes = pos
pos = pos + 1
pos = pos + 1
while true do -- TODO: non-ASCII charset ranges.
while true do -- TODO: non-ASCII charset ranges.
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)
if not ch then
return false
-- If escaped, get the next character. No need to
-- If escaped, get the next character. No need to
-- distinguish magic characters or character classes,
-- distinguish magic characters or character classes,
-- as they'll all fail for having the wrong length
-- as they'll all fail for having the wrong length
-- anyway.
-- anyway.
elseif ch == "%" then
if ch == "%" then
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)
elseif ch == "]" then
elseif ch == "]" then
pos = nxt_pos
pos = nxt_pos
break
break
end
end
if ch_len ~= #ch then
if not (ch and nxt_pos - pos == ch_len) then
return false
return false
elseif bytes == nil then
bytes = {}
end
end
bytes = bytes or {}
local bytes, last = bytes, nxt_pos - 1
local bytes = bytes
for i = pos, last - 1 do
for i = 1, ch_len - 1 do
local b = byte(pattern, i)
local b = byte(ch, i, i)
local bytes_b = bytes[b]
bytes[b] = bytes[b] or {}
if bytes_b == nil then
bytes = bytes[b]
bytes_b = {}
bytes[b] = bytes_b
end
bytes[b], bytes = bytes_b, bytes_b
end
end
bytes[byte(ch, -1)] = true
bytes[byte(pattern, last)] = true
pos = nxt_pos
pos = nxt_pos
end
end
Line 271: Line 543:
return false
return false
end
end
local nxt = sub(pattern, pos, pos)
nxt = byte(pattern, pos)
if (
if (
(nxt == "?" or nxt == "*" or nxt == "-") or
(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?
(nxt == "+" and ch_len > 2) or
(nxt == 0x2B and ch_len > 2) or -- +
not check_sets(bytes)
not check_sets(bytes)
) then
) then
Line 292: Line 564:
bytes = next_byte
bytes = next_byte
until next_byte == true
until next_byte == true
if nxt == "+" then
if nxt == 0x2B then -- +
local range1, range2 = ranges[1], ranges[2]
local range1, range2 = ranges[1], ranges[2]
ranges[1] = make_charset(range1)
ranges[1], ranges[3] = make_charset(range1), make_charset(range2)
ranges[3] = make_charset(range2)
local n = #range2
local n = #range2
for i = 1, #range1 do
for i = 1, #range1 do
Line 308: Line 579:
end
end
end
end
output = output or {}
if output == nil then
output = {}
end
nxt = byte(pattern, pos)
n = n + 1
n = n + 1
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
start = pos
start = pos
end
end
elseif nxt == "+" then
elseif not nxt then
if #ch ~= 2 then
break
elseif nxt == 0x2B then -- +
if nxt_pos - pos ~= 2 then
return false
return false
elseif output == nil then
output = {}
end
end
output = output or {}
pos, nxt_pos = pos + 1, nxt_pos + 1
nxt = byte(pattern, nxt_pos)
local ch2 = sub(pattern, pos, pos)
n = n + 1
n = n + 1
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..
pos = nxt_pos + 1
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
start = pos
pos, start = nxt_pos, nxt_pos
elseif nxt == "?" or nxt == "*" or nxt == "-" then
elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?
return false
return false
else
else
Line 332: Line 613:
end
end
return concat(output) .. sub(pattern, start)
return concat(output) .. sub(pattern, start)
end, true)
end
export.pattern_simplifier = pattern_simplifier -- For testing.
pattern_simplifier = memoize(pattern_simplifier, true)
export.pattern_simplifier = pattern_simplifier
end
 
--[==[
Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring
library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}).
 
The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used
(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary
characters.
]==]
function get_charset(charset)
if type(charset) == "number" then
return tostring(charset)
end
local pos, start, n, output = 1, 1, 0
if byte(charset) == 0x5E then -- ^
pos = pos + 1
end
-- FIXME: "]" is non-magic if it's the first character in a charset.
local nxt_pos, nxt
while true do
local new_pos, ch = match(charset, "()([%%%-%]])", pos)
if not ch then
break
-- Skip percent escapes. Ranges can't start with them, either.
elseif ch == "%" then
pos = new_pos + 2
else
-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.
if ch == "-" and new_pos > pos then
pos, nxt_pos, nxt = new_pos - 1, new_pos, ch
ch = sub(charset, pos, pos)
else
pos, nxt_pos = new_pos, new_pos + 1
nxt = sub(charset, nxt_pos, nxt_pos)
end
-- Range.
if nxt == "-" then
if output == nil then
output = {}
end
n = n + 1
output[n] = sub(charset, start, pos - 1)
nxt_pos = nxt_pos + 1
nxt = sub(charset, nxt_pos, nxt_pos)
-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.
if nxt == "" or nxt == "%" then
n = n + 1
output[n] = (ch == "]" and "%]" or ch) .. "%-"
start = nxt_pos
nxt_pos = nxt_pos + 2
-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be
-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is
-- omitted if the range would be empty (i.e. if the first byte is greater than the second).
else
n = n + 1
output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..
(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)
nxt_pos = nxt_pos + 1
start = nxt_pos
end
elseif ch == "-" or ch == "]" then
if output == nil then
output = {}
end
n = n + 1
output[n] = sub(charset, start, pos - 1) .. "%" .. ch
start = nxt_pos
end
pos = nxt_pos
end
end
if start == 1 then
return "[" .. charset .. "]"
end
return "[" .. concat(output) .. sub(charset, start) .. "]"
end
end
get_charset = memoize(get_charset, true)
export.get_charset = get_charset


function export.len(str)
function export.len(str)
Line 423: Line 783:
end
end


--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==]
--[==[
Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.
]==]
function export.plain_gsub(str, pattern, repl, n)
function export.plain_gsub(str, pattern, repl, n)
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n)
end
end


--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
--[==[
Reverses a UTF-8 string; equivalent to string.reverse.
]==]
function export.reverse(str)
function export.reverse(str)
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
end
 
function export.char(...) -- To be moved to [[Module:string/char]].
return u(...)
end
end


do
do
local function err(cp)
local function utf8_err(func_name)
error("Codepoint " .. cp .. " is out of range: codepoints must be between 0x0 and 0x10FFFF.", 2)
error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4)
end
end


local function utf8_char(cp)
local function get_codepoint(func_name, b1, b2, b3, b4)
cp = tonumber(cp)
if b1 <= 0x7F then
if cp < 0 then
err("-0x" .. format("%X", -cp + 1))
elseif cp < 0x80 then
return char(cp)
elseif cp < 0x800 then
return char(
0xC0 + cp / 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x10000 then
if cp >= 0xD800 and cp < 0xE000 then
return "?" -- mw.ustring.char returns "?" for surrogates.
end
return char(
0xE0 + cp / 0x1000,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
elseif cp < 0x110000 then
return char(
0xF0 + cp / 0x40000,
0x80 + cp / 0x1000 % 0x40,
0x80 + cp / 0x40 % 0x40,
0x80 + cp % 0x40
)
end
err("0x" .. format("%X", cp))
end
 
function export.char(cp, ...)
if ... == nil then
return utf8_char(cp)
end
local ret = {cp, ...}
for i = 1, select("#", cp, ...) do
ret[i] = utf8_char(ret[i])
end
return concat(ret)
end
u = export.char
end
 
do
local function get_codepoint(b1, b2, b3, b4)
if b1 < 128 then
return b1, 1
return b1, 1
elseif b1 < 224 then
elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then
return 0x40 * b1 + b2 - 0x3080, 2
utf8_err(func_name)
elseif b1 < 240 then
elseif b1 <= 0xDF then
return 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080, 3
local cp = 0x40 * b1 + b2 - 0x3080
return cp >= 0x80 and cp or utf8_err(func_name), 2
elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then
utf8_err(func_name)
elseif b1 <= 0xEF then
local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080
return cp >= 0x800 and cp or utf8_err(func_name), 3
elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then
utf8_err(func_name)
end
end
return 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080, 4
local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080
return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4
end
end


function export.codepoint(str, i, j)
function export.codepoint(str, i, j)
if type(str) == "number" then
if str == "" then
return -- return nothing
elseif type(str) == "number" then
return byte(str, i, j)
return byte(str, i, j)
end
end
i, j = i or 1, j == -1 and #str or i or 1
i, j = i or 1, j == -1 and #str or i or 1
if i == 1 and j == 1 then
if i == 1 and j == 1 then
return (get_codepoint(byte(str, 1, 4)))
return (get_codepoint("codepoint", byte(str, 1, 4)))
elseif i < 0 or j < 0 then
elseif i < 0 or j < 0 then
return ucodepoint(str, i, j) -- FIXME
return ucodepoint(str, i, j) -- FIXME
Line 517: Line 851:
nr = nr + 1
nr = nr + 1
local add
local add
ret[nr], add = get_codepoint(b1, b2, b3, b4)
ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4)
nb = nb + add
nb = nb + add
end
end
Line 549: Line 883:
return nil
return nil
end
end
local ret, add = get_codepoint(b1, b2, b3, b4)
local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4)
nb = nb + add
nb = nb + add
return ret
return ret
Line 556: Line 890:
end
end


--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
do
function export.lower(str)
local _ulower = ulower
return (match(str, "^()[^\128-\255]*$") and lower or ulower)(str)
 
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==]
function export.lower(str)
return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str)
end
end
end


--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
do
function export.upper(str)
local _uupper = uupper
return (match(str, "^()[^\128-\255]*$") and upper or uupper)(str)
 
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==]
function export.upper(str)
return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str)
end
end
end


do
do
local function add_captures(text, n, ...)
local function add_captures(t, n, ...)
if ... == nil then
return
end
-- Insert any captures from the splitting pattern.
-- Insert any captures from the splitting pattern.
local offset, capture = n - 1, ...
local offset, capture = n - 1, ...
while capture do
while capture do
n = n + 1
n = n + 1
text[n] = capture
t[n] = capture
capture = select(n - offset, ...)
capture = select(n - offset, ...)
end
end
Line 578: Line 923:
end
end
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
--[==[
if not (loc1 and start <= str_len) then
Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like
-- If no match, or there is but we're past the end of the string
Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by
-- (which happens when the match is the empty string), then add
one character at a time; Python returns the whole remainder of the string). When possible, it will use the string
-- the final chunk and return.
library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the
n = n + 1
string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
text[n] = _sub(str, start)
return
elseif loc2 < loc1 then
-- Special case: If we match the empty string, then include the
-- next character; this avoids an infinite loop, and makes
-- splitting by an empty string work the way mw.text.split() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string. If using the string library, we
-- need to make sure we advance by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
n = n + 1
text[n] = _sub(str, start, loc1)
start = loc1 + 1
if start > str_len then
return ... and add_captures(text, n, ...) or n
end
else
-- Add chunk up to the current match.
n = n + 1
text[n] = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
return (... and add_captures(text, n, ...) or n), start
end
local function _split(str, pattern, str_len, _sub, _find, plain)
In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start
local text, n, start = {}, 0, 1
index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil
if there are no further matches. By default, the start index will be calculated using the ustring library, unless
`str_lib` or `plain` is set.
]==]
function export.split(str, pattern_or_func, str_lib, plain)
local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
repeat
repeat
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
n = add_captures(t, n, iter())
until not start
until n == nil
return t
return text
end
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
function export.split(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
export.capturing_split = export.split -- To be removed.
export.capturing_split = export.split -- To be removed.
end
end


do
--[==[
-- TODO: merge this with export.split. Not clear how to do this while
Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the
-- maintaining the same level of performance, as gsplit is slower.
string up the splitting pattern, with any capture groups being returned as additional values on that iteration.
local function _split(str, pattern, str_len, _sub, _find, plain)
]==]
local start, final = 1
function export.gsplit(str, pattern_or_func, str_lib, plain)
local start, final, str_len, _string, callable = 1
local function iter(loc1, loc2, ...)
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
-- If no match, return the final chunk.
local _find, _sub = _string.find, _string.sub
if not loc1 then
local function iter(loc1, loc2, ...)
-- If no match, or there is but we're past the end of the string
-- (which happens when the match is the empty string), then return
-- the final chunk.
if not loc1 then
final = true
return _sub(str, start)
end
-- Special case: If we match the empty string, then eat the
-- next character; this avoids an infinite loop, and makes
-- splitting by the empty string work the way mw.text.gsplit() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, set `final` to true, so we
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
local b = byte(str, loc1)
if b and b >= 128 then
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
end
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
final = true
return _sub(str, start)
end
-- Special case: If we match the empty string, then eat the
-- next character; this avoids an infinite loop, and makes
-- splitting by the empty string work the way mw.text.gsplit() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, set `final` to true, so we
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
else
start = loc1 + 1
end
-- Eat chunk up to the current match.
else
else
chunk = _sub(str, start, loc1 - 1)
start = loc1 + 1
start = loc2 + 1
end
end
return chunk, ...
-- Eat chunk up to the current match.
else
chunk = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
end
return chunk, ...
end
if callable then
return function()
return function()
if not final then
if not final then
return iter(_find(str, pattern, start, plain))
return iter(pattern_or_func(str, start))
end
end
-- Special case if the pattern is anchored to the start: "^" always
-- anchors to the start position, not the start of the string, so get
-- around this by only attempting one match with the pattern, then match
-- the end of the string.
elseif byte(pattern_or_func) == 0x5E then -- ^
local returned
return function()
if not returned then
returned = true
return iter(_find(str, pattern_or_func, start, plain))
elseif not final then
return iter(_find(str, "$", start, plain))
end
end
return nil
end
end
end
end
return function()
function export.gsplit(str, pattern, str_lib, plain)
if not final then
if str_lib or plain then
return iter(_find(str, pattern_or_func, start, plain))
return _split(str, pattern, #str, sub, find, plain)
end
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
end
gsplit = export.gsplit
function export.count(str, pattern, plain)
if plain then
return select(2, gsub(str, pattern_escape(pattern), ""))
end
local simple = pattern_simplifier(pattern)
if simple then
return select(2, gsub(str, pattern, ""))
end
return select(2, ugsub(str, pattern, ""))
end
end


function export.trim(str, charset)
function export.trim(str, charset, str_lib, plain)
if not charset then
if charset == nil then
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are
elseif match(charset, "^()[^\128-\255]*$") then
-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to ""
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
-- first.
return match(gsub(str, "^%s*", ""), "^.*%S") or ""
elseif charset == "" then
return str
end
end
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets
-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there
-- would be two callbacks into PHP, which is slower.
local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
if not str_lib then
local simple = pattern_simplifier(pattern)
if not simple then
return umatch(str, pattern)
end
pattern = simple
end
return match(str, pattern)
end
end
trim = export.trim


do
do
local entities
local entities
 
local function get_entities()
local function decode_numeric_entity(code, pattern, base)
entities, get_entities = load_data("Module:data/entities"), nil
local cp = match(code, pattern) and tonumber(code, base)
return entities
return cp and cp < 0x110000 and u(cp) or nil
end
end


local function decode_entity(hash, x, code)
local function decode_entity(hash, x, code)
if hash == "#" then
if hash == "" then
return x == "" and decode_numeric_entity(code, "^%d+$") or
return (entities or get_entities())[x .. code]
decode_numeric_entity(code, "^%x+$", 16)
end
end
entities = entities or load_data("Module:data/entities")
local cp
return entities[x .. code]
if x == "" then
cp = match(code, "^()%d+$") and tonumber(code)
else
cp = match(code, "^()%x+$") and tonumber(code, 16)
end
return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil
end
end


-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases
-- which have also been included in [[Module:data/entities]].
function export.decode_entities(str)
function export.decode_entities(str)
return find(str, "&", 1, true) and
local amp = find(str, "&", nil, true)
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
end
end
end
end


do
do
local html_entities
local entities
local function get_entities()
local function encode_entity(ch)
local entity = html_entities[ch]
if entity then
return entity
end
entity = "&#" .. codepoint(ch) .. ";"
html_entities[ch] = entity
return entity
end
function export.encode_entities(str, charset, str_lib, plain)
-- Memoized HTML entities (taken from mw.text.lua).
-- Memoized HTML entities (taken from mw.text.lua).
html_entities = html_entities or {
entities, get_entities = {
["\""] = "&quot;",
["\""] = "&quot;",
["&"] = "&amp;",
["&"] = "&amp;",
Line 750: Line 1,096:
[">"] = "&gt;",
[">"] = "&gt;",
["\194\160"] = "&nbsp;",
["\194\160"] = "&nbsp;",
}
}, nil
if not charset then
return entities
return (gsub(str, "[\"&'<>\194]\160?", html_entities))
end
elseif plain then
 
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
local function encode_entity(ch)
elseif str_lib then
local entity = (entities or get_entities())[ch]
if not match(charset, "^()[^\128-\255]*$") then
if entity == nil then
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
local cp = codepoint(ch)
-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities.
entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false
entities[ch] = entity
end
return entity or nil
end
 
function export.encode_entities(str, charset, str_lib, plain)
if charset == nil then
return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))
elseif charset == "" then
return str
end
local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)
if not str_lib then
local simple = pattern_simplifier(pattern)
if not simple then
return (ugsub(str, pattern, encode_entity))
end
end
return (gsub(str, "[" .. charset .. "]", encode_entity))
pattern = simple
end
local pattern = charset and "[" .. charset .. "]"
local simple = pattern_simplifier(pattern)
if simple then
return (gsub(str, simple, encode_entity))
end
end
return (ugsub(str, pattern, encode_entity))
return (gsub(str, pattern, encode_entity))
end
end
end
end
Line 787: Line 1,146:
enctype = enctype and upper(enctype) or "QUERY"
enctype = enctype and upper(enctype) or "QUERY"
if enctype == "PATH" then
if enctype == "PATH" then
return find(str, "%", 1, true) and
return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str
gsub(str, "%%(%x%x)", decode_path) or str
elseif enctype == "QUERY" then
elseif enctype == "QUERY" then
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str
gsub(str, "([%%%+])(%x?%x?)", decode) or str
elseif enctype == "WIKI" then
elseif enctype == "WIKI" then
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
gsub(str, "([%%_])(%x?%x?)", decode) or str
end
end
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2)
end
end
end
end
Line 802: Line 1,158:
do
do
local function _remove_comments(str, pre)
local function _remove_comments(str, pre)
local head = find(str, "<!--", 1, true)
local head = find(str, "<!--", nil, true)
if not head then
if not head then
return str
return str
Line 824: Line 1,180:
end
end
--[==[Removes any HTML comments from the input text. `stage` can be one of three options:
--[==[
* {{lua|"PRE"}} (default) applies the method used by MediaWiki's preprocessor: all {{code||<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code||<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
Removes any HTML comments from the input text. `stage` can be one of three options:
* {{lua|"POST"}} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code||<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code||<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code||<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {{lua|"PRE"}} method will have already been applied by the native parser.
* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all
* {{lua|"BOTH"}} applies {{lua|"PRE"}} then {{lua|"POST"}}.]==]
  {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed
  {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or
  [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the
  preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags);
  if full accuracy is absolutely necessary, use [[Module:template parser]] instead).
* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops
  over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g.
  {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed
  {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs,
  where the {"PRE"} method will have already been applied by the native parser.
* {"BOTH"} applies {"PRE"} then {"POST"}.
]==]
function export.remove_comments(str, stage)
function export.remove_comments(str, stage)
if not stage or stage == "PRE" then
if not stage or stage == "PRE" then
Line 834: Line 1,201:
local processed = stage == "POST" and _remove_comments(str) or
local processed = stage == "POST" and _remove_comments(str) or
stage == "BOTH" and _remove_comments(str, true) or
stage == "BOTH" and _remove_comments(str, true) or
error("bad argument #2 to \"remove_comments\" (expected PRE, POST, or BOTH)", 2)
error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2)
while processed ~= str do
while processed ~= str do
str = processed
str = processed
Line 841: Line 1,208:
return str
return str
end
end
end
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.php_trim(str)
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
end
php_trim = export.php_trim
--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.
After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
# They are integers, with no decimals (2.0) or leading zeroes (02).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
function export.scribunto_param_key(key)
if type(key) ~= "string" then
return key
end
key = php_trim(key)
if match(key, "^-?[1-9]%d*$") then
local num = tonumber(key)
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
return (
num <= 9007199254740991 and num >= -9007199254740991 or
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
elseif key == "0" then
return 0
end
return key
end
end


do
do
local byte_escapes
local byte_escapes
local function get_byte_escapes()
byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
return byte_escapes
end
local function escape_byte(b)
local function escape_byte(b)
return byte_escapes[b] or format("\\%03d", byte(b))
return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))
end
end
function export.escape_bytes(str)
function export.escape_bytes(str)
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
return (gsub(str, ".", escape_byte))
return (gsub(str, ".", escape_byte))
end
end
Line 892: Line 1,231:
return name == "op" and "{" or
return name == "op" and "{" or
name == "cl" and "}" or
name == "cl" and "}" or
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")
elseif fun(name) and type(fun(name)) ~= "string" then
elseif fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
end
end
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")
end))
end))
end
end
format_fun = export.format_fun
format_fun = export.format_fun


--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
--[==[
This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table,
and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening
and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a
slash can be escaped by doubling the initial slash.
 
====Examples====
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
* {string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"}) }
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
*: produces: {"one fish, two fish, red fish, blue fish"}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*: produces: {"The set {1, 2, 3} contains three elements."}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.
]==]
function export.format(str, tbl)
function export.format(str, tbl)
return format_fun(str, function(key)
return format_fun(str, function(key)
Line 916: Line 1,261:
do
do
local function do_uclcfirst(str, case_func)
local function do_uclcfirst(str, case_func)
-- Actual function to re-case of the first letter.
-- Re-case the first letter.
local first_letter = case_func(match(str, "^.[\128-\191]*") or "")
local first, remainder = match(str, "^(.[\128-\191]*)(.*)")
return first_letter .. sub(str, #first_letter + 1)
return first and (case_func(first) .. remainder) or ""
end
end
local function uclcfirst(str, case_func)
local function uclcfirst(str, case_func)
-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref>
-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through
-- [[Module:links]].
local html_at_beginning = nil
if str:match("^<") then
while true do
local html_tag, rest = str:match("^(<.->)(.*)$")
if not html_tag then
break
end
if not html_at_beginning then
html_at_beginning = {}
end
insert(html_at_beginning, html_tag)
str = rest
end
end
-- If there's a link at the beginning, re-case the first letter of the
-- If there's a link at the beginning, re-case the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
local retval
if link then
if link then
return "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder
else
retval = do_uclcfirst(str, case_func)
end
if html_at_beginning then
retval = concat(html_at_beginning) .. retval
end
end
return do_uclcfirst(str, case_func)
return retval
end
end
--[==[
Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
uppercase the first character of text that may include links that have been passed through `full_link()` in
[[Module:links]] or a similar function.
]==]
function export.ucfirst(str)
function export.ucfirst(str)
return uclcfirst(str, uupper)
return uclcfirst(str, uupper)
end
end
ucfirst = export.ucfirst


--[==[
Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly
lowercase the first character of text that may include links that have been passed through `full_link()` in
[[Module:links]] or a similar function.
]==]
function export.lcfirst(str)
function export.lcfirst(str)
return uclcfirst(str, ulower)
return uclcfirst(str, ulower)
end
end
local function capitalize(w)
--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==]
return uclcfirst(w, uupper)
end
--[==[Capitalize each word of a string. WARNING: May be broken in the presence of multiword links.]==]
function export.capitalize(str)
function export.capitalize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Capitalize multi-word that is separated by spaces
-- Capitalize multi-word that is separated by spaces
-- by uppercasing the first letter of each part.
-- by uppercasing the first letter of each part.
-- I assume nobody will input all CAP text.
return (ugsub(str, "%w+", ucfirst))
return (ugsub(str, "%S+", capitalize))
end
end
end


do
local function do_title_case(first, remainder)
local function word_ends_in_consonant_plus_y(str)
first = uupper(first)
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
return remainder == "" and first or (first .. ulower(remainder))
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- We should maybe consider applying this rule here; but it may not
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
return find(str, "[^aeiouyAEIOUY ]y$")
end
end
 
local function word_takes_es_plural(str)
--[==[
return find(str, "[sxz]$") or find(str, "[csz]h$")
Capitalizes each word of the input string, with any further letters in each word being converted to lowercase.
]==]
function export.title_case(str)
return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case)
end
end
title_case = export.title_case
local function do_pluralize(str)
 
if word_ends_in_consonant_plus_y(str) then
-- avoid returning multiple values
return (gsub(str, "y$", "ies"))
elseif word_takes_es_plural(str) then
return str .. "es"
end
return str .. "s"
end
--[==[
--[==[
Pluralize a word in a smart fashion, according to normal English rules.
Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between
# If word ends in consonant + -y, replace the -y with -ies.
words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase).
# If the word ends in -s, -x, -z, -ch, -sh, -zh, add -es.
# Otherwise, add -s.
 
This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
]==]
function export.pluralize(str)
function export.camel_case(str, lower_first)
if type(str) == "table" then
str = ugsub(str, "%W*(%w*)", title_case)
-- allow calling from a template
return lower_first and do_uclcfirst(str, ulower) or str
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_pluralize(str)
elseif linktext ~= "" then
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
elseif word_ends_in_consonant_plus_y(link) then
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
end
end
end
end


do
do
local function do_singularize(str)
local function do_snake_case(nonword, word)
local sing = match(str, "^(.-)ies$")
return nonword == "" and word or "_" .. word
if sing then
return sing .. "y"
end
-- Handle cases like "[[parish]]es"
return match(str, "^(.-[cs]h%]*)es$") or -- not -zhes
-- Handle cases like "[[box]]es"
match(str, "^(.-x%]*)es$") or -- not -ses or -zes
-- Handle regular plurals
match(str, "^(.-)s$") or
-- Otherwise, return input
str
end
end
 
local function collapse_link(link, linktext)
if link == linktext then
return "[[" .. link .. "]]"
end
return "[[" .. link .. "|" .. linktext .. "]]"
end
--[==[
--[==[
Singularize a word in a smart fashion, according to normal English rules. Works analogously to {pluralize()}.
Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between
 
words.
'''NOTE''': This doesn't always work as well as {pluralize()}. Beware. It will mishandle cases like "passes" -> "passe", "eyries" -> "eyry".
# If word ends in -ies, replace -ies with -y.
# If the word ends in -xes, -shes, -ches, remove -es. [Does not affect -ses, cf. "houses", "impasses".]
# Otherwise, remove -s.
 
This handles links correctly:
# If a piped link, change the second part appropriately. Collapse the link to a simple link if both parts end up the same.
# If a non-piped link, singularize the link.
# A link like "[[parish]]es" will be handled correctly because the code that checks for -shes etc. allows ] characters between the
  'sh' etc. and final -es.
]==]
]==]
function export.singularize(str)
function export.snake_case(str)
if type(str) == "table" then
return (ugsub(str, "(%W*)(%w*)", do_snake_case))
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_singularize(str)
elseif linktext ~= "" then
return beginning .. collapse_link(link, do_singularize(linktext))
end
return beginning .. "[[" .. do_singularize(link) .. "]]"
end
end
end
--[==[
Return the appropriate indefinite article to prefix to `str`. Correctly handles links and capitalized text.
Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning with
a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.get_indefinite_article(str, ucfirst)
str = str or ""
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
if link then
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
is_vowel = find(str, "^[AEIOUaeiou]")
end
return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
end
get_indefinite_article = export.get_indefinite_article
--[==[
Prefix `text` with the appropriate indefinite article to prefix to `text`. Correctly handles links and capitalized
text. Does not correctly handle words like [[union]], [[uniform]] and [[university]] that take "a" despite beginning
with a 'u'. The returned article will have its first letter capitalized if `ucfirst` is specified, otherwise lowercase.
]==]
function export.add_indefinite_article(text, ucfirst)
return get_indefinite_article(text, ucfirst) .. " " .. text
end
end


return export
return export