Module:string utilities: Difference between revisions
No edit summary |
No edit summary |
||
| Line 1: | Line 1: | ||
local export = {} | |||
local function_module = "Module:fun" | |||
local load_module = "Module:load" | |||
local memoize_module = "Module:memoize" | |||
local mw = mw | local mw = mw | ||
local string = string | local string = string | ||
| Line 12: | Line 18: | ||
local gsub = string.gsub | local gsub = string.gsub | ||
local len = string.len | local len = string.len | ||
local lower = string.lower | local lower = string.lower | ||
local match = string.match | local match = string.match | ||
local next = next | local next = next | ||
local require = require | |||
local reverse = string.reverse | local reverse = string.reverse | ||
local select = select | local select = select | ||
| Line 34: | Line 40: | ||
local usub = ustring.sub | local usub = ustring.sub | ||
local uupper = ustring.upper | local uupper = ustring.upper | ||
local memoize = require(memoize_module) | |||
-- Defined below. | -- Defined below. | ||
local charset_escape | local charset_escape | ||
| Line 39: | Line 48: | ||
local explode_utf8 | local explode_utf8 | ||
local format_fun | local format_fun | ||
local get_charset | |||
local get_indefinite_article | local get_indefinite_article | ||
local gsplit | |||
local pattern_escape | local pattern_escape | ||
local pattern_simplifier | local pattern_simplifier | ||
| Line 47: | Line 58: | ||
local ulen | local ulen | ||
local | --[==[ | ||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | |||
local function is_callable(...) | |||
is_callable = require(function_module).is_callable | |||
return is_callable(...) | |||
end | |||
local | local function load_data(...) | ||
load_data = require(load_module).load_data | |||
return load_data(...) | |||
end | |||
local function prepare_iter(str, pattern, str_lib, plain) | |||
local callable = is_callable(pattern) | |||
if str_lib or plain then | |||
return pattern, #str, string, callable | |||
elseif not callable then | |||
local simple = pattern_simplifier(pattern) | |||
if simple then | |||
return simple, #str, string, false | |||
end | |||
end | |||
return pattern, ulen(str), ustring, callable | |||
end | |||
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==] | --[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==] | ||
| Line 62: | Line 94: | ||
explode_utf8 = export.explode_utf8 | explode_utf8 = export.explode_utf8 | ||
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns| | do | ||
function export.pattern_escape(str) | local charset_chars = { | ||
["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^" | |||
end | } | ||
charset_chars.__index = charset_chars | |||
local chars = setmetatable({ | |||
["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+", | |||
["."] = "%.", ["?"] = "%?", ["["] = "%[" | |||
}, charset_chars) | |||
--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>, and converts the null character to <code>%z</code>. For example, {{lua|"^$()%.[]*+-?\0"}} becomes {{lua|"%^%$%(%)%%%.%[%]%*%+%-%?%z"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==] | |||
function export.pattern_escape(str) | |||
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars)) | |||
end | |||
pattern_escape = export.pattern_escape | |||
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>, and converts the null character to <code>%z</code>.]==] | |||
function export.charset_escape(str) | |||
return (gsub(str, "[%z%%%-%]^]", charset_chars)) | |||
end | |||
charset_escape = export.charset_escape | |||
--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==] | |||
function export.replacement_escape(str) | |||
return (gsub(str, "%%", "%%%%")) | |||
end | |||
replacement_escape = export.replacement_escape | |||
local function case_insensitive_char(ch) | |||
local upper_ch = uupper(ch) | |||
if upper_ch == ch then | |||
ch = ulower(ch) | |||
if ch == upper_ch then | |||
return chars[ch] or ch | |||
end | |||
end | |||
return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]" | |||
end | |||
-- | local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2) | ||
if not (loc1 and start <= str_len) then | |||
-- Add final chunk and return. | |||
end | n = n + 1 | ||
text[n] = _gsub(_sub(str, start), ".", chars) | |||
return | |||
elseif loc2 < loc1 then | |||
if _sub == sub then | |||
local b = byte(str, loc1) | |||
if b and b >= 128 then | |||
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3) | |||
end | |||
end | |||
n = n + 1 | |||
text[n] = _gsub(_sub(str, start, loc1), ".", chars) | |||
start = loc1 + 1 | |||
if start > str_len then | |||
return | |||
end | |||
else | |||
-- Add chunk up to the current match. | |||
n = n + 1 | |||
text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars) | |||
-- Add current match. | |||
n = n + 1 | |||
text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char) | |||
start = loc2 + 1 | |||
end | |||
return n, start | |||
end | |||
--[==[Escapes | --[==[ | ||
function export. | Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns any pattern matching facilities off in the optional pattern supplied.]==] | ||
function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain) | |||
if pattern_or_func == nil then | |||
return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char)) | |||
end | |||
local text, n, start, str_len, _string, callable = {}, 0, 1 | |||
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain) | |||
local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub | |||
if callable then | |||
repeat | |||
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start)) | |||
until not start | |||
-- Special case if the pattern is anchored to the start: "^" always | |||
-- anchors to the start position, not the start of the string, so get | |||
-- around this by only attempting one match with the pattern, then match | |||
-- the end of the string. | |||
elseif byte(pattern_or_func) == 0x5E then -- ^ | |||
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain)) | |||
if start ~= nil then | |||
iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain)) | |||
end | |||
else | |||
repeat | |||
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain)) | |||
until not start | |||
end | |||
return concat(text) | |||
end | |||
end | end | ||
do | do | ||
local character_classes | |||
local function get_character_classes() | |||
character_classes, get_character_classes = { | |||
[0x41] = true, [0x61] = true, -- Aa | |||
[0x43] = true, [0x63] = true, -- Cc | |||
[0x44] = true, [0x64] = true, -- Dd | |||
[0x4C] = true, [0x6C] = true, -- Ll | |||
[0x50] = true, [0x70] = true, -- Pp | |||
[0x53] = true, [0x73] = true, -- Ss | |||
[0x55] = true, [0x75] = true, -- Uu | |||
[0x57] = true, [0x77] = true, -- Ww | |||
[0x58] = true, [0x78] = true, -- Xx | |||
[0x5A] = true, -- z dealt with separately. | |||
}, nil | |||
return character_classes | |||
end | |||
local function check_sets_equal(set1, set2) | local function check_sets_equal(set1, set2) | ||
local k2 | local k2 | ||
| Line 129: | Line 265: | ||
local function parse_1_byte_charset(pattern, pos) | local function parse_1_byte_charset(pattern, pos) | ||
local ch | |||
while true do | while true do | ||
pos, ch = match(pattern, "()([%%%]\192-\255])", pos) | |||
pos, ch | if ch == "%" then | ||
if | local nxt = byte(pattern, pos + 1) | ||
if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z | |||
return false | return false | ||
end | end | ||
pos = pos + 2 | pos = pos + 2 | ||
elseif ch == "]" then | elseif ch == "]" then | ||
pos = | pos = pos + 1 | ||
return pos | return pos | ||
else | else | ||
return false | return false | ||
end | end | ||
end | end | ||
end | end | ||
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==] | --[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==] | ||
pattern_simplifier | function pattern_simplifier(pattern) | ||
if type(pattern) == "number" then | if type(pattern) == "number" then | ||
return tostring(pattern) | return tostring(pattern) | ||
end | end | ||
local pos, | local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0 | ||
while true do | while true do | ||
-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails. | |||
pos, ch, nxt_pos = match(pattern, "()([%%(.[\ | pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos) | ||
if not ch then | if not ch then | ||
break | break | ||
end | end | ||
local nxt = | local nxt = byte(pattern, nxt_pos) | ||
if ch == "%" then | if ch == "%" then | ||
if nxt == | if nxt == 0x62 then -- b | ||
local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3) | |||
if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then | |||
return false | return false | ||
end | end | ||
pos = pos + 4 | pos = pos + 4 | ||
elseif nxt == | elseif nxt == 0x66 then -- f | ||
nxt_pos = nxt_pos + 2 | |||
local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos) | |||
-- Only possible to convert a positive %f charset which is | |||
-- all ASCII, so use parse_1_byte_charset. | |||
if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^ | |||
return false | return false | ||
elseif nxt3 == 0x5D then -- Initial ] is non-magic. | |||
nxt_pos = nxt_pos + 1 | |||
end | end | ||
pos = parse_1_byte_charset(pattern, nxt_pos) | |||
pos = parse_1_byte_charset(pattern, | |||
if not pos then | if not pos then | ||
return false | return false | ||
end | end | ||
elseif nxt == | elseif nxt == 0x5A then -- Z | ||
nxt = byte(pattern, nxt_pos + 1) | |||
nxt = | if nxt == 0x2A or nxt == 0x2D then -- *- | ||
if nxt == | pos = pos + 3 | ||
pos = pos + | |||
else | else | ||
output = output | if output == nil then | ||
output = {} | |||
end | |||
local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]" | |||
n = n + 1 | n = n + 1 | ||
if nxt == | if nxt == 0x2B then -- + | ||
output[n] = | output[n] = ins .. "%Z*" | ||
pos = pos + | pos = pos + 3 | ||
elseif nxt == 0x3F then -- ? | |||
output[n] = ins .. "?[\128-\191]*" | |||
pos = pos + 3 | |||
else | else | ||
output[n] = | output[n] = ins .. "[\128-\191]*" | ||
pos = pos + 2 | |||
end | end | ||
start = pos | start = pos | ||
end | end | ||
elseif | elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz | ||
return false | return false | ||
-- Skip the next character if it's ASCII. Otherwise, we will | -- Skip the next character if it's ASCII. Otherwise, we will | ||
-- still need to do length checks. | -- still need to do length checks. | ||
else | else | ||
pos = pos + | pos = pos + (nxt < 128 and 2 or 1) | ||
end | end | ||
elseif ch == "(" then | elseif ch == "(" then | ||
if nxt == | if nxt == 0x29 or capture_groups == 32 then -- ) | ||
return false | return false | ||
end | end | ||
capture_groups = capture_groups + 1 | |||
pos = pos + 1 | pos = pos + 1 | ||
elseif ch == "." then | elseif ch == "." then | ||
if nxt == | if nxt == 0x2A or nxt == 0x2D then -- *- | ||
pos = pos + 2 | pos = pos + 2 | ||
else | else | ||
output = output | if output == nil then | ||
output = {} | |||
end | |||
local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]" | |||
n = n + 1 | n = n + 1 | ||
if nxt == " | if nxt == 0x2B then -- + | ||
output[n] = | output[n] = ins .. ".*" | ||
pos = pos + 2 | |||
elseif nxt == 0x3F then -- ? | |||
output[n] = ins .. "?[\128-\191]*" | |||
pos = pos + 2 | pos = pos + 2 | ||
else | else | ||
output[n] = | output[n] = ins .. "[\128-\191]*" | ||
pos = pos + 1 | pos = pos + 1 | ||
end | end | ||
| Line 224: | Line 375: | ||
elseif ch == "[" then | elseif ch == "[" then | ||
-- Fail negative charsets. TODO: 1-byte charsets should be safe. | -- Fail negative charsets. TODO: 1-byte charsets should be safe. | ||
if nxt == | if nxt == 0x5E then -- ^ | ||
return false | return false | ||
-- If the first character is "%", ch_len is determined by the | -- If the first character is "%", ch_len is determined by the | ||
-- next one instead. | -- next one instead. | ||
elseif nxt == | elseif nxt == 0x25 then -- % | ||
nxt = byte(pattern, nxt_pos + 1) | |||
elseif nxt == 0x5D then -- Initial ] is non-magic. | |||
nxt_pos = nxt_pos + 1 | nxt_pos = nxt_pos + 1 | ||
end | end | ||
local ch_len = | if not nxt then | ||
return false | |||
end | |||
local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4 | |||
if ch_len == 1 then -- Single-byte charset. | if ch_len == 1 then -- Single-byte charset. | ||
pos = parse_1_byte_charset(pattern, | pos = parse_1_byte_charset(pattern, nxt_pos) | ||
if not pos then | if not pos then | ||
return false | return false | ||
end | end | ||
else -- Multibyte charset. | else -- Multibyte charset. | ||
-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST. | |||
local charset_pos, bytes = pos | local charset_pos, bytes = pos | ||
pos = pos + 1 | pos = pos + 1 | ||
while true do -- TODO: non-ASCII charset ranges. | while true do -- TODO: non-ASCII charset ranges. | ||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]* | pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos) | ||
-- If escaped, get the next character. No need to | -- If escaped, get the next character. No need to | ||
-- distinguish magic characters or character classes, | -- distinguish magic characters or character classes, | ||
-- as they'll all fail for having the wrong length | -- as they'll all fail for having the wrong length | ||
-- anyway. | -- anyway. | ||
if ch == "%" then | |||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]* | pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos) | ||
elseif ch == "]" then | elseif ch == "]" then | ||
pos = nxt_pos | pos = nxt_pos | ||
break | break | ||
end | end | ||
if ch_len | if not (ch and nxt_pos - pos == ch_len) then | ||
return false | return false | ||
elseif bytes == nil then | |||
bytes = {} | |||
end | end | ||
local bytes, last = bytes, nxt_pos - 1 | |||
local bytes = bytes | for i = pos, last - 1 do | ||
for i = | local b = byte(pattern, i) | ||
local b = byte( | local bytes_b = bytes[b] | ||
bytes[b] = bytes[b] | if bytes_b == nil then | ||
bytes_b = {} | |||
bytes[b] = bytes_b | |||
end | |||
bytes[b], bytes = bytes_b, bytes_b | |||
end | end | ||
bytes[byte( | bytes[byte(pattern, last)] = true | ||
pos = nxt_pos | pos = nxt_pos | ||
end | end | ||
| Line 271: | Line 430: | ||
return false | return false | ||
end | end | ||
nxt = byte(pattern, pos) | |||
if ( | if ( | ||
(nxt == | (nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-? | ||
(nxt == | (nxt == 0x2B and ch_len > 2) or -- + | ||
not check_sets(bytes) | not check_sets(bytes) | ||
) then | ) then | ||
| Line 292: | Line 451: | ||
bytes = next_byte | bytes = next_byte | ||
until next_byte == true | until next_byte == true | ||
if nxt == | if nxt == 0x2B then -- + | ||
local range1, range2 = ranges[1], ranges[2] | local range1, range2 = ranges[1], ranges[2] | ||
ranges[1] = make_charset(range1) | ranges[1], ranges[3] = make_charset(range1), make_charset(range2) | ||
local n = #range2 | local n = #range2 | ||
for i = 1, #range1 do | for i = 1, #range1 do | ||
| Line 308: | Line 466: | ||
end | end | ||
end | end | ||
output = output | if output == nil then | ||
output = {} | |||
end | |||
nxt = byte(pattern, pos) | |||
n = n + 1 | n = n + 1 | ||
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) | output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) .. | ||
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped | |||
start = pos | start = pos | ||
end | end | ||
elseif nxt == | elseif not nxt then | ||
if | break | ||
elseif nxt == 0x2B then -- + | |||
if nxt_pos - pos ~= 2 then | |||
return false | return false | ||
elseif output == nil then | |||
output = {} | |||
end | end | ||
pos, nxt_pos = pos + 1, nxt_pos + 1 | |||
nxt = byte(pattern, nxt_pos) | |||
local ch2 = sub(pattern, pos, pos) | |||
n = n + 1 | n = n + 1 | ||
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. | output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 .. | ||
pos = nxt_pos | ((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped | ||
pos, start = nxt_pos, nxt_pos | |||
elseif nxt == | elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-? | ||
return false | return false | ||
else | else | ||
| Line 332: | Line 500: | ||
end | end | ||
return concat(output) .. sub(pattern, start) | return concat(output) .. sub(pattern, start) | ||
end, true) | end | ||
export.pattern_simplifier = pattern_simplifier -- | pattern_simplifier = memoize(pattern_simplifier, true) | ||
export.pattern_simplifier = pattern_simplifier | |||
end | |||
--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {{lua|"abcd-g"}} becomes {{lua|"[abcd-g]"}}, and {{lua|"[]"}} becomes {{lua|"[[%]]"}}). | |||
The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==] | |||
function get_charset(charset) | |||
if type(charset) == "number" then | |||
return tostring(charset) | |||
end | |||
local pos, start, n, output = 1, 1, 0 | |||
if byte(charset) == 0x5E then -- ^ | |||
pos = pos + 1 | |||
end | |||
-- FIXME: "]" is non-magic if it's the first character in a charset. | |||
local nxt_pos, nxt | |||
while true do | |||
local new_pos, ch = match(charset, "()([%%%-%]])", pos) | |||
if not ch then | |||
break | |||
-- Skip percent escapes. Ranges can't start with them, either. | |||
elseif ch == "%" then | |||
pos = new_pos + 2 | |||
else | |||
-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`. | |||
if ch == "-" and new_pos > pos then | |||
pos, nxt_pos, nxt = new_pos - 1, new_pos, ch | |||
ch = sub(charset, pos, pos) | |||
else | |||
pos, nxt_pos = new_pos, new_pos + 1 | |||
nxt = sub(charset, nxt_pos, nxt_pos) | |||
end | |||
-- Range. | |||
if nxt == "-" then | |||
if output == nil then | |||
output = {} | |||
end | |||
n = n + 1 | |||
output[n] = sub(charset, start, pos - 1) | |||
nxt_pos = nxt_pos + 1 | |||
nxt = sub(charset, nxt_pos, nxt_pos) | |||
-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour. | |||
if nxt == "" or nxt == "%" then | |||
n = n + 1 | |||
output[n] = (ch == "]" and "%]" or ch) .. "%-" | |||
start = nxt_pos | |||
nxt_pos = nxt_pos + 2 | |||
-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is omitted if the range would be empty (i.e. if the first byte is greater than the second). | |||
else | |||
n = n + 1 | |||
output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" .. | |||
(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt) | |||
nxt_pos = nxt_pos + 1 | |||
start = nxt_pos | |||
end | |||
elseif ch == "-" or ch == "]" then | |||
if output == nil then | |||
output = {} | |||
end | |||
n = n + 1 | |||
output[n] = sub(charset, start, pos - 1) .. "%" .. ch | |||
start = nxt_pos | |||
end | |||
pos = nxt_pos | |||
end | |||
end | |||
if start == 1 then | |||
return "[" .. charset .. "]" | |||
end | |||
return "[" .. concat(output) .. sub(charset, start) .. "]" | |||
end | end | ||
get_charset = memoize(get_charset, true) | |||
export.get_charset = get_charset | |||
function export.len(str) | function export.len(str) | ||
| Line 430: | Line 670: | ||
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==] | --[==[Reverses a UTF-8 string; equivalent to string.reverse.]==] | ||
function export.reverse(str) | function export.reverse(str) | ||
return reverse(gsub(str, "[\ | return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse))) | ||
end | end | ||
| Line 441: | Line 681: | ||
cp = tonumber(cp) | cp = tonumber(cp) | ||
if cp < 0 then | if cp < 0 then | ||
err("-0x | err(format("-0x%X", -cp)) | ||
elseif cp < 0x80 then | elseif cp < 0x80 then | ||
return char(cp) | return char(cp) | ||
| Line 466: | Line 706: | ||
) | ) | ||
end | end | ||
err( | err(format("0x%X", cp)) | ||
end | end | ||
| Line 567: | Line 807: | ||
do | do | ||
local function add_captures( | local function add_captures(t, n, ...) | ||
if ... == nil then | |||
return | |||
end | |||
-- Insert any captures from the splitting pattern. | -- Insert any captures from the splitting pattern. | ||
local offset, capture = n - 1, ... | local offset, capture = n - 1, ... | ||
while capture do | while capture do | ||
n = n + 1 | n = n + 1 | ||
t[n] = capture | |||
capture = select(n - offset, ...) | capture = select(n - offset, ...) | ||
end | end | ||
| Line 578: | Line 821: | ||
end | end | ||
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal. | |||
In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil if there are no further matches. By default, the start index will be calculated using the ustring library, unless `str_lib` or `plain` is set.]==] | |||
local | function export.split(str, pattern_or_func, str_lib, plain) | ||
local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0 | |||
repeat | repeat | ||
n | n = add_captures(t, n, iter()) | ||
until | until n == nil | ||
return t | |||
end | end | ||
export.capturing_split = export.split -- To be removed. | export.capturing_split = export.split -- To be removed. | ||
end | end | ||
--[==[Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the string up the splitting pattern, with any capture groups being returned as additional values on that iteration.]==] | |||
function export.gsplit(str, pattern_or_func, str_lib, plain) | |||
local start, final, str_len, _string, callable = 1 | |||
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain) | |||
local _find, _sub = _string.find, _string.sub | |||
local function iter(loc1, loc2, ...) | |||
-- If no match, or there is but we're past the end of the string | |||
if | -- (which happens when the match is the empty string), then return | ||
-- the final chunk. | |||
if not loc1 then | |||
final = true | |||
return _sub(str, start) | |||
end | |||
-- Special case: If we match the empty string, then eat the | |||
-- next character; this avoids an infinite loop, and makes | |||
-- splitting by the empty string work the way mw.text.gsplit() does | |||
-- (including non-adjacent empty string matches with %f). If we | |||
-- reach the end of the string this way, set `final` to true, so we | |||
-- don't get stuck matching the empty string at the end. | |||
local chunk | |||
if loc2 < loc1 then | |||
-- If using the string library, we need to make sure we advance | |||
-- by one UTF-8 character. | |||
if _sub == sub then | |||
local b = byte(str, loc1) | |||
if b and b >= 128 then | |||
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3) | |||
end | |||
end | |||
chunk = _sub(str, start, loc1) | |||
if loc1 >= str_len then | |||
final = true | final = true | ||
else | else | ||
start = loc1 + 1 | |||
end | end | ||
-- Eat chunk up to the current match. | |||
else | |||
chunk = _sub(str, start, loc1 - 1) | |||
start = loc2 + 1 | |||
end | end | ||
return chunk, ... | |||
end | |||
if callable then | |||
return function() | return function() | ||
if not final then | if not final then | ||
return iter(_find(str, | return iter(pattern_or_func(str, start)) | ||
end | |||
end | |||
-- Special case if the pattern is anchored to the start: "^" always | |||
-- anchors to the start position, not the start of the string, so get | |||
-- around this by only attempting one match with the pattern, then match | |||
-- the end of the string. | |||
elseif byte(pattern_or_func) == 0x5E then -- ^ | |||
local returned | |||
return function() | |||
if not returned then | |||
returned = true | |||
return iter(_find(str, pattern_or_func, start, plain)) | |||
elseif not final then | |||
return iter(_find(str, "$", start, plain)) | |||
end | end | ||
end | end | ||
end | end | ||
return function() | |||
if not final then | |||
if | return iter(_find(str, pattern_or_func, start, plain)) | ||
return | |||
end | end | ||
end | end | ||
end | end | ||
gsplit = export.gsplit | |||
function export.trim(str, charset) | function export.trim(str, charset, str_lib, plain) | ||
if | if charset == nil then | ||
-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" first. | |||
return match(gsub(str, "^%s*", ""), "^.*%S") or "" | |||
elseif charset == "" then | |||
return str | |||
end | end | ||
charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset) | |||
-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there would be two callbacks into PHP, which is slower. | |||
local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$" | |||
if not str_lib then | |||
local simple = pattern_simplifier(pattern) | |||
if not simple then | |||
return umatch(str, pattern) | |||
end | |||
pattern = simple | |||
end | |||
return match(str, pattern) | |||
end | end | ||
do | do | ||
local entities | local entities | ||
local function get_entities() | |||
local function | entities, get_entities = load_data("Module:data/entities"), nil | ||
return entities | |||
return | |||
end | end | ||
local function decode_entity(hash, x, code) | local function decode_entity(hash, x, code) | ||
if hash == " | if hash == "" then | ||
return x == "" | return (entities or get_entities())[x .. code] | ||
end | |||
local cp | |||
if x == "" then | |||
cp = match(code, "^()%d+$") and tonumber(code) | |||
else | |||
cp = match(code, "^()%x+$") and tonumber(code, 16) | |||
end | end | ||
return cp and cp < 0x110000 and u(cp) or nil | |||
end | end | ||
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]]. | -- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]]. | ||
function export.decode_entities(str) | function export.decode_entities(str) | ||
return find(str, " | local amp = find(str, "&", nil, true) | ||
return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str | |||
end | end | ||
end | end | ||
do | do | ||
local | local entities | ||
local function get_entities() | |||
-- Memoized HTML entities (taken from mw.text.lua). | |||
entities, get_entities = { | |||
["\""] = """, | |||
["&"] = "&", | |||
["'"] = "'", | |||
["<"] = "<", | |||
[">"] = ">", | |||
["\194\160"] = " ", | |||
}, nil | |||
return entities | |||
end | |||
local function encode_entity(ch) | local function encode_entity(ch) | ||
local entity = | local entity = (entities or get_entities())[ch] | ||
if entity then | if entity == nil then | ||
entity = "&#" .. codepoint(ch) .. ";" | |||
entities[ch] = entity | |||
end | end | ||
return entity | return entity | ||
end | end | ||
function export.encode_entities(str, charset, str_lib, plain) | function export.encode_entities(str, charset, str_lib, plain) | ||
if charset == nil then | |||
return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities())) | |||
elseif charset == "" then | |||
return str | |||
end | |||
local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset) | |||
if not str_lib then | |||
local simple = pattern_simplifier(pattern) | |||
if not simple then | |||
return (ugsub(str, pattern, encode_entity)) | |||
return (gsub(str, "[\"&'<>\194]\160?", | |||
elseif | |||
return ( | |||
if not | |||
end | end | ||
pattern = simple | |||
end | end | ||
return (gsub(str, pattern, encode_entity)) | |||
end | end | ||
end | end | ||
| Line 787: | Line 1,013: | ||
enctype = enctype and upper(enctype) or "QUERY" | enctype = enctype and upper(enctype) or "QUERY" | ||
if enctype == "PATH" then | if enctype == "PATH" then | ||
return find(str, "%", | return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str | ||
elseif enctype == "QUERY" then | elseif enctype == "QUERY" then | ||
return (find(str, "%", | return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str | ||
elseif enctype == "WIKI" then | elseif enctype == "WIKI" then | ||
return (find(str, "%", | return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str | ||
end | end | ||
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2) | error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2) | ||
| Line 802: | Line 1,025: | ||
do | do | ||
local function _remove_comments(str, pre) | local function _remove_comments(str, pre) | ||
local head = find(str, "<!--", | local head = find(str, "<!--", nil, true) | ||
if not head then | if not head then | ||
return str | return str | ||
| Line 843: | Line 1,066: | ||
end | end | ||
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{ | --[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{lua|"\0"}}, {{lua|"\t"}}, {{lua|"\n"}}, {{lua|"\v"}}, {{lua|"\r"}} and {{lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==] | ||
function export.php_trim(str) | function export.php_trim(str) | ||
-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking. | |||
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or "" | return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or "" | ||
end | end | ||
php_trim = export.php_trim | php_trim = export.php_trim | ||
--[==[Takes a parameter name as | --[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{lua|frame.args}} table). For example, {{lua|"1"}} (a string) is normalized to {{lua|1}} (a number), {{lua|" foo "}} is normalized to {{lua|"foo"}}, and {{lua|1.5}} (a number) is normalized to {{lua|"1.5"}} (a string). Inputs which cannot be normalized (e.g. booleans) return {{lua|nil}}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {{lua|frame:expandTemplates()}}). | ||
Strings are trimmed with {{lua|export.php_trim}}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true: | |||
# They are integers | # They are integers; i.e. no decimals or leading zeroes (e.g. {{lua|"2"}}, but not {{lua|"2.0"}} or {{lua|"02"}}). | ||
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}. | # They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}. | ||
# | # There is no leading sign unless < 0 (e.g. {{lua|"2"}} or {{lua|"-2"}}, but not {{lua|"+2"}} or {{lua|"-0"}}). | ||
function export.scribunto_param_key(key) | # They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set). | ||
return key | Numbers are converted to strings if '''either''': | ||
# They are not integers (e.g. {{lua|1.5}}). | |||
# They are > 2{{sup|53}} or < -2{{sup|53}}. | |||
When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {{lua|tostring()}}.]==] | |||
function export.scribunto_param_key(key, no_trim) | |||
local tp = type(key) | |||
if tp == "string" then | |||
if not no_trim then | |||
key = php_trim(key) | |||
end | |||
if match(key, "^()-?[1-9]%d*$") then | |||
local num = tonumber(key) | |||
-- Lua integers are only precise to 2^53 - 1, so specifically check for 2^53 and -2^53 as strings, since a numerical comparison won't work as it can't distinguish 2^53 from 2^53 + 1. | |||
return ( | |||
num <= 9007199254740991 and num >= -9007199254740991 or | |||
key == "9007199254740992" or | |||
key == "-9007199254740992" | |||
) and num or key | |||
end | |||
return key == "0" and 0 or key | |||
elseif tp == "number" then | |||
-- No special handling needed for inf or NaN. | |||
return key % 1 == 0 and ( | |||
key <= 9007199254740992 and key >= -9007199254740992 and key or | |||
key <= 9223372036854775808 and key >= -9223372036854775808 and format("%d", key) | |||
) or tostring(key) | |||
end | end | ||
return nil | |||
end | end | ||
do | do | ||
local byte_escapes | local byte_escapes | ||
local function get_byte_escapes() | |||
byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil | |||
return byte_escapes | |||
end | |||
local function escape_byte(b) | local function escape_byte(b) | ||
return byte_escapes[b] or format("\\%03d", byte(b)) | return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b)) | ||
end | end | ||
function export.escape_bytes(str) | function export.escape_bytes(str) | ||
return (gsub(str, ".", escape_byte)) | return (gsub(str, ".", escape_byte)) | ||
end | end | ||
| Line 892: | Line 1,134: | ||
return name == "op" and "{" or | return name == "op" and "{" or | ||
name == "cl" and "}" or | name == "cl" and "}" or | ||
error( | error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'") | ||
elseif fun(name) and type(fun(name)) ~= "string" then | elseif fun(name) and type(fun(name)) ~= "string" then | ||
error( | error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string") | ||
end | end | ||
return fun(name) or error( | return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table") | ||
end)) | end)) | ||
end | end | ||
format_fun = export.format_fun | format_fun = export.format_fun | ||
--[==[This function, unlike {{ | --[==[This function, unlike {{lua|string.format}} and {{lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{lua|{param_name}}} in the format string with the table's entry for {{lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash. | ||
====Examples==== | ====Examples==== | ||
* {{ | * {{lua|=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}} | ||
*: produces: {{ | *: produces: {{lua|"one fish, two fish, red fish, blue fish"}} | ||
* {{ | * {{lua|=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}} | ||
*: produces: {{ | *: produces: {{lua|"The set {1, 2, 3} contains three elements."}} | ||
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] | *:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] | ||
function export.format(str, tbl) | function export.format(str, tbl) | ||
| Line 957: | Line 1,199: | ||
end | end | ||
function export.pluralize(...) -- To be removed once all calling modules have been changed to call Module:en-utilities directly. | |||
export.pluralize = require("Module:en-utilities").pluralize | |||
return export.pluralize(...) | |||
end | end | ||
| Line 1,072: | Line 1,265: | ||
function export.get_indefinite_article(str, ucfirst) | function export.get_indefinite_article(str, ucfirst) | ||
str = str or "" | str = str or "" | ||
-- If there's a link at the beginning, examine the first letter of the | -- If there's a link at the beginning, examine the first letter of the | ||
-- link text. This pattern matches both piped and unpiped links. | -- link text. This pattern matches both piped and unpiped links. | ||
-- If the link is not piped, the second capture (linktext) will be empty. | -- If the link is not piped, the second capture (linktext) will be empty. | ||
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]") | local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]") | ||
if link | if match(link and (linktext ~= "" and linktext or link) or str, "^()[AEIOUaeiou]") then | ||
return ucfirst and "An" or "an" | |||
end | end | ||
return | return ucfirst and "A" or "a" | ||
end | end | ||
get_indefinite_article = export.get_indefinite_article | get_indefinite_article = export.get_indefinite_article | ||