Module:string utilities: Difference between revisions

No edit summary
No edit summary
Line 1: Line 1:
local export = {}
local function_module = "Module:fun"
local load_module = "Module:load"
local memoize_module = "Module:memoize"
local mw = mw
local mw = mw
local string = string
local string = string
Line 12: Line 18:
local gsub = string.gsub
local gsub = string.gsub
local len = string.len
local len = string.len
local load_data = mw.loadData
local lower = string.lower
local lower = string.lower
local match = string.match
local match = string.match
local next = next
local next = next
local require = require
local reverse = string.reverse
local reverse = string.reverse
local select = select
local select = select
Line 34: Line 40:
local usub = ustring.sub
local usub = ustring.sub
local uupper = ustring.upper
local uupper = ustring.upper
local memoize = require(memoize_module)
-- Defined below.
-- Defined below.
local charset_escape
local charset_escape
Line 39: Line 48:
local explode_utf8
local explode_utf8
local format_fun
local format_fun
local get_charset
local get_indefinite_article
local get_indefinite_article
local gsplit
local pattern_escape
local pattern_escape
local pattern_simplifier
local pattern_simplifier
Line 47: Line 58:
local ulen
local ulen


local module_name = "string_utilities"
--[==[
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==]
local function is_callable(...)
is_callable = require(function_module).is_callable
return is_callable(...)
end


local export = {}
local function load_data(...)
load_data = require(load_module).load_data
return load_data(...)
end
 
local function prepare_iter(str, pattern, str_lib, plain)
local callable = is_callable(pattern)
if str_lib or plain then
return pattern, #str, string, callable
elseif not callable then
local simple = pattern_simplifier(pattern)
if simple then
return simple, #str, string, false
end
end
return pattern, ulen(str), ustring, callable
end


--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function has no safety checks for non-UTF-8 byte sequences, to optimize speed and memory use. Inputs containing them therefore result in undefined behaviour.]==]
Line 62: Line 94:
explode_utf8 = export.explode_utf8
explode_utf8 = export.explode_utf8


--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>. For example, {{code|lua|"^$()%.[]*+-?"}} becomes {{code|lua|"%^%$%(%)%%%.%[%]%*%+%-%?"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
do
function export.pattern_escape(str)
local charset_chars = {
return (gsub(str, "[$%%()*+%-.?[%]^]", "%%%0"))
["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^"
end
}
pattern_escape = export.pattern_escape
charset_chars.__index = charset_chars
local chars = setmetatable({
["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+",
["."] = "%.", ["?"] = "%?", ["["] = "%["
}, charset_chars)
 
--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): <code>$%()*+-.?[]^</code>, and converts the null character to <code>%z</code>. For example, {{lua|"^$()%.[]*+-?\0"}} becomes {{lua|"%^%$%(%)%%%.%[%]%*%+%-%?%z"}}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==]
function export.pattern_escape(str)
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars))
end
pattern_escape = export.pattern_escape
 
--[==[Escapes the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>, and converts the null character to <code>%z</code>.]==]
function export.charset_escape(str)
return (gsub(str, "[%z%%%-%]^]", charset_chars))
end
charset_escape = export.charset_escape
 
--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
function export.replacement_escape(str)
return (gsub(str, "%%", "%%%%"))
end
replacement_escape = export.replacement_escape
 
local function case_insensitive_char(ch)
local upper_ch = uupper(ch)
if upper_ch == ch then
ch = ulower(ch)
if ch == upper_ch then
return chars[ch] or ch
end
end
return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]"
end


--[==[Escapes only the magic characters used in [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] character sets: <code>%-]^</code>.]==]
local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2)
function export.charset_escape(str)
if not (loc1 and start <= str_len) then
return (gsub(str, "[%%%-%]^]", "%%%0"))
-- Add final chunk and return.
end
n = n + 1
charset_escape = export.charset_escape
text[n] = _gsub(_sub(str, start), ".", chars)
return
elseif loc2 < loc1 then
if _sub == sub then
local b = byte(str, loc1)
if b and b >= 128 then
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
end
end
n = n + 1
text[n] = _gsub(_sub(str, start, loc1), ".", chars)
start = loc1 + 1
if start > str_len then
return
end
else
-- Add chunk up to the current match.
n = n + 1
text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars)
-- Add current match.
n = n + 1
text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char)
start = loc2 + 1
end
return n, start
end


--[==[Escapes only <code>%</code>, which is the only magic character used in replacement [[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub.]==]
--[==[
function export.replacement_escape(str)
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns any pattern matching facilities off in the optional pattern supplied.]==]
return (gsub(str, "%%", "%%%%"))
function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain)
if pattern_or_func == nil then
return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char))
end
local text, n, start, str_len, _string, callable = {}, 0, 1
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub
if callable then
repeat
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start))
until not start
-- Special case if the pattern is anchored to the start: "^" always
-- anchors to the start position, not the start of the string, so get
-- around this by only attempting one match with the pattern, then match
-- the end of the string.
elseif byte(pattern_or_func) == 0x5E then -- ^
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
if start ~= nil then
iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain))
end
else
repeat
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain))
until not start
end
return concat(text)
end
end
end
replacement_escape = export.replacement_escape


do
do
local character_classes
local function get_character_classes()
character_classes, get_character_classes = {
[0x41] = true, [0x61] = true, -- Aa
[0x43] = true, [0x63] = true, -- Cc
[0x44] = true, [0x64] = true, -- Dd
[0x4C] = true, [0x6C] = true, -- Ll
[0x50] = true, [0x70] = true, -- Pp
[0x53] = true, [0x73] = true, -- Ss
[0x55] = true, [0x75] = true, -- Uu
[0x57] = true, [0x77] = true, -- Ww
[0x58] = true, [0x78] = true, -- Xx
[0x5A] = true, -- z dealt with separately.
}, nil
return character_classes
end
local function check_sets_equal(set1, set2)
local function check_sets_equal(set1, set2)
local k2
local k2
Line 129: Line 265:
local function parse_1_byte_charset(pattern, pos)
local function parse_1_byte_charset(pattern, pos)
local ch
while true do
while true do
local ch, nxt_pos
pos, ch = match(pattern, "()([%%%]\192-\255])", pos)
pos, ch, nxt_pos = match(pattern, "()([%%%]\194-\244][\128-\191]*)()", pos)
if ch == "%" then
if not ch then
local nxt = byte(pattern, pos + 1)
return false
if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z
elseif ch == "%" then
if match(pattern, "^[acdlpsuwxACDLPSUWXZ\128-\255]", nxt_pos) then
return false
return false
end
end
pos = pos + 2
pos = pos + 2
elseif ch == "]" then
elseif ch == "]" then
pos = nxt_pos
pos = pos + 1
return pos
return pos
else
else
return false
return false
end
end
end
end
end
end
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==]
pattern_simplifier = require("Module:fun").memoize(function(pattern)
function pattern_simplifier(pattern)
if type(pattern) == "number" then
if type(pattern) == "number" then
return tostring(pattern)
return tostring(pattern)
end
end
local pos, captures, start, n, output = 1, 0, 1, 0
local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0
while true do
while true do
local ch, nxt_pos
-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails.
pos, ch, nxt_pos = match(pattern, "()([%%(.[\194-\244][\128-\191]*)()", pos)
pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos)
if not ch then
if not ch then
break
break
end
end
local nxt = sub(pattern, nxt_pos, nxt_pos)
local nxt = byte(pattern, nxt_pos)
if ch == "%" then
if ch == "%" then
if nxt == "b" then
if nxt == 0x62 then -- b
if not match(pattern, "^()[^\128-\255][^\128-\255]", pos + 2) then
local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3)
if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then
return false
return false
end
end
pos = pos + 4
pos = pos + 4
elseif nxt == "f" then
elseif nxt == 0x66 then -- f
pos = pos + 2
nxt_pos = nxt_pos + 2
if not match(pattern, "^()%[[^^]", pos) then
local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos)
-- Only possible to convert a positive %f charset which is
-- all ASCII, so use parse_1_byte_charset.
if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^
return false
return false
elseif nxt3 == 0x5D then -- Initial ] is non-magic.
nxt_pos = nxt_pos + 1
end
end
-- Only possible to convert a %f charset which is all
pos = parse_1_byte_charset(pattern, nxt_pos)
-- ASCII, so use parse_1_byte_charset.
pos = parse_1_byte_charset(pattern, pos)
if not pos then
if not pos then
return false
return false
end
end
elseif nxt == "Z" then
elseif nxt == 0x5A then -- Z
pos = pos + 2
nxt = byte(pattern, nxt_pos + 1)
nxt = sub(pattern, pos, pos)
if nxt == 0x2A or nxt == 0x2D then -- *-
if nxt == "*" or nxt == "+" or nxt == "-" then
pos = pos + 3
pos = pos + 1
else
else
output = output or {}
if output == nil then
output = {}
end
local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]"
n = n + 1
n = n + 1
if nxt == "?" then
if nxt == 0x2B then -- +
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244]?[\128-\191]*"
output[n] = ins .. "%Z*"
pos = pos + 1
pos = pos + 3
elseif nxt == 0x3F then -- ?
output[n] = ins .. "?[\128-\191]*"
pos = pos + 3
else
else
output[n] = sub(pattern, start, pos - 3) .. "[\1-\127\194-\244][\128-\191]*"
output[n] = ins .. "[\128-\191]*"
pos = pos + 2
end
end
start = pos
start = pos
end
end
elseif find("acdlpsuwxACDLPSUWX", nxt, 1, true) then
elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz
return false
return false
-- Skip the next character if it's ASCII. Otherwise, we will
-- Skip the next character if it's ASCII. Otherwise, we will
-- still need to do length checks.
-- still need to do length checks.
else
else
pos = pos + (byte(nxt) < 128 and 2 or 1)
pos = pos + (nxt < 128 and 2 or 1)
end
end
elseif ch == "(" then
elseif ch == "(" then
if nxt == ")" or captures == 32 then
if nxt == 0x29 or capture_groups == 32 then -- )
return false
return false
end
end
captures = captures + 1
capture_groups = capture_groups + 1
pos = pos + 1
pos = pos + 1
elseif ch == "." then
elseif ch == "." then
if nxt == "*" or nxt == "+" or nxt == "-" then
if nxt == 0x2A or nxt == 0x2D then -- *-
pos = pos + 2
pos = pos + 2
else
else
output = output or {}
if output == nil then
output = {}
end
local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]"
n = n + 1
n = n + 1
if nxt == "?" then
if nxt == 0x2B then -- +
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191]?[\128-\191]*"
output[n] = ins .. ".*"
pos = pos + 2
elseif nxt == 0x3F then -- ?
output[n] = ins .. "?[\128-\191]*"
pos = pos + 2
pos = pos + 2
else
else
output[n] = sub(pattern, start, pos - 1) .. "[^\128-\191][\128-\191]*"
output[n] = ins .. "[\128-\191]*"
pos = pos + 1
pos = pos + 1
end
end
Line 224: Line 375:
elseif ch == "[" then
elseif ch == "[" then
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
-- Fail negative charsets. TODO: 1-byte charsets should be safe.
if nxt == "^" then
if nxt == 0x5E then -- ^
return false
return false
-- If the first character is "%", ch_len is determined by the
-- If the first character is "%", ch_len is determined by the
-- next one instead.
-- next one instead.
elseif nxt == "%" then
elseif nxt == 0x25 then -- %
nxt = byte(pattern, nxt_pos + 1)
elseif nxt == 0x5D then -- Initial ] is non-magic.
nxt_pos = nxt_pos + 1
nxt_pos = nxt_pos + 1
nxt = sub(pattern, nxt_pos, nxt_pos)
end
end
local ch_len = #match(pattern, "^.[\128-\191]*", nxt_pos)
if not nxt then
return false
end
local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4
if ch_len == 1 then -- Single-byte charset.
if ch_len == 1 then -- Single-byte charset.
pos = parse_1_byte_charset(pattern, pos + 1)
pos = parse_1_byte_charset(pattern, nxt_pos)
if not pos then
if not pos then
return false
return false
end
end
else -- Multibyte charset.
else -- Multibyte charset.
-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST.
local charset_pos, bytes = pos
local charset_pos, bytes = pos
pos = pos + 1
pos = pos + 1
while true do -- TODO: non-ASCII charset ranges.
while true do -- TODO: non-ASCII charset ranges.
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos)
if not ch then
return false
-- If escaped, get the next character. No need to
-- If escaped, get the next character. No need to
-- distinguish magic characters or character classes,
-- distinguish magic characters or character classes,
-- as they'll all fail for having the wrong length
-- as they'll all fail for having the wrong length
-- anyway.
-- anyway.
elseif ch == "%" then
if ch == "%" then
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]*)()", pos)
pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos)
elseif ch == "]" then
elseif ch == "]" then
pos = nxt_pos
pos = nxt_pos
break
break
end
end
if ch_len ~= #ch then
if not (ch and nxt_pos - pos == ch_len) then
return false
return false
elseif bytes == nil then
bytes = {}
end
end
bytes = bytes or {}
local bytes, last = bytes, nxt_pos - 1
local bytes = bytes
for i = pos, last - 1 do
for i = 1, ch_len - 1 do
local b = byte(pattern, i)
local b = byte(ch, i, i)
local bytes_b = bytes[b]
bytes[b] = bytes[b] or {}
if bytes_b == nil then
bytes = bytes[b]
bytes_b = {}
bytes[b] = bytes_b
end
bytes[b], bytes = bytes_b, bytes_b
end
end
bytes[byte(ch, -1)] = true
bytes[byte(pattern, last)] = true
pos = nxt_pos
pos = nxt_pos
end
end
Line 271: Line 430:
return false
return false
end
end
local nxt = sub(pattern, pos, pos)
nxt = byte(pattern, pos)
if (
if (
(nxt == "?" or nxt == "*" or nxt == "-") or
(nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-?
(nxt == "+" and ch_len > 2) or
(nxt == 0x2B and ch_len > 2) or -- +
not check_sets(bytes)
not check_sets(bytes)
) then
) then
Line 292: Line 451:
bytes = next_byte
bytes = next_byte
until next_byte == true
until next_byte == true
if nxt == "+" then
if nxt == 0x2B then -- +
local range1, range2 = ranges[1], ranges[2]
local range1, range2 = ranges[1], ranges[2]
ranges[1] = make_charset(range1)
ranges[1], ranges[3] = make_charset(range1), make_charset(range2)
ranges[3] = make_charset(range2)
local n = #range2
local n = #range2
for i = 1, #range1 do
for i = 1, #range1 do
Line 308: Line 466:
end
end
end
end
output = output or {}
if output == nil then
output = {}
end
nxt = byte(pattern, pos)
n = n + 1
n = n + 1
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges)
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) ..
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
start = pos
start = pos
end
end
elseif nxt == "+" then
elseif not nxt then
if #ch ~= 2 then
break
elseif nxt == 0x2B then -- +
if nxt_pos - pos ~= 2 then
return false
return false
elseif output == nil then
output = {}
end
end
output = output or {}
pos, nxt_pos = pos + 1, nxt_pos + 1
nxt = byte(pattern, nxt_pos)
local ch2 = sub(pattern, pos, pos)
n = n + 1
n = n + 1
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. sub(ch, 2, 2)
output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 ..
pos = nxt_pos + 1
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped
start = pos
pos, start = nxt_pos, nxt_pos
elseif nxt == "?" or nxt == "*" or nxt == "-" then
elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-?
return false
return false
else
else
Line 332: Line 500:
end
end
return concat(output) .. sub(pattern, start)
return concat(output) .. sub(pattern, start)
end, true)
end
export.pattern_simplifier = pattern_simplifier -- For testing.
pattern_simplifier = memoize(pattern_simplifier, true)
export.pattern_simplifier = pattern_simplifier
end
 
--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {{lua|"abcd-g"}} becomes {{lua|"[abcd-g]"}}, and {{lua|"[]"}} becomes {{lua|"[[%]]"}}).
 
The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==]
function get_charset(charset)
if type(charset) == "number" then
return tostring(charset)
end
local pos, start, n, output = 1, 1, 0
if byte(charset) == 0x5E then -- ^
pos = pos + 1
end
-- FIXME: "]" is non-magic if it's the first character in a charset.
local nxt_pos, nxt
while true do
local new_pos, ch = match(charset, "()([%%%-%]])", pos)
if not ch then
break
-- Skip percent escapes. Ranges can't start with them, either.
elseif ch == "%" then
pos = new_pos + 2
else
-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`.
if ch == "-" and new_pos > pos then
pos, nxt_pos, nxt = new_pos - 1, new_pos, ch
ch = sub(charset, pos, pos)
else
pos, nxt_pos = new_pos, new_pos + 1
nxt = sub(charset, nxt_pos, nxt_pos)
end
-- Range.
if nxt == "-" then
if output == nil then
output = {}
end
n = n + 1
output[n] = sub(charset, start, pos - 1)
nxt_pos = nxt_pos + 1
nxt = sub(charset, nxt_pos, nxt_pos)
-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour.
if nxt == "" or nxt == "%" then
n = n + 1
output[n] = (ch == "]" and "%]" or ch) .. "%-"
start = nxt_pos
nxt_pos = nxt_pos + 2
-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is omitted if the range would be empty (i.e. if the first byte is greater than the second).
else
n = n + 1
output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" ..
(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt)
nxt_pos = nxt_pos + 1
start = nxt_pos
end
elseif ch == "-" or ch == "]" then
if output == nil then
output = {}
end
n = n + 1
output[n] = sub(charset, start, pos - 1) .. "%" .. ch
start = nxt_pos
end
pos = nxt_pos
end
end
if start == 1 then
return "[" .. charset .. "]"
end
return "[" .. concat(output) .. sub(charset, start) .. "]"
end
end
get_charset = memoize(get_charset, true)
export.get_charset = get_charset


function export.len(str)
function export.len(str)
Line 430: Line 670:
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==]
function export.reverse(str)
function export.reverse(str)
return reverse(gsub(str, "[\194-\244][\128-\191]*", reverse))
return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse)))
end
end


Line 441: Line 681:
cp = tonumber(cp)
cp = tonumber(cp)
if cp < 0 then
if cp < 0 then
err("-0x" .. format("%X", -cp + 1))
err(format("-0x%X", -cp))
elseif cp < 0x80 then
elseif cp < 0x80 then
return char(cp)
return char(cp)
Line 466: Line 706:
)
)
end
end
err("0x" .. format("%X", cp))
err(format("0x%X", cp))
end
end


Line 567: Line 807:


do
do
local function add_captures(text, n, ...)
local function add_captures(t, n, ...)
if ... == nil then
return
end
-- Insert any captures from the splitting pattern.
-- Insert any captures from the splitting pattern.
local offset, capture = n - 1, ...
local offset, capture = n - 1, ...
while capture do
while capture do
n = n + 1
n = n + 1
text[n] = capture
t[n] = capture
capture = select(n - offset, ...)
capture = select(n - offset, ...)
end
end
Line 578: Line 821:
end
end
local function iterate(str, str_len, text, n, start, _sub, loc1, loc2, ...)
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.
if not (loc1 and start <= str_len) then
-- If no match, or there is but we're past the end of the string
-- (which happens when the match is the empty string), then add
-- the final chunk and return.
n = n + 1
text[n] = _sub(str, start)
return
elseif loc2 < loc1 then
-- Special case: If we match the empty string, then include the
-- next character; this avoids an infinite loop, and makes
-- splitting by an empty string work the way mw.text.split() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string. If using the string library, we
-- need to make sure we advance by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
n = n + 1
text[n] = _sub(str, start, loc1)
start = loc1 + 1
if start > str_len then
return ... and add_captures(text, n, ...) or n
end
else
-- Add chunk up to the current match.
n = n + 1
text[n] = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
return (... and add_captures(text, n, ...) or n), start
end
local function _split(str, pattern, str_len, _sub, _find, plain)
In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil if there are no further matches. By default, the start index will be calculated using the ustring library, unless `str_lib` or `plain` is set.]==]
local text, n, start = {}, 0, 1
function export.split(str, pattern_or_func, str_lib, plain)
local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0
repeat
repeat
n, start = iterate(str, str_len, text, n, start, _sub, _find(str, pattern, start, plain))
n = add_captures(t, n, iter())
until not start
until n == nil
return t
return text
end
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal.]==]
function export.split(str, pattern, str_lib, plain)
if str_lib or plain then
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
export.capturing_split = export.split -- To be removed.
export.capturing_split = export.split -- To be removed.
end
end


do
--[==[Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the string up the splitting pattern, with any capture groups being returned as additional values on that iteration.]==]
-- TODO: merge this with export.split. Not clear how to do this while
function export.gsplit(str, pattern_or_func, str_lib, plain)
-- maintaining the same level of performance, as gsplit is slower.
local start, final, str_len, _string, callable = 1
local function _split(str, pattern, str_len, _sub, _find, plain)
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain)
local start, final = 1
local _find, _sub = _string.find, _string.sub
local function iter(loc1, loc2, ...)
local function iter(loc1, loc2, ...)
-- If no match, return the final chunk.
-- If no match, or there is but we're past the end of the string
if not loc1 then
-- (which happens when the match is the empty string), then return
-- the final chunk.
if not loc1 then
final = true
return _sub(str, start)
end
-- Special case: If we match the empty string, then eat the
-- next character; this avoids an infinite loop, and makes
-- splitting by the empty string work the way mw.text.gsplit() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, set `final` to true, so we
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
local b = byte(str, loc1)
if b and b >= 128 then
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3)
end
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
final = true
return _sub(str, start)
end
-- Special case: If we match the empty string, then eat the
-- next character; this avoids an infinite loop, and makes
-- splitting by the empty string work the way mw.text.gsplit() does
-- (including non-adjacent empty string matches with %f). If we
-- reach the end of the string this way, set `final` to true, so we
-- don't get stuck matching the empty string at the end.
local chunk
if loc2 < loc1 then
-- If using the string library, we need to make sure we advance
-- by one UTF-8 character.
if _sub == sub then
loc1 = loc1 + #match(str, "^[\128-\191]*", loc1 + 1)
end
chunk = _sub(str, start, loc1)
if loc1 >= str_len then
final = true
else
start = loc1 + 1
end
-- Eat chunk up to the current match.
else
else
chunk = _sub(str, start, loc1 - 1)
start = loc1 + 1
start = loc2 + 1
end
end
return chunk, ...
-- Eat chunk up to the current match.
else
chunk = _sub(str, start, loc1 - 1)
start = loc2 + 1
end
end
return chunk, ...
end
if callable then
return function()
return function()
if not final then
if not final then
return iter(_find(str, pattern, start, plain))
return iter(pattern_or_func(str, start))
end
end
-- Special case if the pattern is anchored to the start: "^" always
-- anchors to the start position, not the start of the string, so get
-- around this by only attempting one match with the pattern, then match
-- the end of the string.
elseif byte(pattern_or_func) == 0x5E then -- ^
local returned
return function()
if not returned then
returned = true
return iter(_find(str, pattern_or_func, start, plain))
elseif not final then
return iter(_find(str, "$", start, plain))
end
end
return nil
end
end
end
end
return function()
function export.gsplit(str, pattern, str_lib, plain)
if not final then
if str_lib or plain then
return iter(_find(str, pattern_or_func, start, plain))
return _split(str, pattern, #str, sub, find, plain)
end
local simple = pattern_simplifier(pattern)
if simple then
return _split(str, simple, #str, sub, find)
end
end
return _split(str, pattern, ulen(str), usub, ufind)
end
end
end
end
gsplit = export.gsplit


function export.trim(str, charset)
function export.trim(str, charset, str_lib, plain)
if not charset then
if charset == nil then
return match(str, "^()%s*$") and "" or match(str, "^%s*(.*%S)")
-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" first.
elseif match(charset, "^()[^\128-\255]*$") then
return match(gsub(str, "^%s*", ""), "^.*%S") or ""
return match(str, "^()[" .. charset .. "]*$") and "" or match(str, "^[" .. charset .. "]*(.*[^" .. charset .. "])")
elseif charset == "" then
return str
end
end
return umatch(str, "^[" .. charset .. "]*(.-)[" .. charset .. "]*$")
charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset)
-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there would be two callbacks into PHP, which is slower.
local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$"
if not str_lib then
local simple = pattern_simplifier(pattern)
if not simple then
return umatch(str, pattern)
end
pattern = simple
end
return match(str, pattern)
end
end


do
do
local entities
local entities
 
local function get_entities()
local function decode_numeric_entity(code, pattern, base)
entities, get_entities = load_data("Module:data/entities"), nil
local cp = match(code, pattern) and tonumber(code, base)
return entities
return cp and cp < 0x110000 and u(cp) or nil
end
end


local function decode_entity(hash, x, code)
local function decode_entity(hash, x, code)
if hash == "#" then
if hash == "" then
return x == "" and decode_numeric_entity(code, "^%d+$") or
return (entities or get_entities())[x .. code]
decode_numeric_entity(code, "^%x+$", 16)
end
local cp
if x == "" then
cp = match(code, "^()%d+$") and tonumber(code)
else
cp = match(code, "^()%x+$") and tonumber(code, 16)
end
end
entities = entities or load_data("Module:data/entities")
return cp and cp < 0x110000 and u(cp) or nil
return entities[x .. code]
end
end


-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]].
function export.decode_entities(str)
function export.decode_entities(str)
return find(str, "&", 1, true) and
local amp = find(str, "&", nil, true)
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str
end
end
end
end


do
do
local html_entities
local entities
local function get_entities()
-- Memoized HTML entities (taken from mw.text.lua).
entities, get_entities = {
["\""] = "&quot;",
["&"] = "&amp;",
["'"] = "&#039;",
["<"] = "&lt;",
[">"] = "&gt;",
["\194\160"] = "&nbsp;",
}, nil
return entities
end
local function encode_entity(ch)
local function encode_entity(ch)
local entity = html_entities[ch]
local entity = (entities or get_entities())[ch]
if entity then
if entity == nil then
return entity
entity = "&#" .. codepoint(ch) .. ";"
entities[ch] = entity
end
end
entity = "&#" .. codepoint(ch) .. ";"
html_entities[ch] = entity
return entity
return entity
end
end
function export.encode_entities(str, charset, str_lib, plain)
function export.encode_entities(str, charset, str_lib, plain)
-- Memoized HTML entities (taken from mw.text.lua).
if charset == nil then
html_entities = html_entities or {
return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities()))
["\""] = "&quot;",
elseif charset == "" then
["&"] = "&amp;",
return str
["'"] = "&#039;",
end
["<"] = "&lt;",
local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset)
[">"] = "&gt;",
if not str_lib then
["\194\160"] = "&nbsp;",
local simple = pattern_simplifier(pattern)
}
if not simple then
if not charset then
return (ugsub(str, pattern, encode_entity))
return (gsub(str, "[\"&'<>\194]\160?", html_entities))
elseif plain then
return (gsub(str, "[" .. charset_escape(charset) .. "]", encode_entity))
elseif str_lib then
if not match(charset, "^()[^\128-\255]*$") then
error("Cannot use the string library with a character set that contains a character with a codepoint above U+007F.")
end
end
return (gsub(str, "[" .. charset .. "]", encode_entity))
pattern = simple
end
end
local pattern = charset and "[" .. charset .. "]"
return (gsub(str, pattern, encode_entity))
local simple = pattern_simplifier(pattern)
if simple then
return (gsub(str, simple, encode_entity))
end
return (ugsub(str, pattern, encode_entity))
end
end
end
end
Line 787: Line 1,013:
enctype = enctype and upper(enctype) or "QUERY"
enctype = enctype and upper(enctype) or "QUERY"
if enctype == "PATH" then
if enctype == "PATH" then
return find(str, "%", 1, true) and
return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str
gsub(str, "%%(%x%x)", decode_path) or str
elseif enctype == "QUERY" then
elseif enctype == "QUERY" then
return (find(str, "%", 1, true) or find(str, "+", 1, true)) and
return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str
gsub(str, "([%%%+])(%x?%x?)", decode) or str
elseif enctype == "WIKI" then
elseif enctype == "WIKI" then
return (find(str, "%", 1, true) or find(str, "_", 1, true)) and
return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str
gsub(str, "([%%_])(%x?%x?)", decode) or str
end
end
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
error("bad argument #2 to \"decode_uri\" (expected QUERY, PATH, or WIKI)", 2)
Line 802: Line 1,025:
do
do
local function _remove_comments(str, pre)
local function _remove_comments(str, pre)
local head = find(str, "<!--", 1, true)
local head = find(str, "<!--", nil, true)
if not head then
if not head then
return str
return str
Line 843: Line 1,066:
end
end


--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{code|lua|"\0"}}, {{code|lua|"\t"}}, {{code|lua|"\n"}}, {{code|lua|"\v"}}, {{code|lua|"\r"}} and {{code|lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
--[==[Lua equivalent of PHP's {{code|php|trim($string)}}, which trims {{lua|"\0"}}, {{lua|"\t"}}, {{lua|"\n"}}, {{lua|"\v"}}, {{lua|"\r"}} and {{lua|" "}}. This is useful when dealing with template parameters, since the native parser trims them like this.]==]
function export.php_trim(str)
function export.php_trim(str)
-- A frontier pattern with a greedy quantifier is faster than the algorithms used by export.trim, but can be only be used if the character set includes \0, since %z matches the start/end of the string, as well as \0. This is also immune to catastrophic backtracking.
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
return match(str, "%f[^%z\t\n\v\r ].*%f[%z\t\n\v\r ]") or ""
end
end
php_trim = export.php_trim
php_trim = export.php_trim


--[==[Takes a parameter name as an input, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{code|lua|frame.args}} table). For example, {{code|lua|"1"}} is normalized to {{code|lua|1}} (a number), and {{code|lua|" foo "}} is normalized to {{code|lua|"foo"}}. If the input is not a string, it is returned unchanged.
--[==[Takes a parameter name as either a string or number, and returns the Scribunto-normalized form (i.e. the key that that parameter would have in a {{lua|frame.args}} table). For example, {{lua|"1"}} (a string) is normalized to {{lua|1}} (a number), {{lua|" foo "}} is normalized to {{lua|"foo"}}, and {{lua|1.5}} (a number) is normalized to {{lua|"1.5"}} (a string). Inputs which cannot be normalized (e.g. booleans) return {{lua|nil}}. If the `no_trim` flag is set, string parameters are not trimmed, but strings may still be converted to numbers if they do not contain whitespace; this is necessary when normalizing keys into the form received by PHP during callbacks, before any trimming occurs (e.g. in the table of arguments when calling {{lua|frame:expandTemplates()}}).


After being trimmed with {{code|lua|export.php_trim}}, strings are converted to numbers if:
Strings are trimmed with {{lua|export.php_trim}}, unless the `no_trim` flag is set. They are then converted to numbers if '''all''' of the following are true:
# They are integers, with no decimals (2.0) or leading zeroes (02).
# They are integers; i.e. no decimals or leading zeroes (e.g. {{lua|"2"}}, but not {{lua|"2.0"}} or {{lua|"02"}}).
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# They are ≤ 2{{sup|53}} and ≥ -2{{sup|53}}.
# For positive values, they do not have a leading {{code|lua|+}} sign.]==]
# There is no leading sign unless < 0 (e.g. {{lua|"2"}} or {{lua|"-2"}}, but not {{lua|"+2"}} or {{lua|"-0"}}).
function export.scribunto_param_key(key)
# They contain no leading or trailing whitespace (which may be present when the `no_trim` flag is set).
if type(key) ~= "string" then
 
return key
Numbers are converted to strings if '''either''':
# They are not integers (e.g. {{lua|1.5}}).
# They are > 2{{sup|53}} or < -2{{sup|53}}.
 
When converted to strings, integers ≤ 2{{sup|63}} and ≥ -2{{sup|63}} are formatted as integers (i.e. all digits are given), which is the range of PHP's integer precision, though the actual output may be imprecise since Lua's integer precision is > 2{{sup|53}} to < -2{{sup|53}}. All other numbers use the standard formatting output by {{lua|tostring()}}.]==]
function export.scribunto_param_key(key, no_trim)
local tp = type(key)
if tp == "string" then
if not no_trim then
key = php_trim(key)
end
if match(key, "^()-?[1-9]%d*$") then
local num = tonumber(key)
-- Lua integers are only precise to 2^53 - 1, so specifically check for 2^53 and -2^53 as strings, since a numerical comparison won't work as it can't distinguish 2^53 from 2^53 + 1.
return (
num <= 9007199254740991 and num >= -9007199254740991 or
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
end
return key == "0" and 0 or key
elseif tp == "number" then
-- No special handling needed for inf or NaN.
return key % 1 == 0 and (
key <= 9007199254740992 and key >= -9007199254740992 and key or
key <= 9223372036854775808 and key >= -9223372036854775808 and format("%d", key)
) or tostring(key)
end
end
key = php_trim(key)
return nil
if match(key, "^-?[1-9]%d*$") then
local num = tonumber(key)
-- Lua integers are only accurate to 2^53 - 1, so we have to specifically check for 2^53 and -2^53, since 2^53 == 2^53 + 1 evaluates to true.
return (
num <= 9007199254740991 and num >= -9007199254740991 or
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
elseif key == "0" then
return 0
end
return key
end
end


do
do
local byte_escapes
local byte_escapes
local function get_byte_escapes()
byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil
return byte_escapes
end
local function escape_byte(b)
local function escape_byte(b)
return byte_escapes[b] or format("\\%03d", byte(b))
return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b))
end
end
function export.escape_bytes(str)
function export.escape_bytes(str)
byte_escapes = byte_escapes or load_data("Module:string utilities/data").byte_escapes
return (gsub(str, ".", escape_byte))
return (gsub(str, ".", escape_byte))
end
end
Line 892: Line 1,134:
return name == "op" and "{" or
return name == "op" and "{" or
name == "cl" and "}" or
name == "cl" and "}" or
error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'")
elseif fun(name) and type(fun(name)) ~= "string" then
elseif fun(name) and type(fun(name)) ~= "string" then
error(module_name .. ".format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string")
end
end
return fun(name) or error(module_name .. ".format: \"" .. name .. "\" not found in table")
return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table")
end))
end))
end
end
format_fun = export.format_fun
format_fun = export.format_fun


--[==[This function, unlike {{code|lua|string.format}} and {{code|lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{code|lua|{param_name}}} in the format string with the table's entry for {{code|lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
--[==[This function, unlike {{lua|string.format}} and {{lua|mw.ustring.format}}, takes just two parameters—a format string and a table—and replaces all instances of {{lua|{param_name}}} in the format string with the table's entry for {{lua|param_name}}. The opening and closing brace characters can be escaped with <code>{\op}</code> and <code>{\cl}</code>, respectively. A table entry beginning with a slash can be escaped by doubling the initial slash.
====Examples====
====Examples====
* {{code|lua|2=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
* {{lua|=string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"})}}
*: produces: {{code|lua|"one fish, two fish, red fish, blue fish"}}
*: produces: {{lua|"one fish, two fish, red fish, blue fish"}}
* {{code|lua|2=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
* {{lua|=string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})}}
*: produces: {{code|lua|"The set {1, 2, 3} contains three elements."}}
*: produces: {{lua|"The set {1, 2, 3} contains three elements."}}
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==]
function export.format(str, tbl)
function export.format(str, tbl)
Line 957: Line 1,199:
end
end


do
function export.pluralize(...) -- To be removed once all calling modules have been changed to call Module:en-utilities directly.
local function word_ends_in_consonant_plus_y(str)
export.pluralize = require("Module:en-utilities").pluralize
-- FIXME, a subrule of rule #1 above says the -ies ending doesn't
return export.pluralize(...)
-- apply to proper nouns, hence "the Gettys", "the public Ivys".
-- We should maybe consider applying this rule here; but it may not
-- be important as this function is almost always called on common nouns
-- (e.g. parts of speech, place types).
return find(str, "[^aeiouyAEIOUY ]y$")
end
local function word_takes_es_plural(str)
return find(str, "[sxz]$") or find(str, "[csz]h$")
end
local function do_pluralize(str)
if word_ends_in_consonant_plus_y(str) then
-- avoid returning multiple values
return (gsub(str, "y$", "ies"))
elseif word_takes_es_plural(str) then
return str .. "es"
end
return str .. "s"
end
--[==[
Pluralize a word in a smart fashion, according to normal English rules.
# If word ends in consonant + -y, replace the -y with -ies.
# If the word ends in -s, -x, -z, -ch, -sh, -zh, add -es.
# Otherwise, add -s.
 
This handles links correctly:
# If a piped link, change the second part appropriately.
# If a non-piped link and rule #1 above applies, convert to a piped link with the second part containing the plural.
# If a non-piped link and rules #2 or #3 above apply, add the plural outside the link.
]==]
function export.pluralize(str)
if type(str) == "table" then
-- allow calling from a template
str = str.args[1]
end
-- Check for a link. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local beginning, link, linktext = match(str, "^(.*)%[%[([^|%]]+)%|?(.-)%]%]$")
if not link then
return do_pluralize(str)
elseif linktext ~= "" then
return beginning .. "[[" .. link .. "|" .. do_pluralize(linktext) .. "]]"
elseif word_ends_in_consonant_plus_y(link) then
return beginning .. "[[" .. link .. "|" .. gsub(link, "y$", "ies") .. "]]"
end
return beginning .. "[[" .. link .. "]]" .. (word_takes_es_plural(link) and "es" or "s")
end
end
end


Line 1,072: Line 1,265:
function export.get_indefinite_article(str, ucfirst)
function export.get_indefinite_article(str, ucfirst)
str = str or ""
str = str or ""
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
local link, linktext = match(str, "^%[%[([^|%]]+)%|?(.-)%]%]")
if link then
if match(link and (linktext ~= "" and linktext or link) or str, "^()[AEIOUaeiou]") then
is_vowel = find(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
return ucfirst and "An" or "an"
else
is_vowel = find(str, "^[AEIOUaeiou]")
end
end
return is_vowel and (ucfirst and "An" or "an") or (ucfirst and "A" or "a")
return ucfirst and "A" or "a"
end
end
get_indefinite_article = export.get_indefinite_article
get_indefinite_article = export.get_indefinite_article