Module:string utilities: Difference between revisions
No edit summary |
m 1 revision imported |
||
| (3 intermediate revisions by 2 users not shown) | |||
| Line 1: | Line 1: | ||
local export = {} | |||
local function_module = "Module:fun" | |||
local load_module = "Module:load" | |||
local memoize_module = "Module:memoize" | |||
local string_char_module = "Module:string/char" | |||
local string_charset_escape_module = "Module:string/charsetEscape" | |||
local mw = mw | local mw = mw | ||
local string = string | local string = string | ||
| Line 11: | Line 19: | ||
local gmatch = string.gmatch | local gmatch = string.gmatch | ||
local gsub = string.gsub | local gsub = string.gsub | ||
local insert = table.insert | |||
local len = string.len | local len = string.len | ||
local lower = string.lower | local lower = string.lower | ||
local match = string.match | local match = string.match | ||
local next = next | local next = next | ||
local require = require | |||
local reverse = string.reverse | local reverse = string.reverse | ||
local select = select | local select = select | ||
| Line 30: | Line 39: | ||
local ulower = ustring.lower | local ulower = ustring.lower | ||
local umatch = ustring.match | local umatch = ustring.match | ||
local unpack = unpack | local unpack = unpack or table.unpack -- Lua 5.2 compatibility | ||
local upper = string.upper | local upper = string.upper | ||
local usub = ustring.sub | local usub = ustring.sub | ||
local uupper = ustring.upper | local uupper = ustring.upper | ||
local memoize = require(memoize_module) | |||
-- Defined below. | -- Defined below. | ||
local codepoint | local codepoint | ||
local explode_utf8 | local explode_utf8 | ||
local format_fun | local format_fun | ||
local | local get_charset | ||
local gsplit | |||
local pattern_escape | local pattern_escape | ||
local pattern_simplifier | local pattern_simplifier | ||
local replacement_escape | local replacement_escape | ||
local | local title_case | ||
local trim | |||
local ucfirst | |||
local ulen | local ulen | ||
local | --[==[ | ||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures | |||
modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no | |||
overhead after the first call, since the target functions are called directly in any subsequent calls. | |||
]==] | |||
local function charset_escape(...) | |||
charset_escape = require(string_charset_escape_module) | |||
return charset_escape(...) | |||
end | |||
local function is_callable(...) | |||
is_callable = require(function_module).is_callable | |||
return is_callable(...) | |||
end | |||
local function load_data(...) | |||
load_data = require(load_module).load_data | |||
return load_data(...) | |||
end | |||
local function u(...) | |||
u = require(string_char_module) | |||
return u(...) | |||
end | |||
local function prepare_iter(str, pattern, str_lib, plain) | |||
local callable = is_callable(pattern) | |||
if str_lib or plain then | |||
return pattern, #str, string, callable | |||
elseif not callable then | |||
local simple = pattern_simplifier(pattern) | |||
if simple then | |||
return simple, #str, string, false | |||
end | |||
end | |||
return pattern, ulen(str), ustring, callable | |||
end | |||
--[==[ | |||
Returns {nil} if the input value is the empty string, or otherwise the same value. | |||
If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is | |||
the empty string, returns {nil}. | |||
If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input | |||
string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation | |||
marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also | |||
be embedded this way (e.g. {"''foo''"} returns {"'foo'"}). | |||
]==] | |||
function export.is_not_empty(str, do_trim, quote_delimiters) | |||
if str == "" then | |||
return nil | |||
elseif not (str and type(str) == "string") then | |||
return str | |||
elseif do_trim then | |||
str = trim(str) | |||
if str == "" then | |||
return nil | |||
end | |||
end | |||
return quote_delimiters and gsub(str, "^(['\"])(.*)%1$", "%2") or str | |||
end | |||
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function | --[==[ | ||
Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8 | |||
in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in | |||
unexpected behaviour. | |||
]==] | |||
function export.explode_utf8(str) | function export.explode_utf8(str) | ||
local text, i = {}, 0 | local text, i = {}, 0 | ||
| Line 62: | Line 138: | ||
explode_utf8 = export.explode_utf8 | explode_utf8 = export.explode_utf8 | ||
--[==[ | --[==[ | ||
function export. | Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true: | ||
* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to | |||
`0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte | |||
characters start with `0xF0` to `0xF4`. | |||
* The leading byte must not fall outside of the above ranges. | |||
* The trailing byte(s) (if any), must be between `0x80` to `0xBF`. | |||
* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`). | |||
* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to | |||
U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings | |||
that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte | |||
character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000). | |||
Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but | |||
`0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`. | |||
If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in | |||
UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of | |||
surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher | |||
codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances | |||
where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly | |||
4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates, | |||
even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`). | |||
]==] | |||
function export.isutf8(str, allow_surrogates) | |||
for ch in gmatch(str, "[\128-\255][\128-\191]*") do | |||
if #ch > 4 then | |||
return false | |||
end | |||
local b1, b2, b3, b4 = byte(ch, 1, 4) | |||
if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then | |||
return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F | |||
elseif not b3 then -- 2-byte | |||
if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong | |||
return false | |||
end | |||
elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte | |||
return false | |||
elseif not b4 then -- 3-byte | |||
if b1 > 0xEF then | |||
return false | |||
elseif b2 < 0xA0 then | |||
if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong | |||
return false | |||
end | |||
elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate | |||
return false | |||
end | |||
elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte | |||
return false | |||
elseif b2 < 0x90 then | |||
if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong | |||
return false | |||
end | |||
elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high | |||
return false | |||
end | |||
end | |||
return true | |||
end | end | ||
--[==[Escapes | do | ||
function export. | local charset_chars = { | ||
["\0"] = "%z", ["%"] = "%%", ["-"] = "%-", ["]"] = "%]", ["^"] = "%^" | |||
end | } | ||
charset_chars.__index = charset_chars | |||
local chars = setmetatable({ | |||
["$"] = "%$", ["("] = "%(", [")"] = "%)", ["*"] = "%*", ["+"] = "%+", | |||
["."] = "%.", ["?"] = "%?", ["["] = "%[" | |||
}, charset_chars) | |||
--[==[ | |||
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's | |||
version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example, | |||
{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving | |||
arbitrary text (e.g. from user input). | |||
]==] | |||
function export.pattern_escape(str) | |||
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars)) | |||
end | |||
pattern_escape = export.pattern_escape | |||
--[==[ | |||
Escapes only {%}, which is the only magic character used in replacement | |||
[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub. | |||
]==] | |||
function export.replacement_escape(str) | |||
return (gsub(str, "%%", "%%%%")) | |||
end | |||
replacement_escape = export.replacement_escape | |||
local function case_insensitive_char(ch) | |||
local upper_ch = uupper(ch) | |||
if upper_ch == ch then | |||
ch = ulower(ch) | |||
if ch == upper_ch then | |||
return chars[ch] or ch | |||
end | |||
end | |||
return "[" .. (charset_chars[upper_ch] or upper_ch) .. (charset_chars[ch] or ch) .. "]" | |||
end | |||
local function iterate(str, str_len, text, n, start, _gsub, _sub, loc1, loc2) | |||
if not (loc1 and start <= str_len) then | |||
-- Add final chunk and return. | |||
n = n + 1 | |||
text[n] = _gsub(_sub(str, start), ".", chars) | |||
return | |||
elseif loc2 < loc1 then | |||
if _sub == sub then | |||
local b = byte(str, loc1) | |||
if b and b >= 128 then | |||
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3) | |||
end | |||
end | |||
n = n + 1 | |||
text[n] = _gsub(_sub(str, start, loc1), ".", chars) | |||
start = loc1 + 1 | |||
if start > str_len then | |||
return | |||
end | |||
else | |||
-- Add chunk up to the current match. | |||
n = n + 1 | |||
text[n] = _gsub(_sub(str, start, loc1 - 1), ".", chars) | |||
-- Add current match. | |||
n = n + 1 | |||
text[n] = _gsub(_sub(str, loc1, loc2), ".", case_insensitive_char) | |||
start = loc2 + 1 | |||
end | |||
return n, start | |||
end | |||
--[==[Escapes | --[==[ | ||
function export. | Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes | ||
all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second | |||
argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns | |||
any pattern matching facilities off in the optional pattern supplied. | |||
]==] | |||
function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain) | |||
if pattern_or_func == nil then | |||
return (gsub(str, str_lib and "[^\128-\255]" or ".[\128-\191]*", case_insensitive_char)) | |||
end | |||
local text, n, start, str_len, _string, callable = {}, 0, 1 | |||
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain) | |||
local _find, _gsub, _sub = _string.find, _string.gsub, _string.sub | |||
if callable then | |||
repeat | |||
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, pattern_or_func(str, start)) | |||
until not start | |||
-- Special case if the pattern is anchored to the start: "^" always | |||
-- anchors to the start position, not the start of the string, so get | |||
-- around this by only attempting one match with the pattern, then match | |||
-- the end of the string. | |||
elseif byte(pattern_or_func) == 0x5E then -- ^ | |||
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain)) | |||
if start ~= nil then | |||
iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, "$", start, plain)) | |||
end | |||
else | |||
repeat | |||
n, start = iterate(str, str_len, text, n, start, _gsub, _sub, _find(str, pattern_or_func, start, plain)) | |||
until not start | |||
end | |||
return concat(text) | |||
end | |||
end | end | ||
do | do | ||
local character_classes | |||
local function get_character_classes() | |||
character_classes, get_character_classes = { | |||
[0x41] = true, [0x61] = true, -- Aa | |||
[0x43] = true, [0x63] = true, -- Cc | |||
[0x44] = true, [0x64] = true, -- Dd | |||
[0x4C] = true, [0x6C] = true, -- Ll | |||
[0x50] = true, [0x70] = true, -- Pp | |||
[0x53] = true, [0x73] = true, -- Ss | |||
[0x55] = true, [0x75] = true, -- Uu | |||
[0x57] = true, [0x77] = true, -- Ww | |||
[0x58] = true, [0x78] = true, -- Xx | |||
[0x5A] = true, -- z dealt with separately. | |||
}, nil | |||
return character_classes | |||
end | |||
local function check_sets_equal(set1, set2) | local function check_sets_equal(set1, set2) | ||
local k2 | local k2 | ||
| Line 129: | Line 375: | ||
local function parse_1_byte_charset(pattern, pos) | local function parse_1_byte_charset(pattern, pos) | ||
local ch | |||
while true do | while true do | ||
pos, ch = match(pattern, "()([%%%]\192-\255])", pos) | |||
pos, ch | if ch == "%" then | ||
if | local nxt = byte(pattern, pos + 1) | ||
if not nxt or nxt >= 128 or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWXZ, but not z | |||
return false | return false | ||
end | end | ||
pos = pos + 2 | pos = pos + 2 | ||
elseif ch == "]" then | elseif ch == "]" then | ||
pos = | pos = pos + 1 | ||
return pos | return pos | ||
else | else | ||
return false | return false | ||
end | end | ||
end | end | ||
end | end | ||
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==] | --[==[ | ||
pattern_simplifier | Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion | ||
isn't possible, returns false. | |||
]==] | |||
function pattern_simplifier(pattern) | |||
if type(pattern) == "number" then | if type(pattern) == "number" then | ||
return tostring(pattern) | return tostring(pattern) | ||
end | end | ||
local pos, | local pos, capture_groups, start, n, output, ch, nxt_pos = 1, 0, 1, 0 | ||
while true do | while true do | ||
-- FIXME: use "()([%%(.[\128-\255])[\128-\191]?[\128-\191]?[\128-\191]?()" and ensure non-UTF8 always fails. | |||
pos, ch, nxt_pos = match(pattern, "()([%%(.[\ | pos, ch, nxt_pos = match(pattern, "()([%%(.[\192-\255])[\128-\191]*()", pos) | ||
if not ch then | if not ch then | ||
break | break | ||
end | end | ||
local nxt = | local nxt = byte(pattern, nxt_pos) | ||
if ch == "%" then | if ch == "%" then | ||
if nxt == | if nxt == 0x62 then -- b | ||
local nxt2, nxt3 = byte(pattern, pos + 2, pos + 3) | |||
if not (nxt2 and nxt2 < 128 and nxt3 and nxt3 < 128) then | |||
return false | return false | ||
end | end | ||
pos = pos + 4 | pos = pos + 4 | ||
elseif nxt == | elseif nxt == 0x66 then -- f | ||
nxt_pos = nxt_pos + 2 | |||
local nxt2, nxt3 = byte(pattern, nxt_pos - 1, nxt_pos) | |||
-- Only possible to convert a positive %f charset which is | |||
-- all ASCII, so use parse_1_byte_charset. | |||
if not (nxt2 == 0x5B and nxt3 and nxt3 ~= 0x5E and nxt3 < 128) then -- [^ | |||
return false | return false | ||
elseif nxt3 == 0x5D then -- Initial ] is non-magic. | |||
nxt_pos = nxt_pos + 1 | |||
end | end | ||
pos = parse_1_byte_charset(pattern, nxt_pos) | |||
pos = parse_1_byte_charset(pattern, | |||
if not pos then | if not pos then | ||
return false | return false | ||
end | end | ||
elseif nxt == | elseif nxt == 0x5A then -- Z | ||
nxt = byte(pattern, nxt_pos + 1) | |||
nxt = | if nxt == 0x2A or nxt == 0x2D then -- *- | ||
if nxt == | pos = pos + 3 | ||
pos = pos + | |||
else | else | ||
output = output | if output == nil then | ||
output = {} | |||
end | |||
local ins = sub(pattern, start, pos - 1) .. "[\1-\127\192-\255]" | |||
n = n + 1 | n = n + 1 | ||
if nxt == | if nxt == 0x2B then -- + | ||
output[n] = | output[n] = ins .. "%Z*" | ||
pos = pos + | pos = pos + 3 | ||
elseif nxt == 0x3F then -- ? | |||
output[n] = ins .. "?[\128-\191]*" | |||
pos = pos + 3 | |||
else | else | ||
output[n] = | output[n] = ins .. "[\128-\191]*" | ||
pos = pos + 2 | |||
end | end | ||
start = pos | start = pos | ||
end | end | ||
elseif | elseif not nxt or (character_classes or get_character_classes())[nxt] then -- acdlpsuwxACDLPSUWX, but not Zz | ||
return false | return false | ||
-- Skip the next character if it's ASCII. Otherwise, we will | -- Skip the next character if it's ASCII. Otherwise, we will | ||
-- still need to do length checks. | -- still need to do length checks. | ||
else | else | ||
pos = pos + | pos = pos + (nxt < 128 and 2 or 1) | ||
end | end | ||
elseif ch == "(" then | elseif ch == "(" then | ||
if nxt == | if nxt == 0x29 or capture_groups == 32 then -- ) | ||
return false | return false | ||
end | end | ||
capture_groups = capture_groups + 1 | |||
pos = pos + 1 | pos = pos + 1 | ||
elseif ch == "." then | elseif ch == "." then | ||
if nxt == | if nxt == 0x2A or nxt == 0x2D then -- *- | ||
pos = pos + 2 | pos = pos + 2 | ||
else | else | ||
output = output | if output == nil then | ||
output = {} | |||
end | |||
local ins = sub(pattern, start, pos - 1) .. "[^\128-\191]" | |||
n = n + 1 | n = n + 1 | ||
if nxt == " | if nxt == 0x2B then -- + | ||
output[n] = | output[n] = ins .. ".*" | ||
pos = pos + 2 | |||
elseif nxt == 0x3F then -- ? | |||
output[n] = ins .. "?[\128-\191]*" | |||
pos = pos + 2 | pos = pos + 2 | ||
else | else | ||
output[n] = | output[n] = ins .. "[\128-\191]*" | ||
pos = pos + 1 | pos = pos + 1 | ||
end | end | ||
| Line 224: | Line 488: | ||
elseif ch == "[" then | elseif ch == "[" then | ||
-- Fail negative charsets. TODO: 1-byte charsets should be safe. | -- Fail negative charsets. TODO: 1-byte charsets should be safe. | ||
if nxt == | if nxt == 0x5E then -- ^ | ||
return false | return false | ||
-- If the first character is "%", ch_len is determined by the | -- If the first character is "%", ch_len is determined by the | ||
-- next one instead. | -- next one instead. | ||
elseif nxt == | elseif nxt == 0x25 then -- % | ||
nxt = byte(pattern, nxt_pos + 1) | |||
elseif nxt == 0x5D then -- Initial ] is non-magic. | |||
nxt_pos = nxt_pos + 1 | nxt_pos = nxt_pos + 1 | ||
end | end | ||
local ch_len = | if not nxt then | ||
return false | |||
end | |||
local ch_len = nxt < 128 and 1 or nxt < 224 and 2 or nxt < 240 and 3 or 4 | |||
if ch_len == 1 then -- Single-byte charset. | if ch_len == 1 then -- Single-byte charset. | ||
pos = parse_1_byte_charset(pattern, | pos = parse_1_byte_charset(pattern, nxt_pos) | ||
if not pos then | if not pos then | ||
return false | return false | ||
end | end | ||
else -- Multibyte charset. | else -- Multibyte charset. | ||
-- TODO: 1-byte chars should be safe to mix with multibyte chars. CONFIRM THIS FIRST. | |||
local charset_pos, bytes = pos | local charset_pos, bytes = pos | ||
pos = pos + 1 | pos = pos + 1 | ||
while true do -- TODO: non-ASCII charset ranges. | while true do -- TODO: non-ASCII charset ranges. | ||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]* | pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", pos) | ||
-- If escaped, get the next character. No need to | -- If escaped, get the next character. No need to | ||
-- distinguish magic characters or character classes, | -- distinguish magic characters or character classes, | ||
-- as they'll all fail for having the wrong length | -- as they'll all fail for having the wrong length | ||
-- anyway. | -- anyway. | ||
if ch == "%" then | |||
pos, ch, nxt_pos = match(pattern, "()([^\128-\191][\128-\191]* | pos, ch, nxt_pos = match(pattern, "^()([^\128-\191])[\128-\191]*()", nxt_pos) | ||
elseif ch == "]" then | elseif ch == "]" then | ||
pos = nxt_pos | pos = nxt_pos | ||
break | break | ||
end | end | ||
if ch_len | if not (ch and nxt_pos - pos == ch_len) then | ||
return false | return false | ||
elseif bytes == nil then | |||
bytes = {} | |||
end | end | ||
local bytes, last = bytes, nxt_pos - 1 | |||
local bytes = bytes | for i = pos, last - 1 do | ||
for i = | local b = byte(pattern, i) | ||
local b = byte( | local bytes_b = bytes[b] | ||
bytes[b] = bytes[b] | if bytes_b == nil then | ||
bytes_b = {} | |||
bytes[b] = bytes_b | |||
end | |||
bytes[b], bytes = bytes_b, bytes_b | |||
end | end | ||
bytes[byte( | bytes[byte(pattern, last)] = true | ||
pos = nxt_pos | pos = nxt_pos | ||
end | end | ||
| Line 271: | Line 543: | ||
return false | return false | ||
end | end | ||
nxt = byte(pattern, pos) | |||
if ( | if ( | ||
(nxt == | (nxt == 0x2A or nxt == 0x2D or nxt == 0x3F) or -- *-? | ||
(nxt == | (nxt == 0x2B and ch_len > 2) or -- + | ||
not check_sets(bytes) | not check_sets(bytes) | ||
) then | ) then | ||
| Line 292: | Line 564: | ||
bytes = next_byte | bytes = next_byte | ||
until next_byte == true | until next_byte == true | ||
if nxt == | if nxt == 0x2B then -- + | ||
local range1, range2 = ranges[1], ranges[2] | local range1, range2 = ranges[1], ranges[2] | ||
ranges[1] = make_charset(range1) | ranges[1], ranges[3] = make_charset(range1), make_charset(range2) | ||
local n = #range2 | local n = #range2 | ||
for i = 1, #range1 do | for i = 1, #range1 do | ||
| Line 308: | Line 579: | ||
end | end | ||
end | end | ||
output = output | if output == nil then | ||
output = {} | |||
end | |||
nxt = byte(pattern, pos) | |||
n = n + 1 | n = n + 1 | ||
output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) | output[n] = sub(pattern, start, charset_pos - 1) .. concat(ranges) .. | ||
((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped | |||
start = pos | start = pos | ||
end | end | ||
elseif nxt == | elseif not nxt then | ||
if | break | ||
elseif nxt == 0x2B then -- + | |||
if nxt_pos - pos ~= 2 then | |||
return false | return false | ||
elseif output == nil then | |||
output = {} | |||
end | end | ||
pos, nxt_pos = pos + 1, nxt_pos + 1 | |||
nxt = byte(pattern, nxt_pos) | |||
local ch2 = sub(pattern, pos, pos) | |||
n = n + 1 | n = n + 1 | ||
output[n] = sub(pattern, start, pos) .. "[" .. ch .. "]*" .. | output[n] = sub(pattern, start, pos - 1) .. "[" .. ch .. ch2 .. "]*" .. ch2 .. | ||
pos = nxt_pos | ((nxt == 0x2A or nxt == 0x2B or nxt == 0x2D or nxt == 0x3F) and "%" or "") -- following *+-? now have to be escaped | ||
pos, start = nxt_pos, nxt_pos | |||
elseif nxt == | elseif nxt == 0x2A or nxt == 0x2D or nxt == 0x3F then -- *-? | ||
return false | return false | ||
else | else | ||
| Line 332: | Line 613: | ||
end | end | ||
return concat(output) .. sub(pattern, start) | return concat(output) .. sub(pattern, start) | ||
end, true) | end | ||
export.pattern_simplifier = pattern_simplifier -- | pattern_simplifier = memoize(pattern_simplifier, true) | ||
export.pattern_simplifier = pattern_simplifier | |||
end | |||
--[==[ | |||
Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring | |||
library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}). | |||
The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used | |||
(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary | |||
characters. | |||
]==] | |||
function get_charset(charset) | |||
if type(charset) == "number" then | |||
return tostring(charset) | |||
end | |||
local pos, start, n, output = 1, 1, 0 | |||
if byte(charset) == 0x5E then -- ^ | |||
pos = pos + 1 | |||
end | |||
-- FIXME: "]" is non-magic if it's the first character in a charset. | |||
local nxt_pos, nxt | |||
while true do | |||
local new_pos, ch = match(charset, "()([%%%-%]])", pos) | |||
if not ch then | |||
break | |||
-- Skip percent escapes. Ranges can't start with them, either. | |||
elseif ch == "%" then | |||
pos = new_pos + 2 | |||
else | |||
-- If `ch` is a hyphen, get the character before iff it's at or ahead of `pos`. | |||
if ch == "-" and new_pos > pos then | |||
pos, nxt_pos, nxt = new_pos - 1, new_pos, ch | |||
ch = sub(charset, pos, pos) | |||
else | |||
pos, nxt_pos = new_pos, new_pos + 1 | |||
nxt = sub(charset, nxt_pos, nxt_pos) | |||
end | |||
-- Range. | |||
if nxt == "-" then | |||
if output == nil then | |||
output = {} | |||
end | |||
n = n + 1 | |||
output[n] = sub(charset, start, pos - 1) | |||
nxt_pos = nxt_pos + 1 | |||
nxt = sub(charset, nxt_pos, nxt_pos) | |||
-- Ranges fail if they end with a percent escape, so escape the hyphen to avoid undefined behaviour. | |||
if nxt == "" or nxt == "%" then | |||
n = n + 1 | |||
output[n] = (ch == "]" and "%]" or ch) .. "%-" | |||
start = nxt_pos | |||
nxt_pos = nxt_pos + 2 | |||
-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be | |||
-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is | |||
-- omitted if the range would be empty (i.e. if the first byte is greater than the second). | |||
else | |||
n = n + 1 | |||
output[n] = (ch == "]" and (byte(nxt) >= 0x5D and "%]^" or "^") or ch) .. "-" .. | |||
(nxt == "]" and (byte(ch) <= 0x5D and "\\%]" or "\\") or nxt) | |||
nxt_pos = nxt_pos + 1 | |||
start = nxt_pos | |||
end | |||
elseif ch == "-" or ch == "]" then | |||
if output == nil then | |||
output = {} | |||
end | |||
n = n + 1 | |||
output[n] = sub(charset, start, pos - 1) .. "%" .. ch | |||
start = nxt_pos | |||
end | |||
pos = nxt_pos | |||
end | |||
end | |||
if start == 1 then | |||
return "[" .. charset .. "]" | |||
end | |||
return "[" .. concat(output) .. sub(charset, start) .. "]" | |||
end | end | ||
get_charset = memoize(get_charset, true) | |||
export.get_charset = get_charset | |||
function export.len(str) | function export.len(str) | ||
| Line 423: | Line 783: | ||
end | end | ||
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==] | --[==[ | ||
Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal. | |||
]==] | |||
function export.plain_gsub(str, pattern, repl, n) | function export.plain_gsub(str, pattern, repl, n) | ||
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n) | return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n) | ||
end | end | ||
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==] | --[==[ | ||
Reverses a UTF-8 string; equivalent to string.reverse. | |||
]==] | |||
function export.reverse(str) | function export.reverse(str) | ||
return reverse(gsub(str, "[\ | return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse))) | ||
end | |||
function export.char(...) -- To be moved to [[Module:string/char]]. | |||
return u(...) | |||
end | end | ||
do | do | ||
local function | local function utf8_err(func_name) | ||
error(" | error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4) | ||
end | end | ||
local function | local function get_codepoint(func_name, b1, b2, b3, b4) | ||
if b1 <= 0x7F then | |||
if b1 < | |||
return b1, 1 | return b1, 1 | ||
elseif b1 < | elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then | ||
utf8_err(func_name) | |||
elseif b1 < | elseif b1 <= 0xDF then | ||
local cp = 0x40 * b1 + b2 - 0x3080 | |||
return cp >= 0x80 and cp or utf8_err(func_name), 2 | |||
elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then | |||
utf8_err(func_name) | |||
elseif b1 <= 0xEF then | |||
local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080 | |||
return cp >= 0x800 and cp or utf8_err(func_name), 3 | |||
elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then | |||
utf8_err(func_name) | |||
end | end | ||
local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080 | |||
return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4 | |||
end | end | ||
function export.codepoint(str, i, j) | function export.codepoint(str, i, j) | ||
if type(str) == "number" then | if str == "" then | ||
return -- return nothing | |||
elseif type(str) == "number" then | |||
return byte(str, i, j) | return byte(str, i, j) | ||
end | end | ||
i, j = i or 1, j == -1 and #str or i or 1 | i, j = i or 1, j == -1 and #str or i or 1 | ||
if i == 1 and j == 1 then | if i == 1 and j == 1 then | ||
return (get_codepoint(byte(str, 1, 4))) | return (get_codepoint("codepoint", byte(str, 1, 4))) | ||
elseif i < 0 or j < 0 then | elseif i < 0 or j < 0 then | ||
return ucodepoint(str, i, j) -- FIXME | return ucodepoint(str, i, j) -- FIXME | ||
| Line 517: | Line 851: | ||
nr = nr + 1 | nr = nr + 1 | ||
local add | local add | ||
ret[nr], add = get_codepoint(b1, b2, b3, b4) | ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4) | ||
nb = nb + add | nb = nb + add | ||
end | end | ||
| Line 549: | Line 883: | ||
return nil | return nil | ||
end | end | ||
local ret, add = get_codepoint(b1, b2, b3, b4) | local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4) | ||
nb = nb + add | nb = nb + add | ||
return ret | return ret | ||
| Line 556: | Line 890: | ||
end | end | ||
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] | do | ||
function export.lower(str) | local _ulower = ulower | ||
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] | |||
function export.lower(str) | |||
return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str) | |||
end | |||
end | end | ||
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] | do | ||
function export.upper(str) | local _uupper = uupper | ||
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] | |||
function export.upper(str) | |||
return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str) | |||
end | |||
end | end | ||
do | do | ||
local function add_captures( | local function add_captures(t, n, ...) | ||
if ... == nil then | |||
return | |||
end | |||
-- Insert any captures from the splitting pattern. | -- Insert any captures from the splitting pattern. | ||
local offset, capture = n - 1, ... | local offset, capture = n - 1, ... | ||
while capture do | while capture do | ||
n = n + 1 | n = n + 1 | ||
t[n] = capture | |||
capture = select(n - offset, ...) | capture = select(n - offset, ...) | ||
end | end | ||
| Line 578: | Line 923: | ||
end | end | ||
--[==[ | |||
Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like | |||
Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by | |||
one character at a time; Python returns the whole remainder of the string). When possible, it will use the string | |||
library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the | |||
string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal. | |||
In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start | |||
local | index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil | ||
if there are no further matches. By default, the start index will be calculated using the ustring library, unless | |||
`str_lib` or `plain` is set. | |||
]==] | |||
function export.split(str, pattern_or_func, str_lib, plain) | |||
local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0 | |||
repeat | repeat | ||
n | n = add_captures(t, n, iter()) | ||
until | until n == nil | ||
return t | |||
end | end | ||
export.capturing_split = export.split -- To be removed. | export.capturing_split = export.split -- To be removed. | ||
end | end | ||
--[==[ | |||
Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the | |||
string up the splitting pattern, with any capture groups being returned as additional values on that iteration. | |||
]==] | |||
function export.gsplit(str, pattern_or_func, str_lib, plain) | |||
local start, final, str_len, _string, callable = 1 | |||
pattern_or_func, str_len, _string, callable = prepare_iter(str, pattern_or_func, str_lib, plain) | |||
local _find, _sub = _string.find, _string.sub | |||
if | |||
local function iter(loc1, loc2, ...) | |||
-- If no match, or there is but we're past the end of the string | |||
-- (which happens when the match is the empty string), then return | |||
-- the final chunk. | |||
if not loc1 then | |||
final = true | |||
return _sub(str, start) | |||
end | |||
-- Special case: If we match the empty string, then eat the | |||
-- next character; this avoids an infinite loop, and makes | |||
-- splitting by the empty string work the way mw.text.gsplit() does | |||
-- (including non-adjacent empty string matches with %f). If we | |||
-- reach the end of the string this way, set `final` to true, so we | |||
-- don't get stuck matching the empty string at the end. | |||
local chunk | |||
if loc2 < loc1 then | |||
-- If using the string library, we need to make sure we advance | |||
-- by one UTF-8 character. | |||
if _sub == sub then | |||
local b = byte(str, loc1) | |||
if b and b >= 128 then | |||
loc1 = loc1 + (b < 224 and 1 or b < 240 and 2 or 3) | |||
end | |||
end | |||
chunk = _sub(str, start, loc1) | |||
if loc1 >= str_len then | |||
final = true | final = true | ||
else | else | ||
start = loc1 + 1 | |||
end | end | ||
-- Eat chunk up to the current match. | |||
else | |||
chunk = _sub(str, start, loc1 - 1) | |||
start = loc2 + 1 | |||
end | end | ||
return chunk, ... | |||
end | |||
if callable then | |||
return function() | return function() | ||
if not final then | if not final then | ||
return iter(_find(str, | return iter(pattern_or_func(str, start)) | ||
end | |||
end | |||
-- Special case if the pattern is anchored to the start: "^" always | |||
-- anchors to the start position, not the start of the string, so get | |||
-- around this by only attempting one match with the pattern, then match | |||
-- the end of the string. | |||
elseif byte(pattern_or_func) == 0x5E then -- ^ | |||
local returned | |||
return function() | |||
if not returned then | |||
returned = true | |||
return iter(_find(str, pattern_or_func, start, plain)) | |||
elseif not final then | |||
return iter(_find(str, "$", start, plain)) | |||
end | end | ||
end | end | ||
end | end | ||
return function() | |||
if not final then | |||
if | return iter(_find(str, pattern_or_func, start, plain)) | ||
return | |||
end | end | ||
end | end | ||
end | |||
gsplit = export.gsplit | |||
function export.count(str, pattern, plain) | |||
if plain then | |||
return select(2, gsub(str, pattern_escape(pattern), "")) | |||
end | |||
local simple = pattern_simplifier(pattern) | |||
if simple then | |||
return select(2, gsub(str, pattern, "")) | |||
end | |||
return select(2, ugsub(str, pattern, "")) | |||
end | end | ||
function export.trim(str, charset) | function export.trim(str, charset, str_lib, plain) | ||
if | if charset == nil then | ||
-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are | |||
-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" | |||
-- first. | |||
return match(gsub(str, "^%s*", ""), "^.*%S") or "" | |||
elseif charset == "" then | |||
return str | |||
end | end | ||
charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset) | |||
-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets | |||
-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there | |||
-- would be two callbacks into PHP, which is slower. | |||
local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$" | |||
if not str_lib then | |||
local simple = pattern_simplifier(pattern) | |||
if not simple then | |||
return umatch(str, pattern) | |||
end | |||
pattern = simple | |||
end | |||
return match(str, pattern) | |||
end | end | ||
trim = export.trim | |||
do | do | ||
local entities | local entities | ||
local function get_entities() | |||
local function | entities, get_entities = load_data("Module:data/entities"), nil | ||
return entities | |||
return | |||
end | end | ||
local function decode_entity(hash, x, code) | local function decode_entity(hash, x, code) | ||
if hash == " | if hash == "" then | ||
return | return (entities or get_entities())[x .. code] | ||
end | end | ||
local cp | |||
if x == "" then | |||
cp = match(code, "^()%d+$") and tonumber(code) | |||
else | |||
cp = match(code, "^()%x+$") and tonumber(code, 16) | |||
end | |||
return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil | |||
end | end | ||
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]]. | -- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases | ||
-- which have also been included in [[Module:data/entities]]. | |||
function export.decode_entities(str) | function export.decode_entities(str) | ||
return find(str, " | local amp = find(str, "&", nil, true) | ||
return amp and find(str, ";", amp, true) and gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or str | |||
end | end | ||
end | end | ||
do | do | ||
local | local entities | ||
local function get_entities() | |||
-- Memoized HTML entities (taken from mw.text.lua). | -- Memoized HTML entities (taken from mw.text.lua). | ||
entities, get_entities = { | |||
["\""] = """, | ["\""] = """, | ||
["&"] = "&", | ["&"] = "&", | ||
| Line 750: | Line 1,096: | ||
[">"] = ">", | [">"] = ">", | ||
["\194\160"] = " ", | ["\194\160"] = " ", | ||
} | }, nil | ||
if | return entities | ||
return (gsub(str, "[\"&'<>\194]\160?", | end | ||
elseif | |||
return ( | local function encode_entity(ch) | ||
local entity = (entities or get_entities())[ch] | |||
if not | if entity == nil then | ||
local cp = codepoint(ch) | |||
-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities. | |||
entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false | |||
entities[ch] = entity | |||
end | |||
return entity or nil | |||
end | |||
function export.encode_entities(str, charset, str_lib, plain) | |||
if charset == nil then | |||
return (gsub(str, "[\"&'<>\194]\160?", entities or get_entities())) | |||
elseif charset == "" then | |||
return str | |||
end | |||
local pattern = plain and ("[" .. charset_escape(charset) .. "]") or charset == "." and charset or get_charset(charset) | |||
if not str_lib then | |||
local simple = pattern_simplifier(pattern) | |||
if not simple then | |||
return (ugsub(str, pattern, encode_entity)) | |||
end | end | ||
pattern = simple | |||
end | end | ||
return ( | return (gsub(str, pattern, encode_entity)) | ||
end | end | ||
end | end | ||
| Line 787: | Line 1,146: | ||
enctype = enctype and upper(enctype) or "QUERY" | enctype = enctype and upper(enctype) or "QUERY" | ||
if enctype == "PATH" then | if enctype == "PATH" then | ||
return find(str, "%", | return find(str, "%", nil, true) and gsub(str, "%%(%x%x)", decode_path) or str | ||
elseif enctype == "QUERY" then | elseif enctype == "QUERY" then | ||
return (find(str, "%", | return (find(str, "%", nil, true) or find(str, "+", nil, true)) and gsub(str, "([%%%+])(%x?%x?)", decode) or str | ||
elseif enctype == "WIKI" then | elseif enctype == "WIKI" then | ||
return (find(str, "%", | return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str | ||
end | end | ||
error("bad argument #2 to | error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2) | ||
end | end | ||
end | end | ||
| Line 802: | Line 1,158: | ||
do | do | ||
local function _remove_comments(str, pre) | local function _remove_comments(str, pre) | ||
local head = find(str, "<!--", | local head = find(str, "<!--", nil, true) | ||
if not head then | if not head then | ||
return str | return str | ||
| Line 824: | Line 1,180: | ||
end | end | ||
--[==[Removes any HTML comments from the input text. `stage` can be one of three options: | --[==[ | ||
* { | Removes any HTML comments from the input text. `stage` can be one of three options: | ||
* { | * {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all | ||
* { | {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed | ||
{{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or | |||
[[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the | |||
preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); | |||
if full accuracy is absolutely necessary, use [[Module:template parser]] instead). | |||
* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops | |||
over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. | |||
{{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed | |||
{{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, | |||
where the {"PRE"} method will have already been applied by the native parser. | |||
* {"BOTH"} applies {"PRE"} then {"POST"}. | |||
]==] | |||
function export.remove_comments(str, stage) | function export.remove_comments(str, stage) | ||
if not stage or stage == "PRE" then | if not stage or stage == "PRE" then | ||
| Line 834: | Line 1,201: | ||
local processed = stage == "POST" and _remove_comments(str) or | local processed = stage == "POST" and _remove_comments(str) or | ||
stage == "BOTH" and _remove_comments(str, true) or | stage == "BOTH" and _remove_comments(str, true) or | ||
error("bad argument #2 to | error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2) | ||
while processed ~= str do | while processed ~= str do | ||
str = processed | str = processed | ||
| Line 841: | Line 1,208: | ||
return str | return str | ||
end | end | ||
end | end | ||
do | do | ||
local byte_escapes | local byte_escapes | ||
local function get_byte_escapes() | |||
byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil | |||
return byte_escapes | |||
end | |||
local function escape_byte(b) | local function escape_byte(b) | ||
return byte_escapes[b] or format("\\%03d", byte(b)) | return (byte_escapes or get_byte_escapes())[b] or format("\\%03d", byte(b)) | ||
end | end | ||
function export.escape_bytes(str) | function export.escape_bytes(str) | ||
return (gsub(str, ".", escape_byte)) | return (gsub(str, ".", escape_byte)) | ||
end | end | ||
| Line 892: | Line 1,231: | ||
return name == "op" and "{" or | return name == "op" and "{" or | ||
name == "cl" and "}" or | name == "cl" and "}" or | ||
error( | error(mw.getCurrentFrame():getTitle() .. " format: unrecognized escape sequence '{\\" .. name .. "}'") | ||
elseif fun(name) and type(fun(name)) ~= "string" then | elseif fun(name) and type(fun(name)) ~= "string" then | ||
error( | error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" is a " .. type(fun(name)) .. ", not a string") | ||
end | end | ||
return fun(name) or error( | return fun(name) or error(mw.getCurrentFrame():getTitle() .. " format: \"" .. name .. "\" not found in table") | ||
end)) | end)) | ||
end | end | ||
format_fun = export.format_fun | format_fun = export.format_fun | ||
--[==[This function, unlike { | --[==[ | ||
This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table, | |||
and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening | |||
and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a | |||
slash can be escaped by doubling the initial slash. | |||
====Examples==== | ====Examples==== | ||
* { | * {string_utilities.format("{foo} fish, {bar} fish, {baz} fish, {quux} fish", {["foo"]="one", ["bar"]="two", ["baz"]="red", ["quux"]="blue"}) } | ||
*: produces: { | *: produces: {"one fish, two fish, red fish, blue fish"} | ||
* { | * {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})} | ||
*: produces: { | *: produces: {"The set {1, 2, 3} contains three elements."} | ||
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] | *:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string. | ||
]==] | |||
function export.format(str, tbl) | function export.format(str, tbl) | ||
return format_fun(str, function(key) | return format_fun(str, function(key) | ||
| Line 916: | Line 1,261: | ||
do | do | ||
local function do_uclcfirst(str, case_func) | local function do_uclcfirst(str, case_func) | ||
-- | -- Re-case the first letter. | ||
local | local first, remainder = match(str, "^(.[\128-\191]*)(.*)") | ||
return | return first and (case_func(first) .. remainder) or "" | ||
end | end | ||
local function uclcfirst(str, case_func) | local function uclcfirst(str, case_func) | ||
-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref> | |||
-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through | |||
-- [[Module:links]]. | |||
local html_at_beginning = nil | |||
if str:match("^<") then | |||
while true do | |||
local html_tag, rest = str:match("^(<.->)(.*)$") | |||
if not html_tag then | |||
break | |||
end | |||
if not html_at_beginning then | |||
html_at_beginning = {} | |||
end | |||
insert(html_at_beginning, html_tag) | |||
str = rest | |||
end | |||
end | |||
-- If there's a link at the beginning, re-case the first letter of the | -- If there's a link at the beginning, re-case the first letter of the | ||
-- link text. This pattern matches both piped and unpiped links. | -- link text. This pattern matches both piped and unpiped links. | ||
-- If the link is not piped, the second capture (linktext) will be empty. | -- If the link is not piped, the second capture (linktext) will be empty. | ||
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") | local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") | ||
local retval | |||
if link then | if link then | ||
retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder | |||
else | |||
retval = do_uclcfirst(str, case_func) | |||
end | |||
if html_at_beginning then | |||
retval = concat(html_at_beginning) .. retval | |||
end | end | ||
return | return retval | ||
end | end | ||
--[==[ | |||
Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally | |||
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly | |||
uppercase the first character of text that may include links that have been passed through `full_link()` in | |||
[[Module:links]] or a similar function. | |||
]==] | |||
function export.ucfirst(str) | function export.ucfirst(str) | ||
return uclcfirst(str, uupper) | return uclcfirst(str, uupper) | ||
end | end | ||
ucfirst = export.ucfirst | |||
--[==[ | |||
Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally | |||
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly | |||
lowercase the first character of text that may include links that have been passed through `full_link()` in | |||
[[Module:links]] or a similar function. | |||
]==] | |||
function export.lcfirst(str) | function export.lcfirst(str) | ||
return uclcfirst(str, ulower) | return uclcfirst(str, ulower) | ||
end | end | ||
--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==] | |||
--[==[ | |||
function export.capitalize(str) | function export.capitalize(str) | ||
-- Capitalize multi-word that is separated by spaces | -- Capitalize multi-word that is separated by spaces | ||
-- by uppercasing the first letter of each part. | -- by uppercasing the first letter of each part. | ||
return (ugsub(str, "%w+", ucfirst)) | |||
return (ugsub(str, "% | |||
end | end | ||
local function do_title_case(first, remainder) | |||
local function | first = uupper(first) | ||
return remainder == "" and first or (first .. ulower(remainder)) | |||
end | end | ||
--[==[ | |||
return | Capitalizes each word of the input string, with any further letters in each word being converted to lowercase. | ||
]==] | |||
function export.title_case(str) | |||
return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case) | |||
end | end | ||
title_case = export.title_case | |||
--[==[ | --[==[ | ||
Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between | |||
words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase). | |||
]==] | ]==] | ||
function export. | function export.camel_case(str, lower_first) | ||
str = ugsub(str, "%W*(%w*)", title_case) | |||
return lower_first and do_uclcfirst(str, ulower) or str | |||
end | end | ||
end | end | ||
do | do | ||
local function | local function do_snake_case(nonword, word) | ||
return nonword == "" and word or "_" .. word | |||
end | end | ||
--[==[ | --[==[ | ||
Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between | |||
words. | |||
]==] | ]==] | ||
function export. | function export.snake_case(str) | ||
return (ugsub(str, "(%W*)(%w*)", do_snake_case)) | |||
end | end | ||
end | end | ||
return export | return export | ||