Module:string utilities: Difference between revisions
No edit summary |
make ucfirst() and lcfirst() work in the presence of wrapping HTML spans and such, and document them; reformat remaining documentation to 120 chars per line max |
||
| Line 4: | Line 4: | ||
local load_module = "Module:load" | local load_module = "Module:load" | ||
local memoize_module = "Module:memoize" | local memoize_module = "Module:memoize" | ||
local string_char_module = "Module:string/char" | |||
local string_charset_escape_module = "Module:string/charsetEscape" | |||
local mw = mw | local mw = mw | ||
| Line 17: | Line 19: | ||
local gmatch = string.gmatch | local gmatch = string.gmatch | ||
local gsub = string.gsub | local gsub = string.gsub | ||
local insert = table.insert | |||
local len = string.len | local len = string.len | ||
local lower = string.lower | local lower = string.lower | ||
| Line 36: | Line 39: | ||
local ulower = ustring.lower | local ulower = ustring.lower | ||
local umatch = ustring.match | local umatch = ustring.match | ||
local unpack = unpack | local unpack = unpack or table.unpack -- Lua 5.2 compatibility | ||
local upper = string.upper | local upper = string.upper | ||
local usub = ustring.sub | local usub = ustring.sub | ||
| Line 44: | Line 47: | ||
-- Defined below. | -- Defined below. | ||
local codepoint | local codepoint | ||
local explode_utf8 | local explode_utf8 | ||
local format_fun | local format_fun | ||
local get_charset | local get_charset | ||
local gsplit | local gsplit | ||
local pattern_escape | local pattern_escape | ||
local pattern_simplifier | local pattern_simplifier | ||
local replacement_escape | local replacement_escape | ||
local title_case | |||
local trim | local trim | ||
local | local ucfirst | ||
local ulen | local ulen | ||
--[==[ | --[==[ | ||
Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no overhead after the first call, since the target functions are called directly in any subsequent calls.]==] | Loaders for functions in other modules, which overwrite themselves with the target function when called. This ensures | ||
modules are only loaded when needed, retains the speed/convenience of locally-declared pre-loaded functions, and has no | |||
overhead after the first call, since the target functions are called directly in any subsequent calls. | |||
]==] | |||
local function charset_escape(...) | |||
charset_escape = require(string_charset_escape_module) | |||
return charset_escape(...) | |||
end | |||
local function is_callable(...) | local function is_callable(...) | ||
is_callable = require(function_module).is_callable | is_callable = require(function_module).is_callable | ||
| Line 69: | Line 78: | ||
load_data = require(load_module).load_data | load_data = require(load_module).load_data | ||
return load_data(...) | return load_data(...) | ||
end | |||
local function u(...) | |||
u = require(string_char_module) | |||
return u(...) | |||
end | end | ||
| Line 84: | Line 98: | ||
end | end | ||
--[==[Returns {nil} if the input value is the empty string, or otherwise the same value. | --[==[ | ||
Returns {nil} if the input value is the empty string, or otherwise the same value. | |||
If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is the empty string, returns {nil}. | If the input is a string and `do_trim` is set, the input value will be trimmed before returning; if the trimmed value is | ||
the empty string, returns {nil}. | |||
If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also be embedded this way (e.g. {"''foo''"} returns {"'foo'"}).]==] | If `quote_delimiters` is set, then any outer pair of quotation marks ({' '} or {" "}) surrounding the rest of the input | ||
string will be stripped, if present. The string will not be trimmed again, converted to {nil}, or have further quotation | |||
marks stripped, as it exists as a way to embed spaces or the empty string in an input. Genuine quotation marks may also | |||
be embedded this way (e.g. {"''foo''"} returns {"'foo'"}). | |||
]==] | |||
function export.is_not_empty(str, do_trim, quote_delimiters) | function export.is_not_empty(str, do_trim, quote_delimiters) | ||
if str == "" then | if str == "" then | ||
| Line 103: | Line 123: | ||
end | end | ||
--[==[Explodes a string into an array of UTF-8 characters. '''Warning''': this function | --[==[ | ||
Explodes a string into an array of UTF-8 characters. '''Warning''': this function assumes that the input is valid UTF-8 | |||
in order to optimize speed and memory use. Passing in an input containing non-UTF-8 byte sequences could result in | |||
unexpected behaviour. | |||
]==] | |||
function export.explode_utf8(str) | function export.explode_utf8(str) | ||
local text, i = {}, 0 | local text, i = {}, 0 | ||
| Line 113: | Line 137: | ||
end | end | ||
explode_utf8 = export.explode_utf8 | explode_utf8 = export.explode_utf8 | ||
--[==[ | |||
Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true: | |||
* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to | |||
`0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte | |||
characters start with `0xF0` to `0xF4`. | |||
* The leading byte must not fall outside of the above ranges. | |||
* The trailing byte(s) (if any), must be between `0x80` to `0xBF`. | |||
* The character's codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`). | |||
* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to | |||
U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings | |||
that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte | |||
character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000). | |||
Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but | |||
`0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`. | |||
If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in | |||
UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of | |||
surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher | |||
codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances | |||
where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly | |||
4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates, | |||
even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`). | |||
]==] | |||
function export.isutf8(str, allow_surrogates) | |||
for ch in gmatch(str, "[\128-\255][\128-\191]*") do | |||
if #ch > 4 then | |||
return false | |||
end | |||
local b1, b2, b3, b4 = byte(ch, 1, 4) | |||
if not (b2 and b2 >= 0x80 and b2 <= 0xBF) then | |||
return false -- 1-byte is always invalid, as gmatch excludes 0x00 to 0x7F | |||
elseif not b3 then -- 2-byte | |||
if not (b1 >= 0xC2 and b1 <= 0xDF) then -- b1 == 0xC0 or b1 == 0xC1 is overlong | |||
return false | |||
end | |||
elseif not (b3 >= 0x80 and b3 <= 0xBF) then -- trailing byte | |||
return false | |||
elseif not b4 then -- 3-byte | |||
if b1 > 0xEF then | |||
return false | |||
elseif b2 < 0xA0 then | |||
if b1 < 0xE1 then -- b1 == 0xE0 and b2 < 0xA0 is overlong | |||
return false | |||
end | |||
elseif b1 < 0xE0 or (b1 == 0xED and not allow_surrogates) then -- b1 == 0xED and b2 >= 0xA0 is a surrogate | |||
return false | |||
end | |||
elseif not (b4 >= 0x80 and b4 <= 0xBF) then -- 4-byte | |||
return false | |||
elseif b2 < 0x90 then | |||
if not (b1 >= 0xF1 and b1 <= 0xF4) then -- b1 == 0xF0 and b2 < 0x90 is overlong | |||
return false | |||
end | |||
elseif not (b1 >= 0xF0 and b1 <= 0xF3) then -- b1 == 0xF4 and b2 >= 0x90 is too high | |||
return false | |||
end | |||
end | |||
return true | |||
end | |||
do | do | ||
| Line 125: | Line 209: | ||
}, charset_chars) | }, charset_chars) | ||
--[==[Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example, {"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving arbitrary text (e.g. from user input).]==] | --[==[ | ||
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]] (Lua's | |||
version of regular expressions): {$%()*+-.?[]^}, and converts the null character to {%z}. For example, | |||
{"^$()%.[]*+-?\0"} becomes {"%^%$%(%)%%%.%[%]%*%+%-%?%z"}. This is necessary when constructing a pattern involving | |||
arbitrary text (e.g. from user input). | |||
]==] | |||
function export.pattern_escape(str) | function export.pattern_escape(str) | ||
return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars)) | return (gsub(str, "[%z$%%()*+%-.?[%]^]", chars)) | ||
| Line 131: | Line 220: | ||
pattern_escape = export.pattern_escape | pattern_escape = export.pattern_escape | ||
--[==[ | --[==[ | ||
Escapes only {%}, which is the only magic character used in replacement | |||
[[mw:Extension:Scribunto/Lua reference manual#Patterns|patterns]] with string.gsub and mw.ustring.gsub. | |||
]==] | |||
function export.replacement_escape(str) | function export.replacement_escape(str) | ||
return (gsub(str, "%%", "%%%%")) | return (gsub(str, "%%", "%%%%")) | ||
| Line 186: | Line 272: | ||
--[==[ | --[==[ | ||
Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns any pattern matching facilities off in the optional pattern supplied.]==] | Escapes the magic characters used in a [[mw:Extension:Scribunto/Lua reference manual#Patterns|pattern]], and makes | ||
all characters case-insensitive. An optional pattern or find function (see {split}) may be supplied as the second | |||
argument, the third argument (`str_lib`) forces use of the string library, while the fourth argument (`plain`) turns | |||
any pattern matching facilities off in the optional pattern supplied. | |||
]==] | |||
function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain) | function export.case_insensitive_pattern(str, pattern_or_func, str_lib, plain) | ||
if pattern_or_func == nil then | if pattern_or_func == nil then | ||
| Line 303: | Line 393: | ||
end | end | ||
--[==[Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion isn't possible, returns false.]==] | --[==[ | ||
Parses `pattern`, a ustring library pattern, and attempts to convert it into a string library pattern. If conversion | |||
isn't possible, returns false. | |||
]==] | |||
function pattern_simplifier(pattern) | function pattern_simplifier(pattern) | ||
if type(pattern) == "number" then | if type(pattern) == "number" then | ||
| Line 525: | Line 618: | ||
end | end | ||
--[==[Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}). | --[==[ | ||
Parses `charset`, the interior of a string or ustring library character set, and normalizes it into a string or ustring | |||
library pattern (e.g. {"abcd-g"} becomes {"[abcd-g]"}, and {"[]"} becomes {"[[%]]"}). | |||
The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used (e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary characters.]==] | The negative (`^`), range (`-`) and literal (`%`) magic characters work as normal, and character classes may be used | ||
(e.g. `%d` and `%w`), but opening and closing square brackets are sanitized so that they behave like ordinary | |||
characters. | |||
]==] | |||
function get_charset(charset) | function get_charset(charset) | ||
if type(charset) == "number" then | if type(charset) == "number" then | ||
| Line 569: | Line 667: | ||
start = nxt_pos | start = nxt_pos | ||
nxt_pos = nxt_pos + 2 | nxt_pos = nxt_pos + 2 | ||
-- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is omitted if the range would be empty (i.e. if the first byte is greater than the second). | -- Since ranges can't contain "%]", since it's escaped, range inputs like "]-z" or "a-]" must be | ||
-- adjusted to the character before or after, plus "%]" (e.g. "%]^-z" or "a-\\%]"). The escaped "%]" is | |||
-- omitted if the range would be empty (i.e. if the first byte is greater than the second). | |||
else | else | ||
n = n + 1 | n = n + 1 | ||
| Line 683: | Line 783: | ||
end | end | ||
--[==[Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal.]==] | --[==[ | ||
Like gsub, but pattern-matching facilities are turned off, so `pattern` and `repl` (if a string) are treated as literal. | |||
]==] | |||
function export.plain_gsub(str, pattern, repl, n) | function export.plain_gsub(str, pattern, repl, n) | ||
return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n) | return gsub(str, pattern_escape(pattern), type(repl) == "string" and replacement_escape(repl) or repl, n) | ||
end | end | ||
--[==[Reverses a UTF-8 string; equivalent to string.reverse.]==] | --[==[ | ||
Reverses a UTF-8 string; equivalent to string.reverse. | |||
]==] | |||
function export.reverse(str) | function export.reverse(str) | ||
return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse))) | return reverse((gsub(str, "[\192-\255][\128-\191]*", reverse))) | ||
end | |||
function export.char(...) -- To be moved to [[Module:string/char]]. | |||
return u(...) | |||
end | end | ||
do | do | ||
local function | local function utf8_err(func_name) | ||
error | error(format("bad argument #1 to '%s' (string is not UTF-8)", func_name), 4) | ||
end | end | ||
local function get_codepoint(func_name, b1, b2, b3, b4) | |||
if b1 <= 0x7F then | |||
local function get_codepoint(b1, b2, b3, b4) | |||
if b1 < | |||
return b1, 1 | return b1, 1 | ||
elseif b1 < | elseif not (b2 and b2 >= 0x80 and b2 <= 0xBF) then | ||
utf8_err(func_name) | |||
elseif b1 < | elseif b1 <= 0xDF then | ||
local cp = 0x40 * b1 + b2 - 0x3080 | |||
return cp >= 0x80 and cp or utf8_err(func_name), 2 | |||
elseif not (b3 and b3 >= 0x80 and b3 <= 0xBF) then | |||
utf8_err(func_name) | |||
elseif b1 <= 0xEF then | |||
local cp = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080 | |||
return cp >= 0x800 and cp or utf8_err(func_name), 3 | |||
elseif not (b4 and b4 >= 0x80 and b4 <= 0xBF) then | |||
utf8_err(func_name) | |||
end | end | ||
local cp = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080 | |||
return cp >= 0x10000 and cp <= 0x10FFFF and cp or utf8_err(func_name), 4 | |||
end | end | ||
function export.codepoint(str, i, j) | function export.codepoint(str, i, j) | ||
if type(str) == "number" then | if str == "" then | ||
return -- return nothing | |||
elseif type(str) == "number" then | |||
return byte(str, i, j) | return byte(str, i, j) | ||
end | end | ||
i, j = i or 1, j == -1 and #str or i or 1 | i, j = i or 1, j == -1 and #str or i or 1 | ||
if i == 1 and j == 1 then | if i == 1 and j == 1 then | ||
return (get_codepoint(byte(str, 1, 4))) | return (get_codepoint("codepoint", byte(str, 1, 4))) | ||
elseif i < 0 or j < 0 then | elseif i < 0 or j < 0 then | ||
return ucodepoint(str, i, j) -- FIXME | return ucodepoint(str, i, j) -- FIXME | ||
| Line 777: | Line 851: | ||
nr = nr + 1 | nr = nr + 1 | ||
local add | local add | ||
ret[nr], add = get_codepoint(b1, b2, b3, b4) | ret[nr], add = get_codepoint("codepoint", b1, b2, b3, b4) | ||
nb = nb + add | nb = nb + add | ||
end | end | ||
| Line 809: | Line 883: | ||
return nil | return nil | ||
end | end | ||
local ret, add = get_codepoint(b1, b2, b3, b4) | local ret, add = get_codepoint("gcodepoint", b1, b2, b3, b4) | ||
nb = nb + add | nb = nb + add | ||
return ret | return ret | ||
| Line 816: | Line 890: | ||
end | end | ||
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] | do | ||
function export.lower(str) | local _ulower = ulower | ||
--[==[A version of lower which uses string.lower when possible, but otherwise uses mw.ustring.lower.]==] | |||
function export.lower(str) | |||
return (match(str, "^()[^\128-\255]*$") and lower or _ulower)(str) | |||
end | |||
end | end | ||
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] | do | ||
function export.upper(str) | local _uupper = uupper | ||
--[==[A version of upper which uses string.upper when possible, but otherwise uses mw.ustring.upper.]==] | |||
function export.upper(str) | |||
return (match(str, "^()[^\128-\255]*$") and upper or _uupper)(str) | |||
end | |||
end | end | ||
| Line 841: | Line 923: | ||
end | end | ||
--[==[Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by one character at a time; Python returns the whole remainder of the string). When possible, it will use the string library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal. | --[==[ | ||
Reimplementation of mw.text.split() that includes any capturing groups in the splitting pattern. This works like | |||
Python's re.split() function, except that it has Lua's behavior when the split pattern is empty (i.e. advancing by | |||
one character at a time; Python returns the whole remainder of the string). When possible, it will use the string | |||
library, but otherwise uses the ustring library. There are two optional parameters: `str_lib` forces use of the | |||
string library, while `plain` turns any pattern matching facilities off, treating `pattern` as literal. | |||
In addition, `pattern` may be a custom find function (or callable table), which takes the input string and start | |||
index as its two arguments, and must return the start and end index of the match, plus any optional captures, or nil | |||
if there are no further matches. By default, the start index will be calculated using the ustring library, unless | |||
`str_lib` or `plain` is set. | |||
]==] | |||
function export.split(str, pattern_or_func, str_lib, plain) | function export.split(str, pattern_or_func, str_lib, plain) | ||
local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0 | local iter, t, n = gsplit(str, pattern_or_func, str_lib, plain), {}, 0 | ||
| Line 854: | Line 945: | ||
end | end | ||
--[==[Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the string up the splitting pattern, with any capture groups being returned as additional values on that iteration.]==] | --[==[ | ||
Returns an iterator function, which iterates over the substrings returned by {split}. The first value returned is the | |||
string up the splitting pattern, with any capture groups being returned as additional values on that iteration. | |||
]==] | |||
function export.gsplit(str, pattern_or_func, str_lib, plain) | function export.gsplit(str, pattern_or_func, str_lib, plain) | ||
local start, final, str_len, _string, callable = 1 | local start, final, str_len, _string, callable = 1 | ||
| Line 926: | Line 1,020: | ||
end | end | ||
gsplit = export.gsplit | gsplit = export.gsplit | ||
function export.count(str, pattern, plain) | |||
if plain then | |||
return select(2, gsub(str, pattern_escape(pattern), "")) | |||
end | |||
local simple = pattern_simplifier(pattern) | |||
if simple then | |||
return select(2, gsub(str, pattern, "")) | |||
end | |||
return select(2, ugsub(str, pattern, "")) | |||
end | |||
function export.trim(str, charset, str_lib, plain) | function export.trim(str, charset, str_lib, plain) | ||
if charset == nil then | if charset == nil then | ||
-- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" first. | -- "^.*%S" is the fastest trim algorithm except when strings only consist of characters to be trimmed, which are | ||
-- very slow due to catastrophic backtracking. gsub with "^%s*" gets around this by trimming such strings to "" | |||
-- first. | |||
return match(gsub(str, "^%s*", ""), "^.*%S") or "" | return match(gsub(str, "^%s*", ""), "^.*%S") or "" | ||
elseif charset == "" then | elseif charset == "" then | ||
| Line 935: | Line 1,042: | ||
end | end | ||
charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset) | charset = plain and ("[" .. charset_escape(charset) .. "]") or get_charset(charset) | ||
-- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there would be two callbacks into PHP, which is slower. | -- The pattern uses a non-greedy quantifier instead of the algorithm used for %s, because negative character sets | ||
-- are non-trivial to compute (e.g. "[^^-z]" becomes "[%^_-z]"). Plus, if the ustring library has to be used, there | |||
-- would be two callbacks into PHP, which is slower. | |||
local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$" | local pattern = "^" .. charset .. "*(.-)" .. charset .. "*$" | ||
if not str_lib then | if not str_lib then | ||
| Line 965: | Line 1,074: | ||
cp = match(code, "^()%x+$") and tonumber(code, 16) | cp = match(code, "^()%x+$") and tonumber(code, 16) | ||
end | end | ||
return cp and cp < | return cp and (cp <= 0xD7FF or cp >= 0xE000 and cp <= 0x10FFFF) and u(cp) or nil | ||
end | end | ||
-- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases which have also been included in [[Module:data/entities]]. | -- Non-ASCII characters aren't valid in proper HTML named entities, but MediaWiki uses them in some custom aliases | ||
-- which have also been included in [[Module:data/entities]]. | |||
function export.decode_entities(str) | function export.decode_entities(str) | ||
local amp = find(str, "&", nil, true) | local amp = find(str, "&", nil, true) | ||
| Line 989: | Line 1,099: | ||
return entities | return entities | ||
end | end | ||
local function encode_entity(ch) | local function encode_entity(ch) | ||
local entity = (entities or get_entities())[ch] | local entity = (entities or get_entities())[ch] | ||
if entity == nil then | if entity == nil then | ||
local cp = codepoint(ch) | |||
-- U+D800 to U+DFFF are surrogates, so can't be encoded as entities. | |||
entity = cp and (cp <= 0xD7FF or cp >= 0xE000) and format("&#%d;", cp) or false | |||
entities[ch] = entity | entities[ch] = entity | ||
end | end | ||
return entity | return entity or nil | ||
end | end | ||
function export.encode_entities(str, charset, str_lib, plain) | function export.encode_entities(str, charset, str_lib, plain) | ||
if charset == nil then | if charset == nil then | ||
| Line 1,040: | Line 1,152: | ||
return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str | return (find(str, "%", nil, true) or find(str, "_", nil, true)) and gsub(str, "([%%_])(%x?%x?)", decode) or str | ||
end | end | ||
error("bad argument #2 to | error("bad argument #2 to 'decode_uri' (expected QUERY, PATH, or WIKI)", 2) | ||
end | end | ||
end | end | ||
| Line 1,068: | Line 1,180: | ||
end | end | ||
--[==[Removes any HTML comments from the input text. `stage` can be one of three options: | --[==[ | ||
* {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed {{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or [[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); if full accuracy is absolutely necessary, use [[Module:template parser]] instead). | Removes any HTML comments from the input text. `stage` can be one of three options: | ||
* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. {{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed {{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, where the {"PRE"} method will have already been applied by the native parser. | * {"PRE"} (default) applies the method used by MediaWiki's preprocessor: all | ||
* {"BOTH"} applies {"PRE"} then {"POST"}.]==] | {{code|html|<nowiki><!-- ... --></nowiki>}} pairs are removed, as well as any text after an unclosed | ||
{{code|html|<nowiki><!--</nowiki>}}. This is generally suitable when parsing raw template or | |||
[[mw:Parser extension tags|parser extension tag]] code. (Note, however, that the actual method used by the | |||
preprocessor is considerably more complex and differs under certain conditions (e.g. comments inside nowiki tags); | |||
if full accuracy is absolutely necessary, use [[Module:template parser]] instead). | |||
* {"POST"} applies the method used to generate the final page output once all templates have been expanded: it loops | |||
over the text, removing any {{code|html|<nowiki><!-- ... --></nowiki>}} pairs until no more are found (e.g. | |||
{{code|html|<nowiki><!-<!-- ... -->- ... --></nowiki>}} would be fully removed), but any unclosed | |||
{{code|html|<nowiki><!--</nowiki>}} is ignored. This is suitable for handling links embedded in template inputs, | |||
where the {"PRE"} method will have already been applied by the native parser. | |||
* {"BOTH"} applies {"PRE"} then {"POST"}. | |||
]==] | |||
function export.remove_comments(str, stage) | function export.remove_comments(str, stage) | ||
if not stage or stage == "PRE" then | if not stage or stage == "PRE" then | ||
| Line 1,078: | Line 1,201: | ||
local processed = stage == "POST" and _remove_comments(str) or | local processed = stage == "POST" and _remove_comments(str) or | ||
stage == "BOTH" and _remove_comments(str, true) or | stage == "BOTH" and _remove_comments(str, true) or | ||
error("bad argument #2 to | error("bad argument #2 to 'remove_comments' (expected PRE, POST, or BOTH)", 2) | ||
while processed ~= str do | while processed ~= str do | ||
str = processed | str = processed | ||
| Line 1,085: | Line 1,208: | ||
return str | return str | ||
end | end | ||
end | end | ||
do | do | ||
local byte_escapes | local byte_escapes | ||
local function get_byte_escapes() | local function get_byte_escapes() | ||
byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil | byte_escapes, get_byte_escapes = load_data("Module:string utilities/data").byte_escapes, nil | ||
| Line 1,164: | Line 1,240: | ||
format_fun = export.format_fun | format_fun = export.format_fun | ||
--[==[This function, unlike {string.format} and {mw.ustring.format}, takes just two | --[==[ | ||
This function, unlike {string.format} and {mw.ustring.format}, takes just two parameters, a format string and a table, | |||
and replaces all instances of { {param_name} } in the format string with the table's entry for {param_name}. The opening | |||
and closing brace characters can be escaped with { {\op} } and { {\cl} }, respectively. A table entry beginning with a | |||
slash can be escaped by doubling the initial slash. | |||
====Examples==== | ====Examples==== | ||
| Line 1,171: | Line 1,251: | ||
* {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})} | * {string_utilities.format("The set {\\op}1, 2, 3{\\cl} contains {\\\\hello} elements.", {["\\hello"]="three"})} | ||
*: produces: {"The set {1, 2, 3} contains three elements."} | *: produces: {"The set {1, 2, 3} contains three elements."} | ||
*:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string.]==] | *:* Note that the single and double backslashes should be entered as double and quadruple backslashes when quoted in a literal string. | ||
]==] | |||
function export.format(str, tbl) | function export.format(str, tbl) | ||
return format_fun(str, function(key) | return format_fun(str, function(key) | ||
| Line 1,180: | Line 1,261: | ||
do | do | ||
local function do_uclcfirst(str, case_func) | local function do_uclcfirst(str, case_func) | ||
-- | -- Re-case the first letter. | ||
local | local first, remainder = match(str, "^(.[\128-\191]*)(.*)") | ||
return | return first and (case_func(first) .. remainder) or "" | ||
end | end | ||
local function uclcfirst(str, case_func) | local function uclcfirst(str, case_func) | ||
-- Strip off any HTML tags at the beginning. This currently does not handle comments or <ref>...</ref> | |||
-- correctly; it's intended for text wrapped in <span> or the like, as happens when passing text through | |||
-- [[Module:links]]. | |||
local html_at_beginning = nil | |||
if str:match("^<") then | |||
while true do | |||
local html_tag, rest = str:match("^(<.->)(.*)$") | |||
if not html_tag then | |||
break | |||
end | |||
if not html_at_beginning then | |||
html_at_beginning = {} | |||
end | |||
insert(html_at_beginning, html_tag) | |||
str = rest | |||
end | |||
end | |||
-- If there's a link at the beginning, re-case the first letter of the | -- If there's a link at the beginning, re-case the first letter of the | ||
-- link text. This pattern matches both piped and unpiped links. | -- link text. This pattern matches both piped and unpiped links. | ||
-- If the link is not piped, the second capture (linktext) will be empty. | -- If the link is not piped, the second capture (linktext) will be empty. | ||
local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") | local link, linktext, remainder = match(str, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$") | ||
local retval | |||
if link then | if link then | ||
retval = "[[" .. link .. "|" .. do_uclcfirst(linktext ~= "" and linktext or link, case_func) .. "]]" .. remainder | |||
else | |||
retval = do_uclcfirst(str, case_func) | |||
end | end | ||
if html_at_beginning then | |||
retval = concat(html_at_beginning) .. retval | |||
end | |||
return retval | |||
end | end | ||
--[==[ | |||
Uppercase the first character of the input string, correctly handling one-part and two-part links, optionally | |||
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly | |||
uppercase the first character of text that may include links that have been passed through `full_link()` in | |||
[[Module:links]] or a similar function. | |||
]==] | |||
function export.ucfirst(str) | function export.ucfirst(str) | ||
return uclcfirst(str, uupper) | return uclcfirst(str, uupper) | ||
end | end | ||
ucfirst = export.ucfirst | |||
--[==[ | |||
Lowercase the first character of the input string, correctly handling one-part and two-part links, optionally | |||
surrounded by HTML tags such as `<nowiki><span>...</span></nowiki>`, possibly nested. Intended to correctly | |||
lowercase the first character of text that may include links that have been passed through `full_link()` in | |||
[[Module:links]] or a similar function. | |||
]==] | |||
function export.lcfirst(str) | function export.lcfirst(str) | ||
return uclcfirst(str, ulower) | return uclcfirst(str, ulower) | ||
end | end | ||
--[==[Capitalizes each word of the input string. WARNING: May be broken in the presence of multiword links.]==] | |||
--[==[ | |||
function export.capitalize(str) | function export.capitalize(str) | ||
-- Capitalize multi-word that is separated by spaces | -- Capitalize multi-word that is separated by spaces | ||
-- by uppercasing the first letter of each part. | -- by uppercasing the first letter of each part. | ||
return (ugsub(str, "%w+", ucfirst)) | |||
return (ugsub(str, "% | |||
end | end | ||
function | local function do_title_case(first, remainder) | ||
first = uupper(first) | |||
return remainder == "" and first or (first .. ulower(remainder)) | |||
end | end | ||
--[==[ | |||
Capitalizes each word of the input string, with any further letters in each word being converted to lowercase. | |||
]==] | |||
function export.title_case(str) | |||
return str == "" and "" or ugsub(str, "(%w)(%w*)", do_title_case) | |||
end | end | ||
title_case = export.title_case | |||
--[==[ | --[==[ | ||
Converts the input string to {{w|Camel case|CamelCase}}. Any non-word characters are treated as breaks between | |||
words. If `lower_first` is set, then the first character of the string will be lowercase (e.g. camelCase). | |||
]==] | ]==] | ||
function export. | function export.camel_case(str, lower_first) | ||
str = ugsub(str, "%W*(%w*)", title_case) | |||
return lower_first and do_uclcfirst(str, ulower) or str | |||
end | end | ||
end | end | ||
do | |||
local function do_snake_case(nonword, word) | |||
return nonword == "" and word or "_" .. word | |||
function | |||
end | end | ||
--[==[ | --[==[ | ||
Converts the input string to {{w|Snake case|snake_case}}. Any non-word characters are treated as breaks between | |||
words. | |||
]==] | |||
]==] | function export.snake_case(str) | ||
function export. | return (ugsub(str, "(%W*)(%w*)", do_snake_case)) | ||
end | |||
end | end | ||
return export | return export | ||