Module:debug/escape
Documentation for this module may be created at Module:debug/escape/doc
local string_isutf8_module = "Module:string/isutf8"
local byte = string.byte
local dump = mw.dumpObject
local error = error
local format = string.format
local gsub = string.gsub
local sub = string.sub
local type = type
local function isutf8(...)
isutf8 = require(string_isutf8_module)
return isutf8(...)
end
local quote_options
local function get_quote_options()
quote_options, get_quote_options = {
noquotes = "",
single = "'",
double = '"',
quotes = "'\""
}, nil
return quote_options
end
local escapes
local function get_escapes()
escapes, get_escapes = {
["\a"] = [[\a]], ["\b"] = [[\b]], ["\t"] = [[\t]], ["\n"] = [[\n]],
["\v"] = [[\v]], ["\f"] = [[\f]], ["\r"] = [[\r]], ['"'] = [[\"]],
["'"] = [[\']], ["\\"] = [[\\]],
}, nil
return escapes
end
-- Escapes one byte.
local function escape_byte(ch)
return (escapes or get_escapes())[ch] or format([[\%03d]], byte(ch))
end
-- Escapes a string of bytes.
local function escape_bytes(b)
return (gsub(b, ".", escape_byte))
end
-- Takes a valid UTF-8 character with its leading byte, and potentially escapes
-- it.
local function maybe_escape_char(ch, b)
-- Escape the control characters (U+0080 to U+009F) and the no-break space
-- (U+00A0).
if b == 0xC2 and byte(ch, 2) <= 0xA0 then
return escape_bytes(ch)
end
return ch
end
-- Handles a character-like raw chunk of escapable bytes.
local function escape_chunk(chunk)
local chunk_len = #chunk
if chunk_len == 1 then
return escape_byte(chunk)
end
local b = byte(chunk)
-- If the initial byte is a 1-byte character (\x00 to \x7F) or not valid as
-- a leading byte (\x80 to \xC1 or \xF5 to \xFF), escape `chunk`.
if b < 0xC2 or b > 0xF4 then
return escape_bytes(chunk)
end
-- Get the expected chunk length, which is the length of a UTF-8 character
-- with leading byte `b`.
local exp_len = b < 0xE0 and 2 or b < 0xF0 and 3 or 4
-- If `chunk` is the expected length, return it if it's a valid UTF-8
-- character, or escape if not.
if chunk_len == exp_len then
return isutf8(chunk) and maybe_escape_char(chunk, b) or escape_bytes(chunk)
-- If it's too short, escape it.
elseif chunk_len < exp_len then
return escape_bytes(chunk)
end
-- If it's too long, it could be a valid UTF-8 character followed by further
-- bytes. If it is, keep the valid character intact, but escape everything
-- after.
local init_ch = sub(chunk, 1, exp_len)
if isutf8(init_ch) then
return maybe_escape_char(init_ch, b) .. escape_bytes(sub(chunk, exp_len + 1))
end
-- Otherwise, escape all of `chunk`.
return escape_bytes(chunk)
end
--[==[
Escapes control characters, backslash, the no-break space, bytes that aren't used in UTF-8 and invalid UTF-8 character sequences.
The optional {quotes} flag controls how quotation marks are handled, which takes a string value:
* {"quotes"}: escapes {'} and {"} (default)
* {"single"}: escapes {'} only
* {"double"}: escapes {"} only
* {"noquotes"}: no quotation mark escapes]==]
return function(str, quotes)
local q = (quote_options or get_quote_options())[quotes == nil and "quotes" or quotes]
if not q then
local quotes_type = type(quotes)
error('`quotes` must be "quotes", "single", "double" or nil; received ' ..
(quotes_type == "string" and dump(quotes) or "a " .. quotes_type))
end
-- TODO: handle Unicode normalization.
return (gsub(str, format("[%%c%s\\\128-\255][\128-\191]*", q), escape_chunk))
end