Module:debug/escape

From Linguifex
Jump to navigation Jump to search

Documentation for this module may be created at Module:debug/escape/doc

local string_isutf8_module = "Module:string/isutf8"

local byte = string.byte
local dump = mw.dumpObject
local error = error
local format = string.format
local gsub = string.gsub
local sub = string.sub
local type = type

local function isutf8(...)
	isutf8 = require(string_isutf8_module)
	return isutf8(...)
end

local quote_options
local function get_quote_options()
	quote_options, get_quote_options = {
		noquotes = "",
		single = "'",
		double = '"',
		quotes = "'\""
	}, nil
	return quote_options
end

local escapes
local function get_escapes()
	escapes, get_escapes = {
		["\a"] = [[\a]], ["\b"] = [[\b]], ["\t"] = [[\t]], ["\n"] = [[\n]],
		["\v"] = [[\v]], ["\f"] = [[\f]], ["\r"] = [[\r]], ['"'] = [[\"]],
		["'"] = [[\']], ["\\"] = [[\\]],
	}, nil
	return escapes
end

-- Escapes one byte.
local function escape_byte(ch)
	return (escapes or get_escapes())[ch] or format([[\%03d]], byte(ch))
end

-- Escapes a string of bytes.
local function escape_bytes(b)
	return (gsub(b, ".", escape_byte))
end

-- Takes a valid UTF-8 character with its leading byte, and potentially escapes
-- it.
local function maybe_escape_char(ch, b)
	-- Escape the control characters (U+0080 to U+009F) and the no-break space
	-- (U+00A0).
	if b == 0xC2 and byte(ch, 2) <= 0xA0 then
		return escape_bytes(ch)
	end
	return ch
end

-- Handles a character-like raw chunk of escapable bytes.
local function escape_chunk(chunk)
	local chunk_len = #chunk
	if chunk_len == 1 then
		return escape_byte(chunk)
	end
	local b = byte(chunk)
	-- If the initial byte is a 1-byte character (\x00 to \x7F) or not valid as
	-- a leading byte (\x80 to \xC1 or \xF5 to \xFF), escape `chunk`.
	if b < 0xC2 or b > 0xF4 then
		return escape_bytes(chunk)
	end
	-- Get the expected chunk length, which is the length of a UTF-8 character
	-- with leading byte `b`.
	local exp_len = b < 0xE0 and 2 or b < 0xF0 and 3 or 4
	-- If `chunk` is the expected length, return it if it's a valid UTF-8
	-- character, or escape if not.
	if chunk_len == exp_len then
		return isutf8(chunk) and maybe_escape_char(chunk, b) or escape_bytes(chunk)
	-- If it's too short, escape it.
	elseif chunk_len < exp_len then
		return escape_bytes(chunk)
	end
	-- If it's too long, it could be a valid UTF-8 character followed by further
	-- bytes. If it is, keep the valid character intact, but escape everything
	-- after.
	local init_ch = sub(chunk, 1, exp_len)
	if isutf8(init_ch) then
		return maybe_escape_char(init_ch, b) .. escape_bytes(sub(chunk, exp_len + 1))
	end
	-- Otherwise, escape all of `chunk`.
	return escape_bytes(chunk)
end

--[==[
Escapes control characters, backslash, the no-break space, bytes that aren't used in UTF-8 and invalid UTF-8 character sequences.

The optional {quotes} flag controls how quotation marks are handled, which takes a string value:
* {"quotes"}: escapes {'} and {"} (default)
* {"single"}: escapes {'} only
* {"double"}: escapes {"} only
* {"noquotes"}: no quotation mark escapes]==]
return function(str, quotes)
	local q = (quote_options or get_quote_options())[quotes == nil and "quotes" or quotes]
	if not q then
		local quotes_type = type(quotes)
		error('`quotes` must be "quotes", "single", "double" or nil; received ' ..
			(quotes_type == "string" and dump(quotes) or "a " .. quotes_type))
	end
	-- TODO: handle Unicode normalization.
	return (gsub(str, format("[%%c%s\\\128-\255][\128-\191]*", q), escape_chunk))
end