Module:links/data: Difference between revisions

From Linguifex
Jump to navigation Jump to search
No edit summary
Tag: Reverted
No edit summary
 
(4 intermediate revisions by 2 users not shown)
Line 1: Line 1:
local encode = mw.text.encode
local u = mw.ustring.char
local data = {}
local data = {}


data.ignore_cap = {
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
["ko"] = true,
local u = require("Module:string utilities").char
}


data.phonetic_extraction = {
data.phonetic_extraction = {
Line 12: Line 9:
}
}


data.pos_tags = {
data.ignored_prefixes = {
["a"] = "adjective",
["cat"] = true,
["adv"] = "adverb",
["category"] = true,
["int"] = "interjection",
["file"] = true,
["n"] = "noun",
["image"] = true
["pron"] = "pronoun",
}
["v"] = "verb",
 
["vi"] = "intransitive verb",
-- Scheme for using unsupported characters in titles.
["vt"] = "transitive verb",
data.unsupported_characters = {
["vti"] = "transitive and intransitive verb",
["#"] = "`num`",
["%"] = "`percnt`", -- only escaped in percent encoding
["&"] = "`amp`", -- only escaped in HTML entities
["."] = "`period`", -- only escaped in dot-slash notation
["<"] = "`lt`",
[">"] = "`gt`",
["["] = "`lsqb`",
["]"] = "`rsqb`",
["_"] = "`lowbar`",
["`"] = "`grave`", -- used to enclose unsupported characters in the scheme, so a raw use in an unsupported title must be escaped to prevent interference
["{"] = "`lcub`",
["|"] = "`vert`",
["}"] = "`rcub`",
["~"] = "`tilde`", -- only escaped when 3 or more are consecutive
["\239\191\189"] = "`repl`" -- replacement character U+FFFD, which can't be typed directly here due to an abuse filter
}
}


--[[ The "actual title" is the page name with the prefix "Unsupported titles/" removed.
-- Manually specified unsupported titles. Only put titles here if there is a different reason why they are unsupported, and not just because they contain one of the unsupported characters above.
["displayed_title"] = "actual title" ]]
data.unsupported_titles = {
data.unsupported_titles = {
[" "] = "Space",
[" "] = "Space",
["# #"] = "Enclosing number signs",
["&amp;"] = "`amp`amp;",
["#"] = "Number sign",
["#MeToo"] = "MeToo",
["#MeTooed"] = "MeTooed",
["#MeTooing"] = "MeTooing",
["#MeToos"] = "MeToos",
["&amp;"] = "Amp",
["¯\\_(ツ)_/¯"] = "¯\\ (ツ) /¯",
["¯_(ツ)_/¯"] = "¯ (ツ) /¯",
["(^_^)"] = "(^ ^)",
["*_*"] = "* *",
["."] = "Full stop",
[".."] = "Double period",
["./."] = "Period slash period",
[": :"] = ": :",
[":"] = ":",
[":-{"] = "Colon hyphen left curly bracket",
[":{"] = "Colon left curly bracket",
[":|"] = "Colon vertical line",
["=_="] = "= =",
["[ ]"] = "Square brackets",
["["] = "Left square bracket",
["[…]"] = "Square bracketed ellipsis",
["[...]"] = "Left square bracket ... right square bracket",
["[-0-]"] = "Australian Aboriginal Flag emoticon alternative",
["[citation needed]"] = "Square bracketed citation needed",
["[-o-]"] = "Australian Aboriginal Flag emoticon",
["]"] = "Right square bracket",
["^_^"] = "^ ^",
["_ _"] = "Underscore space underscore",
["-_-"] = "- -",
["_"] = "Underscore",
["{ }"] = "Curly brackets",
["{"] = "Left curly bracket",
["| |"] = "Enclosing vertical lines",
["|"] = "Vertical line",
["-||-"] = "Hyphen vertical line vertical line hyphen",
["||"] = "Vertical line vertical line",
["}"] = "Right curly bracket",
["</s>"] = "End s tag",
["< />"] = "Less than trailing slash greater than",
["< > </ >"] = "HTML start tag end tag",
["< >"] = "Enclosing less than greater than",
["<!-- -->"] = "HTML comment",
["<-"] = "Less than hyphen",
["<"] = "Less than",
["</3"] = "Less than slash three",
["<\\3"] = "Less than backslash three",
["<<"] = "Double less than",
["<<<"] = "Triple less than",
["<="] = "Less than equal",
["<>"] = "Less than greater than",
["<3"] = "Less than three",
["<g>"] = "g tag",
["=<"] = "Equal less than",
["=>"] = "Equal greater than",
[">"] = "Greater than",
["->"] = "Hyphen greater than",
[">_<"] = "Greater than low line less than",
[">="] = "Greater than equal",
[">>"] = "Double greater than",
[">>>"] = "Triple greater than",
["×_×"] = "× ×",
["9_9"] = "9 9",
["C#"] = "C sharp",
["C|N>K"] = "C through N to K",
["eq #"] = "eq number sign",
["f##k"] = "f double number sign k",
["f##ked"] = "f double number sign ked",
["f##king"] = "f double number sign king",
["f##ks"] = "f double number sign ks",
["hr #"] = "hr number sign",
["n_n"] = "n n",
["O_O"] = "O O",
["O_o"] = "O o",
["o_O"] = "o O",
["o_o"] = "o o",
["snake_case"] = "snake case",
["T_T"] = "T T",
["u_u"] = "u u",
["X_X"] = "X X",
["x_x"] = "x x",
["x86_64"] = "x86 64",
["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλεπικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish",
["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλεπικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish",
["о/."] = "о slash dot",
["ಠ_ಠ"] = "ಠ ಠ",
["ಥ_ಥ"] = "ಥ ಥ",
["┬─┬ノ( º _ ºノ)"] = "┬─┬ノ( º ºノ)",
["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok",
["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok",
[u(0x1680)] = "Ogham space",
[u(0x1680)] = "Ogham space",
[u(0x3000)] = "Ideographic space",
[u(0x3000)] = "Ideographic space"
[u(0xFFFD)] = "Replacement character",
}
}


data.display_change = {
-- Mammoth pages contain only Translingual and English entries, if present. The remaining L2s are placed on subpages.
[" "] = "] [", -- Space
-- The same subpage titles are used across all mammoth pages for the convenience of bot and script operators.
[u(0x00A0)] = "]" .. u(0x00A0) .. "[", -- No-break space
-- Assuming that most mammoth pages will be Latin-script terms, the subpage groupings are determined by dividing the
[u(0x180E)] = "]" .. u(0x180E) .. "[", -- Mongolian vowel separator
-- list of Latin-script languages known to Wiktionary into two (three, ...) roughly equal alphabetic divisions. This is
[u(0x2000)] = "]" .. u(0x2000) .. "[", -- En quad
-- easily done by looking at Petscan's output:
[u(0x2001)] = "]" .. u(0x2001) .. "[", -- Em quad
-- https://petscan.wmcloud.org/?sortby=title&language=en&ns%5B14%5D=1&categories=Latin+script+languages&project=wiktionary&doit=
[u(0x2002)] = "]" .. u(0x2002) .. "[", -- En space
-- This data structure contains types of splits, each of which is a list of names of splits and Lua patterns applied to
[u(0x2003)] = "]" .. u(0x2003) .. "[", -- Em space
-- the decomposed L2 name (with apostrophes and double quotes removed and certain other transformations applied; see
[u(0x2004)] = "]" .. u(0x2004) .. "[", -- Three-per-em space
-- get_L2_sort_key() in [[Module:headword/page]]), or "true" for the final catch-all subpage (which includes anything
[u(0x2005)] = "]" .. u(0x2005) .. "[", -- Four-per-em space
-- not beginning with a Latin letter after the transformations are applied; this includes e.g. ǃKung but not 'Are'are,
[u(0x2006)] = "]" .. u(0x2006) .. "[", -- Six-per-em space
-- which sorts with A, and not Àhàn, which likewise sorts with A). The patterns must be suitable for use with plain
[u(0x2007)] = "]" .. u(0x2007) .. "[", -- Figure space
-- string functions, not their mw.ustring equivalents.
[u(0x2008)] = "]" .. u(0x2008) .. "[", -- Punctuation space
data.mammoth_page_subpage_types = {
[u(0x2009)] = "]" .. u(0x2009) .. "[", -- Thin space
twos = {
[u(0x200A)] = "]" .. u(0x200A) .. "[", -- Hair space
{"languages A to L", "^[A-L]"},
[u(0x202F)] = "]" .. u(0x202F) .. "[", -- Narrow no-break space
{"languages M to Z", true},
[u(0x205F)] = "]" .. u(0x205F) .. "[", -- Medium mathematical space
},
[u(0x3000)] = "]" .. u(0x3000) .. "[", -- Ideographic space
threes = {
{"languages A to I", "^[A-I]"},
{"languages J to Q", "^[J-Q]"},
{"languages R to Z", true},
},
CJK = {
{"languages A to C", "^[A-C]"}, -- Translingual and Chinese on one page
{"languages D to Z", true}, -- all the remainder (mostly Japanese, Korean, Vietnamese) on the other
},
}
}


-- Valid URI schemes in external links, which therefore have to be escaped if used in entry names (e.g. [[sms:a]]).
-- "Mammoth pages" are pages whose entries cannot be housed on a single page because of MediaWiki limits. The key is
local uri_schemes = {
-- the page and the value is the subpage type, as defined above in `mammoth_page_subpage_types`.
"bitcoin:",
data.mammoth_pages = {
"ftp://",
["mammoth page test"] = "twos",   -- required for testing purposes - please leave here
"ftps://",
"geo:",
"git://",
"gopher://",
"http://",
"https://",
"irc:",
"ircs:",
"magnet:",
"mailto:",
"mms://",
"news:",
"nntp://",
"redis://",
"sftp://",
"sip:",
"sips:",
"sms:",
"ssh://",
"svn://",
"tel:",
"telnet://",
"urn:",
"worldwind://",
"xmpp:",
}
}
-- Convert into lookup table.
local uri_lookup = {}
for _, scheme in ipairs(uri_schemes) do
uri_lookup[scheme] = encode(scheme, ":")
end
data.uri_schemes = uri_lookup


return data
return data

Latest revision as of 15:11, 29 April 2026



local data = {}

local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local u = require("Module:string utilities").char

data.phonetic_extraction = {
	["th"] = "Module:th",
	["km"] = "Module:km",
}

data.ignored_prefixes = {
	["cat"] = true,
	["category"] = true,
	["file"] = true,
	["image"] = true
}

-- Scheme for using unsupported characters in titles.
data.unsupported_characters = {
	["#"] = "`num`",
	["%"] = "`percnt`", -- only escaped in percent encoding
	["&"] = "`amp`", -- only escaped in HTML entities
	["."] = "`period`", -- only escaped in dot-slash notation
	["<"] = "`lt`",
	[">"] = "`gt`",
	["["] = "`lsqb`",
	["]"] = "`rsqb`",
	["_"] = "`lowbar`",
	["`"] = "`grave`", -- used to enclose unsupported characters in the scheme, so a raw use in an unsupported title must be escaped to prevent interference
	["{"] = "`lcub`",
	["|"] = "`vert`",
	["}"] = "`rcub`",
	["~"] = "`tilde`", -- only escaped when 3 or more are consecutive
	["\239\191\189"] = "`repl`" -- replacement character U+FFFD, which can't be typed directly here due to an abuse filter
}

-- Manually specified unsupported titles. Only put titles here if there is a different reason why they are unsupported, and not just because they contain one of the unsupported characters above.
data.unsupported_titles = {
	[" "] = "Space",
	["&amp;"] = "`amp`amp;",
	["λοπαδοτεμαχοσελαχογαλεοκρανιολειψανοδριμυποτριμματοσιλφιοκαραβομελιτοκατακεχυμενοκιχλεπικοσσυφοφαττοπεριστεραλεκτρυονοπτοκεφαλλιοκιγκλοπελειολαγῳοσιραιοβαφητραγανοπτερύγων"] = "Ancient Greek dish",
	["กรุงเทพมหานคร อมรรัตนโกสินทร์ มหินทรายุธยา มหาดิลกภพ นพรัตนราชธานีบูรีรมย์ อุดมราชนิเวศน์มหาสถาน อมรพิมานอวตารสถิต สักกะทัตติยวิษณุกรรมประสิทธิ์"] = "Thai name of Bangkok",
	[u(0x1680)] = "Ogham space",
	[u(0x3000)] = "Ideographic space"
}

-- Mammoth pages contain only Translingual and English entries, if present. The remaining L2s are placed on subpages.
-- The same subpage titles are used across all mammoth pages for the convenience of bot and script operators.
-- Assuming that most mammoth pages will be Latin-script terms, the subpage groupings are determined by dividing the
-- list of Latin-script languages known to Wiktionary into two (three, ...) roughly equal alphabetic divisions. This is
-- easily done by looking at Petscan's output:
-- https://petscan.wmcloud.org/?sortby=title&language=en&ns%5B14%5D=1&categories=Latin+script+languages&project=wiktionary&doit=
-- This data structure contains types of splits, each of which is a list of names of splits and Lua patterns applied to
-- the decomposed L2 name (with apostrophes and double quotes removed and certain other transformations applied; see
-- get_L2_sort_key() in [[Module:headword/page]]), or "true" for the final catch-all subpage (which includes anything
-- not beginning with a Latin letter after the transformations are applied; this includes e.g. ǃKung but not 'Are'are,
-- which sorts with A, and not Àhàn, which likewise sorts with A). The patterns must be suitable for use with plain
-- string functions, not their mw.ustring equivalents.
data.mammoth_page_subpage_types = {
	twos = {
		{"languages A to L", "^[A-L]"},
		{"languages M to Z", true},
	},
	threes = {
		{"languages A to I", "^[A-I]"},
		{"languages J to Q", "^[J-Q]"},
		{"languages R to Z", true},
	},
	CJK = {
		{"languages A to C", "^[A-C]"}, -- Translingual and Chinese on one page
		{"languages D to Z", true}, -- all the remainder (mostly Japanese, Korean, Vietnamese) on the other
	},
}

-- "Mammoth pages" are pages whose entries cannot be housed on a single page because of MediaWiki limits. The key is
-- the page and the value is the subpage type, as defined above in `mammoth_page_subpage_types`.
data.mammoth_pages = {
	["mammoth page test"] = "twos",   -- required for testing purposes - please leave here
}

return data