Module:Cyrs-translit: Difference between revisions
Jump to navigation
Jump to search
Created page with "local export = {} local numbers = mw.loadData("Module:Cyrs-translit/numbers") local ugsub = mw.ustring.gsub local toNFC = mw.ustring.toNFC local toNFD = mw.ustring.toNFD local U = mw.ustring.char local umatch = mw.ustring.match local usub = mw.ustring.sub local ulower = mw.ustring.lower local acute = U(0x301) local grave = U(0x300) local circumflex = U(0x302) local palatalization = U(0x0484) local titlo = U(0x0483) local dasia = U(0x0485) local psili = U(0x0486) local..." |
m 1 revision imported |
||
| (One intermediate revision by one other user not shown) | |||
| Line 14: | Line 14: | ||
local grave = U(0x300) | local grave = U(0x300) | ||
local circumflex = U(0x302) | local circumflex = U(0x302) | ||
local kamora = U(0x0311) | |||
local palatalization = U(0x0484) | local palatalization = U(0x0484) | ||
local titlo = U(0x0483) | local titlo = U(0x0483) | ||
| Line 22: | Line 23: | ||
local breathing = psili .. dasia | local breathing = psili .. dasia | ||
local accent = "[" .. acute .. grave .. circumflex .. breathing .. "]*" | local accent = "[" .. acute .. grave .. circumflex .. breathing .. "]*" | ||
local vowels = " | local vowels = "aAæÆeEiIoOœŒꝏꝎuUyY" | ||
local vowel_or_soft = "[" .. vowels .. "ʹ]" | local vowel_or_soft = "[" .. vowels .. "ʹ]" | ||
| Line 38: | Line 39: | ||
["І"] = 'I', ["і"] = 'i', -- Contrastive with "И". | ["І"] = 'I', ["і"] = 'i', -- Contrastive with "И". | ||
["Й"] = 'J', ["й"] = 'j', | ["Й"] = 'J', ["й"] = 'j', | ||
["Ꙉ"] = ' | ["Ꙉ"] = 'Ǵ', ["ꙉ"] = 'ǵ', | ||
["К"] = 'K', ["к"] = 'k', | ["К"] = 'K', ["к"] = 'k', | ||
["Л"] = 'L', ["л"] = 'l', | ["Л"] = 'L', ["л"] = 'l', | ||
| Line 44: | Line 45: | ||
["Н"] = 'N', ["н"] = 'n', | ["Н"] = 'N', ["н"] = 'n', | ||
["О"] = 'O', ["о"] = 'o', | ["О"] = 'O', ["о"] = 'o', | ||
["Ꚙ"] = 'Ꝏ', ["ꚙ"] = 'ꝏ', | |||
["П"] = 'P', ["п"] = 'p', | ["П"] = 'P', ["п"] = 'p', | ||
["Р"] = 'R', ["р"] = 'r', | ["Р"] = 'R', ["р"] = 'r', | ||
["С"] = 'S', ["с"] = 's', | ["С"] = 'S', ["с"] = 's', | ||
["Т"] = 'T', ["т"] = 't', | ["Т"] = 'T', ["т"] = 't', | ||
["У"] = 'U', ["у"] = 'u', | |||
["Ꙋ"] = 'U', ["ꙋ"] = 'u', | ["Ꙋ"] = 'U', ["ꙋ"] = 'u', | ||
["Ф"] = 'F', ["ф"] = 'f', | ["Ф"] = 'F', ["ф"] = 'f', | ||
["Х"] = 'X', ["х"] = 'x', | ["Х"] = 'X', ["х"] = 'x', | ||
| Line 58: | Line 60: | ||
["Ꙡ"] = 'Ć', ["ꙡ"] = 'ć', -- From a merger of "Ц" and "Ч" in Old Novgorodian. | ["Ꙡ"] = 'Ć', ["ꙡ"] = 'ć', -- From a merger of "Ц" and "Ч" in Old Novgorodian. | ||
["Ч"] = 'Č', ["ч"] = 'č', | ["Ч"] = 'Č', ["ч"] = 'č', | ||
["Џ"] = 'Dž', ["џ"] = 'dž', | |||
["Ш"] = 'Š', ["ш"] = 'š', | ["Ш"] = 'Š', ["ш"] = 'š', | ||
["Щ"] = 'Št', ["щ"] = 'št', | ["Щ"] = 'Št', ["щ"] = 'št', | ||
| Line 65: | Line 68: | ||
["Ѣ"] = 'Ě', ["ѣ"] = 'ě', | ["Ѣ"] = 'Ě', ["ѣ"] = 'ě', | ||
["Ꙓ"] = 'Jě', ["ꙓ"] = 'jě', | ["Ꙓ"] = 'Jě', ["ꙓ"] = 'jě', | ||
["Ю"] = 'Ju', ["ю"] = 'ju', | |||
["Ꙗ"] = 'Ja', ["ꙗ"] = 'ja', | ["Ꙗ"] = 'Ja', ["ꙗ"] = 'ja', | ||
["Ѥ"] = 'Je', ["ѥ"] = 'je', | ["Ѥ"] = 'Je', ["ѥ"] = 'je', | ||
[" | ["Ѧ"] = 'Ę', ["ѧ"] = 'ę', | ||
["Ѫ"] = 'Ǫ', ["ѫ"] = 'ǫ', | ["Ѫ"] = 'Ǫ', ["ѫ"] = 'ǫ', | ||
["Ѩ"] = 'Ję', ["ѩ"] = 'ję', | |||
["Ѭ"] = 'Jǫ', ["ѭ"] = 'jǫ', | ["Ѭ"] = 'Jǫ', ["ѭ"] = 'jǫ', | ||
[" | ["Ꙛ"] = 'Œ', ["ꙛ"] = 'œ', -- Becomes "œ̨". | ||
["Ѯ"] = 'Ks', ["ѯ"] = 'ks', | ["Ѯ"] = 'Ks', ["ѯ"] = 'ks', | ||
["Ѱ"] = 'Ps', ["ѱ"] = 'ps', | ["Ѱ"] = 'Ps', ["ѱ"] = 'ps', | ||
| Line 78: | Line 82: | ||
["Ѷ"] = 'Ü', ["ѷ"] = 'ü', -- Contrastive with "Ѵ". | ["Ѷ"] = 'Ü', ["ѷ"] = 'ü', -- Contrastive with "Ѵ". | ||
["Ҁ"] = 'Q', ["ҁ"] = 'q', | ["Ҁ"] = 'Q', ["ҁ"] = 'q', | ||
[psili] = '', | |||
[kamora] = circumflex, | |||
} | } | ||
| Line 83: | Line 89: | ||
["ᲀ"] = 'в', | ["ᲀ"] = 'в', | ||
["Ґ"] = 'Г', ["ґ"] = 'г', | ["Ґ"] = 'Г', ["ґ"] = 'г', | ||
-- ["Ђ"] = 'Ꙉ', ["ђ"] = 'ꙉ', | |||
["ᲁ"] = 'д', | ["ᲁ"] = 'д', | ||
["Ꙣ"] = 'Д' .. palatalization, ["ꙣ"] = 'д' .. palatalization, | ["Ꙣ"] = 'Д' .. palatalization, ["ꙣ"] = 'д' .. palatalization, | ||
| Line 93: | Line 100: | ||
["Ї"] = 'І', ["ї"] = 'і', | ["Ї"] = 'І', ["ї"] = 'і', | ||
["Ꙇ"] = 'І', ["ꙇ"] = 'і', | ["Ꙇ"] = 'І', ["ꙇ"] = 'і', | ||
-- ["Ћ"] = 'Ꙉ', ["ћ"] = 'ꙉ', | |||
["Ꙥ"] = 'Л' .. palatalization, ["ꙥ"] = 'л' .. palatalization, | ["Ꙥ"] = 'Л' .. palatalization, ["ꙥ"] = 'л' .. palatalization, | ||
["Ꙧ"] = 'М' .. palatalization, ["ꙧ"] = 'м' .. palatalization, | ["Ꙧ"] = 'М' .. palatalization, ["ꙧ"] = 'м' .. palatalization, | ||
| Line 100: | Line 108: | ||
["Ꙫ"] = 'О', ["ꙫ"] = 'о', | ["Ꙫ"] = 'О', ["ꙫ"] = 'о', | ||
["Ꚛ"] = 'О', ["ꚛ"] = 'о', | ["Ꚛ"] = 'О', ["ꚛ"] = 'о', | ||
["Ꙭ"] = 'Ꚙ', ["ꙭ"] = 'ꚙ', | |||
["Ꙭ"] = ' | |||
["ꙮ"] = 'о', | ["ꙮ"] = 'о', | ||
["ᲂ"] = 'о', | ["ᲂ"] = 'о', | ||
| Line 114: | Line 121: | ||
["Я"] = 'Ꙗ', ["я"] = 'ꙗ', | ["Я"] = 'Ꙗ', ["я"] = 'ꙗ', | ||
["Ꙕ"] = 'Ю', ["ꙕ"] = 'ю', | ["Ꙕ"] = 'Ю', ["ꙕ"] = 'ю', | ||
["Ꙙ"] = 'Ѧ', ["ꙙ"] = 'ѧ', | ["Ꙙ"] = 'Ѧ', ["ꙙ"] = 'ѧ', | ||
["Ꙝ"] = 'Ѩ', ["ꙝ"] = 'ѩ', | ["Ꙝ"] = 'Ѩ', ["ꙝ"] = 'ѩ', | ||
} | |||
-- A second round of substitutions, e.g. if the final output isn't a precomposed character, but needs to behave like one during processing. | |||
local final_substitutions = { | |||
["Œ"] = "Œ̨", ["œ"] = "œ̨", | |||
[titlo] = ":", [vzmet] = ":" | |||
} | } | ||
| Line 149: | Line 161: | ||
-- Old Novgorodian | -- Old Novgorodian | ||
lang_letters["zle-ono"] = setmetatable({ | lang_letters["zle-ono"] = setmetatable({ | ||
["Ц"] = ' | ["Ц"] = 'Cʹ', ["ц"] = 'cʹ', | ||
["Ч"] = ' | ["Ч"] = 'Cʹ', ["ч"] = 'cʹ', | ||
["Щ"] = ' | ["Щ"] = 'Sʹcʹ', ["щ"] = 'sʹcʹ', | ||
}, {__index = common_letters}) | }, {__index = common_letters}) | ||
| Line 159: | Line 171: | ||
-- Old Pskovian | -- Old Pskovian | ||
lang_letters["zle-ops"] = setmetatable({ -- In addition to zle-ono above. | lang_letters["zle-ops"] = setmetatable({ -- In addition to zle-ono above. | ||
["Ж"] = ' | ["Ж"] = 'Zʹ', ["ж"] = 'zʹ', | ||
["Ѕ"] = ' | ["Ѕ"] = 'Dzʹ', ["ѕ"] = 'dzʹ', | ||
["З"] = ' | ["З"] = 'Zʹ', ["з"] = 'zʹ', | ||
["С"] = ' | ["С"] = 'Sʹ', ["с"] = 'sʹ', | ||
["Ш"] = ' | ["Џ"] = 'Dzʹ', ["џ"] = 'dzʹ', | ||
["Щ"] = ' | ["Ш"] = 'Sʹ', ["ш"] = 'sʹ', | ||
["Щ"] = 'Sʹk', ["щ"] = 'sʹk', | |||
["Ѣ"] = 'Æ', ["ѣ"] = 'æ', | |||
["Ꙓ"] = 'Jæ', ["ꙓ"] = 'jæ', | |||
}, {__index = lang_letters["zle-ono"]}) | }, {__index = lang_letters["zle-ono"]}) | ||
| Line 185: | Line 200: | ||
end | end | ||
local function | local function handle_rough_breathing(base1, base2, diacritics) | ||
-- | -- Mark rough breathing with "h". | ||
if | local base2_lower = ulower(base2) | ||
return | if not vowels:match(base2_lower) then | ||
return base1 .. base2 .. diacritics .. "h" | |||
end | end | ||
local base1_lower = ulower(base1) | |||
if not vowels:match(base1_lower) then | |||
return ( | return base1 .. (base2_lower == base2 and "h" or "H") .. base2_lower .. diacritics | ||
end | |||
return (base1_lower == base1 and "h" or "H") .. base1 .. base2 .. diacritics | |||
end | end | ||
| Line 268: | Line 286: | ||
-- In some languages, treat "уо" ("uo") as "у" ("u"). | -- In some languages, treat "уо" ("uo") as "у" ("u"). | ||
if uo_is_u[lang] then | if uo_is_u[lang] then | ||
-- Not "ꚙ", which is an orthographically doubled "о". | |||
text = ugsub(text, "([уУѵѴѷѶ]" .. accent .. ")[оО]", "%1") | text = ugsub(text, "([уУѵѴѷѶ]" .. accent .. ")[оО]", "%1") | ||
end | end | ||
-- Treat "оу" ("ou") as "у" ("u"). | -- Treat "оу" ("ou") as "у" ("u") (but not "ꚙ"). | ||
text = ugsub(text, "([оО])(" .. accent .. ")[уУѵѴѷѶ]", handle_ou) | text = ugsub(text, "([оО])(" .. accent .. ")[уУѵѴѷѶ]", handle_ou) | ||
| Line 282: | Line 301: | ||
text = text:gsub(".[\128-\191]*", letters) | text = text:gsub(".[\128-\191]*", letters) | ||
-- Handle any breathing marks. | -- Handle any rough breathing marks. | ||
text = ugsub(toNFD(text), "([ | -- FIXME: this can't handle various edge cases. | ||
text = ugsub(toNFD(text), "(%w)(%w?)([^%w%s]*)[" .. dasia .. "]", handle_rough_breathing) | |||
if umatch(text, "[" .. breathing .. "]") then | if umatch(text, "[" .. breathing .. "]") then | ||
| Line 289: | Line 309: | ||
end | end | ||
-- | -- Final substitutions. | ||
text = | text = text:gsub(".[\128-\191]*", final_substitutions) | ||
return toNFC(text) | return toNFC(text) | ||
Latest revision as of 12:46, 21 April 2026
- The following documentation is generated by Module:documentation/functions/translit. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module will transliterate text in the Old Cyrillic script. It is used to transliterate Old Church Slavonic, Old East Slavic, Bulgar, Old Novgorodian, and Church Slavonic.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{xlit}}.
Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:Cyrs-translit/testcases.
Functions
tr(text, lang, sc)- Transliterates a given piece of
textwritten in the script specified by the codesc, and language specified by the codelang. - When the transliteration fails, returns
nil.
local export = {}
local numbers = mw.loadData("Module:Cyrs-translit/numbers")
local ugsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local U = mw.ustring.char
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local ulower = mw.ustring.lower
local acute = U(0x301)
local grave = U(0x300)
local circumflex = U(0x302)
local kamora = U(0x0311)
local palatalization = U(0x0484)
local titlo = U(0x0483)
local dasia = U(0x0485)
local psili = U(0x0486)
local vzmet = U(0xA66F)
local breathing = psili .. dasia
local accent = "[" .. acute .. grave .. circumflex .. breathing .. "]*"
local vowels = "aAæÆeEiIoOœŒꝏꝎuUyY"
local vowel_or_soft = "[" .. vowels .. "ʹ]"
local common_letters = {
["А"] = 'A', ["а"] = 'a',
["Б"] = 'B', ["б"] = 'b',
["В"] = 'V', ["в"] = 'v',
["Г"] = 'G', ["г"] = 'g',
["Д"] = 'D', ["д"] = 'd',
["Е"] = 'E', ["е"] = 'e',
["Ж"] = 'Ž', ["ж"] = 'ž',
["Ѕ"] = 'Dz', ["ѕ"] = 'dz',
["З"] = 'Z', ["з"] = 'z',
["И"] = 'I', ["и"] = 'i',
["І"] = 'I', ["і"] = 'i', -- Contrastive with "И".
["Й"] = 'J', ["й"] = 'j',
["Ꙉ"] = 'Ǵ', ["ꙉ"] = 'ǵ',
["К"] = 'K', ["к"] = 'k',
["Л"] = 'L', ["л"] = 'l',
["М"] = 'M', ["м"] = 'm',
["Н"] = 'N', ["н"] = 'n',
["О"] = 'O', ["о"] = 'o',
["Ꚙ"] = 'Ꝏ', ["ꚙ"] = 'ꝏ',
["П"] = 'P', ["п"] = 'p',
["Р"] = 'R', ["р"] = 'r',
["С"] = 'S', ["с"] = 's',
["Т"] = 'T', ["т"] = 't',
["У"] = 'U', ["у"] = 'u',
["Ꙋ"] = 'U', ["ꙋ"] = 'u',
["Ф"] = 'F', ["ф"] = 'f',
["Х"] = 'X', ["х"] = 'x',
["Ѡ"] = 'O', ["ѡ"] = 'o', -- Contrastive with "О".
["Ѿ"] = 'Ot', ["ѿ"] = 'ot', -- Becomes "otŭ" as appropriate.
["Ѽ"] = 'Ô', ["ѽ"] = 'ô',
["Ц"] = 'C', ["ц"] = 'c',
["Ꙡ"] = 'Ć', ["ꙡ"] = 'ć', -- From a merger of "Ц" and "Ч" in Old Novgorodian.
["Ч"] = 'Č', ["ч"] = 'č',
["Џ"] = 'Dž', ["џ"] = 'dž',
["Ш"] = 'Š', ["ш"] = 'š',
["Щ"] = 'Št', ["щ"] = 'št',
["Ъ"] = 'Ŭ', ["ъ"] = 'ŭ',
["Ꙑ"] = 'Y', ["ꙑ"] = 'y',
["Ь"] = 'Ĭ', ["ь"] = 'ĭ',
["Ѣ"] = 'Ě', ["ѣ"] = 'ě',
["Ꙓ"] = 'Jě', ["ꙓ"] = 'jě',
["Ю"] = 'Ju', ["ю"] = 'ju',
["Ꙗ"] = 'Ja', ["ꙗ"] = 'ja',
["Ѥ"] = 'Je', ["ѥ"] = 'je',
["Ѧ"] = 'Ę', ["ѧ"] = 'ę',
["Ѫ"] = 'Ǫ', ["ѫ"] = 'ǫ',
["Ѩ"] = 'Ję', ["ѩ"] = 'ję',
["Ѭ"] = 'Jǫ', ["ѭ"] = 'jǫ',
["Ꙛ"] = 'Œ', ["ꙛ"] = 'œ', -- Becomes "œ̨".
["Ѯ"] = 'Ks', ["ѯ"] = 'ks',
["Ѱ"] = 'Ps', ["ѱ"] = 'ps',
["Ѳ"] = 'Θ', ["ѳ"] = 'θ',
["Ѵ"] = 'Ü', ["ѵ"] = 'ü',
["Ѷ"] = 'Ü', ["ѷ"] = 'ü', -- Contrastive with "Ѵ".
["Ҁ"] = 'Q', ["ҁ"] = 'q',
[psili] = '',
[kamora] = circumflex,
}
local variants = {
["ᲀ"] = 'в',
["Ґ"] = 'Г', ["ґ"] = 'г',
-- ["Ђ"] = 'Ꙉ', ["ђ"] = 'ꙉ',
["ᲁ"] = 'д',
["Ꙣ"] = 'Д' .. palatalization, ["ꙣ"] = 'д' .. palatalization,
["Є"] = 'Е', ["є"] = 'е',
["Э"] = 'Е', ["э"] = 'е',
["Ꙃ"] = 'Ѕ', ["ꙃ"] = 'ѕ',
["Ꙅ"] = 'Ѕ', ["ꙅ"] = 'ѕ',
["Ꙁ"] = 'З', ["ꙁ"] = 'з',
["Ӥ"] = 'И', ["ӥ"] = 'и',
["Ї"] = 'І', ["ї"] = 'і',
["Ꙇ"] = 'І', ["ꙇ"] = 'і',
-- ["Ћ"] = 'Ꙉ', ["ћ"] = 'ꙉ',
["Ꙥ"] = 'Л' .. palatalization, ["ꙥ"] = 'л' .. palatalization,
["Ꙧ"] = 'М' .. palatalization, ["ꙧ"] = 'м' .. palatalization,
["Ҥ"] = 'Н' .. palatalization, ["ҥ"] = 'н' .. palatalization,
["Ѻ"] = 'О', ["ѻ"] = 'о',
["Ꙩ"] = 'О', ["ꙩ"] = 'о',
["Ꙫ"] = 'О', ["ꙫ"] = 'о',
["Ꚛ"] = 'О', ["ꚛ"] = 'о',
["Ꙭ"] = 'Ꚙ', ["ꙭ"] = 'ꚙ',
["ꙮ"] = 'о',
["ᲂ"] = 'о',
["ᲃ"] = 'с',
["ᲄ"] = 'т',
["ᲅ"] = 'т',
["Ѹ"] = 'Ꙋ', ["ѹ"] = 'ꙋ', ["ᲈ"] = 'ꙋ',
["Ꙍ"] = 'Ѡ', ["ꙍ"] = 'ѡ',
["Ы"] = 'Ꙑ', ["ы"] = 'ꙑ',
["ᲆ"] = 'ъ',
["ᲇ"] = 'ѣ',
["Я"] = 'Ꙗ', ["я"] = 'ꙗ',
["Ꙕ"] = 'Ю', ["ꙕ"] = 'ю',
["Ꙙ"] = 'Ѧ', ["ꙙ"] = 'ѧ',
["Ꙝ"] = 'Ѩ', ["ꙝ"] = 'ѩ',
}
-- A second round of substitutions, e.g. if the final output isn't a precomposed character, but needs to behave like one during processing.
local final_substitutions = {
["Œ"] = "Œ̨", ["œ"] = "œ̨",
[titlo] = ":", [vzmet] = ":"
}
-- Letters converted to their iotated equivalents when word-initial.
local common_iotated_initial = {
["Ѣ"] = 'Ꙓ', ["ѣ"] = 'ꙓ',
}
-- Letters converted to their iotated equivalents after vowels or a
-- palatalization mark.
local common_iotated_after_vowel_or_soft = {
["Е"] = 'Ѥ', ["е"] = 'ѥ',
["Ѣ"] = 'Ꙓ', ["ѣ"] = 'ꙓ',
["Ѧ"] = 'Ѩ', ["ѧ"] = 'ѩ',
}
local lang_letters = {}
local lang_iotated_initial = {}
local lang_iotated_after_vowel_or_soft = {}
local uo_is_u = {}
-- Old East Slavic
lang_letters["orv"] = setmetatable({
["Щ"] = 'Šč', ["щ"] = 'šč',
}, {__index = common_letters})
lang_iotated_initial["orv"] = setmetatable({
["Е"] = 'Ѥ', ["е"] = 'ѥ',
["Ѧ"] = 'Ѩ', ["ѧ"] = 'ѩ',
}, {__index = common_iotated_initial})
-- Old Novgorodian
lang_letters["zle-ono"] = setmetatable({
["Ц"] = 'Cʹ', ["ц"] = 'cʹ',
["Ч"] = 'Cʹ', ["ч"] = 'cʹ',
["Щ"] = 'Sʹcʹ', ["щ"] = 'sʹcʹ',
}, {__index = common_letters})
lang_iotated_initial["zle-ono"] = lang_iotated_initial["orv"]
uo_is_u["zle-ono"] = true
-- Old Pskovian
lang_letters["zle-ops"] = setmetatable({ -- In addition to zle-ono above.
["Ж"] = 'Zʹ', ["ж"] = 'zʹ',
["Ѕ"] = 'Dzʹ', ["ѕ"] = 'dzʹ',
["З"] = 'Zʹ', ["з"] = 'zʹ',
["С"] = 'Sʹ', ["с"] = 'sʹ',
["Џ"] = 'Dzʹ', ["џ"] = 'dzʹ',
["Ш"] = 'Sʹ', ["ш"] = 'sʹ',
["Щ"] = 'Sʹk', ["щ"] = 'sʹk',
["Ѣ"] = 'Æ', ["ѣ"] = 'æ',
["Ꙓ"] = 'Jæ', ["ꙓ"] = 'jæ',
}, {__index = lang_letters["zle-ono"]})
lang_iotated_initial["zle-ops"] = setmetatable({ -- In addition to zle-ono above.
["Ѫ"] = 'Ѭ', ["ѫ"] = 'ѭ',
}, {__index = lang_iotated_initial["zle-ono"]})
lang_iotated_after_vowel_or_soft["zle-ops"] = setmetatable({
["Ѫ"] = 'Ѭ', ["ѫ"] = 'ѭ',
}, {__index = common_iotated_after_vowel_or_soft})
uo_is_u["zle-ops"] = true
local function handle_v(prev, v)
return prev .. (v == "Ѵ" and "В" or "в")
end
local function handle_ou(o, ac)
return (ulower(o) == o and "у" or "У") .. ac
end
local function handle_rough_breathing(base1, base2, diacritics)
-- Mark rough breathing with "h".
local base2_lower = ulower(base2)
if not vowels:match(base2_lower) then
return base1 .. base2 .. diacritics .. "h"
end
local base1_lower = ulower(base1)
if not vowels:match(base1_lower) then
return base1 .. (base2_lower == base2 and "h" or "H") .. base2_lower .. diacritics
end
return (base1_lower == base1 and "h" or "H") .. base1 .. base2 .. diacritics
end
function export.tr(text, lang, sc)
if not sc then
sc = require("Module:languages").getByCode(lang, nil, true):findBestScript(text):getCode()
end
if sc ~= "Cyrs" then
return nil
end
local input = text
-- Decompose any acute and grave accents.
text = ugsub(toNFD(text), "[^" .. acute .. grave .. "]+", toNFC)
-- Canonicalize any variants.
text = text:gsub(".[\128-\191]*", variants)
-- Transliterate the palatalization mark as prime.
text = text:gsub(palatalization, "ʹ")
-- Treat "Ѵ" as the consonant "В" (transliterated "V") in diphthongs that
-- correspond to Ancient Greek "αυ", "ευ" and "ηυ" (equivalent to "аѵ", "еѵ"
-- and "иѵ"). Note that "ιυ" ("іѵ") is not a diphthong, and "ου" ("оѵ") is
-- a long vowel. However, this doesn't apply to "Ѷ", as the diacritic means
-- it must be treated as a vowel.
text = ugsub(text, "([аАеЕиИꙗꙖѥѤ]" .. accent .. ")([ѵѴ])", handle_v)
local letters = lang_letters[lang] or common_letters
-- Convert "ѿ" to "ѡт" if followed by a non-iotated vowel (including those
-- which iotate only after vowels) or a palatalization mark, and "ѡтъ" in
-- all other cases.
text = ugsub(text, "([ѿѾ])(" .. accent .. ")()", function(ot, ac, loc)
ot = (ot == "Ѿ" and "Ѡ" or "ѡ") .. ac .. "т"
local nxt = toNFD(usub(text, loc, loc):gsub(".[\128-\191]*", letters))
if not umatch(nxt, "^" .. vowel_or_soft) then
ot = ot .. "ъ"
end
return ot
end)
-- Handle any vowels which are iotated at the start of words.
local iotated_initial = lang_iotated_initial[lang] or common_iotated_initial
-- Not possible to input iotated_initial directly, as mw.ustring.gsub
-- doesn't respect metamethods...
text = ugsub(text, "%f[%w].", function(m)
return iotated_initial[m]
end)
-- Handle any vowels which are iotated after another vowel or a
-- palatalization mark.
local iotated_after_vowel_or_soft = lang_iotated_after_vowel_or_soft[lang] or common_iotated_after_vowel_or_soft
text = ugsub(text, "()(" .. accent .. ")(.)", function(loc, ac, letter)
local iotated = iotated_after_vowel_or_soft[letter]
if iotated then
loc = loc - 1
local prev = toNFD((loc == 0 and "" or usub(text, loc, loc)):gsub(".[\128-\191]*", letters))
if umatch(prev, vowel_or_soft .. "%W*$") then
return ac .. iotated
end
end
end)
-- Treat "ъі" as "ꙑ", and make "ъ" tense ("ŷ") before "и" or an iotated
-- vowel.
text = ugsub(text, "([Ъъ])(" .. accent .. ")()([иИіІ]?)", function(yer, ac, loc, i)
local nxt = toNFD(usub(text, loc, loc):gsub(".[\128-\191]*", letters)):match("^[iIjJ]")
if nxt ~= nil then
return (yer == "Ъ" and "Ꙑ" or "ꙑ") .. ((i == "і" or i == "І") and ac or circumflex .. ac .. i)
end
end)
-- In some languages, treat "уо" ("uo") as "у" ("u").
if uo_is_u[lang] then
-- Not "ꚙ", which is an orthographically doubled "о".
text = ugsub(text, "([уУѵѴѷѶ]" .. accent .. ")[оО]", "%1")
end
-- Treat "оу" ("ou") as "у" ("u") (but not "ꚙ").
text = ugsub(text, "([оО])(" .. accent .. ")[уУѵѴѷѶ]", handle_ou)
-- Substitute any numbers.
for key, repl in pairs(numbers) do
text = ugsub(text, key, repl)
end
-- Main substitution.
text = text:gsub(".[\128-\191]*", letters)
-- Handle any rough breathing marks.
-- FIXME: this can't handle various edge cases.
text = ugsub(toNFD(text), "(%w)(%w?)([^%w%s]*)[" .. dasia .. "]", handle_rough_breathing)
if umatch(text, "[" .. breathing .. "]") then
error("Invalid breathing marks in input " .. mw.dumpObject(input))
end
-- Final substitutions.
text = text:gsub(".[\128-\191]*", final_substitutions)
return toNFC(text)
end
return export