Module:tt-translit: Difference between revisions
Jump to navigation
Jump to search
Created page with "local export = {} local rsubn = mw.ustring.gsub -- version of rsubn() that discards all but the first return value local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end -- apply rsub() repeatedly until no change local function rsub_repeatedly(term, foo, bar) while true do local new_term = rsub(term, foo, bar) if new_term == term then return term end term = new_term end end local tt = { ["ү"]="ü",['Ү']='Ü', ["..." |
m 1 revision imported |
||
| (One intermediate revision by one other user not shown) | |||
| Line 21: | Line 21: | ||
local tt = { | local tt = { | ||
[ | ['б']='b',['Б']='B', ['в']='w',['В']='W', ['г']='g',['Г']='G', ['д']='d',['Д']='D', | ||
[ | ['з']='z',['З']='Z', ['й']='y',['Й']='Y', ['к']='k',['К']='K', ['л']='l',['Л']='L', | ||
[ | ['м']='m',['М']='M', ['н']='n',['Н']='N', ['п']='p',['П']='P', ['р']='r',['Р']='R', | ||
['с']='s',['С']='S', ['т']='t',['Т']='T', ['ф']='f',['Ф']='F', ['х']='x',['Х']='X', | |||
['ч']='ç',['Ч']='Ç', ['ш']='ş',['Ш']='Ş', | |||
[ | ['җ']='c',['Җ']='C', ['ң']='ñ',['Ң']='Ñ', ['һ']='h',['Һ']='H', | ||
[ | ['ж']='j',['Ж']='J', ['ц']='ts',['Ц']='Ts', ['щ']='şç',['Щ']='Şç', ['ё']='yo',['Ё']='Yo', | ||
[ | ['а']='a',['А']='A', ['ы']='ı',['Ы']='I', ['о']='o',['О']='O', ['у']='u',['У']='U', | ||
} | ['ә']='ä',['Ә']='Ä', ['э']='e',['Э']='E', ['и']='i',['И']='İ', ['ө']='ö',['Ө']='Ö', ['ү']='ü',['Ү']='Ü', | ||
--['я']='ya',['Я']='Ya', ['е']='ye',['Е']='Ye', ['ю']='yu',['Ю']='Yu', | |||
--['е']='e',['Е']='E', | |||
['ь']='ʹ',['Ь']='ʹ', ['ъ']='ʺ',['Ъ']='ʺ', | |||
['ҡ']='q',['Ҡ']='Q', ['ғ']='ğ',['Ғ']='Ğ', ['Ӹ']='Iy',['ӹ']='ıy', | |||
} | |||
local consonants = 'БВГДЗЙКЛМНПРСТФХЧШҖҢҺбвгдзйклмнпрстфхчшҗңһЖЦЩжцщҠҒҡғ' | |||
local vowels_hard = 'АЫӸОУаыӹоу' | |||
local vowels_soft = 'ӘЭИӨҮәэиөү' | |||
local vowels_iotated = 'ЯЕЮяею' -- ё is only in loans | |||
local consonants_soft2hard = {['К']='Ҡ', ['Г']='Ғ', ['к']='ҡ', ['г']='ғ'} | |||
local vowels_hard2soft = {['А']='Ә', ['Ы']='Э', ['Ӹ']='И', ['О']='Ө', ['У']='Ү', ['а']='ә', ['ы']='э', ['ӹ']='и', ['о']='ө', ['у']='ү'} | |||
local vowels_iotated_expanded_hard = {['Я']='Йа', ['Е']='Йы', ['Ю']='Йу', ['я']='йа', ['е']='йы', ['ю']='йу'} | |||
local vowels_iotated_expanded_soft = {['Я']='Йә', ['Е']='Йэ', ['Ю']='Йү', ['я']='йә', ['е']='йэ', ['ю']='йү'} | |||
local tt_Arab_New = { | |||
-- [[s:mul:Рус мәктәпләре өчен татар теле дәреслеге/13]] | |||
-- XXX: need to investigate the most appropriate Unicode codepoints to use for tt-Arab | |||
['ا']='а', ['ە']='ә', | |||
['ب']='б', ['پ']='п', ['ت']='т', | |||
['ج']='җ', ['چ']='ч', ['ح']='х', | |||
['د']='д', | |||
['ر']='р', ['ز']='з', ['ژ']='ж', | |||
['س']='с', ['ش']='ш', | |||
['ع']='ғ', | |||
['ف']='ф', ['ق']='ҡ', ['ک']='к', ['گ']='г', ['ڭ']='ң', | |||
['ل']='л', | |||
['م']='м', | |||
['ن']='н', | |||
['ۇ']='ө', ['و']='ү', ['ۋ']='в', | |||
['ه']='һ', | |||
['ىُ']='э', ['ی']='и', | |||
['ث']='с', ['خ']='х', ['ذ']='з', ['ص']='с', ['ض']='з', ['ط']='т', ['ظ']='з', ['غ']='ғ', | |||
['ئ']='ь', | |||
['ࢭ']='ъ', | |||
['۱']='1', ['۲']='2', ['۳']='3', ['۴']='4', ['۵']='5', | |||
['۶']='6', ['۷']='7', ['۸']='8', ['۹']='9', ['۰']='0', | |||
['١']='1', ['٢']='2', ['٣']='3', ['٤']='4', ['٥']='5', | |||
['٦']='6', ['٧']='7', ['٨']='8', ['٩']='9', ['٠']='0', | |||
['،']=',', ['؟']='?', | |||
} | |||
-- excluding ә/а. яңа имля has separate letters | |||
--local vowels_soft2hard = {['э']='ы', ['и']='ӹ', ['ө']='о', ['ү']='у'} | |||
-- XXX: keep и for now. less unsightly and more common than ый? can и vs. ый even be predicted accurately? | |||
local vowels_soft2hard = {['э']='ы', ['и']='и', ['ө']='о', ['ү']='у'} | |||
function export.tr(text, lang, sc) | function export.tr(text, lang, sc) | ||
text = rsub( | if sc == 'tt-Arab' then | ||
text, | -- яңа имля. | ||
-- automatic insertion of э/ы would be Cool | |||
-- but maybe we don't have to worry about that | |||
-- since яңалиф also omits them. | |||
-- visualize the continuity between the two. | |||
-- also, insertion would wreak havoc on иске имля | |||
-- quick fixes for иске имля? | |||
text = rsub(text, '^او', 'ئو') | |||
text = rsub(text, '^ای', 'ئی') | |||
text = rsub(text, '^آ', 'ئا') | |||
text = rsub(text, '^ا', 'ئە') | |||
text = rsub(text, '([%p%s])او', '%1ئو') | |||
text = rsub(text, '([%p%s])ای', '%1ئی') | |||
text = rsub(text, '([%p%s])آ', '%1ئا') | |||
text = rsub(text, '([%p%s])ا', '%1ئە') | |||
text = rsub(text, 'ه$', 'ە') | |||
text = rsub(text, 'ه([%p%s' .. mw.ustring.char(0x200C) .. '])', 'ە%1') | |||
text = rsub(text, mw.ustring.char(0x200C), '') -- ZERO WIDTH NON-JOINER | |||
text = rsub(text, '(.)' .. mw.ustring.char(0x0651), '%1%1') -- SHADDA | |||
text = rsub(text, 'ىُ', tt_Arab_New) -- `э/ы` is not atomic in Unicode | |||
text = rsub(text, '.', tt_Arab_New) | |||
text = rsub(text, 'ии([әэөаү])', 'ий%1') | |||
text = rsub(text, 'и([әэөаү])', 'й%1') | |||
text = rsub(text, '([әэөаүи])и', '%1й') | |||
text = rsub(text, 'ү([әэөаи])', 'в%1') | |||
text = rsub(text, '([әэөаиү])ү', '%1в') | |||
text = rsub(text, | |||
'([^%p%s]+)', | |||
function(text) | |||
text = rsub(text, mw.ustring.format('^(ъ?)и([%s])', consonants), '%1й%2') | |||
text = rsub(text, mw.ustring.format('^(ь)([%s])', consonants), '%1э%2') | |||
if mw.ustring.match(text, '[ъаҡғ]') then | |||
text = rsub(text, mw.ustring.format('([%s])', vowels_soft), vowels_soft2hard) | |||
end | |||
text = rsub(text, '^ъ', '') | |||
text = rsub(text, '^ь', '') | |||
return text | |||
end | |||
) | |||
text = rsub(text, '.', tt) | |||
return text | |||
end | |||
-- normalize pure vocalic e | |||
text = rsub(text, | |||
mw.ustring.format('([%s])([Ее])', consonants), | |||
function(consonant, e) | |||
local uniotated = {['Е']='Э', ['е']='э'} | |||
return consonant .. uniotated[e] | |||
end | |||
) | ) | ||
-- | -- simplify handling ый | ||
text = | text = rsub(text, 'Ы[Йй]', 'Ӹ') | ||
text = rsub(text, 'ый', 'ӹ') | |||
text = rsub(text, | -- Russian loan sounds | ||
-- XXX: an idea: identify Russian loans by adding an accent mark? | |||
--text = rsub(text, 'ия', 'ийә') | |||
-- | -- process iotated soft vowels | ||
-- | |||
-- | |||
text = rsub(text, | text = rsub(text, | ||
mw.ustring.format('([%s])([%s]*[%s])', vowels_iotated, consonants, vowels_soft), | |||
function( | function(vowel_iotated, following) | ||
return | return vowels_iotated_expanded_soft[vowel_iotated] .. following | ||
end | end | ||
) | ) | ||
text = rsub(text, | |||
mw.ustring.format('([%s])([%s]*)([Ьь])', vowels_iotated, consonants), | |||
function(vowel_iotated, following, soft_sign) | |||
return vowels_iotated_expanded_soft[vowel_iotated] .. following | |||
end | |||
) | |||
text = rsub_repeatedly(text, | |||
mw.ustring.format('([%s])([%s])', vowels_soft, vowels_iotated), | |||
function(preceding, vowel_iotated) | |||
return preceding .. vowels_iotated_expanded_soft[vowel_iotated] | |||
end | |||
) | |||
-- process iotated hard vowels | |||
text = rsub(text, | |||
mw.ustring.format('([%s])', vowels_iotated), | |||
function(vowel_iotated) | |||
return vowels_iotated_expanded_hard[vowel_iotated] | |||
end | |||
) | |||
-- verbal noun + 3rd person possessive | |||
text = rsub(text, 'үйэ', 'үвэ') | |||
-- q/ğ is indicated by using a hard vowel, even in soft vowel words | |||
text = rsub(text, | |||
mw.ustring.format('([КГкг])([%s]+)([%s])([Ъъ])', vowels_hard, consonants), | |||
function(kg, vowel, following, soft_and_glottal_sign) | |||
-- XXX: presumably this is what ъ means here | |||
return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following .. 'ь' | |||
end | |||
) | |||
text = rsub(text, | |||
mw.ustring.format('([КГкг])([%s]+)([%s]+[%s])', vowels_hard, consonants, vowels_soft), | |||
function(kg, vowel, following) | |||
return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following | |||
end | |||
) | |||
text = rsub(text, | |||
mw.ustring.format('([КГкг])([%s]+)([%s])([Ьь])', vowels_hard, consonants), | |||
function(kg, vowel, following, soft_sign) | |||
return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following | |||
end | |||
) | |||
text = rsub(text, | |||
mw.ustring.format('([%s]?)([КГкг])([%s]?)', vowels_hard, vowels_hard), | |||
function(preceding, kg, following) | |||
return preceding .. (((following ~= '') or (preceding ~= '' and following == '')) and consonants_soft2hard[kg] or kg) .. following | |||
end | |||
) | |||
text = rsub(text, '([КГкг])([Ъъ])', function(kg, hard_sign) return consonants_soft2hard[kg] end) | |||
-- excrescent y/w after i/u | |||
text = rsub_repeatedly(text, '([Ии])([' .. vowels_hard .. vowels_soft .. '])', '%1й%2') | |||
text = rsub_repeatedly(text, '([УҮуү])([' .. vowels_hard .. vowels_soft .. '])', '%1в%2') | |||
-- semivocalic w after vowels | |||
text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[УҮуү]', '%1в') | |||
-- glottal stop after vowels | |||
text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[Ээ]', '%1ь') | |||
text = rsub(text, | text = rsub(text, '.', tt) | ||
return text | return text | ||
end | end | ||
return export | return export | ||
Latest revision as of 12:44, 21 April 2026
Documentation for this module may be created at Module:tt-translit/doc
local export = {}
local rsubn = mw.ustring.gsub
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
local tt = {
['б']='b',['Б']='B', ['в']='w',['В']='W', ['г']='g',['Г']='G', ['д']='d',['Д']='D',
['з']='z',['З']='Z', ['й']='y',['Й']='Y', ['к']='k',['К']='K', ['л']='l',['Л']='L',
['м']='m',['М']='M', ['н']='n',['Н']='N', ['п']='p',['П']='P', ['р']='r',['Р']='R',
['с']='s',['С']='S', ['т']='t',['Т']='T', ['ф']='f',['Ф']='F', ['х']='x',['Х']='X',
['ч']='ç',['Ч']='Ç', ['ш']='ş',['Ш']='Ş',
['җ']='c',['Җ']='C', ['ң']='ñ',['Ң']='Ñ', ['һ']='h',['Һ']='H',
['ж']='j',['Ж']='J', ['ц']='ts',['Ц']='Ts', ['щ']='şç',['Щ']='Şç', ['ё']='yo',['Ё']='Yo',
['а']='a',['А']='A', ['ы']='ı',['Ы']='I', ['о']='o',['О']='O', ['у']='u',['У']='U',
['ә']='ä',['Ә']='Ä', ['э']='e',['Э']='E', ['и']='i',['И']='İ', ['ө']='ö',['Ө']='Ö', ['ү']='ü',['Ү']='Ü',
--['я']='ya',['Я']='Ya', ['е']='ye',['Е']='Ye', ['ю']='yu',['Ю']='Yu',
--['е']='e',['Е']='E',
['ь']='ʹ',['Ь']='ʹ', ['ъ']='ʺ',['Ъ']='ʺ',
['ҡ']='q',['Ҡ']='Q', ['ғ']='ğ',['Ғ']='Ğ', ['Ӹ']='Iy',['ӹ']='ıy',
}
local consonants = 'БВГДЗЙКЛМНПРСТФХЧШҖҢҺбвгдзйклмнпрстфхчшҗңһЖЦЩжцщҠҒҡғ'
local vowels_hard = 'АЫӸОУаыӹоу'
local vowels_soft = 'ӘЭИӨҮәэиөү'
local vowels_iotated = 'ЯЕЮяею' -- ё is only in loans
local consonants_soft2hard = {['К']='Ҡ', ['Г']='Ғ', ['к']='ҡ', ['г']='ғ'}
local vowels_hard2soft = {['А']='Ә', ['Ы']='Э', ['Ӹ']='И', ['О']='Ө', ['У']='Ү', ['а']='ә', ['ы']='э', ['ӹ']='и', ['о']='ө', ['у']='ү'}
local vowels_iotated_expanded_hard = {['Я']='Йа', ['Е']='Йы', ['Ю']='Йу', ['я']='йа', ['е']='йы', ['ю']='йу'}
local vowels_iotated_expanded_soft = {['Я']='Йә', ['Е']='Йэ', ['Ю']='Йү', ['я']='йә', ['е']='йэ', ['ю']='йү'}
local tt_Arab_New = {
-- [[s:mul:Рус мәктәпләре өчен татар теле дәреслеге/13]]
-- XXX: need to investigate the most appropriate Unicode codepoints to use for tt-Arab
['ا']='а', ['ە']='ә',
['ب']='б', ['پ']='п', ['ت']='т',
['ج']='җ', ['چ']='ч', ['ح']='х',
['د']='д',
['ر']='р', ['ز']='з', ['ژ']='ж',
['س']='с', ['ش']='ш',
['ع']='ғ',
['ف']='ф', ['ق']='ҡ', ['ک']='к', ['گ']='г', ['ڭ']='ң',
['ل']='л',
['م']='м',
['ن']='н',
['ۇ']='ө', ['و']='ү', ['ۋ']='в',
['ه']='һ',
['ىُ']='э', ['ی']='и',
['ث']='с', ['خ']='х', ['ذ']='з', ['ص']='с', ['ض']='з', ['ط']='т', ['ظ']='з', ['غ']='ғ',
['ئ']='ь',
['ࢭ']='ъ',
['۱']='1', ['۲']='2', ['۳']='3', ['۴']='4', ['۵']='5',
['۶']='6', ['۷']='7', ['۸']='8', ['۹']='9', ['۰']='0',
['١']='1', ['٢']='2', ['٣']='3', ['٤']='4', ['٥']='5',
['٦']='6', ['٧']='7', ['٨']='8', ['٩']='9', ['٠']='0',
['،']=',', ['؟']='?',
}
-- excluding ә/а. яңа имля has separate letters
--local vowels_soft2hard = {['э']='ы', ['и']='ӹ', ['ө']='о', ['ү']='у'}
-- XXX: keep и for now. less unsightly and more common than ый? can и vs. ый even be predicted accurately?
local vowels_soft2hard = {['э']='ы', ['и']='и', ['ө']='о', ['ү']='у'}
function export.tr(text, lang, sc)
if sc == 'tt-Arab' then
-- яңа имля.
-- automatic insertion of э/ы would be Cool
-- but maybe we don't have to worry about that
-- since яңалиф also omits them.
-- visualize the continuity between the two.
-- also, insertion would wreak havoc on иске имля
-- quick fixes for иске имля?
text = rsub(text, '^او', 'ئو')
text = rsub(text, '^ای', 'ئی')
text = rsub(text, '^آ', 'ئا')
text = rsub(text, '^ا', 'ئە')
text = rsub(text, '([%p%s])او', '%1ئو')
text = rsub(text, '([%p%s])ای', '%1ئی')
text = rsub(text, '([%p%s])آ', '%1ئا')
text = rsub(text, '([%p%s])ا', '%1ئە')
text = rsub(text, 'ه$', 'ە')
text = rsub(text, 'ه([%p%s' .. mw.ustring.char(0x200C) .. '])', 'ە%1')
text = rsub(text, mw.ustring.char(0x200C), '') -- ZERO WIDTH NON-JOINER
text = rsub(text, '(.)' .. mw.ustring.char(0x0651), '%1%1') -- SHADDA
text = rsub(text, 'ىُ', tt_Arab_New) -- `э/ы` is not atomic in Unicode
text = rsub(text, '.', tt_Arab_New)
text = rsub(text, 'ии([әэөаү])', 'ий%1')
text = rsub(text, 'и([әэөаү])', 'й%1')
text = rsub(text, '([әэөаүи])и', '%1й')
text = rsub(text, 'ү([әэөаи])', 'в%1')
text = rsub(text, '([әэөаиү])ү', '%1в')
text = rsub(text,
'([^%p%s]+)',
function(text)
text = rsub(text, mw.ustring.format('^(ъ?)и([%s])', consonants), '%1й%2')
text = rsub(text, mw.ustring.format('^(ь)([%s])', consonants), '%1э%2')
if mw.ustring.match(text, '[ъаҡғ]') then
text = rsub(text, mw.ustring.format('([%s])', vowels_soft), vowels_soft2hard)
end
text = rsub(text, '^ъ', '')
text = rsub(text, '^ь', '')
return text
end
)
text = rsub(text, '.', tt)
return text
end
-- normalize pure vocalic e
text = rsub(text,
mw.ustring.format('([%s])([Ее])', consonants),
function(consonant, e)
local uniotated = {['Е']='Э', ['е']='э'}
return consonant .. uniotated[e]
end
)
-- simplify handling ый
text = rsub(text, 'Ы[Йй]', 'Ӹ')
text = rsub(text, 'ый', 'ӹ')
-- Russian loan sounds
-- XXX: an idea: identify Russian loans by adding an accent mark?
--text = rsub(text, 'ия', 'ийә')
-- process iotated soft vowels
text = rsub(text,
mw.ustring.format('([%s])([%s]*[%s])', vowels_iotated, consonants, vowels_soft),
function(vowel_iotated, following)
return vowels_iotated_expanded_soft[vowel_iotated] .. following
end
)
text = rsub(text,
mw.ustring.format('([%s])([%s]*)([Ьь])', vowels_iotated, consonants),
function(vowel_iotated, following, soft_sign)
return vowels_iotated_expanded_soft[vowel_iotated] .. following
end
)
text = rsub_repeatedly(text,
mw.ustring.format('([%s])([%s])', vowels_soft, vowels_iotated),
function(preceding, vowel_iotated)
return preceding .. vowels_iotated_expanded_soft[vowel_iotated]
end
)
-- process iotated hard vowels
text = rsub(text,
mw.ustring.format('([%s])', vowels_iotated),
function(vowel_iotated)
return vowels_iotated_expanded_hard[vowel_iotated]
end
)
-- verbal noun + 3rd person possessive
text = rsub(text, 'үйэ', 'үвэ')
-- q/ğ is indicated by using a hard vowel, even in soft vowel words
text = rsub(text,
mw.ustring.format('([КГкг])([%s]+)([%s])([Ъъ])', vowels_hard, consonants),
function(kg, vowel, following, soft_and_glottal_sign)
-- XXX: presumably this is what ъ means here
return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following .. 'ь'
end
)
text = rsub(text,
mw.ustring.format('([КГкг])([%s]+)([%s]+[%s])', vowels_hard, consonants, vowels_soft),
function(kg, vowel, following)
return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following
end
)
text = rsub(text,
mw.ustring.format('([КГкг])([%s]+)([%s])([Ьь])', vowels_hard, consonants),
function(kg, vowel, following, soft_sign)
return consonants_soft2hard[kg] .. vowels_hard2soft[vowel] .. following
end
)
text = rsub(text,
mw.ustring.format('([%s]?)([КГкг])([%s]?)', vowels_hard, vowels_hard),
function(preceding, kg, following)
return preceding .. (((following ~= '') or (preceding ~= '' and following == '')) and consonants_soft2hard[kg] or kg) .. following
end
)
text = rsub(text, '([КГкг])([Ъъ])', function(kg, hard_sign) return consonants_soft2hard[kg] end)
-- excrescent y/w after i/u
text = rsub_repeatedly(text, '([Ии])([' .. vowels_hard .. vowels_soft .. '])', '%1й%2')
text = rsub_repeatedly(text, '([УҮуү])([' .. vowels_hard .. vowels_soft .. '])', '%1в%2')
-- semivocalic w after vowels
text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[УҮуү]', '%1в')
-- glottal stop after vowels
text = rsub(text, '([' .. vowels_hard .. vowels_soft .. '])[Ээ]', '%1ь')
text = rsub(text, '.', tt)
return text
end
return export