Module:ar-translit: Difference between revisions

From Linguifex
Jump to navigation Jump to search
Created page with "-- Authors: Benwing, ZxxZxxZ, Atitarev local export = {} local U = mw.ustring.char local rfind = mw.ustring.find local rsubn = mw.ustring.gsub local rmatch = mw.ustring.matc..."
 
No edit summary
Line 3: Line 3:
local export = {}
local export = {}


local U = mw.ustring.char
local m_str_utils = require("Module:string utilities")
local rfind = mw.ustring.find
 
local rsubn = mw.ustring.gsub
local gcodepoint = m_str_utils.gcodepoint
local rmatch = mw.ustring.match
local rfind = m_str_utils.find
local rsplit = mw.text.split
local rsubn = m_str_utils.gsub
local gcodepoint = mw.ustring.gcodepoint
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility


-- assigned below
-- assigned below
Line 19: Line 22:
end
end


local zwnj = U(0x200c) -- zero-width non-joiner
local zwnj = U(0x200C) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif_hamza_below = U(0x625)
Line 38: Line 41:
local dagger_alif = U(0x670)
local dagger_alif = U(0x670)
local alif_waSl = U(0x671)
local alif_waSl = U(0x671)
--local zwj = U(0x200d) -- zero-width joiner
--local zwj = U(0x200D) -- zero-width joiner
local lrm = U(0x200e) -- left-to-right mark
local lrm = U(0x200E) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local rlm = U(0x200F) -- right-to-left mark
-- Occurs after al- in allaḏī and variants so that we can implement elision of
-- a- after a preceding vowel, after which we remove the marker.
local alladi_marker = U(0xFFF0)


local tt = {
local tt = {
Line 46: Line 52:
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʿ", ["غ"]="ḡ",
["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʕ", ["غ"]="ḡ",
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
["ه"]="h",
["ه"]="h",
Line 57: Line 63:
-- [zwj]="", -- ZWJ (zero-width joiner)
-- [zwj]="", -- ZWJ (zero-width joiner)
-- rare letters
-- rare letters
["پ"]="p", ["چ"]="č", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g", ["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g",
["پ"]="p", ["چ"]="č", ["ژ"]="ž", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g",
["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g",
-- semivowels or long vowels, alif, hamza, special letters
-- semivowels or long vowels, alif, hamza, special letters
["ا"]="ā", -- ʾalif
["ا"]="ā", -- ʾalif
-- hamzated letters
-- hamzated letters
["أ"]="ʾ", -- hamza over alif
["أ"]="ʔ", -- hamza over alif
[alif_hamza_below]="ʾ", -- hamza under alif
[alif_hamza_below]="ʔ", -- hamza under alif
["ؤ"]="ʾ", -- hamza over wāw
["ؤ"]="ʔ", -- hamza over wāw
["ئ"]="ʾ", -- hamza over yā
["ئ"]="ʔ", -- hamza over yā
["ء"]="ʾ", -- hamza on the line
["ء"]="ʔ", -- hamza on the line
-- long vowels
-- long vowels
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic
[waaw]="w", --"ū" after ḍamma (u) and not before diacritic
[yaa]="y", --"ī" after kasra (i) and not before diacritic
[yaa]="y", --"ī" after kasra (i) and not before diacritic
[alif_maqSuura]="ā", -- ʾalif maqṣūra
[alif_maqSuura]="ā", -- ʾalif maqṣūra
[alif_madda]="ʾā", -- ʾalif madda
[alif_madda]="ʔā", -- ʾalif madda
[alif_waSl]= "", -- hamzatu l-waṣl
[alif_waSl]= "", -- hamzatu l-waṣl
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
Line 115: Line 122:
local sun_letters_tr = table.concat(ttsun3, "")
local sun_letters_tr = table.concat(ttsun3, "")


local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچڤگڨڧڢںڭأإؤئءةﷲ"
local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچژڤگڨڧڢںڭأإؤئءةﷲ"
-- consonants on the right side; includes alif madda
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ويآ"
local rconsonants = consonants_needing_vowels .. "ويآ"
Line 129: Line 136:
local before_diacritic_checking_subs = {
local before_diacritic_checking_subs = {
------------ transformations prior to checking for diacritics --------------
------------ transformations prior to checking for diacritics --------------
-- random Koranic marks and presentation forms
{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn)
{U(0x06DA), ""}, -- "Small High Jeem"
{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?)
{U(0x08F0), U(0x64B)}, -- "Open Fathatan"
{U(0x08F1), U(0x64C)}, -- "Open Dammatan"
{U(0x08F2), U(0x64D)}, -- "Open Kasratan"
{U(0x06E4), ""}, -- "Small High Madda" (FIXME: correct?)
{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do)
{U(0x06E5), "و"},
{U(0x06E6), "ي"},
-- convert llh for allāh into ll+shadda+dagger-alif+h
-- convert llh for allāh into ll+shadda+dagger-alif+h
{"لله", "للّٰه"},
{"لله", "للّٰه"},
Line 136: Line 154:
-- transliteration process inconvenient, so undo it.
-- transliteration process inconvenient, so undo it.
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant
{" ([" .. lconsonants .. "])" .. shadda, " %1"},
-- ignore alif jamīla (otiose alif in 3pl verb forms)
-- ignore alif jamīla (otiose alif in 3pl verb forms)
--    #1: handle ḍamma + wāw + alif (final -ū)
--    #1: handle ḍamma + wāw + alif (final -ū)
Line 181: Line 201:
{alif_waSl .. fatHa .. "?" .. laam, "l-"},
{alif_waSl .. fatHa .. "?" .. laam, "l-"},
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
-- so we don't mistakenly double the dash
-- so we don't mistakenly double the dash; insert a special marker here so
{"l%-" .. shadda, "ll"},
-- that we know later to elide the a- after a vowel
{"l%-" .. shadda, "l" .. alladi_marker .. "l"},
-- implement assimilation of sun letters
-- implement assimilation of sun letters
{"l%-[" .. sun_letters .. "]", ttsun2},
{"l%-[" .. sun_letters .. "]", ttsun2},
Line 206: Line 227:


if not force_translit and not has_diacritics(text) then
if not force_translit and not has_diacritics(text) then
require("Module:debug").track("ar-translit/lacking diacritics")
return nil
return nil
end
end
Line 227: Line 249:
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: #888888">t</span>%2')
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
'%1<span style="color: #888888">t</span>%2')
'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
'<span style="color: #888888">t</span>%1')
'<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1')
text = rsub(text, ".", {
text = rsub(text, ".", {
[fatHataan] = '<span style="color: #888888">an</span>',
[fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>',
[kasrataan] = '<span style="color: #888888">in</span>',
[kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>',
[Dammataan] = '<span style="color: #888888">un</span>'
[Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>'
})
})
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
function(vowel, space)
function(vowel, space)
vowel_repl = {
vowel_repl = {
[fatHa] = '<span style="color: #888888">a</span> ',
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ',
[kasra] = '<span style="color: #888888">i</span> ',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ',
[Damma] = '<span style="color: #888888">u</span> '
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> '
}
}
return vowel_repl[vowel] .. space
return vowel_repl[vowel] .. space
Line 249: Line 271:
)
)
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
[fatHa] = '<span style="color: #888888">a</span>',
[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>',
[kasra] = '<span style="color: #888888">i</span>',
[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>',
[Damma] = '<span style="color: #888888">u</span>'
[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>'
})
})
text = rsub(text, '</span><span style="color: #888888">', "")
text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "")
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
Line 292: Line 314:
text = rsub(text, "aā", "ā")
text = rsub(text, "aā", "ā")
-- Implement elision of al- after a final vowel. We do this
-- Implement elision of al- after a final vowel. We do this
-- conservatively, only handling elision of the definite article rather
-- conservatively, only handling elision of the definite article and related
-- terms (specifically, relative pronoun الَّذِي (allaḏī) and variants) rather
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
-- or form-VII and above verbal nouns) partly because elision in
-- or form-VII and above verbal nouns) partly because elision in
Line 298: Line 321:
-- elision in case of words written with initial bare alif instead of
-- elision in case of words written with initial bare alif instead of
-- properly with hamzated alif. Possibly we should reconsider.
-- properly with hamzated alif. Possibly we should reconsider.
-- At the very least we currently don't handle elision of الَّذِي (allaḏi)
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
-- correctly because we special-case it to appear without the hyphen;
-- perhaps we should reconsider that.
text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "]%-)",
"%1%2")
"%1%2")
if gray_i3raab then
if gray_i3raab then
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "]%-)",
text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
"%1%2")
"%1%2")
end
end
-- Special-case the transliteration of allāh, without the hyphen
-- remove indicator of allaḏī, which has served its purpose
text = rsub(text, alladi_marker, "")
-- Special-case the transliteration of allāh, without the hyphen.
text = rsub(text, "^(a?)l%-lāh", "%1llāh")
text = rsub(text, "^(a?)l%-lāh", "%1llāh")
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh")
text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh")
-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics.
text = rsub(text, "(%s)%s+", "%1")


return text
return text
Line 323: Line 347:
-- If you want to catch places without iʿrāb, comment out the next two lines.
-- If you want to catch places without iʿrāb, comment out the next two lines.
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "]$", ""},
{"[" .. lconsonants .. "](" .. space_like_class .. ")", "%1"},
{"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"},
-- remove consonants (or alif) when followed by diacritics
-- remove consonants (or alif) when followed by diacritics
-- must go after removing shadda
-- must go after removing shadda
Line 350: Line 374:
-- declared as local above
-- declared as local above
function has_diacritics(text)
function has_diacritics(text)
local orig_text = text
local count
local count
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
Line 357: Line 382:
for _, sub in ipairs(has_diacritics_subs) do
for _, sub in ipairs(has_diacritics_subs) do
text = rsub(text, unpack(sub))
text = rsub(text, unpack(sub))
end
if #text > 0 then
mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format(
orig_text, text))
end
end
return #text == 0
return #text == 0
Line 407: Line 436:


return export
return export
-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet:

Revision as of 21:10, 11 September 2025


This module will transliterate Arabic language text. It is also used to transliterate West Circassian, Avar, Chechen, Ingush, and East Circassian. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:ar-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

-- Authors: Benwing, ZxxZxxZ, Atitarev

local export = {}

local m_str_utils = require("Module:string utilities")

local gcodepoint = m_str_utils.gcodepoint
local rfind = m_str_utils.find
local rsubn = m_str_utils.gsub
local rmatch = m_str_utils.match
local rsplit = m_str_utils.split
local U = m_str_utils.char
local unpack = unpack or table.unpack -- Lua 5.2 compatibility

-- assigned below
local has_diacritics

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

local zwnj = U(0x200C) -- zero-width non-joiner
local alif_madda = U(0x622)
local alif_hamza_below = U(0x625)
local alif = U(0x627)
local taa_marbuuTa = U(0x629)
local laam = U(0x644)
local waaw = U(0x648)
local alif_maqSuura = U(0x649)
local yaa = U(0x64A)
local fatHataan = U(0x64B)
local Dammataan = U(0x64C)
local kasrataan = U(0x64D)
local fatHa = U(0x64E)
local Damma = U(0x64F)
local kasra = U(0x650)
local shadda = U(0x651)
local sukuun = U(0x652)
local dagger_alif = U(0x670)
local alif_waSl = U(0x671)
--local zwj = U(0x200D) -- zero-width joiner
local lrm = U(0x200E) -- left-to-right mark
local rlm = U(0x200F) -- right-to-left mark
-- Occurs after al- in allaḏī and variants so that we can implement elision of
-- a- after a preceding vowel, after which we remove the marker.
local alladi_marker = U(0xFFF0)

local tt = {
	-- consonants
	["ب"]="b", ["ت"]="t", ["ث"]="ṯ", ["ج"]="j", ["ح"]="ḥ", ["خ"]="ḵ",
	["د"]="d", ["ذ"]="ḏ", ["ر"]="r", ["ز"]="z", ["س"]="s", ["ش"]="š",
	["ص"]="ṣ", ["ض"]="ḍ", ["ط"]="ṭ", ["ظ"]="ẓ", ["ع"]="ʕ", ["غ"]="ḡ",
	["ف"]="f", ["ق"]="q", ["ك"]="k", ["ڪ"]="k", ["ل"]="l", ["م"]="m", ["ن"]="n",
	["ه"]="h",
	-- tāʾ marbūṭa (special) - always after a fátḥa (a), silent at the end of
	-- an utterance, "t" in ʾiḍāfa or with pronounced tanwīn. We catch
	-- most instances of tāʾ marbūṭa before we get to this stage.
	[taa_marbuuTa]="t", -- tāʾ marbūṭa = ة
	-- control characters
	[zwnj]="-", -- ZWNJ (zero-width non-joiner)
	-- [zwj]="", -- ZWJ (zero-width joiner)
	-- rare letters
	["پ"]="p", ["چ"]="č", ["ژ"]="ž", ["ڤ"]="v", ["ڥ"]="v", ["گ"]="g",
	["ڨ"]="g", ["ڧ"]="q", ["ڢ"]="f", ["ں"]="n", ["ڭ"]="g",
	-- semivowels or long vowels, alif, hamza, special letters
	["ا"]="ā", -- ʾalif
	-- hamzated letters
	["أ"]="ʔ", -- hamza over alif
	[alif_hamza_below]="ʔ", -- hamza under alif
	["ؤ"]="ʔ", -- hamza over wāw
	["ئ"]="ʔ", -- hamza over yā
	["ء"]="ʔ", -- hamza on the line
	-- long vowels
	[waaw]="w", --"ū" after ḍamma (u) and not before diacritic
	[yaa]="y", --"ī" after kasra (i) and not before diacritic
	[alif_maqSuura]="ā", -- ʾalif maqṣūra
	[alif_madda]="ʔā", -- ʾalif madda
	[alif_waSl]= "", -- hamzatu l-waṣl
	[dagger_alif] = "ā", -- ʾalif xanjariyya = dagger ʾalif (Koranic diacritic)
	-- short vowels, šádda and sukūn
	[fatHataan]="an", -- fatḥatan
	[Dammataan]="un", -- ḍammatan
	[kasrataan]="in", -- kasratan
	[fatHa]="a", -- fatḥa
	[Damma]="u", -- ḍamma
	[kasra]="i", -- kasra
	-- šadda - doubled consonant
	[sukuun]="", --sukūn - no vowel
	-- ligatures
	["ﻻ"]="lā",
	["ﷲ"]="llāh",
	-- taṭwīl
	["ـ"]="", -- taṭwīl, no sound
	-- numerals
	["١"]="1", ["٢"]="2", ["٣"]="3", ["٤"]="4", ["٥"]="5",
	["٦"]="6", ["٧"]="7", ["٨"]="8", ["٩"]="9", ["٠"]="0",
	-- punctuation (leave on separate lines)
	["؟"]="?", -- question mark
	["«"]='“', -- quotation mark
	["»"]='”', -- quotation mark
	["٫"]=".", -- decimal point
	["٬"]=",", -- thousands separator
	["٪"]="%", -- percent sign
	["،"]=",", -- comma
	["؛"]=";" -- semicolon
}

local sun_letters = "تثدذرزسشصضطظلن"
-- For use in implementing sun-letter assimilation of ال (al-)
local ttsun1 = {}
local ttsun2 = {}
local ttsun3 = {}
for cp in gcodepoint(sun_letters) do
	local ch = U(cp)
	ttsun1[ch] = tt[ch]
	ttsun2["l-" .. ch] = tt[ch] .. "-" .. ch
	table.insert(ttsun3, tt[ch])
end
-- For use in implementing elision of al-
local sun_letters_tr = table.concat(ttsun3, "")

local consonants_needing_vowels = "بتثجحخدذرزسشصضطظعغفقكڪلمنهپچژڤگڨڧڢںڭأإؤئءةﷲ"
-- consonants on the right side; includes alif madda
local rconsonants = consonants_needing_vowels .. "ويآ"
-- consonants on the left side; does not include alif madda
local lconsonants = consonants_needing_vowels .. "وي"
-- Arabic semicolon, comma, question mark; taṭwīl; period, exclamation point,
-- single quote for bold/italic, double quotes for quoted material
local punctuation = "؟،؛" .. "ـ" .. ".!'" .. '"'
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. "]"
local numbers = "١٢٣٤٥٦٧٨٩٠"

local before_diacritic_checking_subs = {
	------------ transformations prior to checking for diacritics --------------
	-- random Koranic marks and presentation forms
	{U(0x06E1), sukuun}, -- "Small High Dotless Head of Khah" (variant of sukūn)
	{U(0x06DA), ""}, -- "Small High Jeem"
	{U(0x06DF), ""}, -- "Small High Rounded Zero" (FIXME: correct?)
	{U(0x08F0), U(0x64B)}, -- "Open Fathatan"
	{U(0x08F1), U(0x64C)}, -- "Open Dammatan"
	{U(0x08F2), U(0x64D)}, -- "Open Kasratan"
	{U(0x06E4), ""}, -- "Small High Madda" (FIXME: correct?)
	{U(0x06D6), ""}, -- "Small High Ligature Sad with Lam with Alef Maksura" (FIXME: there are others we need to do)
	{U(0x06E5), "و"},
	{U(0x06E6), "ي"},
	-- convert llh for allāh into ll+shadda+dagger-alif+h
	{"لله", "للّٰه"},
	-- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets
	-- replaced with short-vowel+shadda during NFC normalisation, which
	-- MediaWiki does for all Unicode strings; however, it makes the
	-- transliteration process inconvenient, so undo it.
	{"([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. dagger_alif .. "])" .. shadda, shadda .. "%1"},
	-- ignore Koranic gemination at beginning of word due to assimilation of preceding consonant
	{" ([" .. lconsonants .. "])" .. shadda, " %1"},
	-- ignore alif jamīla (otiose alif in 3pl verb forms)
	--     #1: handle ḍamma + wāw + alif (final -ū)
	{Damma .. waaw .. alif, Damma .. waaw},
	--     #2: handle wāw + sukūn + alif (final -w in -aw in defective verbs)
	--     this must go before the generation of w, which removes the waw here.
	{waaw .. sukuun .. alif, waaw .. sukuun},
	-- ignore final alif or alif maqṣūra following fatḥatan (e.g. in accusative
	-- singular or words like عَصًا "stick" or هُدًى "guidance"; this is called
	-- tanwin nasb)
	{fatHataan .. "[" .. alif .. alif_maqSuura .. "]", fatHataan},
	-- same but with the fatḥatan placed over the alif or alif maqṣūra
	-- instead of over the previous letter (considered a misspelling but
	-- common)
	{"[" .. alif .. alif_maqSuura .. "]" .. fatHataan, fatHataan},
	-- tāʾ marbūṭa should always be preceded by fatḥa, alif, alif madda or
	-- dagger alif; infer fatḥa if not
	{"([^" .. fatHa .. alif .. alif_madda .. dagger_alif .. "])" .. taa_marbuuTa, "%1" .. fatHa .. taa_marbuuTa},
	-- similarly for alif between consonants, possibly marked with shadda
	-- (does not apply to initial alif, which is silent when not marked with
	-- hamza, or final alif, which might be pronounced as -an)
	{"([" .. lconsonants .. "]" .. shadda .. "?)" .. alif .. "([" .. rconsonants .. "])",
		"%1" .. fatHa .. alif .. "%2"},
	-- infer fatḥa in case of non-fatḥa + alif/alif-maqṣūra + dagger alif
	{"([^" .. fatHa .. "])([" .. alif .. alif_maqSuura .. "]" .. dagger_alif .. ")", "%1" .. fatHa .. "%2"},
	-- infer kasra in case of hamza-under-alif not + kasra
	{alif_hamza_below .. "([^" .. kasra .. kasrataan .. "])", alif_hamza_below .. kasra .. "%1"},
	-- ignore dagger alif placed over regular alif or alif maqṣūra
	{"([" .. alif .. alif_maqSuura .. "])" .. dagger_alif, "%1"},

	----------- rest of these concern definite article alif-lām ----------
	-- in kasra/ḍamma + alif + lam, make alif into hamzatu l-waṣl, so we
	-- handle cases like بِالتَّوْفِيق (bi-t-tawfīq) correctly
	{"([" .. Damma .. kasra .. "])" .. alif .. laam, "%1" .. alif_waSl .. laam},
	-- al + consonant + shadda (only recognize word-initially if regular alif): remove shadda
	{"^(" .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
	{"(" .. space_like_class .. alif .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
	{"(" .. alif_waSl .. fatHa .. "?" .. laam .. "[" .. lconsonants .. "])" .. shadda, "%1"},
	-- handle l- hamzatu l-waṣl or word-initial al-
	{"^" .. alif .. fatHa .. "?" .. laam, "al-"},
	{"(" .. space_like_class .. ")" .. alif .. fatHa .. "?" .. laam, "%1al-"},
	-- next one for bi-t-tawfīq
	{"([" .. Damma .. kasra .. "])" .. alif_waSl .. fatHa .. "?" .. laam, "%1-l-"},
	-- next one for remaining hamzatu l-waṣl (at beginning of word)
	{alif_waSl .. fatHa .. "?" .. laam, "l-"},
	-- special casing if the l in al- has a shadda on it (as in الَّذِي "that"),
	-- so we don't mistakenly double the dash; insert a special marker here so
	-- that we know later to elide the a- after a vowel
	{"l%-" .. shadda, "l" .. alladi_marker .. "l"},
	-- implement assimilation of sun letters
	{"l%-[" .. sun_letters .. "]", ttsun2},
}

-- Transliterate the word(s) in TEXT. LANG (the language) and SC (the script)
-- are ignored. OMIT_I3RAAB means leave out final short vowels (ʾiʿrāb).
-- GRAY_I3RAAB means render transliterate short vowels (ʾiʿrāb) in gray.
-- FORCE_TRANSLIT causes even non-vocalized text to be transliterated
-- (normally the function checks for non-vocalized text and returns nil,
-- since such text is ambiguous in transliteration).
function export.tr(text, lang, sc, omit_i3raab, gray_i3raab, force_translit)
	-- make it possible to call this function from a template
	if type(text) == "table" then
		local function f(x) return (x ~= "") and x or nil end
		text, lang, sc, omit_i3raab, force_translit =
			f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
	end

	for _, sub in ipairs(before_diacritic_checking_subs) do
		text = rsub(text, sub[1], sub[2])
	end

	if not force_translit and not has_diacritics(text) then
		require("Module:debug").track("ar-translit/lacking diacritics")
		return nil
	end
	
	------------ transformations after checking for diacritics --------------
	-- Replace plain alif with hamzatu l-waṣl when followed by fatḥa/ḍamma/kasra.
	-- Must go after handling of initial al-, which distinguishes alif-fatḥa
	-- from alif w/hamzatu l-waṣl. Must go before generation of ū and ī, which
	-- eliminate the ḍamma/kasra.
	text = rsub(text, alif .. "([" .. fatHa .. Damma .. kasra .. "])", alif_waSl .. "%1")
	-- ḍamma + waw not followed by a diacritic is ū, otherwise w
	text = rsub(text, Damma .. waaw .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. "])", "ū%1")
	text = rsub(text, Damma .. waaw .. "$", "ū")
	-- kasra + yaa not followed by a diacritic (or ū from prev step) is ī, otherwise y
	text = rsub(text, kasra .. yaa .. "([^" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. shadda .. sukuun .. dagger_alif .. "ū])", "ī%1")
	text = rsub(text, kasra .. yaa .. "$", "ī")
	-- convert shadda to double letter.
	text = rsub(text, "(.)" .. shadda, "%1%1")
	if not omit_i3raab and gray_i3raab then -- show ʾiʿrāb grayed in transliteration
		-- decide whether to gray out the t in ﺓ. If word begins with al- or l-, yes.
		-- Otherwise, no if word ends in a/i/u, yes if ends in an/in/un.
		text = rsub(text, "^(a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
			'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
		text = rsub(text, "(" .. space_like_class .. "a?l%-[^%s]+)" .. taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. "])",
			'%1<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%2')
		text = rsub(text, taa_marbuuTa .. "([" .. fatHa .. Damma .. kasra .. "])", "t%1")
		text = rsub(text, taa_marbuuTa .. "([" .. fatHataan .. Dammataan .. kasrataan .. "])",
			'<span style="color: var(--wikt-palette-grey-8,#888)">t</span>%1')
		text = rsub(text, ".", {
			[fatHataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">an</span>',
			[kasrataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">in</span>',
			[Dammataan] = '<span style="color: var(--wikt-palette-grey-8,#888)">un</span>'
		})
		text = rsub(text, "([" .. fatHa .. Damma .. kasra .. "])(" .. space_like_class .. ")",
			function(vowel, space)
				vowel_repl = {
					[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span> ',
					[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span> ',
					[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span> '
				}
				return vowel_repl[vowel] .. space
			end
		)
		text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", {
			[fatHa] = '<span style="color: var(--wikt-palette-grey-8,#888)">a</span>',
			[kasra] = '<span style="color: var(--wikt-palette-grey-8,#888)">i</span>',
			[Damma] = '<span style="color: var(--wikt-palette-grey-8,#888)">u</span>'
		})
		text = rsub(text, '</span><span style="color: var(--wikt-palette-grey-8,#888)">', "")
	elseif omit_i3raab then -- omit ʾiʿrāb in transliteration
		text = rsub(text, "[" .. fatHataan .. Dammataan .. kasrataan .. "]", "")
		text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "](" .. space_like_class .. ")", "%1")
		text = rsub(text, "[" .. fatHa .. Damma .. kasra .. "]$", "")
	end
	-- tāʾ marbūṭa should not be rendered by -t if word-final even when
	-- ʾiʿrāb (desinential inflection) is shown; instead, use (t) before
	-- whitespace, nothing when final; but render final -ﺍﺓ and -ﺁﺓ as -āh,
	-- consistent with Wehr's dictionary
	-- Left-to-right or right-to-left mark at end of text will prevent tāʾ marbūṭa
	-- from being transliterated correctly.
	text = string.gsub(text, lrm, "")
	text = string.gsub(text, rlm, "")
	text = rsub(text, "([" .. alif .. alif_madda .. "])" .. taa_marbuuTa .. "$", "%1h")
	-- Ignore final tāʾ marbūṭa (it appears as "a" due to the preceding
	-- short vowel). Need to do this after graying or omitting word-final
	-- ʾiʿrāb.
	text = rsub(text, taa_marbuuTa .. "$", "")
	text = rsub(text, taa_marbuuTa .. "(%p)", "%1")
	if not omit_i3raab then -- show ʾiʿrāb in transliteration
		text = rsub(text, taa_marbuuTa .. "(" .. space_like_class .. ")", "(t)%1")
	else
		-- When omitting ʾiʿrāb, show all non-absolutely-final instances of
		-- tāʾ marbūṭa as (t), with trailing ʾiʿrāb omitted.
		text = rsub(text, taa_marbuuTa, "(t)")
	end
	-- tatwīl should be rendered as - at beginning or end of word. It will
	-- be rendered as nothing in the middle of a word (FIXME, do we want
	-- this?)
	text = rsub(text, "^ـ", "-")
	text = rsub(text, "(" .. space_like_class .. ")ـ",
		"%1-")
	text = rsub(text, "ـ$", "-")
	text = rsub(text, "ـ(" .. space_like_class .. ")", "-%1")
	-- Now convert remaining Arabic chars according to table.
	text = rsub(text, ".", tt)
	text = rsub(text, "aā", "ā")
	-- Implement elision of al- after a final vowel. We do this
	-- conservatively, only handling elision of the definite article and related
	-- terms (specifically, relative pronoun الَّذِي (allaḏī) and variants) rather
	-- than elision in other cases of hamzat al-waṣl (e.g. form-I imperatives
	-- or form-VII and above verbal nouns) partly because elision in
	-- these cases isn't so common in MSA and partly to avoid excessive
	-- elision in case of words written with initial bare alif instead of
	-- properly with hamzated alif. Possibly we should reconsider.
	text = rsub(text, "([aiuāīū]'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
		"%1%2")
	if gray_i3raab then
		text = rsub(text, "([aiuāīū]'*</span>'* +'*)a([" .. sun_letters_tr .. "][%-" .. alladi_marker .. "])",
			"%1%2")
	end
	-- remove indicator of allaḏī, which has served its purpose
	text = rsub(text, alladi_marker, "")
	-- Special-case the transliteration of allāh, without the hyphen.
	text = rsub(text, "^(a?)l%-lāh", "%1llāh")
	text = rsub(text, "(" .. space_like_class .. "a?)l%-lāh", "%1llāh")
	-- Compress multiple spaces, which may occur e.g. when removing Koranic diacritics.
	text = rsub(text, "(%s)%s+", "%1")

	return text
end

local has_diacritics_subs = {
	-- FIXME! What about lam-alif ligature?
	-- remove punctuation and shadda
	-- must go before removing final consonants
	{"[" .. punctuation .. shadda .. "]", ""},
	-- Remove consonants at end of word or utterance, so that we're OK with
	-- words lacking iʿrāb (must go before removing other consonants).
	-- If you want to catch places without iʿrāb, comment out the next two lines.
	{"[" .. lconsonants .. "]$", ""},
	{"[" .. lconsonants .. "]([%)%]}]?" .. space_like_class .. ")", "%1"},
	-- remove consonants (or alif) when followed by diacritics
	-- must go after removing shadda
	-- do not remove the diacritics yet because we need them to handle
	-- long-vowel sequences of diacritic + pseudo-consonant
	{"[" .. lconsonants .. alif .. "]([" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. "])", "%1"},
	-- the following two must go after removing consonants w/diacritics because
	-- we only want to treat vocalic wāw/yā' in them (we want to have removed
	-- wāw/yā' followed by a diacritic)
	-- remove ḍamma + wāw
	{Damma .. waaw, ""},
	-- remove kasra + yā'
	{kasra .. yaa, ""},
	-- remove fatḥa/fatḥatan + alif/alif-maqṣūra
	{"[" .. fatHataan .. fatHa .. "][" .. alif .. alif_maqSuura .. "]", ""},
	-- remove diacritics
	{"[" .. fatHataan .. Dammataan .. kasrataan .. fatHa .. Damma .. kasra .. sukuun .. dagger_alif .. "]", ""},
	-- remove numbers, hamzatu l-waṣl, alif madda
	{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
	-- remove non-Arabic characters
	{"[^" .. U(0x0600) .. "-" .. U(0x06FF) .. U(0x0750) .. "-" .. U(0x077F) ..
			 U(0x08A0) .. "-" .. U(0x08FF) .. U(0xFB50) .. "-" .. U(0xFDFF) ..
			 U(0xFE70) .. "-" .. U(0xFEFF) .. "]", ""}
}

-- declared as local above
function has_diacritics(text)
	local orig_text = text
	local count
	text, count = rsubn(text, "[" .. lrm .. rlm .. "]", "")
	if count > 0 then
		require("Module:debug").track("ar-translit/lrm or rlm")
	end
	for _, sub in ipairs(has_diacritics_subs) do
		text = rsub(text, unpack(sub))
	end
	if #text > 0 then
		mw.log(("Check for missing diacritics failed; original text '%s', text without diacritics '%s'"):format(
			orig_text, text))
	end
	return #text == 0
end

-- Return true if transliteration TR is an irregular transliteration of
-- ARABIC. Return false if ARABIC can't be transliterated. For purposes of
-- establishing regularity, hyphens are ignored and word-final tāʾ marbūṭa
-- can be transliterated as "(t)", "" or "t".
function export.irregular_translit(arabic, tr)
	if not arabic or arabic == "" or not tr or tr == "" then
		return false
	end
	local regtr = export.tr(arabic)
	if not regtr or regtr == tr then
		return false
	end
	local arwords = rsplit(arabic, " ")
	local regwords = rsplit(regtr, " ")
	local words = rsplit(tr, " ")
	if #regwords ~= #words or #regwords ~= #arwords then
		return true
	end
	for i=1,#regwords do
		local regword = regwords[i]
		local word = words[i]
		local arword = arwords[i]
		-- Resolve final (t) in auto-translit to t, h or nothing
		if rfind(regword, "%(t%)$") then
			regword = rfind(word, "āh$") and rsub(regword, "%(t%)$", "h") or
				rfind(word, "t$") and rsub(regword, "%(t%)$", "t") or
				rsub(regword, "%(t%)$", "")
		end
		-- Resolve clitics + short a + alif-lām, which may get auto-transliterated
		-- to contain long ā, to short a if the manual translit has it; note
		-- that currently in cases with assimilated l, the auto-translit will
		-- fail, so we won't ever get here and don't have to worry about
		-- auto-translit l against manual-translit assimilated char.
		local clitic_chars = "^[وفكل]" -- separate line to avoid L2R display weirdness
		if rfind(arword, clitic_chars .. fatHa .. "?[" .. alif .. alif_waSl .. "]" .. laam) and rfind(word, "^[wfkl]a%-") then
			regword = rsub(regword, "^([wfkl])ā", "%1a")
		end
		-- Ignore hyphens when comparing
		if rsub(regword, "%-", "") ~= rsub(word, "%-", "") then
			return true
		end
	end
	return false
end

return export