Module:IPA: Difference between revisions

No edit summary
No edit summary
 
(11 intermediate revisions by the same user not shown)
Line 3: Line 3:
local force_cat = false -- for testing
local force_cat = false -- for testing


local pages_module = "Module:pages"
local pron_qualifier_module = "Module:pron qualifier"
local pron_qualifier_module = "Module:pron qualifier"
local qualifier_module = "Module:qualifier"
local qualifier_module = "Module:qualifier"
Line 21: Line 22:
local gsub = string.gsub
local gsub = string.gsub
local insert = table.insert
local insert = table.insert
local is_preview = require(pages_module).is_preview
local len = m_str_utils.len
local len = m_str_utils.len
local listToText = mw.text.listToText
local listToText = mw.text.listToText
local match = string.match
local match = string.match
local pattern_escape = m_str_utils.pattern_escape
local sub = string.sub
local sub = string.sub
local u = m_str_utils.char
local u = m_str_utils.char
Line 31: Line 34:


local namespace = mw.title.getCurrentTitle().namespace
local namespace = mw.title.getCurrentTitle().namespace
local is_content_page = namespace == 0 or namespace == 118
local is_content_page = namespace == 0 or namespace == 120


local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext)
local function process_maybe_split_categories(split_output, categories, prontext, lang, errtext)
Line 55: Line 58:
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
Format a line of one or more IPA pronunciations as {{tl|IPA}} would do it, i.e. with a preceding {"IPA:"} followed by
the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category
the word {"key"} linking to an Appendix page describing the language's phonology, and with an added category
{{cd|<var>lang</var> terms with IPA pronunciation}}. Other than the extra preceding text and category, this is identical
` ``lang`` terms with IPA pronunciation`. Other than the extra preceding text and category, this is identical
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
to {format_IPA_multiple()}, and the considerations described there in the documentation apply here as well. There is a
single parameter `data`, an object with the following fields:
single parameter `data`, an object with the following fields:
Line 61: Line 64:
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   pronunciations with invalid phonemes; for determining how many syllables the pronunciations have in them, in order to
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
   add a category such as [[:Category:Italian 2-syllable words]] (for certain languages only); for adding a category
   {{cd|<var>lang</var> terms with IPA pronunciation}}; and for determining the proper sort keys for categories. Unlike
   ` ``lang`` terms with IPA pronunciation`; and for determining the proper sort keys for categories. Unlike
   for {format_IPA_multiple()}, `lang` may not be {nil}.
   for {format_IPA_multiple()}, `lang` may not be {nil}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
* `items`: List of pronunciations, in exactly the same format as for {format_IPA_multiple()}.
Line 115: Line 118:
prefix_text = '<span class="error">' .. err .. '</span>'
prefix_text = '<span class="error">' .. err .. '</span>'
else
else
if hasKey[lang:getCode()] then
prefix_text = "IPA for " .. langname
prefix_text = "Appendix:" .. langname .. " pronunciation"
else
prefix_text = "wikipedia:" .. langname .. " phonology"
end
prefix_text = "[[" .. prefix_text .. "|key]]"
prefix_text = "[[" .. prefix_text .. "|key]]"
end
end


local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"
local prefix = "[[wikt:Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. prefix_text .. ")</sup>:&#32;"


local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")
local IPAs, categories = export.format_IPA_multiple(lang, items, separator, no_count, "raw")
if is_content_page then
insert(categories, {
cat = langname .. " terms with IPA pronunciation",
sort_key = sort_key
})
end


local prontext = prefix .. IPAs
local prontext = prefix .. IPAs
Line 162: Line 154:
local function determine_repr(pron)
local function determine_repr(pron)
local reconstructed
local reconstructed
 
-- remove initial asterisk before representation marks, used on some Reconstruction pages
-- Temporarily remove any initial asterisk before representation marks,
-- which avoids having to account for it in the data, but set the
-- `reconstructed` flag.
if sub(pron, 1, 1) == "*" then
if sub(pron, 1, 1) == "*" then
reconstructed = true
reconstructed = true
pron = sub(pron, 2)
pron = sub(pron, 2)
end
end
 
local opening = match(pron, "^.[\128-\191]*")
-- Some representation types have aliases for convenience (e.g. "// //" is
local data = m_data.representation_types[opening]
-- an alias for "⫽ ⫽"). and these need to be substituted in before checking
-- for other data.
if data then
local opening, n = match(pron, "^.[\128-\191]*")
local closing = data[2]
local subs_data = m_data.representation_subs[opening]
if data and match(pron, closing .. "$", #opening + 1) then
if subs_data then
return data[1], opening, closing, reconstructed
pron, n = ugsub(pron, subs_data[1], subs_data[2])
-- If the substitution was made, `opening` needs to be changed to the
-- new opening character.
if n ~= 0 then
opening = subs_data[3]
end
end
 
-- Get the type data based on the opening character (if any), and set the
-- representation type if the closing character matches.
local type_data, repr, closing = m_data.representation_types[opening]
if type_data then
closing = type_data[2]
if type_data and match(pron, pattern_escape(closing) .. "$", #opening + 1) then
repr = type_data[1]
end
end
end
end
 
return nil, "", "", reconstructed
-- Default to the empty string.
if not repr then
opening, closing = "", ""
end
 
-- Reattach the asterisk if reconstructed.
if reconstructed then
pron = "*" .. pron
end
 
return pron, repr, opening, closing, reconstructed
end
end


local function hasInvalidSeparators(transcription)
local function hasInvalidSeparators(transcription)
if umatch(transcription, "%.[ˈˌ]") or umatch(transcription, "[ˈˌ][ .]") then
-- Escape certain characters as well as pauses, which have the format "(...)" (with any number of dots), to avoid false-positives.
return true
transcription = transcription:gsub(".[\128-\191]*", m_symbols.separator_escapes)
else
:gsub("%(%.+%)", "\3")
return false
:gsub("[()]+", "")
end
return (
transcription:find("..", nil, true) or
transcription:match("%.%f[%z \1\2\3,:;]") or
transcription:match("\1%f[%z \2\3,:;]") or
transcription:match("\2%f[%z \1\3,:;]") or
transcription:match("\3[:;]") or
transcription:match("%f[^%z \1\2\3,]%.")
) and true or false
end
end


--[==[
--[==[
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
Format a line of one or more bare IPA pronunciations (i.e. without any preceding {"IPA:"} and without adding to a
category {{cd|<var>lang</var> terms with IPA pronunciation}}). Individual pronunciations are formatted using
category ` ``lang`` terms with IPA pronunciation`). Individual pronunciations are formatted using
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
{format_IPA()} and are combined with separators, qualifiers, pre-text, post-text, etc. to form a line of pronunciations.
Parameters accepted are:
Parameters accepted are:
Line 246: Line 271:
if namespace == 10 then -- Template
if namespace == 10 then -- Template
insert(items, {pron = "/aɪ piː ˈeɪ/"})
insert(items, {pron = "/aɪ piː ˈeɪ/"})
else
insert(categories, "Pronunciation templates without a pronunciation")
end
end
end
end
Line 331: Line 354:
local langcode = lang:getCode()
local langcode = lang:getCode()
if m_data.langs_to_generate_syllable_count_categories[langcode] then
if m_data.langs_to_generate_syllable_count_categories[langcode] then
local phonemic, phonetic, use_it = split_phonemic_phonetic(item.pron)
local raw_phonemic, phonetic, use_it = split_phonemic_phonetic(item.pron)
local repr = determine_repr(phonemic)
local phonemic, repr = determine_repr(raw_phonemic)
if not phonetic then -- not a '/.../ [...]' combined pronunciation
if not phonetic then -- not a '/.../ [...]' combined pronunciation
if m_data.langs_to_use_phonetic_notation[langcode] then
if m_data.langs_to_use_phonetic_notation[langcode] then
Line 353: Line 376:
end
end
end
end
end
-- The nature of hasInvalidSeparators() is such that we don't have to split a combined '/.../ [...]' spec
-- into its parts in order to process.
if lang:getCode() == "en" and hasInvalidSeparators(item.pron) then
insert(categories, "English IPA pronunciations with invalid separators")
end
end
end
end
Line 373: Line 390:
may have HTML added surrounding invalid characters so they appear in red.
may have HTML added surrounding invalid characters so they appear in red.
]=]
]=]
local function format_one_IPA(lang, pron, err, categories)
local function format_one_IPA(lang, raw_pron, err, categories)
-- Disallow wikilinks.
-- Disallow wikilinks.
if match(pron, "%[%[.-%]%]") then
if match(raw_pron, "%[%[.-%]%]") then
error("IPA input must not contain wikilinks.")
error("IPA input must not contain wikilinks.")
end
end
pron = decode_entities(pron)
raw_pron = decode_entities(raw_pron)


-- Detect the type of transcription.
-- Detect the type of transcription.
local repr, opening, closing, reconstructed = determine_repr(pron)
local pron, repr, opening, closing, reconstructed = determine_repr(raw_pron)
-- Strip any reconstruction asterisk and representation marks.
-- Strip any reconstruction asterisk and representation marks.
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)
pron = sub(pron, #opening + 1 + (reconstructed and 1 or 0), -#closing - 1)
if not repr then
insert(categories, "IPA pronunciations with invalid representation marks")
-- insert(err, "invalid representation marks")
-- Removed because it's annoying when previewing pronunciation pages.
end


if pron == "" then
if repr ~= "orthographic" and lang and lang:getCode() == "en" and hasInvalidSeparators(pron) then
insert(categories, "IPA pronunciations with no pronunciation present")
insert(categories, "English IPA pronunciations with invalid separators")
end
end


Line 404: Line 416:
end
end
insert(result, nonstandard)
insert(result, nonstandard)
insert(categories,
{cat = "IPA pronunciations with obsolete or nonstandard characters", sort_key = nonstandard}
)
end
end


Line 446: Line 455:


if result ~= "" then
if result ~= "" then
local suggestions = {}
if lang then
-- Get the per_lang_valid data, and convert any per-language valid sequences to spaces.
local per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
if per_lang_valid then
if type(per_lang_valid) == "table" then
for _, pattern in pairs(per_lang_valid) do
result = ugsub(result, pattern, " ")
end
else -- Should be a string.
result = ugsub(result, per_lang_valid, " ")
end
end
end
local suggestions
-- Check for any invalid sequences, excluding anything in the per-language lookup table.
for k, v in pairs(m_symbols.invalid) do
for k, v in pairs(m_symbols.invalid) do
if find(result, k, 1, true) then
if find(result, k, nil, true) then
if not suggestions then
suggestions = {}
end
insert(suggestions, k .. " with " .. v)
insert(suggestions, k .. " with " .. v)
end
end
end
end
if suggestions[1] then
if suggestions and suggestions[1] then
suggestions = listToText(suggestions)
suggestions = listToText(suggestions)
if is_content_page then
if is_content_page then
error("Invalid IPA: replace " .. suggestions)
error("Invalid IPA: replace " .. suggestions)
else
insert(err, "replace " .. suggestions)
end
end
insert(err, "replace " .. suggestions)
end
end
result = ugsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
-- Convert any valid character sequences to spaces
local per_lang_valid
for _, pattern in pairs(m_symbols.valid) do
if lang then
result = ugsub(result, pattern, " ")
per_lang_valid = m_symbols.per_lang_valid[lang:getCode()]
end
end
per_lang_valid = per_lang_valid or ""
result = ugsub(result, "[" .. m_symbols.valid .. per_lang_valid .. "]", "")
if result ~= "" then
local category = "IPA pronunciations with invalid IPA characters"
if not is_content_page then
category = category .. "/non_mainspace"
end
insert(categories, category)
insert(err, "invalid IPA characters (" .. result .. ")")
end
end
if found_HTML then
insert(categories, "IPA pronunciations with paired HTML tags")
end
end


Line 508: Line 518:
else
else
local phoneme = usub(rest, 1, 1)
local phoneme = usub(rest, 1, 1)
insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
insert(phonemes, "<span style=\"color: var(--wikt-palette-red,red)\">" .. phoneme .. "</span>")
rest = usub(rest, 2)
rest = usub(rest, 2)
insert(categories, "IPA pronunciations with invalid phonemes/" .. lang:getCode())
end
end
end
end
Line 551: Line 560:
end
end


if err[1] then
if err[1] and is_preview() then
err = '<span class="previewonly error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
err = '<span class="error" style="font-size: small;>&#32;' .. concat(err, ", ") .. "</span>"
else
else
err = ""
err = ""
end
end


return process_maybe_split_categories(split_output, categories, '<span class="IPA">' .. pron .. "</span>", lang,
return process_maybe_split_categories(split_output, categories, '<span class="IPA nowrap">' .. pron .. "</span>", lang,
err)
err)
end
end
Line 564: Line 573:
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
Format a line of one or more enPR pronunciations as {{tl|enPR}} would do it, i.e. with a preceding {"enPR:"} (linked to
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
[[Appendix:English pronunciation]]) followed by one or more formatted, comma-separated enPR pronunciations. The
pronunciations are formatted by wrapping them in the {{cd|AHD}} and {{cd|enPR}} CSS classes and adding any left and
pronunciations are formatted by wrapping them in the `AHD` and `enPR` CSS classes and adding any left and
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
right regular and accent qualifiers. In addition, the overall result is wrapped in any overall left and right regular
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
and accent qualifiers. There is a single parameter `data`, an object with the following fields:
Line 581: Line 590:
]==]
]==]
function export.format_enPR_full(data)
function export.format_enPR_full(data)
local prefix = "[[Appendix:English pronunciation|enPR]]: "
local prefix = "[[wikt:Appendix:English pronunciation|enPR]]: "
local lang = require("Module:languages").getByCode("en")
local lang = require("Module:languages").getByCode("en")
local parts = {}
local parts = {}