Module:bg-pronunciation: difference between revisions
Jump to navigation
Jump to search
Content deleted Content added
m Changed protection settings for "Module:bg-pronunciation" ([Edit=Allow only autopatrollers] (indefinite) [Move=Allow only autopatrollers] (indefinite)) |
changes from User:Kiril kovachev (lots of fixes, see Module talk:bg-pronunciation) |
||
Line 1: | Line 1: | ||
local export = {} |
local export = {} |
||
local substring = mw.ustring.sub |
|||
local rsubn = mw.ustring.gsub |
local rsubn = mw.ustring.gsub |
||
local rmatch = mw.ustring.match |
local rmatch = mw.ustring.match |
||
local rsplit = mw.text.split |
|||
local U = mw.ustring.char |
local U = mw.ustring.char |
||
local lang = require("Module:languages").getByCode("bg") |
|||
local script = require("Module:scripts").getByCode("Cyrl") |
|||
local GRAVE = U(0x300) |
local GRAVE = U(0x300) |
||
Line 12: | Line 16: | ||
local FRONTED = U(0x31F) |
local FRONTED = U(0x31F) |
||
local DOTUNDER = U(0x323) |
local DOTUNDER = U(0x323) |
||
local |
local HYPH = U(0x2027) |
||
local vowels = "aɤɔuɛiɐo" |
|||
local vowels_c = "[" .. vowels .. "]" |
local vowels_c = "[" .. vowels .. "]" |
||
local non_vowels_c = "[^" .. vowels .. "]" |
local non_vowels_c = "[^" .. vowels .. "]" |
||
local cons = "bvɡdʒzjklmnprstfxʃɣʲ" .. TIE |
local cons = "bvɡdʒzjklmnprstfxʃɣʲ" .. TIE |
||
local cons_c = "[" .. cons .. "]" |
local cons_c = "[" .. cons .. "]" |
||
local hcons_c = "[бвгджзйклмнпрстфхшщьчц#БВГДЖЗЙКЛМНПРСТФХШЩЬЧЦ=]" |
|||
local hvowels_c = "[аъоуеияѝюАЪОУЕИЯЍЮ]" |
|||
local accents = PRIMARY .. SECONDARY |
local accents = PRIMARY .. SECONDARY |
||
local accents_c = "[" .. accents .. "]" |
local accents_c = "[" .. accents .. "]" |
||
Line 180: | Line 187: | ||
-------------------- Vowel reduction (in unstressed syllables) --------------- |
-------------------- Vowel reduction (in unstressed syllables) --------------- |
||
local function reduce_vowel(vowel) |
local function reduce_vowel(vowel) |
||
return rsub(vowel, "[aɔɤu]", { ["a"] = " |
return rsub(vowel, "[aɔɤu]", { ["a"] = "ɐ", ["ɔ"] = "o", ["ɤ"] = "ɐ", ["u"] = "o" }) |
||
end |
end |
||
-- FIXME: This needs to be rewritten entirely and moved above stress movement. |
-- FIXME: This needs to be rewritten entirely and moved above stress movement. |
||
-- NOTE: This rule's removal may be a solution for the wrongly-reduced vowels seen in e.g. жар-птица, /ʒɐr-.../ |
|||
-- /a/ directly before the stress is [ɐ]. |
|||
term = rsub(term, "a(" .. non_vowels_c .. "*" .. accents_c .. ")", "ɐ%1") |
term = rsub(term, "a(" .. non_vowels_c .. "*" .. accents_c .. ")", "ɐ%1") |
||
-- Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably |
-- Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably |
||
-- intended for single-syllable words without accents, but if the word is multisyllabic without accents, |
-- intended for single-syllable words without accents, but if the word is multisyllabic without accents, |
||
-- presumably all vowels should be reduced.) |
-- presumably all vowels should be reduced.) |
||
term = rsub(term, "(#[^#" .. accents .. "]*)(.)", function(a, b) |
term = rsub(term, "(#[^#" .. accents .. "]*)(.)", function(a, b) |
||
if b == "#" then |
if b == "#" then |
||
Line 200: | Line 208: | ||
return a .. reduce_vowel(b) |
return a .. reduce_vowel(b) |
||
end) |
end) |
||
-- /u/ directly before the stress is [u] not [ʊ]. (FIXME: Correct?) |
|||
term = rsub(term, "ʊ(" .. non_vowels_c .. "*" .. accents_c .. ")", "u%1") |
|||
-------------------- Vowel assimilation to adjacent consonants (fronting/raising) --------------- |
-------------------- Vowel assimilation to adjacent consonants (fronting/raising) --------------- |
||
term = rsub_repeatedly(term, "([ʲj])[aɐə](" .. non_vowels_c .. "-[ʲj])", "%1æ%2") |
|||
term = rsub_repeatedly(term, "([ʲj])u(" .. non_vowels_c .. "-[ʲj])", "%1ʉ%2") |
|||
term = rsub(term, "([ʃʒʲj])([aouɤ])", "%1%2" .. FRONTED) |
term = rsub(term, "([ʃʒʲj])([aouɤ])", "%1%2" .. FRONTED) |
||
term = rsub(term, "([ʃʒ])ɛ", "%1e") |
|||
-- Palatalisation |
|||
term = rsub(term, "([kɡxl])([ieɛ])", "%1ʲ%2") |
|||
-- Hard l |
-- Hard l |
||
term = rsub_repeatedly(term, "l([^ |
term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ%1") |
||
-- Voicing assimilation |
-- Voicing assimilation |
||
Line 228: | Line 228: | ||
-- Reduce consonant clusters |
-- Reduce consonant clusters |
||
term = rsub(term, "([szʃʒ])[td](" .. accents_c .. "?)([tdknml])", "%2%1%3") |
term = rsub(term, "([szʃʒ])[td](" .. accents_c .. "?)([tdknml])", "%2%1%3") |
||
term = rsub(term, "([sʃ])t#", "%1(t)#") |
|||
-- ijC -> iːC, ij# -> iː# |
|||
term = rsub(term, "ij(" .. non_vowels_c .. ")", "iː%1") |
|||
-- Strip hashes |
-- Strip hashes |
||
term = rsub(term, "#", "") |
term = rsub(term, "#", "") |
||
return term |
return term |
||
end |
|||
function export.hyphenate(word) |
|||
-- Source: http://logic.fmi.uni-sofia.bg/hyphenation/hyph-bg.html#hyphenation-rules-between-1983-and-2012 |
|||
-- Also note: the rules from 2012 onward, which encode the modern standard, are entirely |
|||
-- backwards-compatible with the previous standard. Thus our code can generate valid 2012 |
|||
-- hyphenations despite following the older rules. |
|||
---Pre-processing---- |
|||
word = rsub(word, "[" .. GRAVE .. ACUTE .. "]", "") -- Remove accent marks |
|||
-- Treat дж as one single unit; this is bypassed by re-writing it as д.ж |
|||
-- I.e. we write суджук, but над.живея |
|||
word = rsub(word, "дж", "#") |
|||
word = rsub(word, "ДЖ", "=") |
|||
word = rsub(word, "[.]", "") |
|||
----Hyphenation---- |
|||
word = rsub_repeatedly(word, "(" .. hcons_c .. hvowels_c .. ")(" .. hcons_c .. ")(" .. hvowels_c .. ")", "%1" .. HYPH .. "%2%3") -- Single consonants separated by single vowels are hyphenated |
|||
word = rsub_repeatedly(word, "(" .. hvowels_c .. ")([йЙ])(" .. hcons_c .. hcons_c .. hcons_c .. "-)", function(a, b, c) |
|||
return a .. b .. substring(c, 1, 1) .. HYPH .. substring(c, 2, -1) |
|||
end) -- A й followed by two or more consonsants keeps one consonant to the left of the hyphen |
|||
word = rsub_repeatedly(word, "(" .. hvowels_c .. ")([йЙ])(" .. hcons_c .. ")([^" .. HYPH .. "])", "%1%2" .. HYPH .. "%3%4") -- A й preceded by a vowel and followed by one consonant is kept with its vowel |
|||
word = rsub_repeatedly(word, "(" .. hvowels_c .. ")(" .. hcons_c .. hcons_c .. hcons_c .. "-)(" .. hvowels_c .. ")", function(a, b, c) |
|||
return a .. substring(b, 1, 1) .. HYPH .. substring(b, 2, -1) .. c |
|||
end) -- When multiple consonants intervene between a vowel, at least one stays on either side of the vowel |
|||
word = rsub_repeatedly(word, "(" .. hcons_c .. ")%1", function(a) |
|||
return a .. HYPH .. a end) -- Two of the same consonant are hyphenated |
|||
word = rsub_repeatedly(word, "(" .. hvowels_c .. hvowels_c .. hvowels_c .. "-)(" .. hcons_c .. ")", function(a, b) |
|||
return substring(a, 1, -2) .. HYPH .. substring(a, -1, -1) .. b end) -- For sequences of two or more vowels, the final vowel goes after the hyphen and the rest before |
|||
word = rsub(word, "(.)[" .. HYPH .. "]([ьЬ])", HYPH .. "%1%2") -- ь cannot be directly after a hyphen |
|||
word = rsub(word, "([ьЬ])[" .. HYPH .. "](.)", "%2%1" .. HYPH) -- ь cannot be directly before a hyphen |
|||
word = rsub(word, "(.)" .. HYPH .. "(.)$", HYPH .. "%1%2") -- At the beginning of words, merge isolated letters with their following letters |
|||
word = rsub(word, "^(.)" .. HYPH .."(.)", "%1%2" .. HYPH) -- At the end of words, merge isolated letters with their preceding letters |
|||
-- Note: the above is flawed in that it cannot detect isolated letters within the word. |
|||
-- We hope that this is sufficient, and there are no rogue cases in between words. |
|||
----Post-processing---- |
|||
word = rsub(word, "#", "дж") -- Decode back to дж |
|||
word = rsub(word, "=", "ДЖ") |
|||
return word |
|||
end |
|||
local function get_anntext(term, ann) |
|||
if ann == "1" or ann == "y" then |
|||
-- remove secondary stress annotations |
|||
anntext = "'''" .. export.remove_pron_notations(term, true) .. "''': " |
|||
elseif ann then |
|||
anntext = "'''" .. ann .. "''': " |
|||
else |
|||
anntext = "" |
|||
end |
|||
return anntext |
|||
end |
|||
local function format_hyphenation(hyphenation) |
|||
local syllables = rsplit(hyphenation, HYPH) |
|||
return require("Module:hyphenation").format_hyphenations( { |
|||
lang = lang, |
|||
hyphs = { { hyph = syllables } }, |
|||
sc = script, |
|||
caption = "Hyphenation", |
|||
} ) |
|||
end |
|||
function export.show_hyphenation(frame) |
|||
local params = { |
|||
[1] = {}, |
|||
} |
|||
local title = mw.title.getCurrentTitle() |
|||
local args = require("Module:parameters").process(frame:getParent().args, params) |
|||
local term = args[1] or title.nsText == "Template" and "при́мер" or title.text |
|||
local hyphenation = export.hyphenate(term) |
|||
return format_hyphenation(hyphenation) |
|||
end |
end |
||
Line 245: | Line 322: | ||
["ann"] = {}, |
["ann"] = {}, |
||
} |
} |
||
local title = mw.title.getCurrentTitle() |
local title = mw.title.getCurrentTitle() |
||
Line 252: | Line 329: | ||
local ipa = export.toIPA(term, args.endschwa) |
local ipa = export.toIPA(term, args.endschwa) |
||
ipa = "[" .. ipa .. "]" |
ipa = "[" .. ipa .. "]" |
||
ipa = require("Module:IPA").format_IPA_full(require("Module:languages").getByCode("bg"), { { pron = ipa } } ) |
|||
local ipa_text = require("Module:IPA").format_IPA_full(lang, { { pron = ipa } } ) |
|||
local anntext |
|||
local anntext = get_anntext(term, args.ann) |
|||
-- remove secondary stress annotations |
|||
anntext = "'''" .. export.remove_pron_notations(term, true) .. "''': " |
|||
elseif args.ann then |
|||
anntext = "'''" .. args.ann .. "''': " |
|||
else |
|||
anntext = "" |
|||
end |
|||
return anntext .. |
return anntext .. ipa_text |
||
end |
end |
||
Revision as of 21:50, 1 August 2023
- The following documentation is located at Module:bg-pronunciation/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module automatically converts Bulgarian orthography to a phonetic transcription in the International Phonetic Alphabet. It also generates hyphenations and syllabifications.
Testcases
16 of 49 tests failed. (refresh)
Text | Expected | Actual | |
---|---|---|---|
Script error during testing: Module:bg-pronunciation/testcases:38: attempt to call field 'hyphenate_total' (a nil value)stack traceback: [C]: in function 'hyphenate_total' Module:bg-pronunciation/testcases:38: in function 'func' Module:UnitTests:295: in function 'iterate' Module:bg-pronunciation/testcases:182: in function <Module:bg-pronunciation/testcases:115> (tail call): ? [C]: in function 'xpcall' Module:UnitTests:369: in function <Module:UnitTests:328> (tail call): ? mw.lua:527: in function <mw.lua:507> [C]: ? [C]: in function 'expandTemplate' mw.lua:333: in function 'expandTemplate' Module:documentation:894: in function 'chunk' mw.lua:527: in function <mw.lua:507> [C]: ? |
Text | Expected | Actual | |
---|---|---|---|
къ́ща (kǎ́šta) | ˈkɤʃtɐ | ˈkɤʃtɐ | |
сгъстя́ се (sgǎstjá se), endschwa=true | zɡɐˈstʲɤ̟ sɛ | zɡɐˈstʲɤ̟ sɛ | |
сгъстя́ се (sgǎstjá se) (respelled сгъстя̣́ се) | zɡɐˈstʲɤ̟ sɛ | zɡɐˈstʲɤ̟ sɛ | |
а̀бдики́ращ (àbdikírašt) | ˌabdiˈkirɐʃt | ˌabdiˈkirɐʃt | |
безшу́мен (bezšúmen) | bɛʃˈʃu̟mɛn | bɛʃˈʃu̟mɛn | |
щастли́в (štastlív) | ʃtɐˈslif | ʃtɐˈslif | |
народността́ (narodnosttá) | nɐrodnoˈsta | nɐrodnoˈsta | |
я (ja) | ja̟ | ja̟ | |
юг (jug) | ju̟k | ju̟k | |
яйце́ (jajcé) | jɐjˈt͡sɛ | jɐjˈt͡sɛ | |
учи́лище (učílište) | oˈt͡ʃiliʃtɛ | oˈt͡ʃiliʃtɛ | |
чорбаджи́я (čorbadžíja) | t͡ʃo̟rbɐˈdʒijɐ | t͡ʃo̟rbɐˈdʒijɐ | |
уби́йца (ubíjca) | oˈbijt͡sɐ | oˈbijt͡sɐ | |
безбра́чие (bezbráčie) | bɛzˈbrat͡ʃiɛ | bɛzˈbrat͡ʃiɛ | |
измра́ (izmrá) (respelled из.мра́) | izˈmra | izˈmra | |
сала́та (saláta) | sɐˈɫatɐ | sɐˈɫatɐ | |
шега́ (šegá) | ʃɛˈɡa | ʃɛˈɡa | |
жена́ (žená) | ʒɛˈna | ʒɛˈna | |
инти́мен (intímen) | inˈtimɛn | inˈtimɛn | |
посо́лство (posólstvo) | poˈsɔɫstvo | poˈsɔɫstvo | |
ъ́гъл (ǎ́gǎl) | ˈɤɡɐɫ | ˈɤɡɐɫ | |
усу́квам (usúkvam) | oˈsukvɐm | oˈsukvɐm | |
ле́ща (léšta) | ˈlɛʃtɐ | ˈlɛʃtɐ | |
липа́ (lipá) | liˈpa | liˈpa | |
океа́н (okeán) | okɛˈan | okɛˈan | |
меки́ца (mekíca) | mɛˈkit͡sɐ | mɛˈkit͡sɐ | |
ла́гер (láger) | ˈɫaɡɛr | ˈɫaɡɛr | |
маги́я (magíja) | mɐˈɡijɐ | mɐˈɡijɐ | |
хем (hem) | xɛm | xɛm | |
химн (himn) | ximn | ximn | |
тулу́п (tulúp) | toˈɫup | toˈɫup | |
жа̀р-пти́ца (žàr-ptíca) | ˌʒa̟r-pˈtit͡sɐ | ˌʒɐr-pˈtit͡sɐ | |
в о́фис (v ófis) | f ˈɔfis | f ˈɔfis | |
във Фра́нция (vǎv Fráncija) | vɐf ˈfrant͡sijɐ | vɤf ˈfrant͡sijɐ | |
ня́колко (njákolko) | ˈnʲa̟koɫko | ˈnʲa̟koɫko | |
в Япо́ния (v Japónija) | f jɐˈpɔnijɐ | f jɐˈpɔnijɐ | |
автоплу́г (avtoplúg) | ɐftoˈpɫuk | ɐftoˈpɫuk | |
ўе́бса́йт | ˈwɛpˈsajt | ŏˈɛpˈsajt | |
ўе́лски | ˈwɛɫski | ŏˈɛɫski | |
ўе́стърн | ˈwɛstɐrn | ŏˈɛstɐrn | |
О́ўен | ˈɔwɛn | ˈɔŏɛn | |
но́ўхаў | ˈnɔwxɐw | ˈnɔŏxɐŏ | |
Джо́ўзеф | ˈdʒɔwzɛf | ˈdʒɔŏzɛf | |
бо́ўлинг | ˈbɔwliŋk | ˈbɔŏliŋk | |
даўнло́ўд | dɐwnˈɫɔwt | dɐŏnˈɫɔŏt | |
ўи́ски | ˈwiski | ŏˈiski | |
ўи́кенд | ˈwikɛnt | ŏˈikɛnt | |
Ўо́рўик | ˈwɔrwik | ŏˈɔrŏik | |
Хе́лоўин | ˈxɛɫowin | ˈxɛɫoŏin |
Text | Expected | Actual | |
---|---|---|---|
Script error during testing: Module:bg-pronunciation/testcases:47: attempt to call field 'syllabify' (a nil value)stack traceback: [C]: in function 'syllabify' Module:bg-pronunciation/testcases:47: in function 'func' Module:UnitTests:295: in function '?' Module:bg-pronunciation/testcases:545: in function <Module:bg-pronunciation/testcases:185> (tail call): ? [C]: in function 'xpcall' Module:UnitTests:369: in function <Module:UnitTests:328> (tail call): ? mw.lua:527: in function <mw.lua:507> [C]: ? [C]: in function 'expandTemplate' mw.lua:333: in function 'expandTemplate' Module:documentation:894: in function 'chunk' mw.lua:527: in function <mw.lua:507> [C]: ? |
References
- Тилков, Димитър, Бояджиев, Тодор, Георгиева, Елена, Пенчев, Йордан, Станков, Валентин (1998) Граматика на съвременния български книжовен език (in Bulgarian), 3rd edition, volume 1, Sofia: ABAGAR
local export = {}
local substring = mw.ustring.sub
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local U = mw.ustring.char
local lang = require("Module:languages").getByCode("bg")
local script = require("Module:scripts").getByCode("Cyrl")
local GRAVE = U(0x300)
local ACUTE = U(0x301)
local PRIMARY = U(0x2C8)
local SECONDARY = U(0x2CC)
local TIE = U(0x361)
local FRONTED = U(0x31F)
local DOTUNDER = U(0x323)
local HYPH = U(0x2027)
local vowels = "aɤɔuɛiɐo"
local vowels_c = "[" .. vowels .. "]"
local non_vowels_c = "[^" .. vowels .. "]"
local cons = "bvɡdʒzjklmnprstfxʃɣʲ" .. TIE
local cons_c = "[" .. cons .. "]"
local hcons_c = "[бвгджзйклмнпрстфхшщьчц#БВГДЖЗЙКЛМНПРСТФХШЩЬЧЦ=]"
local hvowels_c = "[аъоуеияѝюАЪОУЕИЯЍЮ]"
local accents = PRIMARY .. SECONDARY
local accents_c = "[" .. accents .. "]"
-- single characters that map to IPA sounds
local phonetic_chars_map = {
["а"] = "a",
["б"] = "b",
["в"] = "v",
["г"] = "ɡ",
["д"] = "d",
["е"] = "ɛ",
["ж"] = "ʒ",
["з"] = "z",
["и"] = "i",
["й"] = "j",
["к"] = "k",
["л"] = "l",
["м"] = "m",
["н"] = "n",
["о"] = "ɔ",
["п"] = "p",
["р"] = "r",
["с"] = "s",
["т"] = "t",
["у"] = "u",
["ф"] = "f",
["х"] = "x",
["ц"] = "t" .. TIE .. "s",
["ч"] = "t" .. TIE .. "ʃ",
["ш"] = "ʃ",
["щ"] = "ʃt",
["ъ"] = "ɤ",
["ь"] = "ʲ",
["ю"] = "ʲu",
["я"] = "ʲa",
[GRAVE] = SECONDARY,
[ACUTE] = PRIMARY
}
local devoicing = {
["b"] = "p", ["d"] = "t", ["ɡ"] = "k",
["z"] = "s", ["ʒ"] = "ʃ",
["v"] = "f"
}
local voicing = {
["p"] = "b", ["t"] = "d", ["k"] = "ɡ",
["s"] = "z", ["ʃ"] = "ʒ", ["x"] = "ɣ",
["f"] = "v"
}
-- Prefixes where, if they occur at the beginning of the word and the stress is on the next syllable, we place the
-- syllable division directly after the prefix. For example, the default syllable-breaking algorithm would convert
-- безбра́чие to беˈзбрачие; but because it begins with без-, we convert it to безˈбрачие. Note that we don't (yet?)
-- convert измра́ to изˈмра instead of default измˈра, although we probably should.
--
-- Think twice before putting prefixes like на-, пре- and от- here, because of the existence of над-, пред-, and о-,
-- which are also prefixes.
local prefixes = {"bɛz", "vɤz", "vɤzproiz", "iz", "naiz", "poiz", "prɛvɤz", "proiz", "raz"}
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
function export.remove_pron_notations(text, remove_grave)
text = rsub(text, "[." .. DOTUNDER .. "]", "")
-- Remove grave accents from annotations but maybe not from phonetic respelling
if remove_grave then
text = mw.ustring.toNFC(rsub(mw.ustring.toNFD(text), GRAVE, ""))
end
return text
end
function export.toIPA(term, endschwa)
if type(term) == "table" then -- called from a template or a bot
endschwa = term.args.endschwa
term = term.args[1]
end
local origterm = term
term = rsub(mw.ustring.toNFC(term), "й", "j")
term = mw.ustring.toNFD(mw.ustring.lower(term))
if term:find(GRAVE) and not term:find(ACUTE) then
error("Use acute accent, not grave accent, for primary stress: " .. origterm)
end
-- allow DOTUNDER to signal same as endschwa=1
term = rsub(term, "а(" .. accents_c .. "?)" .. DOTUNDER, "ъ%1")
term = rsub(term, "я(" .. accents_c .. "?)" .. DOTUNDER, "ʲɤ%1")
term = rsub(term, ".", phonetic_chars_map)
-- Mark word boundaries
term = rsub(term, "(%s+)", "#%1#")
term = "#" .. term .. "#"
-- Convert verbal and definite endings
if endschwa then
term = rsub(term, "a(" .. PRIMARY .. "t?#)", "ɤ%1")
end
-- Change ʲ to j after vowels or word-initially
term = rsub(term, "([" .. vowels .. "#]" .. accents_c .. "?)ʲ", "%1j")
-------------------- Move stress ---------------
-- First, move leftwards over the vowel.
term = rsub(term, "(" .. vowels_c .. ")(" .. accents_c .. ")", "%2%1")
-- Then, move leftwards over j or soft sign.
term = rsub(term, "([jʲ])(" .. accents_c .. ")", "%2%1")
-- Then, move leftwards over a single consonant.
term = rsub(term, "(" .. cons_c .. ")(" .. accents_c .. ")", "%2%1")
-- Then, move leftwards over Cl/Cr combinations where C is an obstruent (NOTE: IPA ɡ).
term = rsub(term, "([bdɡptkxfv]" .. ")(" .. accents_c .. ")([rl])", "%2%1%3")
-- Then, move leftwards over kv/gv (NOTE: IPA ɡ).
term = rsub(term, "([kɡ]" .. ")(" .. accents_c .. ")(v)", "%2%1%3")
-- Then, move leftwards over sC combinations, where C is a stop or resonant (NOTE: IPA ɡ).
term = rsub(term, "([sz]" .. ")(" .. accents_c .. ")([bdɡptkvlrmn])", "%2%1%3")
-- Then, move leftwards over affricates not followed by a consonant.
term = rsub(term, "([td]" .. TIE .. "?)(" .. accents_c .. ")([szʃʒ][" .. vowels .. "ʲ])", "%2%1%3")
-- If we ended up in the middle of a tied affricate, move to its right.
term = rsub(term, "(" .. TIE .. ")(" .. accents_c .. ")(" .. cons_c .. ")", "%1%3%2")
-- Then, move leftwards over any remaining consonants at the beginning of a word.
term = rsub(term, "#(" .. cons_c .. "*)(" .. accents_c .. ")", "#%2%1")
-- Then correct for known prefixes.
for _, prefix in ipairs(prefixes) do
prefix_prefix, prefix_final_cons = rmatch(prefix, "^(.-)(" .. cons_c .. "*)$")
if prefix_final_cons then
-- Check for accent moved too far to the left into a prefix, e.g. безбрачие accented as беˈзбрачие instead
-- of безˈбрачие
term = rsub(term, "#(" .. prefix_prefix .. ")(" .. accents_c .. ")(" .. prefix_final_cons .. ")", "#%1%3%2")
end
end
-- Finally, if there is an explicit syllable boundary in the cluster of consonants where the stress is, put it there.
-- First check for accent to the right of the explicit syllable boundary.
term = rsub(term, "(" .. cons_c .. "*)%.(" .. cons_c .. "*)(" .. accents_c .. ")(" .. cons_c .. "*)", "%1%3%2%4")
-- Then check for accent to the left of the explicit syllable boundary.
term = rsub(term, "(" .. cons_c .. "*)(" .. accents_c .. ")(" .. cons_c .. "*)%.(" .. cons_c .. "*)", "%1%3%2%4")
-- Finally, remove any remaining syllable boundaries.
term = rsub(term, "%.", "")
-------------------- Vowel reduction (in unstressed syllables) ---------------
local function reduce_vowel(vowel)
return rsub(vowel, "[aɔɤu]", { ["a"] = "ɐ", ["ɔ"] = "o", ["ɤ"] = "ɐ", ["u"] = "o" })
end
-- FIXME: This needs to be rewritten entirely and moved above stress movement.
-- NOTE: This rule's removal may be a solution for the wrongly-reduced vowels seen in e.g. жар-птица, /ʒɐr-.../
term = rsub(term, "a(" .. non_vowels_c .. "*" .. accents_c .. ")", "ɐ%1")
-- Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably
-- intended for single-syllable words without accents, but if the word is multisyllabic without accents,
-- presumably all vowels should be reduced.)
term = rsub(term, "(#[^#" .. accents .. "]*)(.)", function(a, b)
if b == "#" then
return a .. b
else
return reduce_vowel(a) .. b
end
end)
-- Reduce all vowels after the accent except the first vowel after the accent mark (which is stressed).
term = rsub(term, "(" .. accents_c .. "[^aɛiɔuɤ#]*[aɛiɔuɤ])([^#" .. accents .. "]*)", function(a, b)
return a .. reduce_vowel(b)
end)
-------------------- Vowel assimilation to adjacent consonants (fronting/raising) ---------------
term = rsub(term, "([ʃʒʲj])([aouɤ])", "%1%2" .. FRONTED)
-- Hard l
term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ%1")
-- Voicing assimilation
term = rsub(term, "([bdɡzʒv" .. TIE .. "]*)(" .. accents_c .. "?[ptksʃfx#])", function(a, b)
return rsub(a, ".", devoicing) .. b end)
term = rsub(term, "([ptksʃfx" .. TIE .. "]*)(" .. accents_c .. "?[bdɡzʒ])", function(a, b)
return rsub(a, ".", voicing) .. b end)
term = rsub(term, "n(" .. accents_c .. "?[ɡk]+)", "ŋ%1")
term = rsub(term, "m(" .. accents_c .. "?[fv]+)", "ɱ%1")
-- Sibilant assimilation
term = rsub(term, "[sz](" .. accents_c .. "?[td]?" .. TIE .. "?)([ʃʒ])", "%2%1%2")
-- Reduce consonant clusters
term = rsub(term, "([szʃʒ])[td](" .. accents_c .. "?)([tdknml])", "%2%1%3")
-- Strip hashes
term = rsub(term, "#", "")
return term
end
function export.hyphenate(word)
-- Source: http://logic.fmi.uni-sofia.bg/hyphenation/hyph-bg.html#hyphenation-rules-between-1983-and-2012
-- Also note: the rules from 2012 onward, which encode the modern standard, are entirely
-- backwards-compatible with the previous standard. Thus our code can generate valid 2012
-- hyphenations despite following the older rules.
---Pre-processing----
word = rsub(word, "[" .. GRAVE .. ACUTE .. "]", "") -- Remove accent marks
-- Treat дж as one single unit; this is bypassed by re-writing it as д.ж
-- I.e. we write суджук, but над.живея
word = rsub(word, "дж", "#")
word = rsub(word, "ДЖ", "=")
word = rsub(word, "[.]", "")
----Hyphenation----
word = rsub_repeatedly(word, "(" .. hcons_c .. hvowels_c .. ")(" .. hcons_c .. ")(" .. hvowels_c .. ")", "%1" .. HYPH .. "%2%3") -- Single consonants separated by single vowels are hyphenated
word = rsub_repeatedly(word, "(" .. hvowels_c .. ")([йЙ])(" .. hcons_c .. hcons_c .. hcons_c .. "-)", function(a, b, c)
return a .. b .. substring(c, 1, 1) .. HYPH .. substring(c, 2, -1)
end) -- A й followed by two or more consonsants keeps one consonant to the left of the hyphen
word = rsub_repeatedly(word, "(" .. hvowels_c .. ")([йЙ])(" .. hcons_c .. ")([^" .. HYPH .. "])", "%1%2" .. HYPH .. "%3%4") -- A й preceded by a vowel and followed by one consonant is kept with its vowel
word = rsub_repeatedly(word, "(" .. hvowels_c .. ")(" .. hcons_c .. hcons_c .. hcons_c .. "-)(" .. hvowels_c .. ")", function(a, b, c)
return a .. substring(b, 1, 1) .. HYPH .. substring(b, 2, -1) .. c
end) -- When multiple consonants intervene between a vowel, at least one stays on either side of the vowel
word = rsub_repeatedly(word, "(" .. hcons_c .. ")%1", function(a)
return a .. HYPH .. a end) -- Two of the same consonant are hyphenated
word = rsub_repeatedly(word, "(" .. hvowels_c .. hvowels_c .. hvowels_c .. "-)(" .. hcons_c .. ")", function(a, b)
return substring(a, 1, -2) .. HYPH .. substring(a, -1, -1) .. b end) -- For sequences of two or more vowels, the final vowel goes after the hyphen and the rest before
word = rsub(word, "(.)[" .. HYPH .. "]([ьЬ])", HYPH .. "%1%2") -- ь cannot be directly after a hyphen
word = rsub(word, "([ьЬ])[" .. HYPH .. "](.)", "%2%1" .. HYPH) -- ь cannot be directly before a hyphen
word = rsub(word, "(.)" .. HYPH .. "(.)$", HYPH .. "%1%2") -- At the beginning of words, merge isolated letters with their following letters
word = rsub(word, "^(.)" .. HYPH .."(.)", "%1%2" .. HYPH) -- At the end of words, merge isolated letters with their preceding letters
-- Note: the above is flawed in that it cannot detect isolated letters within the word.
-- We hope that this is sufficient, and there are no rogue cases in between words.
----Post-processing----
word = rsub(word, "#", "дж") -- Decode back to дж
word = rsub(word, "=", "ДЖ")
return word
end
local function get_anntext(term, ann)
if ann == "1" or ann == "y" then
-- remove secondary stress annotations
anntext = "'''" .. export.remove_pron_notations(term, true) .. "''': "
elseif ann then
anntext = "'''" .. ann .. "''': "
else
anntext = ""
end
return anntext
end
local function format_hyphenation(hyphenation)
local syllables = rsplit(hyphenation, HYPH)
return require("Module:hyphenation").format_hyphenations( {
lang = lang,
hyphs = { { hyph = syllables } },
sc = script,
caption = "Hyphenation",
} )
end
function export.show_hyphenation(frame)
local params = {
[1] = {},
}
local title = mw.title.getCurrentTitle()
local args = require("Module:parameters").process(frame:getParent().args, params)
local term = args[1] or title.nsText == "Template" and "при́мер" or title.text
local hyphenation = export.hyphenate(term)
return format_hyphenation(hyphenation)
end
function export.show(frame)
local params = {
[1] = {},
["endschwa"] = { type = "boolean" },
["ann"] = {},
}
local title = mw.title.getCurrentTitle()
local args = require("Module:parameters").process(frame:getParent().args, params)
local term = args[1] or title.nsText == "Template" and "при́мер" or title.text
local ipa = export.toIPA(term, args.endschwa)
ipa = "[" .. ipa .. "]"
local ipa_text = require("Module:IPA").format_IPA_full(lang, { { pron = ipa } } )
local anntext = get_anntext(term, args.ann)
return anntext .. ipa_text
end
return export