Module:bg-pronunciation: difference between revisions

Content deleted Content added

Inline

Revision as of 21:50, 1 August 2023

The following documentation is located at Module:bg-pronunciation/documentation. ^[edit] Categories were auto-generated by Module:module categorization. ^[edit]

Useful links: subpage list • links • transclusions • testcases • sandbox

This module automatically converts Bulgarian orthography to a phonetic transcription in the International Phonetic Alphabet. It also generates hyphenations and syllabifications.

Testcases

16 of 49 tests failed. (refresh)

Text

Expected

Actual

test_hyphenation:

Script error during testing: Module:bg-pronunciation/testcases:38: attempt to call field 'hyphenate_total' (a nil value)

stack traceback:
	[C]: in function 'hyphenate_total'
	Module:bg-pronunciation/testcases:38: in function 'func'
	Module:UnitTests:295: in function 'iterate'
	Module:bg-pronunciation/testcases:182: in function <Module:bg-pronunciation/testcases:115>
	(tail call): ?
	[C]: in function 'xpcall'
	Module:UnitTests:369: in function <Module:UnitTests:328>
	(tail call): ?
	mw.lua:527: in function <mw.lua:507>
	[C]: ?
	[C]: in function 'expandTemplate'
	mw.lua:333: in function 'expandTemplate'
	Module:documentation:894: in function 'chunk'
	mw.lua:527: in function <mw.lua:507>
	[C]: ?

test_ipa:
Text	Expected	Actual
къ́ща (kǎ́šta)	ˈkɤʃtɐ	ˈkɤʃtɐ
сгъстя́ се (sgǎstjá se), endschwa=true	zɡɐˈstʲɤ̟ sɛ	zɡɐˈstʲɤ̟ sɛ
сгъстя́ се (sgǎstjá se) (respelled сгъстя̣́ се)	zɡɐˈstʲɤ̟ sɛ	zɡɐˈstʲɤ̟ sɛ
а̀бдики́ращ (àbdikírašt)	ˌabdiˈkirɐʃt	ˌabdiˈkirɐʃt
безшу́мен (bezšúmen)	bɛʃˈʃu̟mɛn	bɛʃˈʃu̟mɛn
щастли́в (štastlív)	ʃtɐˈslif	ʃtɐˈslif
народността́ (narodnosttá)	nɐrodnoˈsta	nɐrodnoˈsta
я (ja)	ja̟	ja̟
юг (jug)	ju̟k	ju̟k
яйце́ (jajcé)	jɐjˈt͡sɛ	jɐjˈt͡sɛ
учи́лище (učílište)	oˈt͡ʃiliʃtɛ	oˈt͡ʃiliʃtɛ
чорбаджи́я (čorbadžíja)	t͡ʃo̟rbɐˈdʒijɐ	t͡ʃo̟rbɐˈdʒijɐ
уби́йца (ubíjca)	oˈbijt͡sɐ	oˈbijt͡sɐ
безбра́чие (bezbráčie)	bɛzˈbrat͡ʃiɛ	bɛzˈbrat͡ʃiɛ
измра́ (izmrá) (respelled из.мра́)	izˈmra	izˈmra
сала́та (saláta)	sɐˈɫatɐ	sɐˈɫatɐ
шега́ (šegá)	ʃɛˈɡa	ʃɛˈɡa
жена́ (žená)	ʒɛˈna	ʒɛˈna
инти́мен (intímen)	inˈtimɛn	inˈtimɛn
посо́лство (posólstvo)	poˈsɔɫstvo	poˈsɔɫstvo
ъ́гъл (ǎ́gǎl)	ˈɤɡɐɫ	ˈɤɡɐɫ
усу́квам (usúkvam)	oˈsukvɐm	oˈsukvɐm
ле́ща (léšta)	ˈlɛʃtɐ	ˈlɛʃtɐ
липа́ (lipá)	liˈpa	liˈpa
океа́н (okeán)	okɛˈan	okɛˈan
меки́ца (mekíca)	mɛˈkit͡sɐ	mɛˈkit͡sɐ
ла́гер (láger)	ˈɫaɡɛr	ˈɫaɡɛr
маги́я (magíja)	mɐˈɡijɐ	mɐˈɡijɐ
хем (hem)	xɛm	xɛm
химн (himn)	ximn	ximn
тулу́п (tulúp)	toˈɫup	toˈɫup
жа̀р-пти́ца (žàr-ptíca)	ˌʒa̟r-pˈtit͡sɐ	ˌʒɐr-pˈtit͡sɐ
в о́фис (v ófis)	f ˈɔfis	f ˈɔfis
във Фра́нция (vǎv Fráncija)	vɐf ˈfrant͡sijɐ	vɤf ˈfrant͡sijɐ
ня́колко (njákolko)	ˈnʲa̟koɫko	ˈnʲa̟koɫko
в Япо́ния (v Japónija)	f jɐˈpɔnijɐ	f jɐˈpɔnijɐ
автоплу́г (avtoplúg)	ɐftoˈpɫuk	ɐftoˈpɫuk
ўе́бса́йт	ˈwɛpˈsajt	ŏˈɛpˈsajt
ўе́лски	ˈwɛɫski	ŏˈɛɫski
ўе́стърн	ˈwɛstɐrn	ŏˈɛstɐrn
О́ўен	ˈɔwɛn	ˈɔŏɛn
но́ўхаў	ˈnɔwxɐw	ˈnɔŏxɐŏ
Джо́ўзеф	ˈdʒɔwzɛf	ˈdʒɔŏzɛf
бо́ўлинг	ˈbɔwliŋk	ˈbɔŏliŋk
даўнло́ўд	dɐwnˈɫɔwt	dɐŏnˈɫɔŏt
ўи́ски	ˈwiski	ŏˈiski
ўи́кенд	ˈwikɛnt	ŏˈikɛnt
Ўо́рўик	ˈwɔrwik	ŏˈɔrŏik
Хе́лоўин	ˈxɛɫowin	ˈxɛɫoŏin

Text

Expected

Actual

test_syllabification:

Script error during testing: Module:bg-pronunciation/testcases:47: attempt to call field 'syllabify' (a nil value)

stack traceback:
	[C]: in function 'syllabify'
	Module:bg-pronunciation/testcases:47: in function 'func'
	Module:UnitTests:295: in function '?'
	Module:bg-pronunciation/testcases:545: in function <Module:bg-pronunciation/testcases:185>
	(tail call): ?
	[C]: in function 'xpcall'
	Module:UnitTests:369: in function <Module:UnitTests:328>
	(tail call): ?
	mw.lua:527: in function <mw.lua:507>
	[C]: ?
	[C]: in function 'expandTemplate'
	mw.lua:333: in function 'expandTemplate'
	Module:documentation:894: in function 'chunk'
	mw.lua:527: in function <mw.lua:507>
	[C]: ?

References

Тилков, Димитър, Бояджиев, Тодор, Георгиева, Елена, Пенчев, Йордан, Станков, Валентин (1998) Граматика на съвременния български книжовен език (in Bulgarian), 3rd edition, volume 1, Sofia: ABAGAR

local export = {}

local substring = mw.ustring.sub
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local U = mw.ustring.char
local lang = require("Module:languages").getByCode("bg")
local script = require("Module:scripts").getByCode("Cyrl")

local GRAVE = U(0x300)
local ACUTE = U(0x301)
local PRIMARY = U(0x2C8)
local SECONDARY = U(0x2CC)
local TIE = U(0x361)
local FRONTED = U(0x31F)
local DOTUNDER = U(0x323)
local HYPH = U(0x2027)
local vowels = "aɤɔuɛiɐo"
local vowels_c = "[" .. vowels .. "]"
local non_vowels_c = "[^" .. vowels .. "]"
local cons = "bvɡdʒzjklmnprstfxʃɣʲ" .. TIE
local cons_c = "[" .. cons .. "]"
local hcons_c = "[бвгджзйклмнпрстфхшщьчц#БВГДЖЗЙКЛМНПРСТФХШЩЬЧЦ=]"
local hvowels_c = "[аъоуеияѝюАЪОУЕИЯЍЮ]"
local accents = PRIMARY .. SECONDARY
local accents_c = "[" .. accents .. "]"

-- single characters that map to IPA sounds
local phonetic_chars_map = {
	["а"] = "a",
	["б"] = "b",
	["в"] = "v",
	["г"] = "ɡ",
	["д"] = "d",
	["е"] = "ɛ",
	["ж"] = "ʒ",
	["з"] = "z",
	["и"] = "i",
	["й"] = "j",
	["к"] = "k",
	["л"] = "l",
	["м"] = "m",
	["н"] = "n",
	["о"] = "ɔ",
	["п"] = "p",
	["р"] = "r",
	["с"] = "s",
	["т"] = "t",
	["у"] = "u",
	["ф"] = "f",
	["х"] = "x",
	["ц"] = "t" .. TIE .. "s",
	["ч"] = "t" .. TIE .. "ʃ",
	["ш"] = "ʃ",
	["щ"] = "ʃt",
	["ъ"] = "ɤ",
	["ь"] = "ʲ",
	["ю"] = "ʲu",
	["я"] = "ʲa",

	[GRAVE] = SECONDARY,
	[ACUTE] = PRIMARY
}

local devoicing = {
	["b"] = "p", ["d"] = "t", ["ɡ"] = "k",
	["z"] = "s", ["ʒ"] = "ʃ",
	["v"] = "f"
}

local voicing = {
	["p"] = "b", ["t"] = "d", ["k"] = "ɡ",
	["s"] = "z", ["ʃ"] = "ʒ", ["x"] = "ɣ",
	["f"] = "v"
}


-- Prefixes where, if they occur at the beginning of the word and the stress is on the next syllable, we place the
-- syllable division directly after the prefix. For example, the default syllable-breaking algorithm would convert
-- безбра́чие to беˈзбрачие; but because it begins with без-, we convert it to безˈбрачие. Note that we don't (yet?)
-- convert измра́ to изˈмра instead of default измˈра, although we probably should.
--
-- Think twice before putting prefixes like на-, пре- and от- here, because of the existence of над-, пред-, and о-,
-- which are also prefixes.
local prefixes = {"bɛz", "vɤz", "vɤzproiz", "iz", "naiz", "poiz", "prɛvɤz", "proiz", "raz"}


-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end


-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end


function export.remove_pron_notations(text, remove_grave)
	text = rsub(text, "[." .. DOTUNDER .. "]", "")
	-- Remove grave accents from annotations but maybe not from phonetic respelling
	if remove_grave then
		text = mw.ustring.toNFC(rsub(mw.ustring.toNFD(text), GRAVE, ""))
	end
	return text
end

	
function export.toIPA(term, endschwa)
	if type(term) == "table" then -- called from a template or a bot
		endschwa = term.args.endschwa
		term = term.args[1]
	end
		
	local origterm = term
	
	term = rsub(mw.ustring.toNFC(term), "й", "j")
	term = mw.ustring.toNFD(mw.ustring.lower(term))

	if term:find(GRAVE) and not term:find(ACUTE) then
		error("Use acute accent, not grave accent, for primary stress: " .. origterm)
	end

	-- allow DOTUNDER to signal same as endschwa=1	
	term = rsub(term, "а(" .. accents_c .. "?)" .. DOTUNDER, "ъ%1")
	term = rsub(term, "я(" .. accents_c .. "?)" .. DOTUNDER, "ʲɤ%1")
	term = rsub(term, ".", phonetic_chars_map)

	-- Mark word boundaries
	term = rsub(term, "(%s+)", "#%1#")
	term = "#" .. term .. "#"

	-- Convert verbal and definite endings
	if endschwa then
		term = rsub(term, "a(" .. PRIMARY .. "t?#)", "ɤ%1")
	end

	-- Change ʲ to j after vowels or word-initially
	term = rsub(term, "([" .. vowels .. "#]" .. accents_c .. "?)ʲ", "%1j")

	-------------------- Move stress ---------------

	-- First, move leftwards over the vowel.
	term = rsub(term, "(" .. vowels_c .. ")(" .. accents_c .. ")", "%2%1")
	-- Then, move leftwards over j or soft sign.
	term = rsub(term, "([jʲ])(" .. accents_c .. ")", "%2%1")
	-- Then, move leftwards over a single consonant.
	term = rsub(term, "(" .. cons_c .. ")(" .. accents_c .. ")", "%2%1")
	-- Then, move leftwards over Cl/Cr combinations where C is an obstruent (NOTE: IPA ɡ).
	term = rsub(term, "([bdɡptkxfv]" .. ")(" .. accents_c .. ")([rl])", "%2%1%3")
	-- Then, move leftwards over kv/gv (NOTE: IPA ɡ).
	term = rsub(term, "([kɡ]" .. ")(" .. accents_c .. ")(v)", "%2%1%3")
	-- Then, move leftwards over sC combinations, where C is a stop or resonant (NOTE: IPA ɡ).
	term = rsub(term, "([sz]" .. ")(" .. accents_c .. ")([bdɡptkvlrmn])", "%2%1%3")
	-- Then, move leftwards over affricates not followed by a consonant.
	term = rsub(term, "([td]" .. TIE .. "?)(" .. accents_c .. ")([szʃʒ][" .. vowels .. "ʲ])", "%2%1%3")
	-- If we ended up in the middle of a tied affricate, move to its right.
	term = rsub(term, "(" .. TIE .. ")(" .. accents_c .. ")(" .. cons_c .. ")", "%1%3%2")
	-- Then, move leftwards over any remaining consonants at the beginning of a word.
	term = rsub(term, "#(" .. cons_c .. "*)(" .. accents_c .. ")", "#%2%1")
	-- Then correct for known prefixes.
	for _, prefix in ipairs(prefixes) do
		prefix_prefix, prefix_final_cons = rmatch(prefix, "^(.-)(" .. cons_c .. "*)$")
		if prefix_final_cons then
			-- Check for accent moved too far to the left into a prefix, e.g. безбрачие accented as беˈзбрачие instead
			-- of безˈбрачие
			term = rsub(term, "#(" .. prefix_prefix .. ")(" .. accents_c .. ")(" .. prefix_final_cons .. ")", "#%1%3%2")
		end
	end
	-- Finally, if there is an explicit syllable boundary in the cluster of consonants where the stress is, put it there.
	-- First check for accent to the right of the explicit syllable boundary.
	term = rsub(term, "(" .. cons_c .. "*)%.(" .. cons_c .. "*)(" .. accents_c .. ")(" .. cons_c .. "*)", "%1%3%2%4")
	-- Then check for accent to the left of the explicit syllable boundary.
	term = rsub(term, "(" .. cons_c .. "*)(" .. accents_c .. ")(" .. cons_c .. "*)%.(" .. cons_c .. "*)", "%1%3%2%4")
	-- Finally, remove any remaining syllable boundaries.
	term = rsub(term, "%.", "")

	-------------------- Vowel reduction (in unstressed syllables) ---------------
	local function reduce_vowel(vowel)
		return rsub(vowel, "[aɔɤu]", { ["a"] = "ɐ", ["ɔ"] = "o", ["ɤ"] = "ɐ", ["u"] = "o" })
	end

	-- FIXME: This needs to be rewritten entirely and moved above stress movement.
	-- NOTE: This rule's removal may be a solution for the wrongly-reduced vowels seen in e.g. жар-птица, /ʒɐr-.../
	term = rsub(term, "a(" .. non_vowels_c .. "*" .. accents_c .. ")", "ɐ%1")
	-- Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably
	-- intended for single-syllable words without accents, but if the word is multisyllabic without accents,
	-- presumably all vowels should be reduced.)

	term = rsub(term, "(#[^#" .. accents .. "]*)(.)", function(a, b)
		if b == "#" then
			return a .. b
		else
			return reduce_vowel(a) .. b
		end
	end)
	-- Reduce all vowels after the accent except the first vowel after the accent mark (which is stressed).
	term = rsub(term, "(" .. accents_c .. "[^aɛiɔuɤ#]*[aɛiɔuɤ])([^#" .. accents .. "]*)", function(a, b)
		return a .. reduce_vowel(b)
	end)

	-------------------- Vowel assimilation to adjacent consonants (fronting/raising) ---------------
	term = rsub(term, "([ʃʒʲj])([aouɤ])", "%1%2" .. FRONTED)

	-- Hard l
	term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ%1")

	-- Voicing assimilation
	term = rsub(term, "([bdɡzʒv" .. TIE .. "]*)(" .. accents_c .. "?[ptksʃfx#])", function(a, b)
		return rsub(a, ".", devoicing) .. b end)
	term = rsub(term, "([ptksʃfx" .. TIE .. "]*)(" .. accents_c .. "?[bdɡzʒ])", function(a, b)
		return rsub(a, ".", voicing) .. b end)
	term = rsub(term, "n(" .. accents_c .. "?[ɡk]+)", "ŋ%1")
	term = rsub(term, "m(" .. accents_c .. "?[fv]+)", "ɱ%1")

	-- Sibilant assimilation
	term = rsub(term, "[sz](" .. accents_c .. "?[td]?" .. TIE .. "?)([ʃʒ])", "%2%1%2")

	-- Reduce consonant clusters
	term = rsub(term, "([szʃʒ])[td](" .. accents_c .. "?)([tdknml])", "%2%1%3")

	-- Strip hashes
	term = rsub(term, "#", "")
	
	return term
end

function export.hyphenate(word)
    -- Source: http://logic.fmi.uni-sofia.bg/hyphenation/hyph-bg.html#hyphenation-rules-between-1983-and-2012
    -- Also note: the rules from 2012 onward, which encode the modern standard, are entirely
    -- backwards-compatible with the previous standard. Thus our code can generate valid 2012
    -- hyphenations despite following the older rules.
    ---Pre-processing----
	word = rsub(word, "[" .. GRAVE .. ACUTE .. "]", "") -- Remove accent marks

    -- Treat дж as one single unit; this is bypassed by re-writing it as д.ж
    -- I.e. we write суджук, but над.живея
    word = rsub(word, "дж", "#")
    word = rsub(word, "ДЖ", "=")
    word = rsub(word, "[.]", "")

    ----Hyphenation----
    word = rsub_repeatedly(word, "(" .. hcons_c .. hvowels_c .. ")(" .. hcons_c .. ")(" .. hvowels_c .. ")", "%1" .. HYPH .. "%2%3") -- Single consonants separated by single vowels are hyphenated
    word = rsub_repeatedly(word, "(" .. hvowels_c .. ")([йЙ])(" .. hcons_c .. hcons_c .. hcons_c .. "-)", function(a, b, c)
    	return a .. b .. substring(c, 1, 1) .. HYPH .. substring(c, 2, -1)
    end) -- A й followed by two or more consonsants keeps one consonant to the left of the hyphen
    word = rsub_repeatedly(word, "(" .. hvowels_c .. ")([йЙ])(" .. hcons_c .. ")([^" .. HYPH .. "])", "%1%2" .. HYPH .. "%3%4") -- A й preceded by a vowel and followed by one consonant is kept with its vowel
    word = rsub_repeatedly(word, "(" .. hvowels_c .. ")(" .. hcons_c .. hcons_c .. hcons_c .. "-)(" .. hvowels_c .. ")", function(a, b, c)
    	return a .. substring(b, 1, 1) .. HYPH .. substring(b, 2, -1) .. c 
    end) -- When multiple consonants intervene between a vowel, at least one stays on either side of the vowel
    word = rsub_repeatedly(word, "(" .. hcons_c .. ")%1", function(a) 
    	return a .. HYPH .. a end) -- Two of the same consonant are hyphenated
    word = rsub_repeatedly(word, "(" .. hvowels_c .. hvowels_c .. hvowels_c .. "-)(" .. hcons_c .. ")", function(a, b)
    	return substring(a, 1, -2) ..  HYPH .. substring(a, -1, -1) .. b end) -- For sequences of two or more vowels, the final vowel goes after the hyphen and the rest before

    word = rsub(word, "(.)[" .. HYPH .. "]([ьЬ])", HYPH .. "%1%2") -- ь cannot be directly after a hyphen
    word = rsub(word, "([ьЬ])[" .. HYPH .. "](.)", "%2%1" .. HYPH) -- ь cannot be directly before a hyphen
    word = rsub(word, "(.)" .. HYPH .. "(.)$", HYPH .. "%1%2") -- At the beginning of words, merge isolated letters with their following letters
    word = rsub(word, "^(.)" .. HYPH .."(.)", "%1%2" .. HYPH) -- At the end of words, merge isolated letters with their preceding letters
    -- Note: the above is flawed in that it cannot detect isolated letters within the word.
    -- We hope that this is sufficient, and there are no rogue cases in between words.

    ----Post-processing----
    word = rsub(word, "#", "дж") -- Decode back to дж
	word = rsub(word, "=", "ДЖ")

    return word
end

local function get_anntext(term, ann)
	if ann == "1" or ann == "y" then
		-- remove secondary stress annotations
		anntext = "'''" .. export.remove_pron_notations(term, true) .. "''':&#32;"
	elseif ann then
		anntext = "'''" .. ann .. "''':&#32;"
	else
		anntext = ""
	end
	return anntext
end

local function format_hyphenation(hyphenation)
	local syllables = rsplit(hyphenation, HYPH)

	return require("Module:hyphenation").format_hyphenations( { 
		lang = lang,
		hyphs = { { hyph = syllables } },
		sc = script,
		caption = "Hyphenation",
		} )
	
end

function export.show_hyphenation(frame)
	local params = {
		[1] = {},
	}

	local title = mw.title.getCurrentTitle()

	local args = require("Module:parameters").process(frame:getParent().args, params)
	local term = args[1] or title.nsText == "Template" and "при́мер" or title.text

	local hyphenation = export.hyphenate(term)
	return format_hyphenation(hyphenation)
	
end

function export.show(frame)
	local params = {
		[1] = {},
		["endschwa"] = { type = "boolean" },
		["ann"] = {},
	}

	local title = mw.title.getCurrentTitle()
	
	local args = require("Module:parameters").process(frame:getParent().args, params)
	local term = args[1] or title.nsText == "Template" and "при́мер" or title.text

	local ipa = export.toIPA(term, args.endschwa)
	ipa = "[" .. ipa .. "]"

	local ipa_text = require("Module:IPA").format_IPA_full(lang, { { pron = ipa } } )
	local anntext = get_anntext(term, args.ann)

	return anntext .. ipa_text
end

return export

Module:bg-pronunciation: difference between revisions

Revision as of 21:50, 1 August 2023

Testcases

References

Navigation menu

Search