Spaces:

thewh1teagle
/

phonemize-in-hebrew

Running

thewh1teagle

latest

3f53a9f about 6 hours ago

1.69 kB

	from mishkal import lexicon
	import unicodedata
	import regex as re


	def sort_diacritics(match):
	letter = match.group(1)
	diacritics = "".join(sorted(match.group(2))) # Sort diacritics
	return letter + diacritics


	NORMALIZE_PATTERNS = {
	# Alphabet followed by 1/2 symbols then dagesh. make dagesh first
	r"(\p{L})(\p{M}+)": sort_diacritics,
	"״": '"',
	"׳": "'",
	}


	def remove_niqqud(text: str):
	return re.sub(lexicon.HE_NIQQUD_PATTERN, "", text)


	def has_niqqud(text: str):
	return re.search(lexicon.HE_NIQQUD_PATTERN, text) is not None


	def normalize(text: str) -> str:
	"""
	Normalize unicode (decomposite)
	Deduplicate niqqud (eg. only Patah instead of Kamatz)
	Keep only Hebrew characters / punctuation / IPA
	"""

	# Decompose text
	text = unicodedata.normalize("NFD", text)
	for k, v in NORMALIZE_PATTERNS.items():
	text = re.sub(k, v, text)
	# Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
	for k, v in lexicon.NIQQUD_DEDUPLICATE.items():
	text = text.replace(k, v)
	return text


	def post_normalize(phonemes: str):
	new_phonemes = []
	for word in phonemes.split(" "):
	# remove glottal stop from start/end
	word = re.sub(r"^ʔ\|ʔ$", "", word)
	word = re.sub(r"^ˈʔ", "ˈ", word)
	word = re.sub(r"ʔ$", "ˈ", word)
	# remove h from start/end
	word = re.sub(r"^h\|h$", "", word)
	word = re.sub(r"^ˈh\|ˈh$", "ˈ", word)
	word = re.sub(r"ij$", "i", word)
	new_phonemes.append(word)
	return " ".join(new_phonemes)


	def get_unicode_names(text: str):
	return [unicodedata.name(c, "?") for c in text]