thewh1teagle
latest
3f53a9f
from mishkal import lexicon
import unicodedata
import regex as re
def sort_diacritics(match):
letter = match.group(1)
diacritics = "".join(sorted(match.group(2))) # Sort diacritics
return letter + diacritics
NORMALIZE_PATTERNS = {
# Alphabet followed by 1/2 symbols then dagesh. make dagesh first
r"(\p{L})(\p{M}+)": sort_diacritics,
"״": '"',
"׳": "'",
}
def remove_niqqud(text: str):
return re.sub(lexicon.HE_NIQQUD_PATTERN, "", text)
def has_niqqud(text: str):
return re.search(lexicon.HE_NIQQUD_PATTERN, text) is not None
def normalize(text: str) -> str:
"""
Normalize unicode (decomposite)
Deduplicate niqqud (eg. only Patah instead of Kamatz)
Keep only Hebrew characters / punctuation / IPA
"""
# Decompose text
text = unicodedata.normalize("NFD", text)
for k, v in NORMALIZE_PATTERNS.items():
text = re.sub(k, v, text)
# Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
for k, v in lexicon.NIQQUD_DEDUPLICATE.items():
text = text.replace(k, v)
return text
def post_normalize(phonemes: str):
new_phonemes = []
for word in phonemes.split(" "):
# remove glottal stop from start/end
word = re.sub(r"^ʔ|ʔ$", "", word)
word = re.sub(r"^ˈʔ", "ˈ", word)
word = re.sub(r"ʔ$", "ˈ", word)
# remove h from start/end
word = re.sub(r"^h|h$", "", word)
word = re.sub(r"^ˈh|ˈh$", "ˈ", word)
word = re.sub(r"ij$", "i", word)
new_phonemes.append(word)
return " ".join(new_phonemes)
def get_unicode_names(text: str):
return [unicodedata.name(c, "?") for c in text]