File size: 1,687 Bytes
3f53a9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from mishkal import lexicon
import unicodedata
import regex as re
def sort_diacritics(match):
letter = match.group(1)
diacritics = "".join(sorted(match.group(2))) # Sort diacritics
return letter + diacritics
NORMALIZE_PATTERNS = {
# Alphabet followed by 1/2 symbols then dagesh. make dagesh first
r"(\p{L})(\p{M}+)": sort_diacritics,
"״": '"',
"׳": "'",
}
def remove_niqqud(text: str):
return re.sub(lexicon.HE_NIQQUD_PATTERN, "", text)
def has_niqqud(text: str):
return re.search(lexicon.HE_NIQQUD_PATTERN, text) is not None
def normalize(text: str) -> str:
"""
Normalize unicode (decomposite)
Deduplicate niqqud (eg. only Patah instead of Kamatz)
Keep only Hebrew characters / punctuation / IPA
"""
# Decompose text
text = unicodedata.normalize("NFD", text)
for k, v in NORMALIZE_PATTERNS.items():
text = re.sub(k, v, text)
# Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
for k, v in lexicon.NIQQUD_DEDUPLICATE.items():
text = text.replace(k, v)
return text
def post_normalize(phonemes: str):
new_phonemes = []
for word in phonemes.split(" "):
# remove glottal stop from start/end
word = re.sub(r"^ʔ|ʔ$", "", word)
word = re.sub(r"^ˈʔ", "ˈ", word)
word = re.sub(r"ʔ$", "ˈ", word)
# remove h from start/end
word = re.sub(r"^h|h$", "", word)
word = re.sub(r"^ˈh|ˈh$", "ˈ", word)
word = re.sub(r"ij$", "i", word)
new_phonemes.append(word)
return " ".join(new_phonemes)
def get_unicode_names(text: str):
return [unicodedata.name(c, "?") for c in text]
|