Spaces:

thewh1teagle
/

phonemize-in-hebrew

Build error

App Files Files Community

thewh1teagle commited on 4 days ago

Commit

11e61f2

0 Parent(s):

latest

Browse files

Files changed (17) hide show

README.md +10 -0
app.py +43 -0
mishkal/__init__.py +28 -0
mishkal/data/kamatz_katan.json +5 -0
mishkal/data/rashej_tevot.json +3 -0
mishkal/data/symbols.json +4 -0
mishkal/expander/__init__.py +32 -0
mishkal/expander/dates.py +60 -0
mishkal/expander/dictionary.py +78 -0
mishkal/expander/number_names.py +193 -0
mishkal/expander/numbers.py +28 -0
mishkal/expander/time_to_word.py +104 -0
mishkal/lexicon.py +179 -0
mishkal/log.py +35 -0
mishkal/phonemize.py +207 -0
mishkal/utils.py +63 -0
requirements.txt +4 -0

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Phonemize in Hebrew
+emoji: 🐢
+colorFrom: red
+colorTo: green
+sdk: gradio
+sdk_version: "4.44.0"
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+uv sync
+uv pip install "gradio>=5.15.0"
+uv run gradio examples/editor.py
+"""
+from mishkal import phonemize, normalize
+import gradio as gr
+default_text = """
+כָּל עֶרֶב יָאִיר (הַשֵּׁם הַמָּלֵא וּמְקוֹם הָעֲבוֹדָה שֶׁלּוֹ שְׁמוּרִים בַּמַּעֲרֶכֶת) רָץ 20 קִילוֹמֶטֶר. הוּא מְסַפֵּר לִי שֶׁזֶּה מְנַקֶּה לוֹ אֶת הָרֹאשׁ אַחֲרֵי הָעֲבוֹדָה, "שָׁעָה וָחֵצִי בְּלִי עֲבוֹדָה, אִשָּׁה וִילָדִים" כְּמוֹ שֶׁהוּא מַגְדִּיר זֹאת. אֲבָל אַחֲרֵי הַמִּקְלַחַת הוּא מַתְחִיל בְּמָה שֶׁנִּתָּן לְכַנּוֹת הָעֲבוֹדָה הַשְּׁנִיָּה שֶׁלּוֹ: לִמְצֹא לוֹ קוֹלֵגוֹת חֲדָשׁוֹת לָעֲבוֹדָה, כִּי יָאִיר הוּא כַּנִּרְאֶה הַמֶּלֶךְ שֶׁל "חָבֵר מֵבִיא חָבֵר" בְּיִשְׂרָאֵל.
+"""
+theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
+def on_submit_debug(text: str) -> str:
+    phonemes = phonemize(text, preserve_punctuation=True)
+    normalized_text = normalize(text)
+    return phonemes + "\n\nNormalized:\n" + normalized_text
+def on_submit(text: str) -> str:
+    return phonemize(text, preserve_punctuation=False)
+with gr.Blocks(theme=theme) as demo:
+    text_input = gr.Textbox(
+        value=default_text, label="Text", rtl=True, elem_classes=["input"]
+    )
+    checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
+    phonemes_output = gr.Textbox(label="Phonemes")
+    submit_button = gr.Button("Create")
+    submit_button.click(
+        fn=lambda text, debug: on_submit_debug(text) if debug else on_submit(text),
+        inputs=[text_input, checkbox],
+        outputs=[phonemes_output],
+    )
+if __name__ == "__main__":
+    demo.launch()

mishkal/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+High level phonemize functions
+"""
+from .phonemize import Phonemizer
+from .utils import normalize  # noqa: F401
+from typing import Callable
+phonemizer = Phonemizer()
+def phonemize(
+    text: str,
+    preserve_punctuation=True,
+    preserve_stress=True,
+    use_expander=False,
+    use_post_normalize=False,  # For TTS
+    fallback: Callable[[str], str] = None,
+) -> str:
+    phonemes = phonemizer.phonemize(
+        text,
+        preserve_punctuation=preserve_punctuation,
+        preserve_stress=preserve_stress,
+        fallback=fallback,
+        use_expander=use_expander,
+        use_post_normalize=use_post_normalize,
+    )
+    return phonemes

mishkal/data/kamatz_katan.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "כל": "ˈkol",
+    "רחבי": "roxˈbi",
+    "אמנות": "omaˈnut"
+}

mishkal/data/rashej_tevot.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "צה״ל": "ˈtsahal"
+}

mishkal/data/symbols.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "₪": "ʃeˈkel",
+    "$": "doˈlar"
+}

mishkal/expander/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Expand dates and numbers into words with niqqud
+This happens before phonemization
+"""
+from .numbers import num_to_word
+from .dates import date_to_word
+from .time_to_word import time_to_word
+from .dictionary import Dictionary
+from mishkal.log import log
+class Expander:
+    def __init__(self):
+        self.dictionary = Dictionary()
+    def expand_text(self, text: str):
+        text = self.dictionary.expand_text(text)
+        words = []
+        for source_word in text.split():
+            try:
+                word = date_to_word(source_word)
+                if word == source_word:
+                    word = time_to_word(word)
+                if word == source_word:
+                    word = num_to_word(word)
+                words.append(word)
+            except Exception as e:
+                log.error(f"Failed to expand {word} with error: {e}")
+                words.append(source_word)
+        return " ".join(words)

mishkal/expander/dates.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from datetime import datetime
+from .numbers import num_to_word
+# Mapping of month names in Hebrew with diacritics (Gregorian months)
+MONTHS = {
+    1: "יָנוּאָר",
+    2: "פֶבְרוּאָר",
+    3: "מֵרְץ",
+    4: "אֵפְרִיל",
+    5: "מַאי",
+    6: "יוּנִי",
+    7: "יוּלִי",
+    8: "אוֹגֻסְט",
+    9: "סֶפְּטֶמְבֶּר",
+    10: "אוֹקְטוֹבֶּר",
+    11: "נוֹבֶמְבֶּר",
+    12: "דֶּצֶמְבֶּר",
+}
+# Mapping of day names in Hebrew with diacritics
+DAYS = {
+    0: "יוֹם רִאשׁוֹן",
+    1: "יוֹם שֵׁנִי",
+    2: "יוֹם שְׁלִישִׁי",
+    3: "יוֹם רֵבִיעִי",
+    4: "יוֹם חֲמִישִׁי",
+    5: "יוֹם שִׁישִׁי",
+    6: "יוֹם שַׁבָּת",
+}
+def date_to_word(word: str, include_day_name=False) -> str:
+    """
+    Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
+    Returns the original word if it's not a valid date.
+    """
+    separators = ["-", ".", "/"]
+    orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
+    date_formats = [sep.join(order) for order in orders for sep in separators]
+    for date_format in date_formats:
+        try:
+            # Try parsing the word with each date format
+            date_obj = datetime.strptime(word, date_format)
+            # Get the Hebrew day name with diacritics
+            day_name = DAYS[date_obj.weekday()]
+            # Convert month to Hebrew name with diacritics
+            month_name = MONTHS[date_obj.month]
+            day = num_to_word(str(date_obj.day))
+            year = num_to_word(str(date_obj.year))
+            text = f"{day} בֵּ{month_name} {year}"
+            if include_day_name:
+                text = f"{day_name}, {text}"
+            return text
+        except ValueError:
+            continue
+    return word

mishkal/expander/dictionary.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Dictionaries are tab separated key value words
+"""
+from pathlib import Path
+import json
+import re
+from mishkal.utils import remove_niqqud
+from mishkal.utils import normalize
+import unicodedata
+files = Path(__file__).parent.joinpath("../data").glob("*.json")
+# Sort in reverse order to prioritize the most recent and best
+order = {"bronze": 1, "silver": 2, "gold": 3}
+files = sorted(
+    files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
+)
+class Dictionary:
+    def __init__(self):
+        self.dict = {}
+        self.load_dictionaries()
+    def load_dictionaries(self):
+        for file in files:
+            with open(file, "r", encoding="utf-8") as f:
+                dictionary: dict = json.load(f)
+                normalized_dictionary = {}
+                # normalize niqqud keys
+                for k, v in dictionary.items():
+                    k = normalize(k)
+                    # Ensure not empty
+                    if k and v:
+                        normalized_dictionary[k] = v
+                self.dict.update(normalized_dictionary)
+    def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
+        source: str = match.group(0)
+        # decomposite
+        source = unicodedata.normalize("NFD", source)
+        raw_lookup = self.dict.get(source)
+        without_niqqud_lookup = self.dict.get(remove_niqqud(source))
+        with_niqqud_lookup = self.dict.get(normalize(source))
+        # Compare without niqqud ONLY if source has no niqqud
+        if raw_lookup:
+            return raw_lookup
+        if without_niqqud_lookup:
+            return without_niqqud_lookup
+        elif with_niqqud_lookup:
+            return with_niqqud_lookup
+        return source
+    def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
+        raw_source: str = match.group(0)
+        if raw_source.isnumeric():
+            return raw_source
+        raw_lookup = self.dict.get(raw_source)
+        # Compare without niqqud ONLY if source has no niqqud
+        if raw_lookup:
+            return raw_lookup
+        # search by only ', space, regular niqqud, alphabet
+        raw_source = re.sub(
+            r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
+        )
+        return raw_source
+    def expand_text(self, text: str) -> str:
+        """
+        TODO: if key doesn't have diacritics expand even diacritized words
+        """
+        text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
+        return text

mishkal/expander/number_names.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
+"""
+# TODO: add niqqud hints
+ZERO = {"אפס": "אֶפֶס"}
+ONES = {
+    "אחת": "אַחַת",
+    "אחד": "אֶחָד",
+    "ראשונה": "רִאשׁוֹנָה",
+    "ראשון": "רִאשׁוֹן",
+    "ראשונות": "רִאשׁוֹנוֹת",
+    "ראשונים": "רִאשׁוֹנִים",
+    "שתיים": "שְׁתַּיִם",
+    "שניים": "שְׁנַיִם",
+    "שתי": "שְׁתֵּי",
+    "שני": "שְׁנֵי",
+    "שנייה": "שְׁנִיָּה",
+    "שניות": "שְׁנִיּוֹת",
+    "שלוש": "שָׁלוֹשׁ",
+    "שלושה": "שְׁלוֹשָׁה",
+    "שלושת": "שְׁלוֹשֶׁת",
+    "שלישית": "שְׁלִישִׁית",
+    "שלישי": "שְׁלִישִׁי",
+    "שלישיות": "שְׁלִישִׁיּוֹת",
+    "שלישיים": "שְׁלִישִׁיִּים",
+    "ארבע": "אַרְבַּע",
+    "ארבעה": "אַרְבַּעָה",
+    "ארבעת": "אַרְבַּעַת",
+    "רביעית": "רֵבִיעִית",
+    "רביעי": "רֵבִיעִי",
+    "רביעיות": "רֵבִיעִיוֹת",
+    "רביעיים": "רֵבִיעִיִּים",
+    "חמש": "חָמֵשׁ",
+    "חמישה": "חֲמִשָּׁה",
+    "חמשת": "חֲמֵשֶׁת",
+    "חמישית": "חֲמִישִּׁית",
+    "חמישי": "חֲמִישִּׁי",
+    "חמישיות": "חֲמִישִּׁיוֹת",
+    "חמישיים": "חֲמִישִּׁיִּים",
+    "שש": "שֵׁשׁ",
+    "שישה": "שִׁשָּׁה",
+    "ששת": "שֵׁשֶׁת",
+    "שישית": "שִׁשִּׁית",
+    "שישי": "שִׁשִּׁי",
+    "שישיות": "שִׁשִּׁיוֹת",
+    "שישיים": "שִׁשִּׁיִּים",
+    "שבע": "שֶׁבַע",
+    "שבעה": "שִׁבְעָה",
+    "שבעת": "שִׁבְעַת",
+    "שביעית": "שְׁבִיעִית",
+    "שביעי": "שְׁבִיעִי",
+    "שביעיות": "שְׁבִיעִיוֹת",
+    "שביעיים": "שְׁבִיעִיִּים",
+    "שמונה": "שְׁמוֹנֶה",
+    "שמונת": "שְׁמוֹנַת",
+    "שמינית": "שְׁמִינִית",
+    "שמיני": "שְׁמִינִי",
+    "שמיניות": "שְׁמִינִיוֹת",
+    "שמיניים": "שְׁמִינִיִּים",
+    "תשע": "תֵּשַׁע",
+    "תשעה": "תִּשְׁעָה",
+    "תשעת": "תִּשְׁעַת",
+    "תשיעית": "תְּשִׁיעִית",
+    "תשיעי": "תְּשִׁיעִי",
+    "תשיעיות": "תְּשִׁיעִיּוֹת",
+    "תשיעיים": "תְּשִׁיעִיִּים",
+}
+TENS = {
+    "עשר": "עֶשֶׂר",
+    "עשרה": "עֲשָׁרָה",
+    "עשרת": "עֲשֶׁרֶת",
+    "עשירית": "עֲשִׁירִית",
+    "עשירי": "עֲשִׁירִי",
+    "עשיריות": "עֲשִׁירִיוֹת",
+    "עשיריים": "עֲשִׁירִיִּים",
+    "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
+    "שנים עשר": "שְׁנֵים עָשָׂר",
+}
+TWENTIES = {
+    "עשרים": "עֶשְׂרִ֫ים",
+    "שלושים": "שְׁלוֹשִׁים",
+    "ארבעים": "אַרְבָּעִים",
+    "חמישים": "חֲמִשִּׁים",
+    "שישים": "שִׁשִּׁים",
+    "שבעים": "שִׁבְעִים",
+    "שמונים": "שְׁמוֹנִים",
+    "תשעים": "תִּשְׁעִים",
+}
+HUNDREDS = {
+    "מאה": "מֵאָה",
+    "מאת": "מֵאַת",
+    "מאתיים": "מָאתַיִם",
+    "מאות": "מֵאוֹת",
+}
+THOUSANDS = {
+    "אלף": "אֶלֶף",
+    "אלפיים": "אַלְפַּיִם",
+    "אלפים": "אֲלָפִים",
+    "אלפי": "אַלְפִּי",
+}
+LARGE = {
+    "מיליון": "מִילְיוֹן",
+    "מיליוני": "מִילְיוֹנִי",
+    "מיליארד": "מִילְיַארְד",
+    "מיליארדי": "מִילְיַארְדִּי",
+    "טריליון": "טְרִילְיוֹן",
+    "טריליוני": "טְרִילְיוֹנִי",
+    "קוודריליון": "קוֹוַדְרִילְיוֹן",
+    "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
+    "קווינטיליון": "קוִוִּנְטִילְיוֹן",
+    "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
+    "סקסטיליון": "סְקֶסְטִילְיוֹן",
+    "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
+    "ספטיליון": "סְפֶּטִילְיוֹן",
+    "ספטיליוני": "סְפֶּטִילְיוֹנִי",
+    "אוקטיליון": "אוֹקְטִילְיוֹן",
+    "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
+    "נוניליון": "נוּנִילְיוֹן",
+    "נוניליוני": "נוּנִילְיוֹנִי",
+    "דסיליון": "דֶּסִילְיוֹן",
+    "דסיליוני": "דֶּסִילְיוֹנִי",
+    "אונדסיליון": "אוּנְדְסִילְיוֹ��",
+    "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
+    "דואודסיליון": "דוּאודְסִילְיוֹן",
+    "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
+    "טרדסיליון": "טֶרְדְסִילְיוֹן",
+    "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
+    "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
+    "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
+    "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
+    "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
+    "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
+    "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
+    "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
+    "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
+    "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
+    "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
+    "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
+    "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
+    "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
+    "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
+}
+LETTERS = {
+    "ו": "וֵ",
+    "ה": "הַ",
+}
+CURRENCY = {
+    "שקל": "שֵׁקֶל",
+    "שקלים": "שְׁקָלִים",
+    "אגורה": "אֲגוֹרָה",
+    "אגורות": "אֲגוֹרוֹת",
+    "אירו": "אֵירוֹ",
+    "סנט": "סֵנְט",
+    "סנטים": "סֵנְטִים",
+    "דולר": "דוֹלָר",
+    "דולרים": "דוֹלָרִים",
+}
+POINTS = {
+    "מינוס": "מִינּוּס",
+    "נקודה": "נְקֻדָּה",
+}
+NUMBER_NAMES = {
+    **CURRENCY,
+    **HUNDREDS,
+    **LARGE,
+    **LETTERS,
+    **ONES,
+    **POINTS,
+    **TENS,
+    **THOUSANDS,
+    **TWENTIES,
+    **ZERO,
+}

mishkal/expander/numbers.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import num2words
+from .number_names import NUMBER_NAMES
+import re
+def add_diacritics(words: str):
+    new_words = []
+    for word in words.split():
+        if NUMBER_NAMES.get(word):
+            new_words.append(NUMBER_NAMES[word])
+        elif NUMBER_NAMES.get(word[1:]):
+            # With Vav or Bet
+            new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
+        else:
+            new_words.append(word)
+    return " ".join(new_words)
+def num_to_word(maybe_number: str) -> str:
+    def replace_number(match):
+        num = match.group()
+        words = num2words.num2words(num, lang="he", ordinal=False)
+        return add_diacritics(words)
+    # Replace all whole numbers in the string
+    result = re.sub(r"\d+", replace_number, maybe_number)
+    return result

mishkal/expander/time_to_word.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Convert time to words
+TODO: fix zeros eg. 22:00
+"""
+import re
+PATTERNS = [
+    r"(\d{1,2})([apm]{2})",  # AM/PM format
+    r"(\d{1,2}):(\d{2})",  # HH:MM format
+]
+def extract_time(match):
+    """
+    Extract hour and minute from a string in HH:MM or AM/PM format
+    and return as integers.
+    """
+    time_str = match.group(0).lower().strip()
+    # Check for HH:MM format
+    match = re.match(r"(\d{1,2}):(\d{2})", time_str)
+    if match:
+        h = int(match.group(1))
+        m = int(match.group(2))
+        return f"{convert_to_word(h, m)}"
+    # Check for AM/PM format
+    match = re.match(r"(\d{1,2})([apm]{2})", time_str)
+    if match:
+        h = int(match.group(1))
+        period = match.group(2)
+        # Normalize to 24-hour format
+        if period == "am" and h == 12:
+            h = 0
+        elif period == "pm" and h != 12:
+            h += 12
+        return f"{convert_to_word(h, 0)}"  # Defaulting to 0 minutes when only hour is provided
+    return match.group(0)  # Return original text if the format is not recognized
+def convert_to_word(h, m):
+    hours = [
+        "אֶפֶס",
+        "אַחַת",
+        "שְׁנַיִם",  # Will be replaced with "שֵׁנִי" when needed
+        "שָׁלוֹשׁ",
+        "אַרְבַּע",
+        "חָמֵשׁ",
+        "שֵׁשׁ",
+        "שֶׁבַע",
+        "שְׁמוֹנֵה",
+        "תֵּשַׁע",
+        "עֵשֵׂר",
+        "אַחַת עֶשְׂרֵה",
+        "שְׁתֵּים עֶשְׂרֵה",
+    ]
+    tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
+    ten_to_twenty = [
+        "עֵשֵׂר",
+        "אַחַת עֶשְׂרֵה",
+        "שְׁתֵּים עֶשְׂרֵה",
+        "שְׁלוֹשׁ עֶשְׂרֵה",
+        "אַרְבַּע עֶשְׂרֵה",
+        "חֲמֵשׁ עֶשְׂרֵה",
+        "שֵׁשׁ עֶשְׂרֵה",
+        "שְׁבַע עֶשְׂרֵה",
+        "שְׁמוֹנֶה עֶשְׂרֵה",
+        "תְּשַׁע עֶשְׂרֵה",
+    ]
+    vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
+    # Convert 0 hours to 12 (midnight)
+    if h == 0:
+        h = 12
+    elif h > 12:
+        h -= 12
+    if m == 0:
+        return f"{hours[h]}"
+    elif 1 <= m <= 9:
+        minute_word = (
+            vocab["shtey"] if m == 2 else hours[m]
+        )  # Replace "שניים" with "שני"
+        return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
+    elif 10 <= m <= 19:
+        return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
+    else:
+        tens_part = f"{vocab['and']}{tens[m // 10]}"
+        units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
+        return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
+def time_to_word(text: str):
+    return re.sub("|".join(PATTERNS), extract_time, text)

mishkal/lexicon.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+ASCII IPA transcription of Hebrew consonants and vowels.
+"""
+# https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
+HE_CHARS_PATTERN = (
+    r"\b[\u05B0-\u05EA\u05F3\u0027]+\b"  # Chars including niqqud, geresh and en_geresh
+)
+HE_NIQQUD_PATTERN = r"[\u05B0-\u05C7]"
+PUNCTUATION = r".,!? "
+# Special
+GIMEL_OR_ZAIN_WITH_DAGESH = "dʒ"
+TSADIK_WITH_DAGESH = "tʃ"
+SHIN_WITH_POINT = "ʃ"
+SIN_WITH_POINT = "s"
+STRESS = "\u02c8"  # visually looks like '
+SECONDARY_STRESS = "\u02cc"
+HET_GNUVA = "ax"
+W_AS_WALLA = "w"
+GERESH_LETTERS = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
+LETTERS_NAMES_PHONEMES = {
+    "א": "alef",  # Alef, glottal stop
+    "ב": "bet",  # Bet
+    "ג": "gimel",  # Gimel
+    "ד": "dalet",  # Dalet
+    "ה": "hej",  # He
+    "ו": "vav",  # Vav
+    "ז": "zajin",  # Zayin
+    "ח": "xet",  # Het
+    "ט": "tet",  # Tet
+    "י": "jud",  # Yod
+    "ך": "xaf sofit",  # Haf sofit
+    "כ": "xaf",  # Haf
+    "ל": "lamed",  # Lamed
+    "ם": "mem sofit",  # Mem Sofit
+    "מ": "mem",  # Mem
+    "ן": "nun sofit",  # Nun Sofit
+    "נ": "nun",  # Nun
+    "ס": "samex",  # Samekh
+    "ע": "ajin",  # Ayin, glottal stop
+    "פ": "fey",  # Fey
+    "ף": "fey sofit",  # Fey Sofit
+    "ץ": "tsadik sofit",  # Tsadik sofit
+    "צ": "tsadik",  # Tsadik
+    "ק": "kuf",  # Kuf
+    "ר": "rejiʃ",  # Resh
+    "ש": "ʃin",  # Shin
+    "ת": "taf",  # Taf
+}
+# Consonants
+LETTERS_PHONEMES = {
+    "א": "ʔ",  # Alef
+    "ב": "v",  # Bet
+    "ג": "g",  # Gimel
+    "ד": "d",  # Dalet
+    "ה": "h",  # He
+    "ו": "v",  # Vav
+    "ז": "z",  # Zayin
+    "ח": "x",  # Het
+    "ט": "t",  # Tet
+    "י": "j",  # Yod
+    "ך": "x",  # Haf sofit
+    "כ": "x",  # Haf
+    "ל": "l",  # Lamed
+    "ם": "m",  # Mem Sofit
+    "מ": "m",  # Mem
+    "ן": "n",  # Nun Sofit
+    "נ": "n",  # Nun
+    "ס": "s",  # Samekh
+    "ע": "ʔ",  # Ayin, only voweled
+    "פ": "f",  # Fey
+    "ף": "f",  # Fey Sofit
+    "ץ": "ts",  # Tsadik sofit
+    "צ": "ts",  # Tsadik
+    "ק": "k",  # Kuf
+    "ר": "r",  # Resh
+    "ש": "ʃ",  # Shin
+    "ת": "t",  # Taf
+    # Beged Kefet
+    "בּ": "b",
+    "כּ": "k",
+    "פּ": "p",
+    "שׁ": "ʃ",
+    "שׂ": "s",
+    "'": "",
+}
+# Vowels
+VOWEL_A = "a"
+VOWEL_E = "e"
+VOWEL_I = "i"
+VOWEL_O = "o"
+VOWEL_U = "u"
+NIQQUD_PHONEMES = {
+    "\u05b4": "i",  # Hiriq
+    "\u05b5": "e",  # Tsere
+    "\u05b7": "a",  # Patah
+    "\u05b9": "o",  # Holam
+    "\u05ba": "o",  # Holam haser for vav
+    "\u05bb": "u",  # Qubuts
+    "\u05ab": "ˈ",  # Stress (Atmaha)
+    "\u05bd": "e",  # Shva na
+}
+SET_LETTER_SYMBOLS = {
+    "\u05b0",  # Shva
+    "\u05b4",  # Hiriq
+    "\u05b5",  # Tsere
+    "\u05b7",  # Patah
+    "\u05b9",  # Holam
+    "\u05ba",  # Holam haser for vav
+    "\u05bb",  # Qubuts
+    "\u05bc",  # Dagesh
+    "\u05c1",  # Shin dot
+    "\u05c2",  # Sin dot
+    "'",  # Geresh
+}
+"""
+We're left with the following niqqud (10):
+Shva, Hiriq, Tsere, Patah, Holam, Qubuts, Dagesh,
+Holam haser for vav, Shin dot, Sin dot
+"""
+NIQQUD_DEDUPLICATE = {
+    "\u05b1": "\u05b5",  # Hataf Segol -> Tsere
+    "\u05b2": "\u05b7",  # Hataf Patah -> Patah
+    "\u05b3": "\u05b9",  # Hataf Qamats -> Holam
+    "\u05b6": "\u05b5",  # Segol -> Tsere
+    # Kamatz -> Patah
+    "\u05b8": "\u05b7",  # Qamats -> Patah
+    "\u05c7": "\u05b9",  # Qamats Qatan -> Holam
+    "\u05f3": "'",  # Hebrew geresh to regular geresh
+}
+SET_OUTPUT_CHARACTERS = set(
+    [
+        *GIMEL_OR_ZAIN_WITH_DAGESH,
+        TSADIK_WITH_DAGESH,
+        SHIN_WITH_POINT,
+        SIN_WITH_POINT,
+        W_AS_WALLA,
+    ]
+    + [STRESS, SECONDARY_STRESS]
+    + list(LETTERS_PHONEMES.values())
+    + list(NIQQUD_PHONEMES.values())
+    + [VOWEL_A, VOWEL_E, VOWEL_I, VOWEL_O, VOWEL_U]
+    + list(PUNCTUATION)
+)
+SET_NIQQUD = {
+    # Shva, Hiriq, Tsere, Patah, Holam, Holam haser for vav, Qubuts, Dagesh, Shin dot, Sin dot
+    "\u05b0",
+    "\u05b4",
+    "\u05b5",
+    "\u05b7",
+    "\u05b9",
+    "\u05ba",
+    "\u05bb",
+    "\u05bc",
+    "\u05c1",
+    "\u05c2",
+    # shva na and atmaha
+    "\u05bd",  # shva na
+    "\u05ab",  # atmaha
+}
+SET_LETTERS = set(LETTERS_PHONEMES.keys())
+SET_PUNCTUATION = set(PUNCTUATION)
+# Set for fast lookup
+SET_INPUT_CHARACTERS = set(
+    list(LETTERS_PHONEMES.keys()) + list(SET_NIQQUD) + list(PUNCTUATION) + ["'"]
+)

mishkal/log.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import logging
+import os
+import colorlog
+def _create_logger():
+    """
+    Create a logger with colorized output
+    Usage: LOG_LEVEL=DEBUG python <script.py>
+    """
+    handler = colorlog.StreamHandler()
+    fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
+    handler.setFormatter(
+        colorlog.ColoredFormatter(
+            fmt=fmt,
+            log_colors={
+                "DEBUG": "blue",
+                "INFO": "green",
+                "WARNING": "yellow",
+                "ERROR": "red",
+                "CRITICAL": "red",
+            },
+        )
+    )
+    # Get log level from LOG_LEVEL environment variable
+    log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
+    logger = colorlog.getLogger(__package__)
+    logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
+    # Setup logging to stdout
+    logger.addHandler(handler)
+    return logger
+log = _create_logger()

mishkal/phonemize.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+The actual letters phonemization happens here.
+Phonemes generated based on rules.
+Early rules:
+1. Niqqud malle vowels
+2. Dagesh (custom beged kefet)
+3. Final letter without niqqud
+4. Final Het gnuva
+5. Geresh (Gimel, Ttadik, Zain)
+6. Shva na
+Revised rules:
+1. Consonants
+2. Niqqud
+Reference:
+- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
+- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
+- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
+"""
+from mishkal import lexicon
+from .expander import Expander
+from mishkal.utils import normalize, post_normalize
+from typing import Callable
+import regex as re
+class Phonemizer:
+    def __init__(self):
+        self.expander = Expander()
+    def phonemize(
+        self,
+        text: str,
+        preserve_punctuation=True,
+        preserve_stress=True,
+        use_expander=False,
+        use_post_normalize=False,  # For TTS
+        fallback: Callable[[str], str] = None,
+    ) -> str:
+        # normalize
+        text = normalize(text)
+        # TODO: is that enough? what if there's punctuation around? other chars?
+        he_pattern = r"[\u05b0-\u05ea\u05ab\u05bd']+"
+        fallback_pattern = r"[a-zA-Z]+"
+        def fallback_replace_callback(match: re.Match):
+            word = match.group(0)
+            if self.expander.dictionary.dict.get(word):
+                # skip
+                # TODO: better API
+                return word
+            phonemes = fallback(word).strip()
+            # TODO: check that it has only IPA?!
+            for c in phonemes:
+                lexicon.SET_OUTPUT_CHARACTERS.add(c)
+            return phonemes
+        if fallback is not None:
+            text = re.sub(fallback_pattern, fallback_replace_callback, text)
+        if use_expander:
+            text = self.expander.expand_text(text)
+        self.fallback = fallback
+        def heb_replace_callback(match: re.Match):
+            word = match.group(0)
+            word = normalize(word)
+            word = "".join(
+                i for i in word if i in lexicon.SET_LETTERS or i in lexicon.SET_NIQQUD
+            )
+            letters = re.findall(r"(\p{L})([\p{M}']*)", word)  # with en_geresh
+            phonemes = self.phonemize_hebrew(letters)
+            return "".join(phonemes)
+        text = re.sub(he_pattern, heb_replace_callback, text)
+        if not preserve_punctuation:
+            text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
+        if not preserve_stress:
+            text = "".join(
+                i for i in text if i not in [lexicon.STRESS, lexicon.SECONDARY_STRESS]
+            )
+        if use_post_normalize:
+            text = post_normalize(text)
+        text = "".join(i for i in text if i in lexicon.SET_OUTPUT_CHARACTERS)
+        return text
+    def phonemize_hebrew(self, letters: list[str]):
+        phonemes = []
+        i = 0
+        while i < len(letters):
+            cur = letters[i]
+            prev = letters[i - 1] if i > 0 else None
+            next = letters[i + 1] if i < len(letters) - 1 else None
+            skip_diacritics = False
+            skip_consonants = False
+            # revised rules
+            # יַאלְלָה
+            if cur[0] == "ל" and cur[1] == "\u05b0" and next and next[0] == "ל":
+                skip_diacritics = True
+                skip_consonants = True
+            if (
+                cur[0] == "ו"
+                and not prev
+                and next
+                and not next[1]
+                and cur[0] + cur[1] == "וַא"
+            ):
+                i += 1
+                phonemes.append("wa")
+            if cur[0] == "א" and not cur[1] and prev:
+                skip_consonants = True
+            # TODO ?
+            if cur[0] == "י" and next and not cur[1]:
+                skip_consonants = True
+            if cur[0] == "ש" and "\u05c2" in cur[1]:
+                phonemes.append("s")
+                skip_consonants = True
+            # shin without niqqud after sin = sin
+            if cur[0] == "ש" and not cur[1] and prev and "\u05c2" in prev[1]:
+                phonemes.append("s")
+                skip_consonants = True
+            if not next and cur[0] == "ח":
+                # Final Het gnuva
+                phonemes.append("ax")
+                skip_diacritics = True
+                skip_consonants = True
+            if cur and "'" in cur[1] and cur[0] in lexicon.GERESH_LETTERS:
+                if cur[0] == "ת":
+                    phonemes.append(lexicon.GERESH_LETTERS.get(cur[0], ""))
+                    skip_diacritics = True
+                    skip_consonants = True
+                else:
+                    # Geresh
+                    phonemes.append(lexicon.GERESH_LETTERS.get(cur[0], ""))
+                    skip_consonants = True
+            elif (
+                "\u05bc" in cur[1] and cur[0] + "\u05bc" in lexicon.LETTERS_PHONEMES
+            ):  # dagesh
+                phonemes.append(lexicon.LETTERS_PHONEMES.get(cur[0] + "\u05bc", ""))
+                skip_consonants = True
+            elif cur[0] == "ו":
+                skip_consonants = True
+                if next and next[0] == "ו":
+                    # patah and next[1] empty
+                    if cur[1] == "\u05b7" and not next[1]:
+                        phonemes.append("w")
+                        i += 2
+                    else:
+                        # double vav
+                        phonemes.append("wo")
+                        skip_diacritics = True
+                else:
+                    # Single vav
+                    # Vav with Patah
+                    if "\u05b7" in cur[1]:
+                        phonemes.append("va")
+                    # Holam haser
+                    elif "\u05b9" in cur[1]:
+                        phonemes.append("o")
+                    # Shuruk / Kubutz
+                    elif "\u05bb" in cur[1] or "\u05bc" in cur[1]:
+                        phonemes.append("u")
+                    # Vav with Shva in start
+                    elif "\u05b0" in cur[1] and not prev:
+                        phonemes.append("ve")
+                    # Hirik
+                    elif "\u05b4" in cur[1]:
+                        phonemes.append("vi")
+                    else:
+                        phonemes.append("v")
+                    skip_diacritics = True
+            if not skip_consonants:
+                phonemes.append(lexicon.LETTERS_PHONEMES.get(cur[0], ""))
+            niqqud_phonemes = (
+                [lexicon.NIQQUD_PHONEMES.get(niqqud, "") for niqqud in cur[1]]
+                if not skip_diacritics
+                else []
+            )
+            if "\u05ab" in cur[1] and phonemes:
+                # Ensure ATMAHA is before the letter (before the last phoneme added)
+                niqqud_phonemes.remove(lexicon.NIQQUD_PHONEMES["\u05ab"])
+                phonemes = (
+                    phonemes[:-1] + [lexicon.NIQQUD_PHONEMES["\u05ab"]] + [phonemes[-1]]
+                )
+            phonemes.extend(niqqud_phonemes)
+            i += 1
+        return phonemes

mishkal/utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from mishkal import lexicon
+import unicodedata
+import regex as re
+def sort_diacritics(match):
+    letter = match.group(1)
+    diacritics = "".join(sorted(match.group(2)))  # Sort diacritics
+    return letter + diacritics
+NORMALIZE_PATTERNS = {
+    # Alphabet followed by 1/2 symbols then dagesh. make dagesh first
+    r"(\p{L})(\p{M}+)": sort_diacritics,
+    "״": '"',
+    "׳": "'",
+}
+def remove_niqqud(text: str):
+    return re.sub(lexicon.HE_NIQQUD_PATTERN, "", text)
+def has_niqqud(text: str):
+    return re.search(lexicon.HE_NIQQUD_PATTERN, text) is not None
+def normalize(text: str) -> str:
+    """
+    Normalize unicode (decomposite)
+    Deduplicate niqqud (eg. only Patah instead of Kamatz)
+    Keep only Hebrew characters / punctuation / IPA
+    Sort diacritics
+    """
+    # Decompose text
+    text = unicodedata.normalize("NFD", text)
+    for k, v in NORMALIZE_PATTERNS.items():
+        text = re.sub(k, v, text)
+    # Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
+    for k, v in lexicon.NIQQUD_DEDUPLICATE.items():
+        text = text.replace(k, v)
+    return text
+def post_normalize(phonemes: str):
+    new_phonemes = []
+    for word in phonemes.split(" "):
+        # remove glottal stop from start and end
+        word = re.sub(r"ʔ$", "", word)
+        word = re.sub(r"^ʔ", "", word)
+        word = re.sub(r"^ˈʔ", "ˈ", word)
+        # remove h from end
+        word = re.sub(r"h$", "", word)
+        word = re.sub(r"ˈh$", "ˈ", word)
+        # remove j followed by a i
+        word = re.sub(r"ij", "i", word)
+        new_phonemes.append(word)
+    return " ".join(new_phonemes)
+def get_unicode_names(text: str):
+    return [unicodedata.name(c, "?") for c in text]

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=5.15.0
+num2words
+colorlog
+regex