Spaces:

thewh1teagle
/

phonemize-in-hebrew

Running

App Files Files Community

thewh1teagle commited on 26 days ago

Commit

308923b

0 Parent(s):

latest

Browse files

Files changed (20) hide show

README.md +10 -0
app.py +45 -0
mishkal/__init__.py +32 -0
mishkal/data/kamatz_katan.json +5 -0
mishkal/data/rashej_tevot.json +3 -0
mishkal/data/special.json +4 -0
mishkal/data/symbols.json +4 -0
mishkal/expander/__init__.py +32 -0
mishkal/expander/dates.py +60 -0
mishkal/expander/dictionary.py +78 -0
mishkal/expander/number_names.py +193 -0
mishkal/expander/numbers.py +28 -0
mishkal/expander/time_to_word.py +104 -0
mishkal/hebrew.py +188 -0
mishkal/lexicon.py +92 -0
mishkal/log.py +35 -0
mishkal/phonemize.py +99 -0
mishkal/utils.py +109 -0
mishkal/variants.py +12 -0
requirements.txt +4 -0

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Phonemize in Hebrew
+emoji: 🐢
+colorFrom: red
+colorTo: green
+sdk: gradio
+sdk_version: "4.44.0"
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+uv sync
+uv pip install "gradio>=5.15.0"
+uv run gradio examples/editor.py
+"""
+from mishkal import phonemize, normalize
+import gradio as gr
+default_text = """
+כָּל עֶ֫רֶב יָאִ֫יר (הַשֵּׁ֫ם הַמָּלֵ֫א וּמֽק֫וֹם הָעֲבוֹדָ֫ה שֶׁלּ֫וֹ שְׁמוּרִ֫ים בַּמַּעֲרֶ֫כֶת) רָץ 20 קִילוֹמֶ֫טֶר. הוּא מֽסַפֵּ֫ר לִי שֶׁזֶּ֫ה מֽנַקֶּ֫ה לוֹ אֶת הָרֹ֫אשׁ אַחֲרֵ֫י הָעֲבוֹדָ֫ה, "שָׁעָ֫ה וָחֵ֫צִי בְּלִ֫י עֲבוֹדָ֫ה, אִשָּׁ֫ה וִילָדִ֫ים" כְּמ֫וֹ שֶׁה֫וּא מַגְדִּ֫יר זֹאת. אֲבָ֫ל אַחֲרֵ֫י הַמִּקְלַ֫חַת הוּא מַתְחִ֫יל בּֽמָ֫ה שֶׁנִּתָּ֫ן לֽכַנּ֫וֹת הָעֲבוֹדָ֫ה הַשְּׁנִיָּ֫ה שֶׁלּ֫וֹ: לִמְצֹ֫א לוֹ קוֹלֵ֫גוֹת חֲדָשׁ֫וֹת לָעֲבוֹדָ֫ה, כִּי יָאִ֫יר הוּא כַּנִּרְאֶ֫ה הַמֶּ֫לֶךְ שֶׁל "חָבֵ֫ר מֵבִ֫יא חָבֵ֫ר" בּֽיִשְׂרָאֵ֫ל.
+דֻּגְמָא מַגְנִיבָה: [אנציקלופדיה](/ʔantsikloˈpedja/)
+"""
+theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
+def on_submit_debug(text: str, predict_stress) -> str:
+    phonemes = phonemize(text, preserve_punctuation=True, predict_stress=predict_stress)
+    normalized_text = normalize(text)
+    return phonemes + "\n\nNormalized:\n" + normalized_text
+def on_submit(text: str, predict_stress) -> str:
+    return phonemize(text, preserve_punctuation=False, predict_stress=predict_stress)
+with gr.Blocks(theme=theme) as demo:
+    text_input = gr.Textbox(
+        value=default_text, label="Text", rtl=True, elem_classes=["input"]
+    )
+    debug_checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
+    predict_stress_checkbox = gr.Checkbox(value=False, label="Predict Stress")
+    phonemes_output = gr.Textbox(label="Phonemes")
+    submit_button = gr.Button("Create")
+    submit_button.click(
+        fn=lambda text, debug, stress: on_submit_debug(text, stress) if debug else on_submit(text, stress),
+        inputs=[text_input, debug_checkbox, predict_stress_checkbox],
+        outputs=[phonemes_output],
+    )
+if __name__ == "__main__":
+    demo.launch()

mishkal/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+High level phonemize functions
+"""
+from .phonemize import Phonemizer
+from .utils import normalize  # noqa: F401
+from typing import Callable
+phonemizer = Phonemizer()
+def phonemize(
+    text: str,
+    preserve_punctuation=True,
+    preserve_stress=True,
+    use_expander=True,
+    use_post_normalize=True,  # For TTS
+    predict_stress=True,
+    predict_shva_nah=True,
+    fallback: Callable[[str], str] = None,
+) -> str:
+    phonemes = phonemizer.phonemize(
+        text,
+        preserve_punctuation=preserve_punctuation,
+        preserve_stress=preserve_stress,
+        fallback=fallback,
+        use_expander=use_expander,
+        use_post_normalize=use_post_normalize,
+        predict_stress=predict_stress,
+        predict_shva_nah=predict_shva_nah,
+    )
+    return phonemes

mishkal/data/kamatz_katan.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "כל": "ˈkol",
+    "רחבי": "roxˈbi",
+    "אמנות": "omaˈnut"
+}

mishkal/data/rashej_tevot.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "צה״ל": "ˈtsahal"
+}

mishkal/data/special.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "וַאלְלָה": "wala",
+    "וַסַבִּי": "wasabi"
+}

mishkal/data/symbols.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "₪": "ʃeˈkel",
+    "$": "doˈlar"
+}

mishkal/expander/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Expand dates and numbers into words with nikud
+This happens before phonemization
+"""
+from .numbers import num_to_word
+from .dates import date_to_word
+from .time_to_word import time_to_word
+from .dictionary import Dictionary
+from mishkal.log import log
+class Expander:
+    def __init__(self):
+        self.dictionary = Dictionary()
+    def expand_text(self, text: str):
+        text = self.dictionary.expand_text(text)
+        words = []
+        for source_word in text.split():
+            try:
+                word = date_to_word(source_word)
+                if word == source_word:
+                    word = time_to_word(word)
+                if word == source_word:
+                    word = num_to_word(word)
+                words.append(word)
+            except Exception as e:
+                log.error(f"Failed to expand {word} with error: {e}")
+                words.append(source_word)
+        return " ".join(words)

mishkal/expander/dates.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from datetime import datetime
+from .numbers import num_to_word
+# Mapping of month names in Hebrew with diacritics (Gregorian months)
+MONTHS = {
+    1: "יָנוּאָר",
+    2: "פֶבְרוּאָר",
+    3: "מֵרְץ",
+    4: "אֵפְרִיל",
+    5: "מַאי",
+    6: "יוּנִי",
+    7: "יוּלִי",
+    8: "אוֹגֻסְט",
+    9: "סֶפְּטֶמְבֶּר",
+    10: "אוֹקְטוֹבֶּר",
+    11: "נוֹבֶמְבֶּר",
+    12: "דֶּצֶמְבֶּר",
+}
+# Mapping of day names in Hebrew with diacritics
+DAYS = {
+    0: "יוֹם רִאשׁוֹן",
+    1: "יוֹם שֵׁנִי",
+    2: "יוֹם שְׁלִישִׁי",
+    3: "יוֹם רֵבִיעִי",
+    4: "יוֹם חֲמִישִׁי",
+    5: "יוֹם שִׁישִׁי",
+    6: "יוֹם שַׁבָּת",
+}
+def date_to_word(word: str, include_day_name=False) -> str:
+    """
+    Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
+    Returns the original word if it's not a valid date.
+    """
+    separators = ["-", ".", "/"]
+    orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
+    date_formats = [sep.join(order) for order in orders for sep in separators]
+    for date_format in date_formats:
+        try:
+            # Try parsing the word with each date format
+            date_obj = datetime.strptime(word, date_format)
+            # Get the Hebrew day name with diacritics
+            day_name = DAYS[date_obj.weekday()]
+            # Convert month to Hebrew name with diacritics
+            month_name = MONTHS[date_obj.month]
+            day = num_to_word(str(date_obj.day))
+            year = num_to_word(str(date_obj.year))
+            text = f"{day} בֵּ{month_name} {year}"
+            if include_day_name:
+                text = f"{day_name}, {text}"
+            return text
+        except ValueError:
+            continue
+    return word

mishkal/expander/dictionary.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Dictionaries are tab separated key value words
+"""
+from pathlib import Path
+import json
+import re
+from mishkal.utils import remove_nikud
+from mishkal.utils import normalize
+import unicodedata
+files = Path(__file__).parent.joinpath("../data").glob("*.json")
+# Sort in reverse order to prioritize the most recent and best
+order = {"bronze": 1, "silver": 2, "gold": 3}
+files = sorted(
+    files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
+)
+class Dictionary:
+    def __init__(self):
+        self.dict = {}
+        self.load_dictionaries()
+    def load_dictionaries(self):
+        for file in files:
+            with open(file, "r", encoding="utf-8") as f:
+                dictionary: dict = json.load(f)
+                normalized_dictionary = {}
+                # normalize nikud keys
+                for k, v in dictionary.items():
+                    k = normalize(k)
+                    # Ensure not empty
+                    if k and v:
+                        normalized_dictionary[k] = v
+                self.dict.update(normalized_dictionary)
+    def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
+        source: str = match.group(0)
+        # decomposite
+        source = unicodedata.normalize("NFD", source)
+        raw_lookup = self.dict.get(source)
+        without_nikud_lookup = self.dict.get(remove_nikud(source))
+        with_nikud_lookup = self.dict.get(normalize(source))
+        # Compare without nikud ONLY if source has no nikud
+        if raw_lookup:
+            return raw_lookup
+        if without_nikud_lookup:
+            return without_nikud_lookup
+        elif with_nikud_lookup:
+            return with_nikud_lookup
+        return source
+    def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
+        raw_source: str = match.group(0)
+        if raw_source.isnumeric():
+            return raw_source
+        raw_lookup = self.dict.get(raw_source)
+        # Compare without nikud ONLY if source has no nikud
+        if raw_lookup:
+            return raw_lookup
+        # search by only ', space, regular nikud, alphabet
+        raw_source = re.sub(
+            r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
+        )
+        return raw_source
+    def expand_text(self, text: str) -> str:
+        """
+        TODO: if key doesn't have diacritics expand even diacritized words
+        """
+        text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
+        return text

mishkal/expander/number_names.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
+"""
+# TODO: add nikud hints
+ZERO = {"אפס": "אֶפֶס"}
+ONES = {
+    "אחת": "אַחַת",
+    "אחד": "אֶחָד",
+    "ראשונה": "רִאשׁוֹנָה",
+    "ראשון": "רִאשׁוֹן",
+    "ראשונות": "רִאשׁוֹנוֹת",
+    "ראשונים": "רִאשׁוֹנִים",
+    "שתיים": "שְׁתַּיִם",
+    "שניים": "שְׁנַיִם",
+    "שתי": "שְׁתֵּי",
+    "שני": "שְׁנֵי",
+    "שנייה": "שְׁנִיָּה",
+    "שניות": "שְׁנִיּוֹת",
+    "שלוש": "שָׁלוֹשׁ",
+    "שלושה": "שְׁלוֹשָׁה",
+    "שלושת": "שְׁלוֹשֶׁת",
+    "שלישית": "שְׁלִישִׁית",
+    "שלישי": "שְׁלִישִׁי",
+    "שלישיות": "שְׁלִישִׁיּוֹת",
+    "שלישיים": "שְׁלִישִׁיִּים",
+    "ארבע": "אַרְבַּע",
+    "ארבעה": "אַרְבַּעָה",
+    "ארבעת": "אַרְבַּעַת",
+    "רביעית": "רֵבִיעִית",
+    "רביעי": "רֵבִיעִי",
+    "רביעיות": "רֵבִיעִיוֹת",
+    "רביעיים": "רֵבִיעִיִּים",
+    "חמש": "חָמֵשׁ",
+    "חמישה": "חֲמִשָּׁה",
+    "חמשת": "חֲמֵשֶׁת",
+    "חמישית": "חֲמִישִּׁית",
+    "חמישי": "חֲמִישִּׁי",
+    "חמישיות": "חֲמִישִּׁיוֹת",
+    "חמישיים": "חֲמִישִּׁיִּים",
+    "שש": "שֵׁשׁ",
+    "שישה": "שִׁשָּׁה",
+    "ששת": "שֵׁשֶׁת",
+    "שישית": "שִׁשִּׁית",
+    "שישי": "שִׁשִּׁי",
+    "שישיות": "שִׁשִּׁיוֹת",
+    "שישיים": "שִׁשִּׁיִּים",
+    "שבע": "שֶׁבַע",
+    "שבעה": "שִׁבְעָה",
+    "שבעת": "שִׁבְעַת",
+    "שביעית": "שְׁבִיעִית",
+    "שביעי": "שְׁבִיעִי",
+    "שביעיות": "שְׁבִיעִיוֹת",
+    "שביעיים": "שְׁבִיעִיִּים",
+    "שמונה": "שְׁמוֹנֶה",
+    "שמונת": "שְׁמוֹנַת",
+    "שמינית": "שְׁמִינִית",
+    "שמיני": "שְׁמִינִי",
+    "שמיניות": "שְׁמִינִיוֹת",
+    "שמיניים": "שְׁמִינִיִּים",
+    "תשע": "תֵּשַׁע",
+    "תשעה": "תִּשְׁעָה",
+    "תשעת": "תִּשְׁעַת",
+    "תשיעית": "תְּשִׁיעִית",
+    "תשיעי": "תְּשִׁיעִי",
+    "תשיעיות": "תְּשִׁיעִיּוֹת",
+    "תשיעיים": "תְּשִׁיעִיִּים",
+}
+TENS = {
+    "עשר": "עֶשֶׂר",
+    "עשרה": "עֲשָׁרָה",
+    "עשרת": "עֲשֶׁרֶת",
+    "עשירית": "עֲשִׁירִית",
+    "עשירי": "עֲשִׁירִי",
+    "עשיריות": "עֲשִׁירִיוֹת",
+    "עשיריים": "עֲשִׁירִיִּים",
+    "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
+    "שנים עשר": "שְׁנֵים עָשָׂר",
+}
+TWENTIES = {
+    "עשרים": "עֶשְׂרִ֫ים",
+    "שלושים": "שְׁלוֹשִׁים",
+    "ארבעים": "אַרְבָּעִים",
+    "חמישים": "חֲמִשִּׁים",
+    "שישים": "שִׁשִּׁים",
+    "שבעים": "שִׁבְעִים",
+    "שמונים": "שְׁמוֹנִים",
+    "תשעים": "תִּשְׁעִים",
+}
+HUNDREDS = {
+    "מאה": "מֵאָה",
+    "מאת": "מֵאַת",
+    "מאתיים": "מָאתַיִם",
+    "מאות": "מֵאוֹת",
+}
+THOUSANDS = {
+    "אלף": "אֶלֶף",
+    "אלפיים": "אַלְפַּיִם",
+    "אלפים": "אֲלָפִים",
+    "אלפי": "אַלְפִּי",
+}
+LARGE = {
+    "מיליון": "מִילְיוֹן",
+    "מיליוני": "מִילְיוֹנִי",
+    "מיליארד": "מִילְיַארְד",
+    "מיליארדי": "מִילְיַארְדִּי",
+    "טריליון": "טְרִילְיוֹן",
+    "טריליוני": "טְרִילְיוֹנִי",
+    "קוודריליון": "קוֹוַדְרִילְיוֹן",
+    "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
+    "קווינטיליון": "קוִוִּנְטִילְיוֹן",
+    "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
+    "סקסטיליון": "סְקֶסְטִילְיוֹן",
+    "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
+    "ספטיליון": "סְפֶּטִילְיוֹן",
+    "ספטיליוני": "סְפֶּטִילְיוֹנִי",
+    "אוקטיליון": "אוֹקְטִילְיוֹן",
+    "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
+    "נוניליון": "נוּנִילְיוֹן",
+    "נוניליוני": "נוּנִילְיוֹנִי",
+    "דסיליון": "דֶּסִילְיוֹן",
+    "דסיליוני": "דֶּסִילְיוֹנִי",
+    "אונדסיליון": "אוּנְדְסִילְיוֹן",
+    "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
+    "דואודסיליון": "דוּאודְסִילְיוֹן",
+    "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
+    "טרדסיליון": "טֶרְדְסִילְיוֹן",
+    "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
+    "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
+    "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
+    "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
+    "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
+    "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
+    "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
+    "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
+    "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
+    "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
+    "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
+    "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
+    "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
+    "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
+    "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
+}
+LETTERS = {
+    "ו": "וֵ",
+    "ה": "הַ",
+}
+CURRENCY = {
+    "שקל": "שֵׁקֶל",
+    "שקלים": "שְׁקָלִים",
+    "אגורה": "אֲגוֹרָה",
+    "אגורות": "אֲגוֹרוֹת",
+    "אירו": "אֵירוֹ",
+    "סנט": "סֵנְט",
+    "סנטים": "סֵנְטִים",
+    "דולר": "דוֹלָר",
+    "דולרים": "דוֹלָרִים",
+}
+POINTS = {
+    "מינוס": "מִינּוּס",
+    "נקודה": "נְקֻדָּה",
+}
+NUMBER_NAMES = {
+    **CURRENCY,
+    **HUNDREDS,
+    **LARGE,
+    **LETTERS,
+    **ONES,
+    **POINTS,
+    **TENS,
+    **THOUSANDS,
+    **TWENTIES,
+    **ZERO,
+}

mishkal/expander/numbers.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import num2words
+from .number_names import NUMBER_NAMES
+import re
+def add_diacritics(words: str):
+    new_words = []
+    for word in words.split():
+        if NUMBER_NAMES.get(word):
+            new_words.append(NUMBER_NAMES[word])
+        elif NUMBER_NAMES.get(word[1:]):
+            # With Vav or Bet
+            new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
+        else:
+            new_words.append(word)
+    return " ".join(new_words)
+def num_to_word(maybe_number: str) -> str:
+    def replace_number(match):
+        num = match.group()
+        words = num2words.num2words(num, lang="he", ordinal=False)
+        return add_diacritics(words)
+    # Replace all whole numbers in the string
+    result = re.sub(r"\d+", replace_number, maybe_number)
+    return result

mishkal/expander/time_to_word.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Convert time to words
+TODO: fix zeros eg. 22:00
+"""
+import re
+PATTERNS = [
+    r"(\d{1,2})([apm]{2})",  # AM/PM format
+    r"(\d{1,2}):(\d{2})",  # HH:MM format
+]
+def extract_time(match):
+    """
+    Extract hour and minute from a string in HH:MM or AM/PM format
+    and return as integers.
+    """
+    time_str = match.group(0).lower().strip()
+    # Check for HH:MM format
+    match = re.match(r"(\d{1,2}):(\d{2})", time_str)
+    if match:
+        h = int(match.group(1))
+        m = int(match.group(2))
+        return f"{convert_to_word(h, m)}"
+    # Check for AM/PM format
+    match = re.match(r"(\d{1,2})([apm]{2})", time_str)
+    if match:
+        h = int(match.group(1))
+        period = match.group(2)
+        # Normalize to 24-hour format
+        if period == "am" and h == 12:
+            h = 0
+        elif period == "pm" and h != 12:
+            h += 12
+        return f"{convert_to_word(h, 0)}"  # Defaulting to 0 minutes when only hour is provided
+    return match.group(0)  # Return original text if the format is not recognized
+def convert_to_word(h, m):
+    hours = [
+        "אֶפֶס",
+        "אַחַת",
+        "שְׁנַיִם",  # Will be replaced with "שֵׁנִי" when needed
+        "שָׁלוֹשׁ",
+        "אַרְבַּע",
+        "חָמֵשׁ",
+        "שֵׁשׁ",
+        "שֶׁבַע",
+        "שְׁמוֹנֵה",
+        "תֵּשַׁע",
+        "עֵשֵׂר",
+        "אַחַת עֶשְׂרֵה",
+        "שְׁתֵּים עֶשְׂרֵה",
+    ]
+    tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
+    ten_to_twenty = [
+        "עֵשֵׂר",
+        "אַחַת עֶשְׂרֵה",
+        "שְׁתֵּים עֶשְׂרֵה",
+        "שְׁלוֹשׁ עֶשְׂרֵה",
+        "אַרְבַּע עֶשְׂרֵה",
+        "חֲמֵשׁ עֶשְׂרֵה",
+        "שֵׁשׁ עֶשְׂרֵה",
+        "שְׁבַע עֶשְׂרֵה",
+        "שְׁמוֹנֶה עֶשְׂרֵה",
+        "תְּשַׁע עֶשְׂרֵה",
+    ]
+    vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
+    # Convert 0 hours to 12 (midnight)
+    if h == 0:
+        h = 12
+    elif h > 12:
+        h -= 12
+    if m == 0:
+        return f"{hours[h]}"
+    elif 1 <= m <= 9:
+        minute_word = (
+            vocab["shtey"] if m == 2 else hours[m]
+        )  # Replace "שניים" with "שני"
+        return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
+    elif 10 <= m <= 19:
+        return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
+    else:
+        tens_part = f"{vocab['and']}{tens[m // 10]}"
+        units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
+        return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
+def time_to_word(text: str):
+    return re.sub("|".join(PATTERNS), extract_time, text)

mishkal/hebrew.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+Hebrew Phonemizer
+Rules implemented:
+1. Consonant handling (including special cases)
+2. Nikud (vowel) processing
+3. Dagesh handling
+4. Geresh handling
+5. Shva na prediction
+6. Special letter combinations
+Reference:
+- https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
+- https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט/
+- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
+- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
+- https://he.wikipedia.org/wiki/הברה
+"""
+from mishkal.variants import Letter
+from mishkal import lexicon
+import re
+SHVA = "\u05b0"
+SIN = "\u05c2"
+PATAH = '\u05b7'
+KAMATZ = '\u05b8'
+HATAF_KAMATZ = '\u05b3'
+DAGESH = "\u05bc"
+HOLAM = "\u05b9"
+HIRIK = "\u05b4"
+PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
+KUBUTS = "\u05bb"
+TSERE = "\u05b5"
+def phonemize_hebrew(letters: list[Letter], predict_shva_na: bool) -> list[str]:
+    phonemes = []
+    i = 0
+    while i < len(letters):
+        cur = letters[i]
+        prev = letters[i - 1] if i > 0 else None
+        next = letters[i + 1] if i < len(letters) - 1 else None
+        next_phonemes, skip_offset = letter_to_phonemes(cur, prev, next, predict_shva_na)
+        phonemes.extend(next_phonemes)
+        i += skip_offset + 1
+    return phonemes
+def letter_to_phonemes(cur: Letter, prev: Letter | None, next: Letter | None, predict_shva_na: bool):
+    cur_phonemes = []
+    skip_diacritics = False
+    skip_constants = False
+    skip_offset = 0
+    # revised rules
+    # יַאלְלָה
+    if cur.char == "ל" and cur.diac == SHVA and next and next.char == "ל":
+        skip_diacritics = True
+        skip_constants = True
+    if (
+        cur.char == "ו"
+        and not prev
+        and next
+        and not next.diac
+        and cur.char + cur.diac == "וַא"
+    ):
+        skip_offset += 1
+        cur_phonemes.append("wa")
+    if cur.char == "א" and not cur.diac and prev:
+        if next and next.char != 'ו':
+            skip_constants = True
+    # TODO ?
+    if cur.char == "י" and next and not cur.diac and prev and prev.char + prev.diac != 'אֵ':
+        skip_constants = True
+    if cur.char == "ש" and SIN in cur.diac:
+        cur_phonemes.append("s")
+        skip_constants = True
+    # shin without nikud after sin = sin
+    if cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
+        cur_phonemes.append("s")
+        skip_constants = True
+    if not next and cur.char == "ח" and PATAH in cur.diac:
+        # Final Het gnuva
+        cur_phonemes.append("ax")
+        skip_diacritics = True
+        skip_constants = True
+    if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
+        if cur.char == "ת":
+            cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
+            skip_diacritics = True
+            skip_constants = True
+        else:
+            # Geresh
+            cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
+            skip_constants = True
+    elif (
+        DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES
+    ):  # dagesh
+        cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
+        skip_constants = True
+    elif cur.char == "ו":
+        skip_constants = True
+        if next and next.char == "ו" and next.diac == cur.diac:
+            # patah and next.diac empty
+            if re.search(PATAH_LIKE_PATTERN, cur.diac) and not next.diac:
+                cur_phonemes.append("w")
+                skip_diacritics = True
+                skip_offset += 1
+            elif cur.diac == next.diac:
+                # double vav
+                cur_phonemes.append("wo")
+                skip_diacritics = True
+                skip_offset += 1
+            else:
+                # TODO ?
+                # skip_consonants = False
+                skip_diacritics = False
+        else:
+            # Single vav
+            # Vav with Patah
+            if re.search(PATAH_LIKE_PATTERN, cur.diac):
+                cur_phonemes.append("va")
+            # Holam haser
+            elif HOLAM in cur.diac:
+                cur_phonemes.append("o")
+            # Shuruk / Kubutz
+            elif KUBUTS in cur.diac or DAGESH in cur.diac:
+                cur_phonemes.append("u")
+            # Vav with Shva in start
+            elif SHVA in cur.diac and not prev:
+                cur_phonemes.append("ve")
+            # Hirik
+            elif HIRIK in cur.diac:
+                cur_phonemes.append("vi")
+            # Tsere
+            elif TSERE in cur.diac:
+                cur_phonemes.append("ve")
+            else:
+                cur_phonemes.append("v")
+            skip_diacritics = True
+    if not skip_constants:
+        cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
+    if predict_shva_na and SHVA in cur.diac and not skip_diacritics and lexicon.SHVA_NA_DIACRITIC not in cur.diac:
+        # shva na prediction
+        if not prev:
+            if cur.char in 'למנרי' or cur.char in 'אהע' or cur.char in 'וכלב':
+                cur_phonemes.append("e")
+                skip_diacritics = True
+        else:
+            if next and next.char == cur.char:
+                cur_phonemes.append("e")
+                skip_diacritics = True
+            elif prev and SHVA in prev.diac and cur_phonemes[-1] != 'e':
+                cur_phonemes.append("e")
+                skip_diacritics = True
+    if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
+        cur_phonemes.append('o')
+        skip_diacritics = True
+    nikud_phonemes = (
+        [lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.diac]
+        if not skip_diacritics
+        else []
+    )
+    cur_phonemes.extend(nikud_phonemes)
+    # Ensure the stress is at the beginning of the syllable
+    cur_phonemes.sort(key=lambda x: x != 'ˈ')
+    cur_phonemes = [p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)]
+    return cur_phonemes, skip_offset

mishkal/lexicon.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+ASCII IPA transcription of Hebrew consonants and vowels.
+"""
+# https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
+MILHEL_PATTERNS = ['יים', 'וע', 'טו', "דיה"] # Used for stress prediction
+HE_PATTERN = r'[\u05b0-\u05ea\u05ab\u05bd\'"]+'
+HE_NIKUD_PATTERN = r"[\u05B0-\u05C7]"
+PUNCTUATION = r".,!? "
+STRESS = "\u02c8"  # visually looks like '
+GERESH_PHONEMES = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
+# Consonants
+LETTERS_PHONEMES = {
+    "א": "ʔ",  # Alef
+    "ב": "v",  # Bet
+    "ג": "g",  # Gimel
+    "ד": "d",  # Dalet
+    "ה": "h",  # He
+    "ו": "v",  # Vav
+    "ז": "z",  # Zayin
+    "ח": "x",  # Het
+    "ט": "t",  # Tet
+    "י": "j",  # Yod
+    "ך": "x",  # Haf sofit
+    "כ": "x",  # Haf
+    "ל": "l",  # Lamed
+    "ם": "m",  # Mem Sofit
+    "מ": "m",  # Mem
+    "ן": "n",  # Nun Sofit
+    "נ": "n",  # Nun
+    "ס": "s",  # Samekh
+    "ע": "ʔ",  # Ayin, only voweled
+    "פ": "f",  # Fey
+    "ף": "f",  # Fey Sofit
+    "ץ": "ts",  # Tsadik sofit
+    "צ": "ts",  # Tsadik
+    "ק": "k",  # Kuf
+    "ר": "r",  # Resh
+    "ש": "ʃ",  # Shin
+    "ת": "t",  # Taf
+    # Beged Kefet
+    "בּ": "b",
+    "כּ": "k",
+    "פּ": "p",
+    "שׁ": "ʃ",
+    "שׂ": "s",
+    "'": "",
+}
+SHVA_NA_DIACRITIC = "\u05bd"
+ATAMAHA_DIACRITIC = "\u05ab"
+NIKUD_PHONEMES = {
+    "\u05b4": "i",  # Hiriq
+    "\u05b1": "e",  # Hataf segol
+    "\u05b5": "e",  # Tsere
+    "\u05b6": "e",  # Segol
+    "\u05b2": "a",  # Hataf Patah
+    "\u05b7": "a",  # Patah
+    "\u05c7": "o",  # Kamatz katan
+    "\u05b9": "o",  # Holam
+    "\u05ba": "o",  # Holam haser for vav
+    "\u05bb": "u",  # Qubuts
+    "\u05b3": 'o', # Hataf qamats
+    "\u05b8": "a", # Kamataz
+    ATAMAHA_DIACRITIC: "ˈ",  # Stress (Atmaha)
+    SHVA_NA_DIACRITIC: "e",  # Shva na
+}
+# Deprecated
+DEDUPLICATE = {
+    # "\u05b1": "\u05b5",  # Hataf Segol -> Tsere
+    # "\u05b2": "\u05b7",  # Hataf Patah -> Patah
+    # "\u05b3": "\u05b9",  # Hataf Qamats -> Holam
+    # "\u05b6": "\u05b5",  # Segol -> Tsere
+    # Kamatz -> Patah
+    # "\u05b8": "\u05b7",  # Qamats -> Patah
+    # "\u05c7": "\u05b9",  # Qamats Qatan -> Holam
+    "\u05f3": "'",  # Hebrew geresh to regular geresh
+}
+SET_PHONEMES = set(sorted({
+    *NIKUD_PHONEMES.values(),
+    *LETTERS_PHONEMES.values(),
+    *GERESH_PHONEMES.values()
+}))

mishkal/log.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import logging
+import os
+import colorlog
+def _create_logger():
+    """
+    Create a logger with colorized output
+    Usage: LOG_LEVEL=DEBUG python <script.py>
+    """
+    handler = colorlog.StreamHandler()
+    fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
+    handler.setFormatter(
+        colorlog.ColoredFormatter(
+            fmt=fmt,
+            log_colors={
+                "DEBUG": "blue",
+                "INFO": "green",
+                "WARNING": "yellow",
+                "ERROR": "red",
+                "CRITICAL": "red",
+            },
+        )
+    )
+    # Get log level from LOG_LEVEL environment variable
+    log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
+    logger = colorlog.getLogger(__package__)
+    logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
+    # Setup logging to stdout
+    logger.addHandler(handler)
+    return logger
+log = _create_logger()

mishkal/phonemize.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from mishkal import lexicon
+from mishkal.variants import Letter
+from .expander import Expander
+from mishkal.utils import get_letters, normalize, post_normalize, has_vowel, has_constant, remove_nikud, get_syllables
+from typing import Callable
+import regex as re
+from mishkal.hebrew import phonemize_hebrew
+ADDITIONAL_PHONEMES = set() # When using fallback
+class Phonemizer:
+    # TODO: is that enough? what if there's punctuation around? other chars?
+    fallback_pattern = r"[a-zA-Z]+"
+    def __init__(self):
+        self.expander = Expander()
+    def phonemize(
+        self,
+        text: str,
+        preserve_punctuation=True,
+        preserve_stress=True,
+        use_expander=False,
+        use_post_normalize=False,  # For TTS
+        predict_stress=False,
+        predict_shva_nah=False,
+        fallback: Callable[[str], str] = None,
+    ) -> str | list[str]:
+        # normalize
+        text = normalize(text)
+        def fallback_replace_callback(match: re.Match):
+            word = match.group(0)
+            if self.expander.dictionary.dict.get(word):
+                # skip
+                # TODO: better API
+                return word
+            phonemes = fallback(word).strip()
+            # TODO: check that it has only IPA?!
+            for c in phonemes:
+                ADDITIONAL_PHONEMES.add(c)
+            return phonemes
+        if fallback is not None:
+            text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
+        if use_expander:
+            text = self.expander.expand_text(text)
+        def heb_replace_callback(match: re.Match):
+            word = match.group(0)
+            word = normalize(word)
+            letters: list[Letter] = get_letters(word)
+            phonemes: list[str] = phonemize_hebrew(letters, predict_shva_na=predict_shva_nah)
+            syllables = get_syllables(phonemes)
+            phonemes_text = ''.join(phonemes)
+            if predict_stress and lexicon.STRESS not in phonemes_text and syllables and len(syllables) > 1:
+                if any(remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS) or phonemes_text.endswith('ax'):
+                    # insert lexicon.STRESS in the first character of syllables[-2]
+                    syllables[-2] = lexicon.STRESS + syllables[-2]
+                else:
+                    # insert in syllables[-1]
+                    syllables[-1] = lexicon.STRESS + syllables[-1]
+            phonemes = ''.join(syllables)
+            if use_post_normalize:
+                phonemes = post_normalize(phonemes)
+            return phonemes
+        text = re.sub(lexicon.HE_PATTERN, heb_replace_callback, text)
+        def hyper_phonemes_callback(match: re.Match):
+            """
+            Expand hyper phonemes into normal phonemes
+            eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
+            """
+            matched_phonemes = match.group(2)
+            for c in matched_phonemes:
+                ADDITIONAL_PHONEMES.add(c)
+            return matched_phonemes  # The phoneme is in the second group
+        text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
+        if not preserve_punctuation:
+            text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
+        if not preserve_stress:
+            text = "".join(
+                i for i in text if i not in [lexicon.STRESS]
+            )
+        if use_post_normalize:
+            text = ''.join(i for i in text if i in lexicon.SET_PHONEMES or i in ADDITIONAL_PHONEMES or i == ' ')
+        return text

mishkal/utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from mishkal import lexicon
+import unicodedata
+import regex as re
+from mishkal.variants import Letter
+import mishkal
+def sort_diacritics(match):
+    letter = match.group(1)
+    diacritics = "".join(sorted(match.group(2)))  # Sort diacritics
+    return letter + diacritics
+NORMALIZE_PATTERNS = {
+    # Sort diacritics
+    r"(\p{L})(\p{M}+)": sort_diacritics,
+    "״": '"', # Hebrew geresh to normal geresh
+    "׳": "'", # Same
+}
+def remove_nikud(text: str):
+    return re.sub(lexicon.HE_NIKUD_PATTERN, "", text)
+def has_nikud(text: str):
+    return re.search(lexicon.HE_NIKUD_PATTERN, text) is not None
+def normalize(text: str) -> str:
+    """
+    Normalize unicode (decomposite)
+    Keep only Hebrew characters / punctuation / IPA
+    Sort diacritics
+    """
+    # Decompose text
+    text = unicodedata.normalize("NFD", text)
+    for k, v in NORMALIZE_PATTERNS.items():
+        text = re.sub(k, v, text)
+    for k, v in lexicon.DEDUPLICATE.items():
+        text = re.sub(k, v, text)
+    return text
+def post_normalize(phonemes: str):
+    new_phonemes = []
+    for word in phonemes.split(" "):
+        # remove glottal stop from end
+        word = re.sub(r"ʔ$", "", word)
+        # remove h from end
+        word = re.sub(r"h$", "", word)
+        word = re.sub(r"ˈh$", "", word)
+        # remove j followed by a i
+        word = re.sub(r"ij", "i", word)
+        new_phonemes.append(word)
+    phonemes = " ".join(new_phonemes)
+    return phonemes
+def get_letters(word: str):
+    letters: list[tuple[str, str]] = re.findall(r"(\p{L})([\p{M}']*)", word)  # with en_geresh
+    letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
+    return letters
+def get_unicode_names(text: str):
+    return [unicodedata.name(c, "?") for c in text]
+def has_vowel(s: iter):
+    return any(i in s for i in 'aeiou')
+def has_constant(s: iter):
+    return any(i not in 'aeiou' for i in s)
+def get_syllables(phonemes: list[str]) -> list[str]:
+    syllables = []
+    cur_syllable = ''
+    i = 0
+    while i < len(phonemes):
+        # Add current phoneme to the syllable
+        cur_syllable += phonemes[i]
+        # If we have a vowel in the current syllable
+        if has_vowel(cur_syllable):
+            # If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
+            if i+2 < len(phonemes) and not has_vowel(phonemes[i+1]) and has_vowel(phonemes[i+2]):
+                # End the current syllable and start a new one
+                syllables.append(cur_syllable)
+                cur_syllable = ''
+            # If we're at the end or next phoneme has a vowel
+            elif i+1 >= len(phonemes) or has_vowel(phonemes[i+1]):
+                # End the current syllable
+                syllables.append(cur_syllable)
+                cur_syllable = ''
+        i += 1
+    # Add any remaining syllable
+    if cur_syllable:
+        syllables.append(cur_syllable)
+    # Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
+    for i in range(len(syllables) - 1):  # Ensure we're not at the last syllable
+        if syllables[i].endswith(lexicon.STRESS):
+            syllables[i+1] = lexicon.STRESS + syllables[i+1]  # Move stress to next syllable
+            syllables[i] = syllables[i][:-len(lexicon.STRESS)]  # Remove stress from current syllable
+    return syllables

mishkal/variants.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import mishkal
+class Letter:
+    def __init__(self, char: str, diac: list[str]):
+        self.char = mishkal.normalize(char)
+        self.diac = mishkal.normalize(diac)
+    def __repr__(self):
+        return f"[Letter] {self.char}{''.join(self.diac)}"
+    def __eq__(self, value: 'Letter'):
+        return value.diac == self.diac and value.char == self.char

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=5.15.0
+num2words
+colorlog
+regex