thewh1teagle commited on
Commit
2d875df
·
0 Parent(s):
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv sync
3
+ uv pip install "gradio>=5.15.0"
4
+ uv run gradio examples/editor.py
5
+ """
6
+
7
+ from mishkal import phonemize, normalize
8
+ import gradio as gr
9
+
10
+ default_text = """
11
+ כָּל עֶ֫רֶב יָאִ֫יר (הַשֵּׁ֫ם הַמָּלֵ֫א וּמֽק֫וֹם הָעֲבוֹדָ֫ה שֶׁלּ֫וֹ שְׁמוּרִ֫ים בַּמַּעֲרֶ֫כֶת) רָץ 20 קִילוֹמֶ֫טֶר. הוּא מֽסַפֵּ֫ר לִי שֶׁזֶּ֫ה מֽנַקֶּ֫ה לוֹ אֶת הָרֹ֫אשׁ אַחֲרֵ֫י הָעֲבוֹדָ֫ה, "שָׁעָ֫ה וָחֵ֫צִי בְּלִ֫י עֲבוֹדָ֫ה, אִשָּׁ֫ה וִילָדִ֫ים" כְּמ֫וֹ שֶׁה֫וּא מַגְדִּ֫יר זֹאת. אֲבָ֫ל אַחֲרֵ֫י הַמִּקְלַ֫חַת הוּא מַתְחִ֫יל בּֽמָ֫ה שֶׁנִּתָּ֫ן לֽכַנּ֫וֹת הָעֲבוֹדָ֫ה הַשְּׁנִיָּ֫ה שֶׁלּ֫וֹ: לִמְצֹ֫א לוֹ קוֹלֵ֫גוֹת חֲדָשׁ֫וֹת לָעֲבוֹדָ֫ה, כִּי יָאִ֫יר הוּא כַּנִּרְאֶ֫ה הַמֶּ֫לֶךְ שֶׁל "חָבֵ֫ר מֵבִ֫יא חָבֵ֫ר" בּֽיִשְׂרָאֵ֫ל.
12
+ דֻּגְמָא מַגְנִיבָה: [אנציקלופדיה](/ʔantsikloˈpedja/)
13
+ """
14
+
15
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
16
+
17
+
18
+ def on_submit_debug(text: str, predict_stress) -> str:
19
+ phonemes = phonemize(text, preserve_punctuation=True, predict_stress=predict_stress)
20
+ normalized_text = normalize(text)
21
+ return phonemes + "\n\nNormalized:\n" + normalized_text
22
+
23
+
24
+ def on_submit(text: str, predict_stress) -> str:
25
+ return phonemize(text, preserve_punctuation=False, predict_stress=predict_stress)
26
+
27
+
28
+ with gr.Blocks(theme=theme) as demo:
29
+ text_input = gr.Textbox(
30
+ value=default_text, label="Text", rtl=True, elem_classes=["input"]
31
+ )
32
+ debug_checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
33
+ predict_stress_checkbox = gr.Checkbox(value=False, label="Predict Stress")
34
+ phonemes_output = gr.Textbox(label="Phonemes")
35
+ submit_button = gr.Button("Create")
36
+
37
+ submit_button.click(
38
+ fn=lambda text, debug, stress: on_submit_debug(text, stress) if debug else on_submit(text, stress),
39
+ inputs=[text_input, debug_checkbox, predict_stress_checkbox],
40
+ outputs=[phonemes_output],
41
+ )
42
+
43
+
44
+ if __name__ == "__main__":
45
+ demo.launch()
mishkal/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ use_expander=True,
17
+ use_post_normalize=True, # For TTS
18
+ predict_stress=True,
19
+ predict_shva_nah=True,
20
+ fallback: Callable[[str], str] = None,
21
+ ) -> str:
22
+ phonemes = phonemizer.phonemize(
23
+ text,
24
+ preserve_punctuation=preserve_punctuation,
25
+ preserve_stress=preserve_stress,
26
+ fallback=fallback,
27
+ use_expander=use_expander,
28
+ use_post_normalize=use_post_normalize,
29
+ predict_stress=predict_stress,
30
+ predict_shva_nah=predict_shva_nah,
31
+ )
32
+ return phonemes
mishkal/data/kamatz_katan.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "כל": "ˈkol",
3
+ "רחבי": "roxˈbi",
4
+ "אמנות": "omaˈnut"
5
+ }
mishkal/data/rashej_tevot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "צה״ל": "ˈtsahal"
3
+ }
mishkal/data/special.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "וַאלְלָה": "wala",
3
+ "וַסַבִּי": "wasabi"
4
+ }
mishkal/data/symbols.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "₪": "ʃeˈkel",
3
+ "$": "doˈlar"
4
+ }
mishkal/expander/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with niqqud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from mishkal.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ text = self.dictionary.expand_text(text)
19
+
20
+ words = []
21
+ for source_word in text.split():
22
+ try:
23
+ word = date_to_word(source_word)
24
+ if word == source_word:
25
+ word = time_to_word(word)
26
+ if word == source_word:
27
+ word = num_to_word(word)
28
+ words.append(word)
29
+ except Exception as e:
30
+ log.error(f"Failed to expand {word} with error: {e}")
31
+ words.append(source_word)
32
+ return " ".join(words)
mishkal/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "יָנוּאָר",
7
+ 2: "פֶבְרוּאָר",
8
+ 3: "מֵרְץ",
9
+ 4: "אֵפְרִיל",
10
+ 5: "מַאי",
11
+ 6: "יוּנִי",
12
+ 7: "יוּלִי",
13
+ 8: "אוֹגֻסְט",
14
+ 9: "סֶפְּטֶמְבֶּר",
15
+ 10: "אוֹקְטוֹבֶּר",
16
+ 11: "נוֹבֶמְבֶּר",
17
+ 12: "דֶּצֶמְבֶּר",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "יוֹם רִאשׁוֹן",
23
+ 1: "יוֹם שֵׁנִי",
24
+ 2: "יוֹם שְׁלִישִׁי",
25
+ 3: "יוֹם רֵבִיעִי",
26
+ 4: "יוֹם חֲמִישִׁי",
27
+ 5: "יוֹם שִׁישִׁי",
28
+ 6: "יוֹם שַׁבָּת",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} בֵּ{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
mishkal/expander/dictionary.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from mishkal.utils import remove_niqqud
9
+ from mishkal.utils import normalize
10
+ import unicodedata
11
+
12
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
13
+ # Sort in reverse order to prioritize the most recent and best
14
+ order = {"bronze": 1, "silver": 2, "gold": 3}
15
+ files = sorted(
16
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
17
+ )
18
+
19
+
20
+ class Dictionary:
21
+ def __init__(self):
22
+ self.dict = {}
23
+ self.load_dictionaries()
24
+
25
+ def load_dictionaries(self):
26
+ for file in files:
27
+ with open(file, "r", encoding="utf-8") as f:
28
+ dictionary: dict = json.load(f)
29
+ normalized_dictionary = {}
30
+
31
+ # normalize niqqud keys
32
+ for k, v in dictionary.items():
33
+ k = normalize(k)
34
+ # Ensure not empty
35
+ if k and v:
36
+ normalized_dictionary[k] = v
37
+ self.dict.update(normalized_dictionary)
38
+
39
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
40
+ source: str = match.group(0)
41
+ # decomposite
42
+ source = unicodedata.normalize("NFD", source)
43
+ raw_lookup = self.dict.get(source)
44
+
45
+ without_niqqud_lookup = self.dict.get(remove_niqqud(source))
46
+ with_niqqud_lookup = self.dict.get(normalize(source))
47
+ # Compare without niqqud ONLY if source has no niqqud
48
+ if raw_lookup:
49
+ return raw_lookup
50
+ if without_niqqud_lookup:
51
+ return without_niqqud_lookup
52
+ elif with_niqqud_lookup:
53
+ return with_niqqud_lookup
54
+ return source
55
+
56
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
57
+ raw_source: str = match.group(0)
58
+ if raw_source.isnumeric():
59
+ return raw_source
60
+
61
+ raw_lookup = self.dict.get(raw_source)
62
+
63
+ # Compare without niqqud ONLY if source has no niqqud
64
+ if raw_lookup:
65
+ return raw_lookup
66
+ # search by only ', space, regular niqqud, alphabet
67
+ raw_source = re.sub(
68
+ r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
69
+ )
70
+ return raw_source
71
+
72
+ def expand_text(self, text: str) -> str:
73
+ """
74
+ TODO: if key doesn't have diacritics expand even diacritized words
75
+ """
76
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
77
+
78
+ return text
mishkal/expander/number_names.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ # TODO: add niqqud hints
6
+
7
+ ZERO = {"אפס": "אֶפֶס"}
8
+
9
+
10
+ ONES = {
11
+ "אחת": "אַחַת",
12
+ "אחד": "אֶחָד",
13
+ "ראשונה": "רִאשׁוֹנָה",
14
+ "ראשון": "רִאשׁוֹן",
15
+ "ראשונות": "רִאשׁוֹנוֹת",
16
+ "ראשונים": "רִאשׁוֹנִים",
17
+ "שתיים": "שְׁתַּיִם",
18
+ "שניים": "שְׁנַיִם",
19
+ "שתי": "שְׁתֵּי",
20
+ "שני": "שְׁנֵי",
21
+ "שנייה": "שְׁנִיָּה",
22
+ "שניות": "שְׁנִיּוֹת",
23
+ "שלוש": "שָׁלוֹשׁ",
24
+ "שלושה": "שְׁלוֹשָׁה",
25
+ "שלושת": "שְׁלוֹשֶׁת",
26
+ "שלישית": "שְׁלִישִׁית",
27
+ "שלישי": "שְׁלִישִׁי",
28
+ "שלישיות": "שְׁלִישִׁיּוֹת",
29
+ "שלישיים": "שְׁלִישִׁיִּים",
30
+ "ארבע": "אַרְבַּע",
31
+ "ארבעה": "אַרְבַּעָה",
32
+ "ארבעת": "אַרְבַּעַת",
33
+ "רביעית": "רֵבִיעִית",
34
+ "רביעי": "רֵבִיעִי",
35
+ "רביעיות": "רֵבִיעִיוֹת",
36
+ "רביעיים": "רֵבִיעִיִּים",
37
+ "חמש": "חָמֵשׁ",
38
+ "חמישה": "חֲמִשָּׁה",
39
+ "חמשת": "חֲמֵשֶׁת",
40
+ "חמישית": "חֲמִישִּׁית",
41
+ "חמישי": "חֲמִישִּׁי",
42
+ "חמישיות": "חֲמִישִּׁיוֹת",
43
+ "חמישיים": "חֲמִישִּׁיִּים",
44
+ "שש": "שֵׁשׁ",
45
+ "שישה": "שִׁשָּׁה",
46
+ "ששת": "שֵׁשֶׁת",
47
+ "שישית": "שִׁשִּׁית",
48
+ "שישי": "שִׁשִּׁי",
49
+ "שישיות": "שִׁשִּׁיוֹת",
50
+ "שישיים": "שִׁשִּׁיִּים",
51
+ "שבע": "שֶׁבַע",
52
+ "שבעה": "שִׁבְעָה",
53
+ "שבעת": "שִׁבְעַת",
54
+ "שביעית": "שְׁבִיעִית",
55
+ "שביעי": "שְׁבִיעִי",
56
+ "שביעיות": "שְׁבִיעִיוֹת",
57
+ "שביעיים": "שְׁבִיעִיִּים",
58
+ "שמונה": "שְׁמוֹנֶה",
59
+ "שמונת": "שְׁמוֹנַת",
60
+ "שמינית": "שְׁמִינִית",
61
+ "שמיני": "שְׁמִינִי",
62
+ "שמיניות": "שְׁמִינִיוֹת",
63
+ "שמיניים": "שְׁמִינִיִּים",
64
+ "תשע": "תֵּשַׁע",
65
+ "תשעה": "תִּשְׁעָה",
66
+ "תשעת": "תִּשְׁעַת",
67
+ "תשיעית": "תְּשִׁיעִית",
68
+ "תשיעי": "תְּשִׁיעִי",
69
+ "תשיעיות": "תְּשִׁיעִיּוֹת",
70
+ "תשיעיים": "תְּשִׁיעִיִּים",
71
+ }
72
+
73
+
74
+ TENS = {
75
+ "עשר": "עֶשֶׂר",
76
+ "עשרה": "עֲשָׁרָה",
77
+ "עשרת": "עֲשֶׁרֶת",
78
+ "עשירית": "עֲשִׁירִית",
79
+ "עשירי": "עֲשִׁירִי",
80
+ "עשיריות": "עֲשִׁירִיוֹת",
81
+ "עשיריים": "עֲשִׁירִיִּים",
82
+ "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
83
+ "שנים עשר": "שְׁנֵים עָשָׂר",
84
+ }
85
+
86
+
87
+ TWENTIES = {
88
+ "עשרים": "עֶשְׂרִ֫ים",
89
+ "שלושים": "שְׁלוֹשִׁים",
90
+ "ארבעים": "אַרְבָּעִים",
91
+ "חמישים": "חֲמִשִּׁים",
92
+ "שישים": "שִׁשִּׁים",
93
+ "שבעים": "שִׁבְעִים",
94
+ "שמונים": "שְׁמוֹנִים",
95
+ "תשעים": "תִּשְׁעִים",
96
+ }
97
+
98
+
99
+ HUNDREDS = {
100
+ "מאה": "מֵאָה",
101
+ "מאת": "מֵאַת",
102
+ "מאתיים": "מָאתַיִם",
103
+ "מאות": "מֵאוֹת",
104
+ }
105
+
106
+ THOUSANDS = {
107
+ "אלף": "אֶלֶף",
108
+ "אלפיים": "אַלְפַּיִם",
109
+ "אלפים": "אֲלָפִים",
110
+ "אלפי": "אַלְפִּי",
111
+ }
112
+
113
+
114
+ LARGE = {
115
+ "מיליון": "מִילְיוֹן",
116
+ "מיליוני": "מִילְיוֹנִי",
117
+ "מיליארד": "מִילְיַארְד",
118
+ "מיליארדי": "מִילְיַארְדִּי",
119
+ "טריליון": "טְרִילְיוֹן",
120
+ "טריליוני": "טְרִילְיוֹנִי",
121
+ "קוודריליון": "קוֹוַדְרִילְיוֹן",
122
+ "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
123
+ "קווינטיליון": "קוִוִּנְטִילְיוֹן",
124
+ "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
125
+ "סקסטיליון": "סְקֶסְטִילְיוֹן",
126
+ "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
127
+ "ספטיליון": "סְפֶּטִילְיוֹן",
128
+ "ספטיליוני": "סְפֶּטִילְיוֹנִי",
129
+ "אוקטיליון": "אוֹקְטִילְיוֹן",
130
+ "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
131
+ "נוניליון": "נוּנִילְיוֹן",
132
+ "נוניליוני": "נוּנִילְיוֹנִי",
133
+ "דסיליון": "דֶּסִילְיוֹן",
134
+ "דסיליוני": "דֶּסִילְיוֹנִי",
135
+ "אונדסיליון": "אוּנְדְסִילְיוֹ��",
136
+ "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
137
+ "דואודסיליון": "דוּאודְסִילְיוֹן",
138
+ "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
139
+ "טרדסיליון": "טֶרְדְסִילְיוֹן",
140
+ "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
141
+ "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
142
+ "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
143
+ "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
144
+ "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
145
+ "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
146
+ "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
147
+ "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
148
+ "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
149
+ "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
150
+ "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
151
+ "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
152
+ "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
153
+ "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
154
+ "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
155
+ }
156
+
157
+
158
+ LETTERS = {
159
+ "ו": "וֵ",
160
+ "ה": "הַ",
161
+ }
162
+
163
+
164
+ CURRENCY = {
165
+ "שקל": "שֵׁקֶל",
166
+ "שקלים": "שְׁקָלִים",
167
+ "אגורה": "אֲגוֹרָה",
168
+ "אגורות": "אֲגוֹרוֹת",
169
+ "אירו": "אֵירוֹ",
170
+ "סנט": "סֵנְט",
171
+ "סנטים": "סֵנְטִים",
172
+ "דולר": "דוֹלָר",
173
+ "דולרים": "דוֹלָרִים",
174
+ }
175
+
176
+
177
+ POINTS = {
178
+ "מינוס": "מִינּוּס",
179
+ "נקודה": "נְקֻדָּה",
180
+ }
181
+
182
+ NUMBER_NAMES = {
183
+ **CURRENCY,
184
+ **HUNDREDS,
185
+ **LARGE,
186
+ **LETTERS,
187
+ **ONES,
188
+ **POINTS,
189
+ **TENS,
190
+ **THOUSANDS,
191
+ **TWENTIES,
192
+ **ZERO,
193
+ }
mishkal/expander/numbers.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num = match.group()
22
+ words = num2words.num2words(num, lang="he", ordinal=False)
23
+ return add_diacritics(words)
24
+
25
+ # Replace all whole numbers in the string
26
+ result = re.sub(r"\d+", replace_number, maybe_number)
27
+
28
+ return result
mishkal/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "אֶפֶס",
47
+ "אַחַת",
48
+ "שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
49
+ "שָׁלוֹשׁ",
50
+ "אַרְבַּע",
51
+ "חָמֵשׁ",
52
+ "שֵׁשׁ",
53
+ "שֶׁבַע",
54
+ "שְׁמוֹנֵה",
55
+ "תֵּשַׁע",
56
+ "עֵשֵׂר",
57
+ "אַחַת עֶשְׂרֵה",
58
+ "שְׁתֵּים עֶשְׂרֵה",
59
+ ]
60
+
61
+ tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
62
+
63
+ ten_to_twenty = [
64
+ "עֵשֵׂר",
65
+ "אַחַת עֶשְׂרֵה",
66
+ "שְׁתֵּים עֶשְׂרֵה",
67
+ "שְׁלוֹשׁ עֶשְׂרֵה",
68
+ "אַרְבַּע עֶשְׂרֵה",
69
+ "חֲמֵשׁ עֶשְׂרֵה",
70
+ "שֵׁשׁ עֶשְׂרֵה",
71
+ "שְׁבַע עֶשְׂרֵה",
72
+ "שְׁמוֹנֶה עֶשְׂרֵה",
73
+ "תְּשַׁע עֶשְׂרֵה",
74
+ ]
75
+
76
+ vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "שניים" with "שני"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
mishkal/lexicon.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
6
+ HE_CHARS_PATTERN = (
7
+ r"\b[\u05B0-\u05EA\u05F3\u0027]+\b" # Chars including niqqud, geresh and en_geresh
8
+ )
9
+ HE_NIQQUD_PATTERN = r"[\u05B0-\u05C7]"
10
+ PUNCTUATION = r".,!? "
11
+
12
+ # Special
13
+ GIMEL_OR_ZAIN_WITH_DAGESH = "dʒ"
14
+ TSADIK_WITH_DAGESH = "tʃ"
15
+ SHIN_WITH_POINT = "ʃ"
16
+ SIN_WITH_POINT = "s"
17
+ STRESS = "\u02c8" # visually looks like '
18
+ HET_GNUVA = "ax"
19
+ W_AS_WALLA = "w"
20
+
21
+ GERESH_LETTERS = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
22
+
23
+ LETTERS_NAMES_PHONEMES = {
24
+ "א": "alef", # Alef, glottal stop
25
+ "ב": "bet", # Bet
26
+ "ג": "gimel", # Gimel
27
+ "ד": "dalet", # Dalet
28
+ "ה": "hej", # He
29
+ "ו": "vav", # Vav
30
+ "ז": "zajin", # Zayin
31
+ "ח": "xet", # Het
32
+ "ט": "tet", # Tet
33
+ "י": "jud", # Yod
34
+ "ך": "xaf sofit", # Haf sofit
35
+ "כ": "xaf", # Haf
36
+ "ל": "lamed", # Lamed
37
+ "ם": "mem sofit", # Mem Sofit
38
+ "מ": "mem", # Mem
39
+ "ן": "nun sofit", # Nun Sofit
40
+ "נ": "nun", # Nun
41
+ "ס": "samex", # Samekh
42
+ "ע": "ajin", # Ayin, glottal stop
43
+ "פ": "fey", # Fey
44
+ "ף": "fey sofit", # Fey Sofit
45
+ "ץ": "tsadik sofit", # Tsadik sofit
46
+ "צ": "tsadik", # Tsadik
47
+ "ק": "kuf", # Kuf
48
+ "ר": "rejiʃ", # Resh
49
+ "ש": "ʃin", # Shin
50
+ "ת": "taf", # Taf
51
+ }
52
+
53
+ # Consonants
54
+ LETTERS_PHONEMES = {
55
+ "א": "ʔ", # Alef
56
+ "ב": "v", # Bet
57
+ "ג": "g", # Gimel
58
+ "ד": "d", # Dalet
59
+ "ה": "h", # He
60
+ "ו": "v", # Vav
61
+ "ז": "z", # Zayin
62
+ "ח": "x", # Het
63
+ "ט": "t", # Tet
64
+ "י": "j", # Yod
65
+ "ך": "x", # Haf sofit
66
+ "כ": "x", # Haf
67
+ "ל": "l", # Lamed
68
+ "ם": "m", # Mem Sofit
69
+ "מ": "m", # Mem
70
+ "ן": "n", # Nun Sofit
71
+ "נ": "n", # Nun
72
+ "ס": "s", # Samekh
73
+ "ע": "ʔ", # Ayin, only voweled
74
+ "פ": "f", # Fey
75
+ "ף": "f", # Fey Sofit
76
+ "ץ": "ts", # Tsadik sofit
77
+ "צ": "ts", # Tsadik
78
+ "ק": "k", # Kuf
79
+ "ר": "r", # Resh
80
+ "ש": "ʃ", # Shin
81
+ "ת": "t", # Taf
82
+ # Beged Kefet
83
+ "בּ": "b",
84
+ "כּ": "k",
85
+ "פּ": "p",
86
+ "שׁ": "ʃ",
87
+ "שׂ": "s",
88
+ "'": "",
89
+ }
90
+
91
+ # Vowels
92
+ VOWEL_A = "a"
93
+ VOWEL_E = "e"
94
+ VOWEL_I = "i"
95
+ VOWEL_O = "o"
96
+ VOWEL_U = "u"
97
+
98
+ SHVA_NA_DIACRITIC = "\u05bd"
99
+ ATAMAHA_DIACRITIC = "\u05ab"
100
+
101
+ NIQQUD_PHONEMES = {
102
+ "\u05b4": "i", # Hiriq
103
+ "\u05b5": "e", # Tsere
104
+ "\u05b7": "a", # Patah
105
+ "\u05b9": "o", # Holam
106
+ "\u05ba": "o", # Holam haser for vav
107
+ "\u05bb": "u", # Qubuts
108
+
109
+ "\u05b3": 'o', # Hataf qamats
110
+ "\u05b8": "a", # Kamataz
111
+
112
+ ATAMAHA_DIACRITIC: "ˈ", # Stress (Atmaha)
113
+ SHVA_NA_DIACRITIC: "e", # Shva na
114
+ }
115
+
116
+ SET_LETTER_SYMBOLS = {
117
+ "\u05b0", # Shva
118
+ "\u05b4", # Hiriq
119
+ "\u05b5", # Tsere
120
+ "\u05b7", # Patah
121
+ "\u05b9", # Holam
122
+ "\u05ba", # Holam haser for vav
123
+ "\u05bb", # Qubuts
124
+ "\u05bc", # Dagesh
125
+ "\u05c1", # Shin dot
126
+ "\u05b3", # Hataf qamats
127
+ "\u05b8", # Kamataz
128
+ "\u05c2", # Sin dot
129
+ "'", # Geresh
130
+ }
131
+
132
+ """
133
+ We're left with the following niqqud (10):
134
+ Shva, Hiriq, Tsere, Patah, Holam, Qubuts, Dagesh,
135
+ Holam haser for vav, Shin dot, Sin dot
136
+ """
137
+ NIQQUD_DEDUPLICATE = {
138
+ "\u05b1": "\u05b5", # Hataf Segol -> Tsere
139
+ "\u05b2": "\u05b7", # Hataf Patah -> Patah
140
+ # "\u05b3": "\u05b9", # Hataf Qamats -> Holam
141
+ "\u05b6": "\u05b5", # Segol -> Tsere
142
+ # Kamatz -> Patah
143
+ # "\u05b8": "\u05b7", # Qamats -> Patah
144
+ "\u05c7": "\u05b9", # Qamats Qatan -> Holam
145
+ "\u05f3": "'", # Hebrew geresh to regular geresh
146
+ }
147
+
148
+
149
+ SET_OUTPUT_CHARACTERS = set(
150
+ [
151
+ *GIMEL_OR_ZAIN_WITH_DAGESH,
152
+ TSADIK_WITH_DAGESH,
153
+ SHIN_WITH_POINT,
154
+ SIN_WITH_POINT,
155
+ W_AS_WALLA,
156
+ ]
157
+ + [STRESS]
158
+ + list(LETTERS_PHONEMES.values())
159
+ + list(NIQQUD_PHONEMES.values())
160
+ + [VOWEL_A, VOWEL_E, VOWEL_I, VOWEL_O, VOWEL_U]
161
+ + list(PUNCTUATION)
162
+ )
163
+
164
+ SET_NIQQUD = {
165
+ # Shva, Hiriq, Tsere, Patah, Holam, Holam haser for vav, Qubuts, Dagesh, Shin dot, Sin dot
166
+ "\u05b0", # Shva
167
+ "\u05b4", # Hiriq
168
+ "\u05b5", # Tsere
169
+ "\u05b7", # Patah
170
+ "\u05b9", # Holam
171
+ "\u05ba", # Holam for vav
172
+ "\u05bb", # Qubuts
173
+ "\u05bc", # Dagesh
174
+ "\u05c1", # Shin
175
+ "\u05c2", # Sin
176
+ "\u05b3", # Hataf qamats
177
+ "\u05b8", # Kamataz
178
+ # shva na and atmaha
179
+ "\u05bd", # shva na
180
+ "\u05ab", # atmaha
181
+ }
182
+ SET_LETTERS = set(LETTERS_PHONEMES.keys())
183
+ SET_PUNCTUATION = set(PUNCTUATION)
184
+
185
+
186
+ # Set for fast lookup
187
+ SET_INPUT_CHARACTERS = set(
188
+ list(LETTERS_PHONEMES.keys()) + list(SET_NIQQUD) + list(PUNCTUATION) + ["'"]
189
+ )
mishkal/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
mishkal/phonemize.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The actual letters phonemization happens here.
3
+ Phonemes generated based on rules.
4
+
5
+ Early rules:
6
+ 1. Niqqud malle vowels
7
+ 2. Dagesh (custom beged kefet)
8
+ 3. Final letter without niqqud
9
+ 4. Final Het gnuva
10
+ 5. Geresh (Gimel, Ttadik, Zain)
11
+ 6. Shva na
12
+ Revised rules:
13
+ 1. Consonants
14
+ 2. Niqqud
15
+
16
+ Reference:
17
+ - https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
18
+ - https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט/
19
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
20
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
21
+ - https://he.wikipedia.org/wiki/הברה
22
+ """
23
+
24
+ from mishkal import lexicon
25
+ from .expander import Expander
26
+ from mishkal.utils import normalize, post_normalize, has_vowel, has_constant, remove_niqqud
27
+ from typing import Callable
28
+ import regex as re
29
+ from mishkal.variants import Letter, Syllable
30
+
31
+ class Phonemizer:
32
+ def __init__(self):
33
+ self.expander = Expander()
34
+
35
+ def phonemize(
36
+ self,
37
+ text: str,
38
+ preserve_punctuation=True,
39
+ preserve_stress=True,
40
+ use_expander=False,
41
+ use_post_normalize=False, # For TTS
42
+ predict_stress=False,
43
+ predict_shva_nah=False,
44
+ fallback: Callable[[str], str] = None,
45
+ ) -> str:
46
+ # normalize
47
+ text = normalize(text)
48
+
49
+
50
+ # TODO: is that enough? what if there's punctuation around? other chars?
51
+ he_pattern = r"[\u05b0-\u05ea\u05ab\u05bd']+"
52
+ fallback_pattern = r"[a-zA-Z]+"
53
+
54
+ def fallback_replace_callback(match: re.Match):
55
+ word = match.group(0)
56
+
57
+ if self.expander.dictionary.dict.get(word):
58
+ # skip
59
+ # TODO: better API
60
+ return word
61
+ phonemes = fallback(word).strip()
62
+ # TODO: check that it has only IPA?!
63
+ for c in phonemes:
64
+ lexicon.SET_OUTPUT_CHARACTERS.add(c)
65
+ return phonemes
66
+
67
+ if fallback is not None:
68
+ text = re.sub(fallback_pattern, fallback_replace_callback, text)
69
+ if use_expander:
70
+ text = self.expander.expand_text(text)
71
+ self.fallback = fallback
72
+
73
+ def heb_replace_callback(match: re.Match):
74
+ word = match.group(0)
75
+
76
+ word = normalize(word)
77
+ word = "".join(
78
+ i for i in word if i in lexicon.SET_LETTERS or i in lexicon.SET_NIQQUD
79
+ )
80
+ letters: list[tuple[str, str]] = re.findall(r"(\p{L})([\p{M}']*)", word) # with en_geresh
81
+ letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
82
+ syllables: list[Syllable] = self.phonemize_hebrew(letters, predict_shva_na=predict_shva_nah)
83
+ phonemes = "".join(syllable.phones for syllable in syllables)
84
+ if use_post_normalize:
85
+ phonemes = post_normalize(phonemes)
86
+
87
+
88
+ if predict_stress and lexicon.STRESS not in phonemes:
89
+ stressed = []
90
+
91
+ is_milra = True
92
+
93
+ milhel_patterns = ['יים', 'וע', 'טו']
94
+ if syllables and any(remove_niqqud(syllables[-1].chars).endswith(i) for i in milhel_patterns):
95
+ is_milra = False
96
+
97
+ # Iterate through each syllable
98
+ for idx, syllable in enumerate(syllables):
99
+ # If it's the last syllable, add stress
100
+ if not is_milra and idx == len(syllables) - 2:
101
+ stressed.append(f'ˈ{syllable.phones}')
102
+ elif is_milra and idx == len(syllables) - 1:
103
+ stressed.append(f'ˈ{syllable.phones}')
104
+ else:
105
+ stressed.append(syllable.phones)
106
+ phonemes = "".join(stressed)
107
+ phonemes = post_normalize(phonemes)
108
+
109
+ return phonemes
110
+
111
+
112
+ text = re.sub(he_pattern, heb_replace_callback, text)
113
+
114
+ if not preserve_punctuation:
115
+ text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
116
+ if not preserve_stress:
117
+ text = "".join(
118
+ i for i in text if i not in [lexicon.STRESS]
119
+ )
120
+
121
+ def expand_hyper_phonemes(text: str):
122
+ """
123
+ Expand hyper phonemes into normal phonemes
124
+ eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
125
+ """
126
+ def hyper_phonemes_callback(match: re.Match):
127
+ matched_phonemes = match.group(2)
128
+ for c in matched_phonemes:
129
+ lexicon.SET_OUTPUT_CHARACTERS.add(c)
130
+ return matched_phonemes # The phoneme is in the second group
131
+
132
+ text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
133
+ return text
134
+
135
+ text = expand_hyper_phonemes(text)
136
+ text = "".join(i for i in text if i in lexicon.SET_OUTPUT_CHARACTERS)
137
+
138
+ return text
139
+
140
+ def phonemize_hebrew(self, letters: list[Letter], predict_shva_na: bool) -> list[Syllable]:
141
+ phonemes = []
142
+ i = 0
143
+
144
+
145
+ syllables = []
146
+ cur_syllable = Syllable('', '')
147
+ while i < len(letters):
148
+
149
+ cur = letters[i]
150
+ prev = letters[i - 1] if i > 0 else None
151
+ next = letters[i + 1] if i < len(letters) - 1 else None
152
+ cur_phonemes = []
153
+ skip_diacritics = False
154
+ skip_constants = False
155
+ skip_offset = 0
156
+ # revised rules
157
+
158
+ # יַאלְלָה
159
+ if cur.char == "ל" and cur.diac == "\u05b0" and next and next.char == "ל":
160
+ skip_diacritics = True
161
+ skip_constants = True
162
+
163
+ if (
164
+ cur.char == "ו"
165
+ and not prev
166
+ and next
167
+ and not next.diac
168
+ and cur.char + cur.diac == "וַא"
169
+ ):
170
+ skip_offset += 1
171
+ cur_phonemes.append("wa")
172
+
173
+ if cur.char == "א" and not cur.diac and prev:
174
+ if next and next.char != 'ו':
175
+ skip_constants = True
176
+
177
+ # TODO ?
178
+ if cur.char == "י" and next and not cur.diac and prev.char + prev.diac != 'אֵ':
179
+ skip_constants = True
180
+
181
+ if cur.char == "ש" and "\u05c2" in cur.diac:
182
+ cur_phonemes.append("s")
183
+ skip_constants = True
184
+
185
+ # shin without niqqud after sin = sin
186
+ if cur.char == "ש" and not cur.diac and prev and "\u05c2" in prev.diac:
187
+ cur_phonemes.append("s")
188
+ skip_constants = True
189
+
190
+ if not next and cur.char == "ח" and '\u05b7' in cur.diac:
191
+ # Final Het gnuva
192
+ cur_phonemes.append("ax")
193
+ skip_diacritics = True
194
+ skip_constants = True
195
+
196
+ if cur and "'" in cur.diac and cur.char in lexicon.GERESH_LETTERS:
197
+ if cur.char == "ת":
198
+ cur_phonemes.append(lexicon.GERESH_LETTERS.get(cur.char, ""))
199
+ skip_diacritics = True
200
+ skip_constants = True
201
+ else:
202
+ # Geresh
203
+ cur_phonemes.append(lexicon.GERESH_LETTERS.get(cur.char, ""))
204
+ skip_constants = True
205
+
206
+ elif (
207
+ "\u05bc" in cur.diac and cur.char + "\u05bc" in lexicon.LETTERS_PHONEMES
208
+ ): # dagesh
209
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + "\u05bc", ""))
210
+ skip_constants = True
211
+ elif cur.char == "ו":
212
+ skip_constants = True
213
+ if next and next.char == "ו" and next.diac == cur.diac:
214
+ # patah and next.diac empty
215
+ if cur.diac in ["\u05b7", "\u05b8"] and not next.diac:
216
+ cur_phonemes.append("w")
217
+ skip_diacritics = True
218
+ skip_offset += 1
219
+ elif cur.diac == next.diac:
220
+ # double vav
221
+ cur_phonemes.append("wo")
222
+ skip_diacritics = True
223
+ skip_offset += 1
224
+ else:
225
+ # TODO ?
226
+ # skip_consonants = False
227
+ skip_diacritics = False
228
+ else:
229
+ # Single vav
230
+
231
+ # Vav with Patah
232
+ if re.search("[\u05b7-\u05b8]", cur.diac):
233
+ cur_phonemes.append("va")
234
+
235
+ # Holam haser
236
+ elif "\u05b9" in cur.diac:
237
+ cur_phonemes.append("o")
238
+ # Shuruk / Kubutz
239
+ elif "\u05bb" in cur.diac or "\u05bc" in cur.diac:
240
+ cur_phonemes.append("u")
241
+ # Vav with Shva in start
242
+ elif "\u05b0" in cur.diac and not prev:
243
+ cur_phonemes.append("ve")
244
+ # Hirik
245
+ elif "\u05b4" in cur.diac:
246
+ cur_phonemes.append("vi")
247
+ # Tsere
248
+ elif "\u05b5" in cur.diac:
249
+ cur_phonemes.append("ve")
250
+
251
+ else:
252
+ cur_phonemes.append("v")
253
+ skip_diacritics = True
254
+
255
+ if not skip_constants:
256
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
257
+
258
+ if predict_shva_na and '\u05b0' in cur.diac and not skip_diacritics and lexicon.SHVA_NA_DIACRITIC not in cur.diac:
259
+ # shva na prediction
260
+ if not prev:
261
+ if cur.char in 'למנרי' or cur.char in 'אהע' or cur.char in 'וכלב':
262
+ cur_phonemes.append("e")
263
+ skip_diacritics = True
264
+ else:
265
+ if next and next.char == cur.char:
266
+ cur_phonemes.append("e")
267
+ skip_diacritics = True
268
+ elif prev and '\u05b0' in prev.diac and phonemes[-1] != 'e':
269
+ cur_phonemes.append("e")
270
+ skip_diacritics = True
271
+
272
+ if '\u05b8' in cur.diac and next and '\u05b3' in next.diac:
273
+ cur_phonemes.append('o')
274
+ skip_diacritics = True
275
+
276
+ niqqud_phonemes = (
277
+ [lexicon.NIQQUD_PHONEMES.get(niqqud, "") for niqqud in cur.diac]
278
+ if not skip_diacritics
279
+ else []
280
+ )
281
+
282
+ cur_phonemes.extend(niqqud_phonemes)
283
+ # Ensure the stress is at the beginning of the syllable
284
+ cur_phonemes.sort(key=lambda x: x != 'ˈ')
285
+ phonemes.extend(cur_phonemes)
286
+
287
+
288
+ if not next:
289
+ cur_syllable.chars += cur.char + cur.diac
290
+ cur_syllable.phones += ''.join(cur_phonemes)
291
+ syllables.append(cur_syllable)
292
+ elif not prev:
293
+ cur_syllable = Syllable(cur.char + cur.diac, ''.join(cur_phonemes))
294
+
295
+ elif len(re.findall('[א-ת]', cur_syllable.chars)) >= 2 and has_vowel(cur_syllable.phones) and cur.diac:
296
+ syllables.append(cur_syllable)
297
+ cur_syllable = Syllable(cur.char + cur.diac, ''.join(cur_phonemes))
298
+
299
+ elif not has_vowel(cur_phonemes):
300
+ cur_syllable.chars += cur.char + cur.diac
301
+ cur_syllable.phones += ''.join(cur_phonemes)
302
+
303
+ elif not has_vowel(cur_syllable.phones):
304
+ cur_syllable.chars += cur.char + cur.diac
305
+ cur_syllable.phones += ''.join(cur_phonemes)
306
+ else:
307
+ syllables.append(cur_syllable)
308
+ cur_syllable = Syllable(cur.char + cur.diac, ''.join(cur_phonemes))
309
+ i += skip_offset + 1
310
+ return syllables
mishkal/utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mishkal import lexicon
2
+ import unicodedata
3
+ import regex as re
4
+
5
+
6
+ def sort_diacritics(match):
7
+ letter = match.group(1)
8
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
9
+ return letter + diacritics
10
+
11
+
12
+ NORMALIZE_PATTERNS = {
13
+ # Alphabet followed by 1/2 symbols then dagesh. make dagesh first
14
+ r"(\p{L})(\p{M}+)": sort_diacritics,
15
+ "״": '"',
16
+ "׳": "'",
17
+ }
18
+
19
+
20
+ def remove_niqqud(text: str):
21
+ return re.sub(lexicon.HE_NIQQUD_PATTERN, "", text)
22
+
23
+
24
+ def has_niqqud(text: str):
25
+ return re.search(lexicon.HE_NIQQUD_PATTERN, text) is not None
26
+
27
+
28
+ def normalize(text: str) -> str:
29
+ """
30
+ Normalize unicode (decomposite)
31
+ Deduplicate niqqud (eg. only Patah instead of Kamatz)
32
+ Keep only Hebrew characters / punctuation / IPA
33
+ Sort diacritics
34
+ """
35
+
36
+ # Decompose text
37
+ text = unicodedata.normalize("NFD", text)
38
+ for k, v in NORMALIZE_PATTERNS.items():
39
+ text = re.sub(k, v, text)
40
+ # Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
41
+ for k, v in lexicon.NIQQUD_DEDUPLICATE.items():
42
+ text = text.replace(k, v)
43
+ return text
44
+
45
+
46
+ def post_normalize(phonemes: str):
47
+ new_phonemes = []
48
+ for word in phonemes.split(" "):
49
+ # remove glottal stop from end
50
+ word = re.sub(r"ʔ$", "", word)
51
+ # remove h from end
52
+ word = re.sub(r"h$", "", word)
53
+ word = re.sub(r"ˈh$", "", word)
54
+ # remove j followed by a i
55
+ word = re.sub(r"ij", "i", word)
56
+ new_phonemes.append(word)
57
+ return " ".join(new_phonemes)
58
+
59
+
60
+ def get_unicode_names(text: str):
61
+ return [unicodedata.name(c, "?") for c in text]
62
+
63
+ def has_vowel(s: iter):
64
+ return any(i in s for i in 'aeiou')
65
+
66
+ def has_constant(s: iter):
67
+ return any(i not in 'aeiou' for i in s)
68
+
mishkal/variants.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Letter:
2
+ def __init__(self, char: str, diac: list[str]):
3
+ self.char = char
4
+ self.diac = diac
5
+
6
+ def __repr__(self):
7
+ return f"{self.char}{''.join(self.diac)}"
8
+
9
+ class Syllable:
10
+ def __init__(self, chars, phones):
11
+ self.chars = chars
12
+ self.phones = phones
13
+
14
+ def __repr__(self):
15
+ return f'{self.chars}: {self.phones}'
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=5.15.0
2
+ num2words
3
+ colorlog
4
+ regex