thewh1teagle commited on
Commit
96c9fde
·
0 Parent(s):
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv sync
3
+ uv pip install "gradio>=5.15.0"
4
+ uv run gradio examples/editor.py
5
+ """
6
+
7
+ from mishkal import phonemize, normalize
8
+ import gradio as gr
9
+
10
+ default_text = """
11
+ כָּל עֶרֶב יָאִיר (הַשֵּׁם הַמָּלֵא וּמְקוֹם הָעֲבוֹדָה שֶׁלּוֹ שְׁמוּרִים בַּמַּעֲרֶכֶת) רָץ 20 קִילוֹמֶטֶר. הוּא מְסַפֵּר לִי שֶׁזֶּה מְנַקֶּה לוֹ אֶת הָרֹאשׁ אַחֲרֵי הָעֲבוֹדָה, "שָׁעָה וָחֵצִי בְּלִי עֲבוֹדָה, אִשָּׁה וִילָדִים" כְּמוֹ שֶׁהוּא מַגְדִּיר זֹאת. אֲבָל אַחֲרֵי הַמִּקְלַחַת הוּא מַתְחִיל בְּמָה שֶׁנִּתָּן לְכַנּוֹת הָעֲבוֹדָה הַשְּׁנִיָּה שֶׁלּוֹ: לִמְצֹא לוֹ קוֹלֵגוֹת חֲדָשׁוֹת לָעֲבוֹדָה, כִּי יָאִיר הוּא כַּנִּרְאֶה הַמֶּלֶךְ שֶׁל "חָבֵר מֵבִיא חָבֵר" בְּיִשְׂרָאֵל.
12
+ """
13
+
14
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
15
+
16
+
17
+ def on_submit_debug(text: str) -> str:
18
+ phonemes = phonemize(text, preserve_punctuation=True)
19
+ normalized_text = normalize(text)
20
+ return phonemes + "\n\nNormalized:\n" + normalized_text
21
+
22
+
23
+ def on_submit(text: str) -> str:
24
+ return phonemize(text, preserve_punctuation=False)
25
+
26
+
27
+ with gr.Blocks(theme=theme) as demo:
28
+ text_input = gr.Textbox(
29
+ value=default_text, label="Text", rtl=True, elem_classes=["input"]
30
+ )
31
+ checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
32
+ phonemes_output = gr.Textbox(label="Phonemes")
33
+ submit_button = gr.Button("Create")
34
+
35
+ submit_button.click(
36
+ fn=lambda text, debug: on_submit_debug(text) if debug else on_submit(text),
37
+ inputs=[text_input, checkbox],
38
+ outputs=[phonemes_output],
39
+ )
40
+
41
+
42
+ if __name__ == "__main__":
43
+ demo.launch()
mishkal/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ fallback: Callable[[str], str] = None,
17
+ ) -> str:
18
+ phonemes = phonemizer.phonemize(
19
+ text,
20
+ preserve_punctuation=preserve_punctuation,
21
+ preserve_stress=preserve_stress,
22
+ fallback=fallback,
23
+ )
24
+ return phonemes
mishkal/data/kamatz_katan.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "כל": "kol"
3
+ }
mishkal/data/rashej_tevot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "צה״ל": "ˈtsahal"
3
+ }
mishkal/data/symbols.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "₪": "ʃeˈkel",
3
+ "$": "doˈlar"
4
+ }
mishkal/expander/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with niqqud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from mishkal.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ text = self.dictionary.expand_text(text)
19
+
20
+ words = []
21
+ for source_word in text.split():
22
+ try:
23
+ word = date_to_word(source_word)
24
+ if word == source_word:
25
+ word = time_to_word(word)
26
+ if word == source_word:
27
+ word = num_to_word(word)
28
+ words.append(word)
29
+ except Exception as e:
30
+ log.error(f"Failed to expand {word} with error: {e}")
31
+ words.append(source_word)
32
+ return " ".join(words)
mishkal/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "יָנוּאָר",
7
+ 2: "פֶבְרוּאָר",
8
+ 3: "מֵרְץ",
9
+ 4: "אֵפְרִיל",
10
+ 5: "מַאי",
11
+ 6: "יוּנִי",
12
+ 7: "יוּלִי",
13
+ 8: "אוֹגֻסְט",
14
+ 9: "סֶפְּטֶמְבֶּר",
15
+ 10: "אוֹקְטוֹבֶּר",
16
+ 11: "נוֹבֶמְבֶּר",
17
+ 12: "דֶּצֶמְבֶּר",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "יוֹם רִאשׁוֹן",
23
+ 1: "יוֹם שֵׁנִי",
24
+ 2: "יוֹם שְׁלִישִׁי",
25
+ 3: "יוֹם רֵבִיעִי",
26
+ 4: "יוֹם חֲמִישִׁי",
27
+ 5: "יוֹם שִׁישִׁי",
28
+ 6: "יוֹם שַׁבָּת",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} בֵּ{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
mishkal/expander/dictionary.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from mishkal.utils import remove_niqqud
9
+ from mishkal.utils import normalize
10
+ import unicodedata
11
+
12
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
13
+ # Sort in reverse order to prioritize the most recent and best
14
+ order = {"bronze": 1, "silver": 2, "gold": 3}
15
+ files = sorted(
16
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
17
+ )
18
+
19
+
20
+ class Dictionary:
21
+ def __init__(self):
22
+ self.dict = {}
23
+ self.load_dictionaries()
24
+
25
+ def load_dictionaries(self):
26
+ for file in files:
27
+ with open(file, "r", encoding="utf-8") as f:
28
+ dictionary: dict = json.load(f)
29
+ normalized_dictionary = {}
30
+
31
+ # normalize niqqud keys
32
+ for k, v in dictionary.items():
33
+ k = normalize(k)
34
+ # Ensure not empty
35
+ if k and v:
36
+ normalized_dictionary[k] = v
37
+ self.dict.update(normalized_dictionary)
38
+
39
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
40
+ source: str = match.group(0)
41
+ # decomposite
42
+ source = unicodedata.normalize("NFD", source)
43
+ raw_lookup = self.dict.get(source)
44
+
45
+ without_niqqud_lookup = self.dict.get(remove_niqqud(source))
46
+ with_niqqud_lookup = self.dict.get(normalize(source))
47
+ # Compare without niqqud ONLY if source has no niqqud
48
+ if raw_lookup:
49
+ return raw_lookup
50
+ if without_niqqud_lookup:
51
+ return without_niqqud_lookup
52
+ elif with_niqqud_lookup:
53
+ return with_niqqud_lookup
54
+ return source
55
+
56
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
57
+ raw_source: str = match.group(0)
58
+ if raw_source.isnumeric():
59
+ return raw_source
60
+
61
+ raw_lookup = self.dict.get(raw_source)
62
+
63
+ # Compare without niqqud ONLY if source has no niqqud
64
+ if raw_lookup:
65
+ return raw_lookup
66
+ # search by only ', space, regular niqqud, alphabet
67
+ raw_source = re.sub(
68
+ r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
69
+ )
70
+ return raw_source
71
+
72
+ def expand_text(self, text: str) -> str:
73
+ """
74
+ TODO: if key doesn't have diacritics expand even diacritized words
75
+ """
76
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
77
+
78
+ return text
mishkal/expander/number_names.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ ZERO = {"אפס": "אֶפֶס"}
6
+
7
+
8
+ ONES = {
9
+ "אחת": "אַחַת",
10
+ "אחד": "אֶחָד",
11
+ "ראשונה": "רִאשׁוֹנָה",
12
+ "ראשון": "רִאשׁוֹן",
13
+ "ראשונות": "רִאשׁוֹנוֹת",
14
+ "ראשונים": "רִאשׁוֹנִים",
15
+ "שתיים": "שְׁתַּיִם",
16
+ "שניים": "שְׁנַיִם",
17
+ "שתי": "שְׁתֵּי",
18
+ "שני": "שְׁנֵי",
19
+ "שנייה": "שְׁנִיָּה",
20
+ "שניות": "שְׁנִיּוֹת",
21
+ "שלוש": "שָׁלוֹשׁ",
22
+ "שלושה": "שְׁלוֹשָׁה",
23
+ "שלושת": "שְׁלוֹשֶׁת",
24
+ "שלישית": "שְׁלִישִׁית",
25
+ "שלישי": "שְׁלִישִׁי",
26
+ "שלישיות": "שְׁלִישִׁיּוֹת",
27
+ "שלישיים": "שְׁלִישִׁיִּים",
28
+ "ארבע": "אַרְבַּע",
29
+ "ארבעה": "אַרְבַּעָה",
30
+ "ארבעת": "אַרְבַּעַת",
31
+ "רביעית": "רֵבִיעִית",
32
+ "רביעי": "רֵבִיעִי",
33
+ "רביעיות": "רֵבִיעִיוֹת",
34
+ "רביעיים": "רֵבִיעִיִּים",
35
+ "חמש": "חָמֵשׁ",
36
+ "חמישה": "חֲמִשָּׁה",
37
+ "חמשת": "חֲמֵשֶׁת",
38
+ "חמישית": "חֲמִישִּׁית",
39
+ "חמישי": "חֲמִישִּׁי",
40
+ "חמישיות": "חֲמִישִּׁיוֹת",
41
+ "חמישיים": "חֲמִישִּׁיִּים",
42
+ "שש": "שֵׁשׁ",
43
+ "שישה": "שִׁשָּׁה",
44
+ "ששת": "שֵׁשֶׁת",
45
+ "שישית": "שִׁשִּׁית",
46
+ "שישי": "שִׁשִּׁי",
47
+ "שישיות": "שִׁשִּׁיוֹת",
48
+ "שישיים": "שִׁשִּׁיִּים",
49
+ "שבע": "שֶׁבַע",
50
+ "שבעה": "שִׁבְעָה",
51
+ "שבעת": "שִׁבְעַת",
52
+ "שביעית": "שְׁבִיעִית",
53
+ "שביעי": "שְׁבִיעִי",
54
+ "שביעיות": "שְׁבִיעִיוֹת",
55
+ "שביעיים": "שְׁבִיעִיִּים",
56
+ "שמונה": "שְׁמוֹנֶה",
57
+ "שמונת": "שְׁמוֹנַת",
58
+ "שמינית": "שְׁמִינִית",
59
+ "שמיני": "שְׁמִינִי",
60
+ "שמיניות": "שְׁמִינִיוֹת",
61
+ "שמיניים": "שְׁמִינִיִּים",
62
+ "תשע": "תֵּשַׁע",
63
+ "תשעה": "תִּשְׁעָה",
64
+ "תשעת": "תִּשְׁעַת",
65
+ "תשיעית": "תְּשִׁיעִית",
66
+ "תשיעי": "תְּשִׁיעִי",
67
+ "תשיעיות": "תְּשִׁיעִיּוֹת",
68
+ "תשיעיים": "תְּשִׁיעִיִּים",
69
+ }
70
+
71
+
72
+ TENS = {
73
+ "עשר": "עֶשֶׂר",
74
+ "עשרה": "עֲשָׁרָה",
75
+ "עשרת": "עֲשֶׁרֶת",
76
+ "עשירית": "עֲשִׁירִית",
77
+ "עשירי": "עֲשִׁירִי",
78
+ "עשיריות": "עֲשִׁירִיוֹת",
79
+ "עשיריים": "עֲשִׁירִיִּים",
80
+ "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
81
+ "שנים עשר": "שְׁנֵים עָשָׂר",
82
+ }
83
+
84
+
85
+ TWENTIES = {
86
+ "עשרים": "עֶשְׂרִים",
87
+ "שלושים": "שְׁלוֹשִׁים",
88
+ "ארבעים": "אַרְבָּעִים",
89
+ "חמישים": "חֲמִשִּׁים",
90
+ "שישים": "שִׁשִּׁים",
91
+ "שבעים": "שִׁבְעִים",
92
+ "שמונים": "שְׁמוֹנִים",
93
+ "תשעים": "תִּשְׁעִים",
94
+ }
95
+
96
+
97
+ HUNDREDS = {
98
+ "מאה": "מֵאָה",
99
+ "מאת": "מֵאַת",
100
+ "מאתיים": "מָאתַיִם",
101
+ "מאות": "מֵאוֹת",
102
+ }
103
+
104
+ THOUSANDS = {
105
+ "אלף": "אֶלֶף",
106
+ "אלפיים": "אַלְפַּיִם",
107
+ "אלפים": "אֲלָפִים",
108
+ "אלפי": "אַלְפִּי",
109
+ }
110
+
111
+
112
+ LARGE = {
113
+ "מיליון": "מִילְיוֹן",
114
+ "מיליוני": "מִילְיוֹנִי",
115
+ "מיליארד": "מִילְיַארְד",
116
+ "מיליארדי": "מִילְיַארְדִּי",
117
+ "טריליון": "טְרִילְיוֹן",
118
+ "טריליוני": "טְרִילְיוֹנִי",
119
+ "קוודריליון": "קוֹוַדְרִילְיוֹן",
120
+ "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
121
+ "קווינטיליון": "קוִוִּנְטִילְיוֹן",
122
+ "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
123
+ "סקסטיליון": "סְקֶסְטִילְיוֹן",
124
+ "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
125
+ "ספטיליון": "סְפֶּטִילְיוֹן",
126
+ "ספטיליוני": "סְפֶּטִילְיוֹנִי",
127
+ "אוקטיליון": "אוֹקְטִילְיוֹן",
128
+ "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
129
+ "נוניליון": "נוּנִילְיוֹן",
130
+ "נוניליוני": "נוּנִילְיוֹנִי",
131
+ "דסיליון": "דֶּסִילְיוֹן",
132
+ "דסיליוני": "דֶּסִילְיוֹנִי",
133
+ "אונדסיליון": "אוּנְדְסִילְיוֹן",
134
+ "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
135
+ "דואודסיליון": "דוּאודְסִילְיוֹן",
136
+ "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
137
+ "טרדסיליון": "טֶרְדְסִילְיוֹן",
138
+ "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
139
+ "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
140
+ "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
141
+ "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
142
+ "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
143
+ "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
144
+ "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
145
+ "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
146
+ "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
147
+ "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
148
+ "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
149
+ "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
150
+ "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
151
+ "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
152
+ "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
153
+ }
154
+
155
+
156
+ LETTERS = {
157
+ "ו": "וֵ",
158
+ "ה": "הַ",
159
+ }
160
+
161
+
162
+ CURRENCY = {
163
+ "שקל": "שֵׁקֶל",
164
+ "שקלים": "שְׁקָלִים",
165
+ "אגורה": "אֲגוֹרָה",
166
+ "אגורות": "אֲגוֹרוֹת",
167
+ "אירו": "אֵירוֹ",
168
+ "סנט": "סֵנְט",
169
+ "סנטים": "סֵנְטִים",
170
+ "דולר": "דוֹלָר",
171
+ "דולרים": "דוֹלָרִים",
172
+ }
173
+
174
+
175
+ POINTS = {
176
+ "מינוס": "מִינּוּס",
177
+ "נקודה": "נְקֻדָּה",
178
+ }
179
+
180
+ NUMBER_NAMES = {
181
+ **CURRENCY,
182
+ **HUNDREDS,
183
+ **LARGE,
184
+ **LETTERS,
185
+ **ONES,
186
+ **POINTS,
187
+ **TENS,
188
+ **THOUSANDS,
189
+ **TWENTIES,
190
+ **ZERO,
191
+ }
mishkal/expander/numbers.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num = match.group()
22
+ words = num2words.num2words(num, lang="he", ordinal=False)
23
+ return add_diacritics(words)
24
+
25
+ # Replace all whole numbers in the string
26
+ result = re.sub(r"\d+", replace_number, maybe_number)
27
+
28
+ return result
mishkal/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "אֶפֶס",
47
+ "אַחַת",
48
+ "שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
49
+ "שָׁלוֹשׁ",
50
+ "אַרְבַּע",
51
+ "חָמֵשׁ",
52
+ "שֵׁשׁ",
53
+ "שֶׁבַע",
54
+ "שְׁמוֹנֵה",
55
+ "תֵּשַׁע",
56
+ "עֵשֵׂר",
57
+ "אַחַת עֶשְׂרֵה",
58
+ "שְׁתֵּים עֶשְׂרֵה",
59
+ ]
60
+
61
+ tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
62
+
63
+ ten_to_twenty = [
64
+ "עֵשֵׂר",
65
+ "אַחַת עֶשְׂרֵה",
66
+ "שְׁתֵּים עֶשְׂרֵה",
67
+ "שְׁלוֹשׁ עֶשְׂרֵה",
68
+ "אַרְבַּע עֶשְׂרֵה",
69
+ "חֲמֵשׁ עֶשְׂרֵה",
70
+ "שֵׁשׁ עֶשְׂרֵה",
71
+ "שְׁבַע עֶשְׂרֵה",
72
+ "שְׁמוֹנֶה עֶשְׂרֵה",
73
+ "תְּשַׁע עֶשְׂרֵה",
74
+ ]
75
+
76
+ vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "שניים" with "שני"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
mishkal/lexicon.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
6
+ HE_CHARS_PATTERN = (
7
+ r"\b[\u05B0-\u05EA\u05F3\u0027]+\b" # Chars including niqqud, geresh and en_geresh
8
+ )
9
+ HE_NIQQUD_PATTERN = r"[\u05B0-\u05C7]"
10
+ PUNCTUATION = r".,!? "
11
+
12
+ # Special
13
+ GIMEL_OR_ZAIN_WITH_DAGESH = "dʒ"
14
+ TSADIK_WITH_DAGESH = "tʃ"
15
+ SHIN_WITH_POINT = "ʃ"
16
+ SIN_WITH_POINT = "s"
17
+ STRESS = "\u02c8" # visually looks like '
18
+ SECONDARY_STRESS = "\u02cc"
19
+ HET_GNUVA = "ax"
20
+ W_AS_WALLA = "w"
21
+
22
+ LETTERS_NAMES_PHONEMES = {
23
+ "א": "alef", # Alef, glottal stop
24
+ "ב": "bet", # Bet
25
+ "ג": "gimel", # Gimel
26
+ "ד": "dalet", # Dalet
27
+ "ה": "hej", # He
28
+ "ו": "vav", # Vav
29
+ "ז": "zajin", # Zayin
30
+ "ח": "xet", # Het
31
+ "ט": "tet", # Tet
32
+ "י": "jud", # Yod
33
+ "ך": "xaf sofit", # Haf sofit
34
+ "כ": "xaf", # Haf
35
+ "ל": "lamed", # Lamed
36
+ "ם": "mem sofit", # Mem Sofit
37
+ "מ": "mem", # Mem
38
+ "ן": "nun sofit", # Nun Sofit
39
+ "נ": "nun", # Nun
40
+ "ס": "samex", # Samekh
41
+ "ע": "ajin", # Ayin, glottal stop
42
+ "פ": "fey", # Fey
43
+ "ף": "fey sofit", # Fey Sofit
44
+ "ץ": "tsadik sofit", # Tsadik sofit
45
+ "צ": "tsadik", # Tsadik
46
+ "ק": "kuf", # Kuf
47
+ "ר": "rejiʃ", # Resh
48
+ "ש": "ʃin", # Shin
49
+ "ת": "taf", # Taf
50
+ }
51
+
52
+ # Consonants
53
+ LETTERS_PHONEMES = {
54
+ "א": "ʔ", # Alef
55
+ "ב": "v", # Bet
56
+ "ג": "g", # Gimel
57
+ "ד": "d", # Dalet
58
+ "ה": "h", # He
59
+ "ו": "v", # Vav
60
+ "ז": "z", # Zayin
61
+ "ח": "x", # Het
62
+ "ט": "t", # Tet
63
+ "י": "j", # Yod
64
+ "ך": "x", # Haf sofit
65
+ "כ": "x", # Haf
66
+ "ל": "l", # Lamed
67
+ "ם": "m", # Mem Sofit
68
+ "מ": "m", # Mem
69
+ "ן": "n", # Nun Sofit
70
+ "נ": "n", # Nun
71
+ "ס": "s", # Samekh
72
+ "ע": "ʔ", # Ayin, only voweled
73
+ "פ": "f", # Fey
74
+ "ף": "f", # Fey Sofit
75
+ "ץ": "ts", # Tsadik sofit
76
+ "צ": "ts", # Tsadik
77
+ "ק": "k", # Kuf
78
+ "ר": "r", # Resh
79
+ "ש": "ʃ", # Shin
80
+ "ת": "t", # Taf
81
+ # Beged Kefet
82
+ "בּ": "b",
83
+ "כּ": "k",
84
+ "פּ": "p",
85
+ "שׁ": "ʃ",
86
+ "שׂ": "s",
87
+
88
+ }
89
+
90
+ # Vowels
91
+ VOWEL_A = "a"
92
+ VOWEL_E = "e"
93
+ VOWEL_I = "i"
94
+ VOWEL_O = "o"
95
+ VOWEL_U = "u"
96
+
97
+ NIQQUD_PHONEMES = {
98
+ "\u05b4": "i", # Hiriq
99
+ "\u05b5": "e", # Tsere
100
+ "\u05b7": "a", # Patah
101
+ "\u05b9": "o", # Holam
102
+ "\u05ba": "o", # Holam haser for vav
103
+ "\u05bb": "u", # Qubuts
104
+ "\u05ab": "ˈ", # Stress (Atmaha)
105
+ "\u05bd": "e" # Shva na
106
+ }
107
+
108
+ SET_LETTER_SYMBOLS = {
109
+ "\u05b0", # Shva
110
+ "\u05b4", # Hiriq
111
+ "\u05b5", # Tsere
112
+ "\u05b7", # Patah
113
+ "\u05b9", # Holam
114
+ "\u05ba", # Holam haser for vav
115
+ "\u05bb", # Qubuts
116
+ "\u05bc", # Dagesh
117
+ "\u05c1", # Shin dot
118
+ "\u05c2", # Sin dot
119
+ "'", # Geresh
120
+ }
121
+
122
+ """
123
+ We're left with the following niqqud (10):
124
+ Shva, Hiriq, Tsere, Patah, Holam, Qubuts, Dagesh,
125
+ Holam haser for vav, Shin dot, Sin dot
126
+ """
127
+ NIQQUD_DEDUPLICATE = {
128
+ "\u05b1": "\u05b5", # Hataf Segol -> Tsere
129
+ "\u05b2": "\u05b7", # Hataf Patah -> Patah
130
+ "\u05b3": "\u05b9", # Hataf Qamats -> Holam
131
+ "\u05b6": "\u05b5", # Segol -> Tsere
132
+ # Kamatz -> Patah
133
+ "\u05b8": "\u05b7", # Qamats -> Patah
134
+ "\u05c7": "\u05b9", # Qamats Qatan -> Holam
135
+ "\u05f3": "'", # Hebrew geresh to regular geresh
136
+ }
137
+
138
+
139
+ SET_OUTPUT_CHARACTERS = set(
140
+ [*GIMEL_OR_ZAIN_WITH_DAGESH, TSADIK_WITH_DAGESH, SHIN_WITH_POINT, SIN_WITH_POINT]
141
+ + [STRESS, SECONDARY_STRESS]
142
+ + list(LETTERS_PHONEMES.values())
143
+ + list(NIQQUD_PHONEMES.values())
144
+ + [VOWEL_A, VOWEL_E, VOWEL_I, VOWEL_O, VOWEL_U]
145
+ + list(PUNCTUATION)
146
+ )
147
+
148
+ SET_NIQQUD = {
149
+ # Shva, Hiriq, Tsere, Patah, Holam, Holam haser for vav, Qubuts, Dagesh, Shin dot, Sin dot
150
+ "\u05b0",
151
+ "\u05b4",
152
+ "\u05b5",
153
+ "\u05b7",
154
+ "\u05b9",
155
+ "\u05ba",
156
+ "\u05bb",
157
+ "\u05bc",
158
+ "\u05c1",
159
+ "\u05c2",
160
+
161
+ # shva na and atmaha
162
+ '\u05bd',# shva na
163
+ '\u05ab' # atmaha
164
+ }
165
+ SET_LETTERS = set(LETTERS_PHONEMES.keys())
166
+ SET_PUNCTUATION = set(PUNCTUATION)
167
+
168
+
169
+ # Set for fast lookup
170
+ SET_INPUT_CHARACTERS = set(
171
+ list(LETTERS_PHONEMES.keys()) + list(SET_NIQQUD) + list(PUNCTUATION) + ["'"]
172
+ )
mishkal/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
mishkal/phonemize.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The actual letters phonemization happens here.
3
+ Phonemes generated based on rules.
4
+
5
+ Early rules:
6
+ 1. Niqqud malle vowels
7
+ 2. Dagesh (custom beged kefet)
8
+ 3. Final letter without niqqud
9
+ 4. Final Het gnuva
10
+ 5. Geresh (Gimel, Ttadik, Zain)
11
+ 6. Shva nax and na
12
+ Revised rules:
13
+ 1. Consonants
14
+ 2. Niqqud
15
+
16
+ Reference:
17
+ - https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
18
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
19
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
20
+ """
21
+
22
+ from mishkal import lexicon
23
+ from .expander import Expander
24
+ from mishkal.utils import get_unicode_names, normalize, post_normalize
25
+ from typing import Callable
26
+ import regex as re
27
+
28
+ class Phonemizer:
29
+ def __init__(self):
30
+ self.expander = Expander()
31
+
32
+ def phonemize(
33
+ self,
34
+ text: str,
35
+ preserve_punctuation=True,
36
+ preserve_stress=True,
37
+ fallback: Callable[[str], str] = None,
38
+ ) -> str:
39
+ # normalize
40
+ text = normalize(text)
41
+ # TODO: is that enough? what if there's punctuation around? other chars?
42
+ he_pattern = r"[\u05b0-\u05ea\u05ab\u05bd]+"
43
+ fallback_pattern = r"[a-zA-Z]+"
44
+
45
+ def fallback_replace_callback(match: re.Match):
46
+ word = match.group(0)
47
+ if self.expander.dictionary.dict.get(word):
48
+ # skip
49
+ # TODO: better API
50
+ return word
51
+ phonemes = fallback(word).strip()
52
+ # TODO: check that it has only IPA?!
53
+ for c in phonemes:
54
+ lexicon.SET_OUTPUT_CHARACTERS.add(c)
55
+ return phonemes
56
+
57
+ if fallback is not None:
58
+ text = re.sub(fallback_pattern, fallback_replace_callback, text)
59
+ text = self.expander.expand_text(text)
60
+ self.fallback = fallback
61
+
62
+ def heb_replace_callback(match: re.Match):
63
+ word = match.group(0)
64
+
65
+ word = normalize(word)
66
+ word = "".join(
67
+ i for i in word if i in lexicon.SET_LETTERS or i in lexicon.SET_NIQQUD
68
+ )
69
+ letters = re.findall(r'(\p{L})(\p{M}*)', word)
70
+ phonemes = self.phonemize_hebrew(letters)
71
+ return "".join(phonemes)
72
+
73
+ text = re.sub(he_pattern, heb_replace_callback, text)
74
+
75
+ if not preserve_punctuation:
76
+ text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
77
+ if not preserve_stress:
78
+ text = "".join(
79
+ i for i in text if i not in [lexicon.STRESS, lexicon.SECONDARY_STRESS]
80
+ )
81
+ text = post_normalize(text)
82
+ text = "".join(i for i in text if i in lexicon.SET_OUTPUT_CHARACTERS)
83
+
84
+ return text
85
+
86
+ def phonemize_hebrew(self, letters: list[str]):
87
+
88
+ phonemes = []
89
+ i = 0
90
+ while i < len(letters):
91
+ cur = letters[i]
92
+ # prev = letters[i - 1] if i > 0 else None
93
+ # next = letters[i + 1] if i < len(letters) - 1 else None
94
+ # revised rules
95
+
96
+
97
+ if '\u05bc' in cur[1] and cur[0] + '\u05bc' in lexicon.LETTERS_PHONEMES: # dagesh
98
+ phonemes.append(lexicon.LETTERS_PHONEMES.get(cur[0] + '\u05bc', ''))
99
+ elif cur[0] == 'ו':
100
+ pass
101
+ else:
102
+ phonemes.append(lexicon.LETTERS_PHONEMES.get(cur[0], ""))
103
+ niqqud_phonemes = [lexicon.NIQQUD_PHONEMES.get(niqqud, "") for niqqud in cur[1]]
104
+
105
+ if '\u05AB' in cur[1] and phonemes:
106
+ # Ensure ATMAHA is before the letter (before the last phoneme added)
107
+ niqqud_phonemes.remove(lexicon.NIQQUD_PHONEMES['\u05AB'])
108
+ phonemes = phonemes[:-1] + [lexicon.NIQQUD_PHONEMES['\u05AB']] + [phonemes[-1]]
109
+
110
+ phonemes.extend(niqqud_phonemes)
111
+ i += 1
112
+ return phonemes
mishkal/utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mishkal import lexicon
2
+ import unicodedata
3
+ import regex as re
4
+
5
+ def sort_diacritics(match):
6
+ letter = match.group(1)
7
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
8
+ return letter + diacritics
9
+
10
+ NORMALIZE_PATTERNS = {
11
+ # Alphabet followed by 1/2 symbols then dagesh. make dagesh first
12
+ r"(\p{L})(\p{M}+)": sort_diacritics,
13
+ '״': '"'
14
+ }
15
+
16
+
17
+ def remove_niqqud(text: str):
18
+ return re.sub(lexicon.HE_NIQQUD_PATTERN, "", text)
19
+
20
+
21
+ def has_niqqud(text: str):
22
+ return re.search(lexicon.HE_NIQQUD_PATTERN, text) is not None
23
+
24
+
25
+ def normalize(text: str) -> str:
26
+ """
27
+ Normalize unicode (decomposite)
28
+ Deduplicate niqqud (eg. only Patah instead of Kamatz)
29
+ Keep only Hebrew characters / punctuation / IPA
30
+ """
31
+ # Decompose text
32
+ text = unicodedata.normalize("NFD", text)
33
+ for k, v in NORMALIZE_PATTERNS.items():
34
+ text = re.sub(k, v, text)
35
+ # Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
36
+ for k, v in lexicon.NIQQUD_DEDUPLICATE.items():
37
+ text = text.replace(k, v)
38
+ return text
39
+
40
+ def post_normalize(phonemes: str):
41
+ new_phonemes = []
42
+ for word in phonemes.split(' '):
43
+ # remove glottal stop from start/end
44
+ word = re.sub(r'^ʔ|ʔ$', '', word)
45
+ word = re.sub(r'^ˈʔ', 'ˈ', word)
46
+ word = re.sub(r'ʔ$', 'ˈ', word)
47
+ # remove h from start/end
48
+ word = re.sub(r'^h|h$', '', word)
49
+ word = re.sub(r'^ˈh|ˈh$', 'ˈ', word)
50
+ word = re.sub(r'ij$', 'i', word)
51
+ new_phonemes.append(word)
52
+ return ' '.join(new_phonemes)
53
+
54
+ def get_unicode_names(text: str):
55
+ return [unicodedata.name(c, "?") for c in text]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=5.15.0
2
+ num2words
3
+ colorlog
4
+ regex