thewh1teagle commited on
Commit
a32df36
·
0 Parent(s):
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv sync
3
+ uv pip install "gradio>=5.15.0"
4
+ uv run gradio examples/editor.py
5
+ """
6
+
7
+ from mishkal import phonemize, normalize
8
+ import gradio as gr
9
+
10
+ default_text = """
11
+ כָּל עֶ֫רֶב יָאִ֫יר (הַשֵּׁ֫ם הַמָּלֵ֫א וּמֽק֫וֹם הָעֲבוֹדָ֫ה שֶׁלּ֫וֹ שְׁמוּרִ֫ים בַּמַּעֲרֶ֫כֶת) רָץ 20 קִילוֹמֶ֫טֶר. הוּא מֽסַפֵּ֫ר לִי שֶׁזֶּ֫ה מֽנַקֶּ֫ה לוֹ אֶת הָרֹ֫אשׁ אַחֲרֵ֫י הָעֲבוֹדָ֫ה, "שָׁעָ֫ה וָחֵ֫צִי בְּלִ֫י עֲבוֹדָ֫ה, אִשָּׁ֫ה וִילָדִ֫ים" כְּמ֫וֹ שֶׁה֫וּא מַגְדִּ֫יר זֹאת. אֲבָ֫ל אַחֲרֵ֫י הַמִּקְלַ֫חַת הוּא מַתְחִ֫יל בּֽמָ֫ה שֶׁנִּתָּ֫ן לֽכַנּ֫וֹת הָעֲבוֹדָ֫ה הַשְּׁנִיָּ֫ה שֶׁלּ֫וֹ: לִמְצֹ֫א לוֹ קוֹלֵ֫גוֹת חֲדָשׁ֫וֹת לָעֲבוֹדָ֫ה, כִּי יָאִ֫יר הוּא כַּנִּרְאֶ֫ה הַמֶּ֫לֶךְ שֶׁל "חָבֵ֫ר מֵבִ֫יא חָבֵ֫ר" בּֽיִשְׂרָאֵ֫ל.
12
+ דֻּגְמָא מַגְנִיבָה: [אנציקלופדיה](/ʔantsiklopˈedja/)
13
+ """
14
+
15
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
16
+
17
+
18
+ def on_submit_debug(text: str, predict_stress) -> str:
19
+ phonemes = phonemize(text, preserve_punctuation=True, predict_stress=predict_stress)
20
+ normalized_text = normalize(text)
21
+ return phonemes + "\n\nNormalized:\n" + normalized_text
22
+
23
+
24
+ def on_submit(text: str, predict_stress) -> str:
25
+ return phonemize(text, preserve_punctuation=False, predict_stress=predict_stress)
26
+
27
+
28
+ with gr.Blocks(theme=theme) as demo:
29
+ text_input = gr.Textbox(
30
+ value=default_text, label="Text", rtl=True, elem_classes=["input"]
31
+ )
32
+ debug_checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
33
+ predict_stress_checkbox = gr.Checkbox(value=False, label="Predict Stress")
34
+ phonemes_output = gr.Textbox(label="Phonemes")
35
+ submit_button = gr.Button("Create")
36
+
37
+ submit_button.click(
38
+ fn=lambda text, debug, stress: on_submit_debug(text, stress) if debug else on_submit(text, stress),
39
+ inputs=[text_input, debug_checkbox, predict_stress_checkbox],
40
+ outputs=[phonemes_output],
41
+ )
42
+
43
+
44
+ if __name__ == "__main__":
45
+ demo.launch()
mishkal/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ use_expander=True,
17
+ use_post_normalize=True, # For TTS
18
+ predict_stress=True,
19
+ predict_shva_nah=True,
20
+ fallback: Callable[[str], str] = None,
21
+ ) -> str:
22
+ phonemes = phonemizer.phonemize(
23
+ text,
24
+ preserve_punctuation=preserve_punctuation,
25
+ preserve_stress=preserve_stress,
26
+ fallback=fallback,
27
+ use_expander=use_expander,
28
+ use_post_normalize=use_post_normalize,
29
+ predict_stress=predict_stress,
30
+ predict_shva_nah=predict_shva_nah,
31
+ )
32
+ return phonemes
mishkal/data/kamatz_katan.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "כל": "kˈol",
3
+ "רחבי": "roxbˈi",
4
+ "אמנות": "omanˈut"
5
+ }
mishkal/data/rashej_tevot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "צה״ל": "tsˈahal"
3
+ }
mishkal/data/special.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "וַאלְלָה": "wˈala",
3
+ "וַסַבִּי": "wasˈabi",
4
+ "פִּינְגְּוִין": "pinguwˈin",
5
+ "ווַטְסְאַפּ": "wˈatsʔap"
6
+ }
mishkal/data/symbols.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "₪": "ʃekˈel",
3
+ "$": "dolˈar"
4
+ }
mishkal/expander/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with nikud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from mishkal.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ text = self.dictionary.expand_text(text)
19
+
20
+ words = []
21
+ for source_word in text.split():
22
+ try:
23
+ word = date_to_word(source_word)
24
+ if word == source_word:
25
+ word = time_to_word(word)
26
+ if word == source_word:
27
+ word = num_to_word(word)
28
+ words.append(word)
29
+ except Exception as e:
30
+ log.error(f"Failed to expand {word} with error: {e}")
31
+ words.append(source_word)
32
+ return " ".join(words)
mishkal/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "יָנוּאָר",
7
+ 2: "פֶבְרוּאָר",
8
+ 3: "מֵרְץ",
9
+ 4: "אֵפְרִיל",
10
+ 5: "מַאי",
11
+ 6: "יוּנִי",
12
+ 7: "יוּלִי",
13
+ 8: "אוֹגֻסְט",
14
+ 9: "סֶפְּטֶמְבֶּר",
15
+ 10: "אוֹקְטוֹבֶּר",
16
+ 11: "נוֹבֶמְבֶּר",
17
+ 12: "דֶּצֶמְבֶּר",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "יוֹם רִאשׁוֹן",
23
+ 1: "יוֹם שֵׁנִי",
24
+ 2: "יוֹם שְׁלִישִׁי",
25
+ 3: "יוֹם רֵבִיעִי",
26
+ 4: "יוֹם חֲמִישִׁי",
27
+ 5: "יוֹם שִׁישִׁי",
28
+ 6: "יוֹם שַׁבָּת",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} בֵּ{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
mishkal/expander/dictionary.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from mishkal.utils import remove_nikud
9
+ from mishkal.utils import normalize
10
+ import unicodedata
11
+
12
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
13
+ # Sort in reverse order to prioritize the most recent and best
14
+ order = {"bronze": 1, "silver": 2, "gold": 3}
15
+ files = sorted(
16
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
17
+ )
18
+
19
+
20
+ class Dictionary:
21
+ def __init__(self):
22
+ self.dict = {}
23
+ self.load_dictionaries()
24
+
25
+ def load_dictionaries(self):
26
+ for file in files:
27
+ with open(file, "r", encoding="utf-8") as f:
28
+ dictionary: dict = json.load(f)
29
+ normalized_dictionary = {}
30
+
31
+ # normalize nikud keys
32
+ for k, v in dictionary.items():
33
+ k = normalize(k)
34
+ # Ensure not empty
35
+ if k and v:
36
+ normalized_dictionary[k] = v
37
+ self.dict.update(normalized_dictionary)
38
+
39
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
40
+ source: str = match.group(0)
41
+ # decomposite
42
+ source = unicodedata.normalize("NFD", source)
43
+ raw_lookup = self.dict.get(source)
44
+
45
+ without_nikud_lookup = self.dict.get(remove_nikud(source))
46
+ with_nikud_lookup = self.dict.get(normalize(source))
47
+ # Compare without nikud ONLY if source has no nikud
48
+ if raw_lookup:
49
+ return raw_lookup
50
+ if without_nikud_lookup:
51
+ return without_nikud_lookup
52
+ elif with_nikud_lookup:
53
+ return with_nikud_lookup
54
+ return source
55
+
56
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
57
+ raw_source: str = match.group(0)
58
+ if raw_source.isnumeric():
59
+ return raw_source
60
+
61
+ raw_lookup = self.dict.get(raw_source)
62
+
63
+ # Compare without nikud ONLY if source has no nikud
64
+ if raw_lookup:
65
+ return raw_lookup
66
+ # search by only ', space, regular nikud, alphabet
67
+ raw_source = re.sub(
68
+ r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
69
+ )
70
+ return raw_source
71
+
72
+ def expand_text(self, text: str) -> str:
73
+ """
74
+ TODO: if key doesn't have diacritics expand even diacritized words
75
+ """
76
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
77
+
78
+ return text
mishkal/expander/number_names.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ # TODO: add nikud hints
6
+
7
+ ZERO = {"אפס": "אֶפֶס"}
8
+
9
+
10
+ ONES = {
11
+ "אחת": "אַחַת",
12
+ "אחד": "אֶחָד",
13
+ "ראשונה": "רִאשׁוֹנָה",
14
+ "ראשון": "רִאשׁוֹן",
15
+ "ראשונות": "רִאשׁוֹנוֹת",
16
+ "ראשונים": "רִאשׁוֹנִים",
17
+ "שתיים": "שְׁתַּיִם",
18
+ "שניים": "שְׁנַיִם",
19
+ "שתי": "שְׁתֵּי",
20
+ "שני": "שְׁנֵי",
21
+ "שנייה": "שְׁנִיָּה",
22
+ "שניות": "שְׁנִיּוֹת",
23
+ "שלוש": "שָׁלוֹשׁ",
24
+ "שלושה": "שְׁלוֹשָׁה",
25
+ "שלושת": "שְׁלוֹשֶׁת",
26
+ "שלישית": "שְׁלִישִׁית",
27
+ "שלישי": "שְׁלִישִׁי",
28
+ "שלישיות": "שְׁלִישִׁיּוֹת",
29
+ "שלישיים": "שְׁלִישִׁיִּים",
30
+ "ארבע": "אַרְבַּע",
31
+ "ארבעה": "אַרְבַּעָה",
32
+ "ארבעת": "אַרְבַּעַת",
33
+ "רביעית": "רֵבִיעִית",
34
+ "רביעי": "רֵבִיעִי",
35
+ "רביעיות": "רֵבִיעִיוֹת",
36
+ "רביעיים": "רֵבִיעִיִּים",
37
+ "חמש": "חָמֵשׁ",
38
+ "חמישה": "חֲמִשָּׁה",
39
+ "חמשת": "חֲמֵשֶׁת",
40
+ "חמישית": "חֲמִישִּׁית",
41
+ "חמישי": "חֲמִישִּׁי",
42
+ "חמישיות": "חֲמִישִּׁיוֹת",
43
+ "חמישיים": "חֲמִישִּׁיִּים",
44
+ "שש": "שֵׁשׁ",
45
+ "שישה": "שִׁשָּׁה",
46
+ "ששת": "שֵׁשֶׁת",
47
+ "שישית": "שִׁשִּׁית",
48
+ "שישי": "שִׁשִּׁי",
49
+ "שישיות": "שִׁשִּׁיוֹת",
50
+ "שישיים": "שִׁשִּׁיִּים",
51
+ "שבע": "שֶׁבַע",
52
+ "שבעה": "שִׁבְעָה",
53
+ "שבעת": "שִׁבְעַת",
54
+ "שביעית": "שְׁבִיעִית",
55
+ "שביעי": "שְׁבִיעִי",
56
+ "שביעיות": "שְׁבִיעִיוֹת",
57
+ "שביעיים": "שְׁבִיעִיִּים",
58
+ "שמונה": "שְׁמוֹנֶה",
59
+ "שמונת": "שְׁמוֹנַת",
60
+ "שמינית": "שְׁמִינִית",
61
+ "שמיני": "שְׁמִינִי",
62
+ "שמיניות": "שְׁמִינִיוֹת",
63
+ "שמיניים": "שְׁמִינִיִּים",
64
+ "תשע": "תֵּשַׁע",
65
+ "תשעה": "תִּשְׁעָה",
66
+ "תשעת": "תִּשְׁעַת",
67
+ "תשיעית": "תְּשִׁיעִית",
68
+ "תשיעי": "תְּשִׁיעִי",
69
+ "תשיעיות": "תְּשִׁיעִיּוֹת",
70
+ "תשיעיים": "תְּשִׁיעִיִּים",
71
+ }
72
+
73
+
74
+ TENS = {
75
+ "עשר": "עֶשֶׂר",
76
+ "עשרה": "עֲשָׁרָה",
77
+ "עשרת": "עֲשֶׁרֶת",
78
+ "עשירית": "עֲשִׁירִית",
79
+ "עשירי": "עֲשִׁירִי",
80
+ "עשיריות": "עֲשִׁירִיוֹת",
81
+ "עשיריים": "עֲשִׁירִיִּים",
82
+ "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
83
+ "שנים עשר": "שְׁנֵים עָשָׂר",
84
+ }
85
+
86
+
87
+ TWENTIES = {
88
+ "עשרים": "עֶשְׂרִ֫ים",
89
+ "שלושים": "שְׁלוֹשִׁים",
90
+ "ארבעים": "אַרְבָּעִים",
91
+ "חמישים": "חֲמִשִּׁים",
92
+ "שישים": "שִׁשִּׁים",
93
+ "שבעים": "שִׁבְעִים",
94
+ "שמונים": "שְׁמוֹנִים",
95
+ "תשעים": "תִּשְׁעִים",
96
+ }
97
+
98
+
99
+ HUNDREDS = {
100
+ "מאה": "מֵאָה",
101
+ "מאת": "מֵאַת",
102
+ "מאתיים": "מָאתַיִם",
103
+ "מאות": "מֵאוֹת",
104
+ }
105
+
106
+ THOUSANDS = {
107
+ "אלף": "אֶלֶף",
108
+ "אלפיים": "אַלְפַּיִם",
109
+ "אלפים": "אֲלָפִים",
110
+ "אלפי": "אַלְפִּי",
111
+ }
112
+
113
+
114
+ LARGE = {
115
+ "מיליון": "מִילְיוֹן",
116
+ "מיליוני": "מִילְיוֹנִי",
117
+ "מיליארד": "מִילְיַארְד",
118
+ "מיליארדי": "מִילְיַארְדִּי",
119
+ "טריליון": "טְרִילְיוֹן",
120
+ "טריליוני": "טְרִילְיוֹנִי",
121
+ "קוודריליון": "קוֹוַדְרִילְיוֹן",
122
+ "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
123
+ "קווינטיליון": "קוִוִּנְטִילְיוֹן",
124
+ "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
125
+ "סקסטיליון": "סְקֶסְטִילְיוֹן",
126
+ "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
127
+ "ספטיליון": "סְפֶּטִילְיוֹן",
128
+ "ספטיליוני": "סְפֶּטִילְיוֹנִי",
129
+ "אוקטיליון": "אוֹקְטִילְיוֹן",
130
+ "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
131
+ "נוניליון": "נוּנִילְיוֹן",
132
+ "נוניליוני": "נוּנִילְיוֹנִי",
133
+ "דסיליון": "דֶּסִילְיוֹן",
134
+ "דסיליוני": "דֶּסִילְיוֹנִי",
135
+ "אונדסיליון": "אוּנְדְסִילְיוֹן",
136
+ "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
137
+ "דואודסיליון": "דוּאודְסִילְיוֹן",
138
+ "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
139
+ "טרדסיליון": "טֶרְדְסִילְיוֹן",
140
+ "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
141
+ "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
142
+ "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
143
+ "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
144
+ "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
145
+ "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
146
+ "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
147
+ "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
148
+ "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
149
+ "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
150
+ "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
151
+ "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
152
+ "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
153
+ "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
154
+ "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
155
+ }
156
+
157
+
158
+ LETTERS = {
159
+ "ו": "וֵ",
160
+ "ה": "הַ",
161
+ }
162
+
163
+
164
+ CURRENCY = {
165
+ "שקל": "שֵׁקֶל",
166
+ "שקלים": "שְׁקָלִים",
167
+ "אגורה": "אֲגוֹרָה",
168
+ "אגורות": "אֲגוֹרוֹת",
169
+ "אירו": "אֵירוֹ",
170
+ "סנט": "סֵנְט",
171
+ "סנטים": "סֵנְטִים",
172
+ "דולר": "דוֹלָר",
173
+ "דולרים": "דוֹלָרִים",
174
+ }
175
+
176
+
177
+ POINTS = {
178
+ "מינוס": "מִינּוּס",
179
+ "נקודה": "נְקֻדָּה",
180
+ }
181
+
182
+ NUMBER_NAMES = {
183
+ **CURRENCY,
184
+ **HUNDREDS,
185
+ **LARGE,
186
+ **LETTERS,
187
+ **ONES,
188
+ **POINTS,
189
+ **TENS,
190
+ **THOUSANDS,
191
+ **TWENTIES,
192
+ **ZERO,
193
+ }
mishkal/expander/numbers.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num = match.group()
22
+ words = num2words.num2words(num, lang="he", ordinal=False)
23
+ return add_diacritics(words)
24
+
25
+ # Replace all whole numbers in the string
26
+ result = re.sub(r"\d+", replace_number, maybe_number)
27
+
28
+ return result
mishkal/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "אֶפֶס",
47
+ "אַחַת",
48
+ "שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
49
+ "שָׁלוֹשׁ",
50
+ "אַרְבַּע",
51
+ "חָמֵשׁ",
52
+ "שֵׁשׁ",
53
+ "שֶׁבַע",
54
+ "שְׁמוֹנֵה",
55
+ "תֵּשַׁע",
56
+ "עֵשֵׂר",
57
+ "אַחַת עֶשְׂרֵה",
58
+ "שְׁתֵּים עֶשְׂרֵה",
59
+ ]
60
+
61
+ tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
62
+
63
+ ten_to_twenty = [
64
+ "עֵשֵׂר",
65
+ "אַחַת עֶשְׂרֵה",
66
+ "שְׁתֵּים עֶשְׂרֵה",
67
+ "שְׁלוֹשׁ עֶשְׂרֵה",
68
+ "אַרְבַּע עֶשְׂרֵה",
69
+ "חֲמֵשׁ עֶשְׂרֵה",
70
+ "שֵׁשׁ עֶשְׂרֵה",
71
+ "שְׁבַע עֶשְׂרֵה",
72
+ "שְׁמוֹנֶה עֶשְׂרֵה",
73
+ "תְּשַׁע עֶשְׂרֵה",
74
+ ]
75
+
76
+ vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "שניים" with "שני"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
mishkal/hebrew.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hebrew Phonemizer
3
+
4
+ Rules implemented:
5
+ 1. Consonant handling (including special cases)
6
+ 2. Nikud (vowel) processing
7
+ 3. Dagesh handling
8
+ 4. Geresh handling
9
+ 5. Shva na prediction
10
+ 6. Special letter combinations
11
+
12
+ Reference:
13
+ - https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
14
+ - https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט/
15
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
16
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
17
+ - https://he.wikipedia.org/wiki/הברה
18
+ """
19
+
20
+ from mishkal.variants import Letter
21
+ from mishkal import lexicon
22
+ import re
23
+ from mishkal.utils import sort_stress
24
+
25
+ SHVA = "\u05b0"
26
+ SIN = "\u05c2"
27
+ PATAH = '\u05b7'
28
+ KAMATZ = '\u05b8'
29
+ HATAF_KAMATZ = '\u05b3'
30
+ DAGESH = "\u05bc"
31
+ HOLAM = "\u05b9"
32
+ HIRIK = "\u05b4"
33
+ PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
34
+ KUBUTS = "\u05bb"
35
+ TSERE = "\u05b5"
36
+
37
+ def phonemize_hebrew(letters: list[Letter], predict_shva_na: bool) -> list[str]:
38
+ phonemes = []
39
+ i = 0
40
+
41
+ while i < len(letters):
42
+ cur = letters[i]
43
+ prev = letters[i - 1] if i > 0 else None
44
+ next = letters[i + 1] if i < len(letters) - 1 else None
45
+
46
+ next_phonemes, skip_offset = letter_to_phonemes(cur, prev, next, predict_shva_na)
47
+ phonemes.extend(next_phonemes)
48
+ i += skip_offset + 1
49
+
50
+ return phonemes
51
+
52
+
53
+ def letter_to_phonemes(cur: Letter, prev: Letter | None, next: Letter | None, predict_shva_na: bool):
54
+ cur_phonemes = []
55
+ skip_diacritics = False
56
+ skip_constants = False
57
+ skip_offset = 0
58
+ # revised rules
59
+
60
+ # יַאלְלָה
61
+ if cur.char == "ל" and cur.diac == SHVA and next and next.char == "ל":
62
+ skip_diacritics = True
63
+ skip_constants = True
64
+
65
+ if (
66
+ cur.char == "ו"
67
+ and not prev
68
+ and next
69
+ and not next.diac
70
+ and cur.char + cur.diac == "וַא"
71
+ ):
72
+ skip_offset += 1
73
+ cur_phonemes.append("wa")
74
+
75
+ if cur.char == "א" and not cur.diac and prev:
76
+ if next and next.char != 'ו':
77
+ skip_constants = True
78
+
79
+ # TODO ?
80
+ if cur.char == "י" and next and not cur.diac and prev and prev.char + prev.diac != 'אֵ':
81
+ skip_constants = True
82
+
83
+ if cur.char == "ש" and SIN in cur.diac:
84
+ cur_phonemes.append("s")
85
+ skip_constants = True
86
+
87
+ # shin without nikud after sin = sin
88
+ if cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
89
+ cur_phonemes.append("s")
90
+ skip_constants = True
91
+
92
+ if not next and cur.char == "ח" and PATAH in cur.diac:
93
+ # Final Het gnuva
94
+ cur_phonemes.append("ax")
95
+ skip_diacritics = True
96
+ skip_constants = True
97
+
98
+ if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
99
+ if cur.char == "ת":
100
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
101
+ skip_diacritics = True
102
+ skip_constants = True
103
+ else:
104
+ # Geresh
105
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
106
+ skip_constants = True
107
+
108
+ elif (
109
+ DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES
110
+ ): # dagesh
111
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
112
+ skip_constants = True
113
+ elif cur.char == "ו":
114
+ skip_constants = True
115
+ if next and next.char == "ו" and next.diac == cur.diac:
116
+ # patah and next.diac empty
117
+ if re.search(PATAH_LIKE_PATTERN, cur.diac) and not next.diac:
118
+ cur_phonemes.append("w")
119
+ skip_diacritics = True
120
+ skip_offset += 1
121
+ elif cur.diac == next.diac:
122
+ # double vav
123
+ cur_phonemes.append("wo")
124
+ skip_diacritics = True
125
+ skip_offset += 1
126
+ else:
127
+ # TODO ?
128
+ # skip_consonants = False
129
+ skip_diacritics = False
130
+ else:
131
+ # Single vav
132
+
133
+ # Vav with Patah
134
+ if re.search(PATAH_LIKE_PATTERN, cur.diac):
135
+ cur_phonemes.append("va")
136
+
137
+ # Holam haser
138
+ elif HOLAM in cur.diac:
139
+ cur_phonemes.append("o")
140
+ # Shuruk / Kubutz
141
+ elif KUBUTS in cur.diac or DAGESH in cur.diac:
142
+ cur_phonemes.append("u")
143
+ # Vav with Shva in start
144
+ elif SHVA in cur.diac and not prev:
145
+ cur_phonemes.append("ve")
146
+ # Hirik
147
+ elif HIRIK in cur.diac:
148
+ cur_phonemes.append("vi")
149
+ # Tsere
150
+ elif TSERE in cur.diac:
151
+ cur_phonemes.append("ve")
152
+
153
+ else:
154
+ cur_phonemes.append("v")
155
+ skip_diacritics = True
156
+
157
+ if not skip_constants:
158
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
159
+
160
+ if predict_shva_na and SHVA in cur.diac and not skip_diacritics and lexicon.SHVA_NA_DIACRITIC not in cur.diac:
161
+ # shva na prediction
162
+ if not prev:
163
+ if cur.char in 'למנרי' or cur.char in 'אהע' or cur.char in 'וכלב':
164
+ cur_phonemes.append("e")
165
+ skip_diacritics = True
166
+ else:
167
+ if next and next.char == cur.char:
168
+ cur_phonemes.append("e")
169
+ skip_diacritics = True
170
+ elif prev and SHVA in prev.diac and cur_phonemes[-1] != 'e':
171
+ cur_phonemes.append("e")
172
+ skip_diacritics = True
173
+
174
+ if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
175
+ cur_phonemes.append('o')
176
+ skip_diacritics = True
177
+
178
+
179
+
180
+ nikud_phonemes = (
181
+ [lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.diac]
182
+ if not skip_diacritics
183
+ else []
184
+ )
185
+ cur_phonemes.extend(nikud_phonemes)
186
+ # Ensure the stress is at the beginning of the syllable
187
+ cur_phonemes = sort_stress(cur_phonemes)
188
+ cur_phonemes = [p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)]
189
+ return cur_phonemes, skip_offset
mishkal/lexicon.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
6
+
7
+ MILHEL_PATTERNS = ['יים', 'וע', 'טו', "דיה"] # Used for stress prediction
8
+
9
+ HE_PATTERN = r'[\u05b0-\u05ea\u05ab\u05bd\'"]+'
10
+ HE_NIKUD_PATTERN = r"[\u05B0-\u05C7]"
11
+ PUNCTUATION = r".,!? "
12
+ STRESS = "\u02c8" # visually looks like '
13
+
14
+ GERESH_PHONEMES = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
15
+ SPECIAL_PHONEMES = ['w']
16
+
17
+ # Consonants
18
+ LETTERS_PHONEMES = {
19
+ "א": "ʔ", # Alef
20
+ "ב": "v", # Bet
21
+ "ג": "g", # Gimel
22
+ "ד": "d", # Dalet
23
+ "ה": "h", # He
24
+ "ו": "v", # Vav
25
+ "ז": "z", # Zayin
26
+ "ח": "x", # Het
27
+ "ט": "t", # Tet
28
+ "י": "j", # Yod
29
+ "ך": "x", # Haf sofit
30
+ "כ": "x", # Haf
31
+ "ל": "l", # Lamed
32
+ "ם": "m", # Mem Sofit
33
+ "מ": "m", # Mem
34
+ "ן": "n", # Nun Sofit
35
+ "נ": "n", # Nun
36
+ "ס": "s", # Samekh
37
+ "ע": "ʔ", # Ayin, only voweled
38
+ "פ": "f", # Fey
39
+ "ף": "f", # Fey Sofit
40
+ "ץ": "ts", # Tsadik sofit
41
+ "צ": "ts", # Tsadik
42
+ "ק": "k", # Kuf
43
+ "ר": "r", # Resh
44
+ "ש": "ʃ", # Shin
45
+ "ת": "t", # Taf
46
+ # Beged Kefet
47
+ "בּ": "b",
48
+ "כּ": "k",
49
+ "פּ": "p",
50
+ "שׁ": "ʃ",
51
+ "שׂ": "s",
52
+ "'": "",
53
+ }
54
+
55
+ SHVA_NA_DIACRITIC = "\u05bd"
56
+ ATAMAHA_DIACRITIC = "\u05ab"
57
+
58
+ NIKUD_PHONEMES = {
59
+ "\u05b4": "i", # Hiriq
60
+ "\u05b1": "e", # Hataf segol
61
+ "\u05b5": "e", # Tsere
62
+ "\u05b6": "e", # Segol
63
+ "\u05b2": "a", # Hataf Patah
64
+ "\u05b7": "a", # Patah
65
+ "\u05c7": "o", # Kamatz katan
66
+ "\u05b9": "o", # Holam
67
+ "\u05ba": "o", # Holam haser for vav
68
+ "\u05bb": "u", # Qubuts
69
+
70
+ "\u05b3": 'o', # Hataf qamats
71
+ "\u05b8": "a", # Kamataz
72
+
73
+ ATAMAHA_DIACRITIC: "ˈ", # Stress (Atmaha)
74
+ SHVA_NA_DIACRITIC: "e", # Shva na
75
+ }
76
+
77
+ # Deprecated
78
+ DEDUPLICATE = {
79
+ # "\u05b1": "\u05b5", # Hataf Segol -> Tsere
80
+ # "\u05b2": "\u05b7", # Hataf Patah -> Patah
81
+ # "\u05b3": "\u05b9", # Hataf Qamats -> Holam
82
+ # "\u05b6": "\u05b5", # Segol -> Tsere
83
+ # Kamatz -> Patah
84
+ # "\u05b8": "\u05b7", # Qamats -> Patah
85
+ # "\u05c7": "\u05b9", # Qamats Qatan -> Holam
86
+ "\u05f3": "'", # Hebrew geresh to regular geresh
87
+ }
88
+
89
+ SET_PHONEMES = set(sorted({
90
+ *NIKUD_PHONEMES.values(),
91
+ *LETTERS_PHONEMES.values(),
92
+ *GERESH_PHONEMES.values(),
93
+ *SPECIAL_PHONEMES
94
+ }))
mishkal/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
mishkal/phonemize.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mishkal import lexicon
2
+ from mishkal.variants import Letter
3
+ from .expander import Expander
4
+ from mishkal.utils import get_letters, normalize, post_normalize, has_vowel, has_constant, remove_nikud, get_syllables, sort_stress
5
+ from typing import Callable
6
+ import regex as re
7
+ from mishkal.hebrew import phonemize_hebrew
8
+
9
+ ADDITIONAL_PHONEMES = set() # When using fallback
10
+
11
+ class Phonemizer:
12
+ # TODO: is that enough? what if there's punctuation around? other chars?
13
+ fallback_pattern = r"[a-zA-Z]+"
14
+
15
+ def __init__(self):
16
+ self.expander = Expander()
17
+
18
+ def phonemize(
19
+ self,
20
+ text: str,
21
+ preserve_punctuation=True,
22
+ preserve_stress=True,
23
+ use_expander=False,
24
+ use_post_normalize=False, # For TTS
25
+ predict_stress=False,
26
+ predict_shva_nah=False,
27
+ fallback: Callable[[str], str] = None,
28
+ ) -> str | list[str]:
29
+ # normalize
30
+ text = normalize(text)
31
+
32
+ def fallback_replace_callback(match: re.Match):
33
+ word = match.group(0)
34
+
35
+ if self.expander.dictionary.dict.get(word):
36
+ # skip
37
+ # TODO: better API
38
+ return word
39
+ phonemes = fallback(word).strip()
40
+ # TODO: check that it has only IPA?!
41
+ for c in phonemes:
42
+ ADDITIONAL_PHONEMES.add(c)
43
+ return phonemes
44
+
45
+ if fallback is not None:
46
+ text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
47
+
48
+ if use_expander:
49
+ text = self.expander.expand_text(text)
50
+
51
+ def heb_replace_callback(match: re.Match, original_text: str):
52
+ word = match.group(0)
53
+ start_offset = match.start()
54
+ if start_offset > 0 and original_text[start_offset - 1] == '[':
55
+ # Skip if it starts with [ as it's used for hyper phonemes
56
+ return word
57
+
58
+ word = normalize(word)
59
+ letters: list[Letter] = get_letters(word)
60
+ phonemes: list[str] = phonemize_hebrew(letters, predict_shva_na=predict_shva_nah)
61
+ syllables = get_syllables(phonemes)
62
+
63
+ phonemes_text = ''.join(phonemes)
64
+ if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
65
+ if len(syllables) == 1:
66
+ syllables[-1] = lexicon.STRESS + syllables[-1]
67
+ syllables[-1] = ''.join(sort_stress(syllables[-1]))
68
+ elif any(remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS) or phonemes_text.endswith('ax'):
69
+ # insert lexicon.STRESS in the first character of syllables[-2]
70
+ syllables[-2] = lexicon.STRESS + syllables[-2]
71
+ syllables[-2] = ''.join(sort_stress(syllables[-2]))
72
+ else:
73
+ # insert in syllables[-1]
74
+ syllables[-1] = lexicon.STRESS + syllables[-1]
75
+ syllables[-1] = ''.join(sort_stress(syllables[-1]))
76
+
77
+ phonemes = ''.join(syllables)
78
+ if use_post_normalize:
79
+ phonemes = post_normalize(phonemes)
80
+
81
+ return phonemes
82
+
83
+
84
+ text = re.sub(lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text)
85
+
86
+ def hyper_phonemes_callback(match: re.Match):
87
+ """
88
+ Expand hyper phonemes into normal phonemes
89
+ eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
90
+ """
91
+ matched_phonemes = match.group(2)
92
+ for c in matched_phonemes:
93
+ ADDITIONAL_PHONEMES.add(c)
94
+ return matched_phonemes # The phoneme is in the second group
95
+
96
+
97
+ text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
98
+
99
+ if not preserve_punctuation:
100
+ text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
101
+ if not preserve_stress:
102
+ text = "".join(
103
+ i for i in text if i not in [lexicon.STRESS]
104
+ )
105
+ if use_post_normalize:
106
+ text = ''.join(i for i in text if i in lexicon.SET_PHONEMES or i in ADDITIONAL_PHONEMES or i == ' ' or i in lexicon.PUNCTUATION)
107
+ return text
mishkal/utils.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mishkal import lexicon
2
+ import unicodedata
3
+ import regex as re
4
+ from mishkal.variants import Letter
5
+ import mishkal
6
+
7
+ def sort_diacritics(match):
8
+ letter = match.group(1)
9
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
10
+ return letter + diacritics
11
+
12
+
13
+ NORMALIZE_PATTERNS = {
14
+ # Sort diacritics
15
+ r"(\p{L})(\p{M}+)": sort_diacritics,
16
+ "״": '"', # Hebrew geresh to normal geresh
17
+ "׳": "'", # Same
18
+ }
19
+
20
+ def remove_nikud(text: str):
21
+ return re.sub(lexicon.HE_NIKUD_PATTERN, "", text)
22
+
23
+
24
+ def has_nikud(text: str):
25
+ return re.search(lexicon.HE_NIKUD_PATTERN, text) is not None
26
+
27
+
28
+ def normalize(text: str) -> str:
29
+ """
30
+ Normalize unicode (decomposite)
31
+ Keep only Hebrew characters / punctuation / IPA
32
+ Sort diacritics
33
+ """
34
+
35
+ # Decompose text
36
+ text = unicodedata.normalize("NFD", text)
37
+ for k, v in NORMALIZE_PATTERNS.items():
38
+ text = re.sub(k, v, text)
39
+ for k, v in lexicon.DEDUPLICATE.items():
40
+ text = re.sub(k, v, text)
41
+ return text
42
+
43
+
44
+ def post_normalize(phonemes: str):
45
+ new_phonemes = []
46
+ for word in phonemes.split(" "):
47
+ # remove glottal stop from end
48
+ word = re.sub(r"ʔ$", "", word)
49
+ # remove h from end
50
+ word = re.sub(r"h$", "", word)
51
+ word = re.sub(r"ˈh$", "", word)
52
+ # remove j followed by a i
53
+ word = re.sub(r"ij$", "i", word)
54
+ new_phonemes.append(word)
55
+ phonemes = " ".join(new_phonemes)
56
+ return phonemes
57
+
58
+ def get_letters(word: str):
59
+ letters: list[tuple[str, str]] = re.findall(r"(\p{L})([\p{M}']*)", word) # with en_geresh
60
+ letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
61
+ return letters
62
+
63
+ def get_unicode_names(text: str):
64
+ return [unicodedata.name(c, "?") for c in text]
65
+
66
+ def has_vowel(s: iter):
67
+ return any(i in s for i in 'aeiou')
68
+
69
+ def has_constant(s: iter):
70
+ return any(i not in 'aeiou' for i in s)
71
+
72
+
73
+
74
+ def get_syllables(phonemes: list[str]) -> list[str]:
75
+ syllables = []
76
+ cur_syllable = ''
77
+
78
+ i = 0
79
+ while i < len(phonemes):
80
+ # Add current phoneme to the syllable
81
+
82
+ cur_syllable += phonemes[i]
83
+
84
+ # If we have a vowel in the current syllable
85
+ if has_vowel(cur_syllable):
86
+ # If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
87
+ if i+2 < len(phonemes) and not has_vowel(phonemes[i+1]) and has_vowel(phonemes[i+2]):
88
+ # End the current syllable and start a new one
89
+ syllables.append(cur_syllable)
90
+ cur_syllable = ''
91
+ # If we're at the end or next phoneme has a vowel
92
+ elif i+1 >= len(phonemes) or has_vowel(phonemes[i+1]):
93
+ # End the current syllable
94
+ syllables.append(cur_syllable)
95
+ cur_syllable = ''
96
+
97
+ i += 1
98
+
99
+ # Add any remaining syllable
100
+ if cur_syllable:
101
+ syllables.append(cur_syllable)
102
+
103
+ # Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
104
+ for i in range(len(syllables) - 1): # Ensure we're not at the last syllable
105
+ if syllables[i].endswith(lexicon.STRESS):
106
+ syllables[i+1] = lexicon.STRESS + syllables[i+1] # Move stress to next syllable
107
+ syllables[i] = syllables[i][:-len(lexicon.STRESS)] # Remove stress from current syllable
108
+
109
+ return syllables
110
+
111
+
112
+ def sort_stress(phonemes: list[str]):
113
+ if 'ˈ' not in phonemes:
114
+ return phonemes
115
+ phonemes = [p for p in phonemes if p != 'ˈ']
116
+ insert_pos = next((i for i, p in enumerate(phonemes) if p in 'aeiou'), len(phonemes))
117
+ phonemes.insert(insert_pos, 'ˈ')
118
+ return phonemes
mishkal/variants.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mishkal
2
+
3
+ class Letter:
4
+ def __init__(self, char: str, diac: list[str]):
5
+ self.char = mishkal.normalize(char)
6
+ self.diac = mishkal.normalize(diac)
7
+
8
+ def __repr__(self):
9
+ return f"[Letter] {self.char}{''.join(self.diac)}"
10
+
11
+ def __eq__(self, value: 'Letter'):
12
+ return value.diac == self.diac and value.char == self.char
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=5.15.0
2
+ num2words
3
+ colorlog
4
+ regex