thewh1teagle commited on
Commit
308923b
·
0 Parent(s):
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv sync
3
+ uv pip install "gradio>=5.15.0"
4
+ uv run gradio examples/editor.py
5
+ """
6
+
7
+ from mishkal import phonemize, normalize
8
+ import gradio as gr
9
+
10
+ default_text = """
11
+ כָּל עֶ֫רֶב יָאִ֫יר (הַשֵּׁ֫ם הַמָּלֵ֫א וּמֽק֫וֹם הָעֲבוֹדָ֫ה שֶׁלּ֫וֹ שְׁמוּרִ֫ים בַּמַּעֲרֶ֫כֶת) רָץ 20 קִילוֹמֶ֫טֶר. הוּא מֽסַפֵּ֫ר לִי שֶׁזֶּ֫ה מֽנַקֶּ֫ה לוֹ אֶת הָרֹ֫אשׁ אַחֲרֵ֫י הָעֲבוֹדָ֫ה, "שָׁעָ֫ה וָחֵ֫צִי בְּלִ֫י עֲבוֹדָ֫ה, אִשָּׁ֫ה וִילָדִ֫ים" כְּמ֫וֹ שֶׁה֫וּא מַגְדִּ֫יר זֹאת. אֲבָ֫ל אַחֲרֵ֫י הַמִּקְלַ֫חַת הוּא מַתְחִ֫יל בּֽמָ֫ה שֶׁנִּתָּ֫ן לֽכַנּ֫וֹת הָעֲבוֹדָ֫ה הַשְּׁנִיָּ֫ה שֶׁלּ֫וֹ: לִמְצֹ֫א לוֹ קוֹלֵ֫גוֹת חֲדָשׁ֫וֹת לָעֲבוֹדָ֫ה, כִּי יָאִ֫יר הוּא כַּנִּרְאֶ֫ה הַמֶּ֫לֶךְ שֶׁל "חָבֵ֫ר מֵבִ֫יא חָבֵ֫ר" בּֽיִשְׂרָאֵ֫ל.
12
+ דֻּגְמָא מַגְנִיבָה: [אנציקלופדיה](/ʔantsikloˈpedja/)
13
+ """
14
+
15
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
16
+
17
+
18
+ def on_submit_debug(text: str, predict_stress) -> str:
19
+ phonemes = phonemize(text, preserve_punctuation=True, predict_stress=predict_stress)
20
+ normalized_text = normalize(text)
21
+ return phonemes + "\n\nNormalized:\n" + normalized_text
22
+
23
+
24
+ def on_submit(text: str, predict_stress) -> str:
25
+ return phonemize(text, preserve_punctuation=False, predict_stress=predict_stress)
26
+
27
+
28
+ with gr.Blocks(theme=theme) as demo:
29
+ text_input = gr.Textbox(
30
+ value=default_text, label="Text", rtl=True, elem_classes=["input"]
31
+ )
32
+ debug_checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
33
+ predict_stress_checkbox = gr.Checkbox(value=False, label="Predict Stress")
34
+ phonemes_output = gr.Textbox(label="Phonemes")
35
+ submit_button = gr.Button("Create")
36
+
37
+ submit_button.click(
38
+ fn=lambda text, debug, stress: on_submit_debug(text, stress) if debug else on_submit(text, stress),
39
+ inputs=[text_input, debug_checkbox, predict_stress_checkbox],
40
+ outputs=[phonemes_output],
41
+ )
42
+
43
+
44
+ if __name__ == "__main__":
45
+ demo.launch()
mishkal/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ use_expander=True,
17
+ use_post_normalize=True, # For TTS
18
+ predict_stress=True,
19
+ predict_shva_nah=True,
20
+ fallback: Callable[[str], str] = None,
21
+ ) -> str:
22
+ phonemes = phonemizer.phonemize(
23
+ text,
24
+ preserve_punctuation=preserve_punctuation,
25
+ preserve_stress=preserve_stress,
26
+ fallback=fallback,
27
+ use_expander=use_expander,
28
+ use_post_normalize=use_post_normalize,
29
+ predict_stress=predict_stress,
30
+ predict_shva_nah=predict_shva_nah,
31
+ )
32
+ return phonemes
mishkal/data/kamatz_katan.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "כל": "ˈkol",
3
+ "רחבי": "roxˈbi",
4
+ "אמנות": "omaˈnut"
5
+ }
mishkal/data/rashej_tevot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "צה״ל": "ˈtsahal"
3
+ }
mishkal/data/special.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "וַאלְלָה": "wala",
3
+ "וַסַבִּי": "wasabi"
4
+ }
mishkal/data/symbols.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "₪": "ʃeˈkel",
3
+ "$": "doˈlar"
4
+ }
mishkal/expander/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with nikud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from mishkal.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ text = self.dictionary.expand_text(text)
19
+
20
+ words = []
21
+ for source_word in text.split():
22
+ try:
23
+ word = date_to_word(source_word)
24
+ if word == source_word:
25
+ word = time_to_word(word)
26
+ if word == source_word:
27
+ word = num_to_word(word)
28
+ words.append(word)
29
+ except Exception as e:
30
+ log.error(f"Failed to expand {word} with error: {e}")
31
+ words.append(source_word)
32
+ return " ".join(words)
mishkal/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "יָנוּאָר",
7
+ 2: "פֶבְרוּאָר",
8
+ 3: "מֵרְץ",
9
+ 4: "אֵפְרִיל",
10
+ 5: "מַאי",
11
+ 6: "יוּנִי",
12
+ 7: "יוּלִי",
13
+ 8: "אוֹגֻסְט",
14
+ 9: "סֶפְּטֶמְבֶּר",
15
+ 10: "אוֹקְטוֹבֶּר",
16
+ 11: "נוֹבֶמְבֶּר",
17
+ 12: "דֶּצֶמְבֶּר",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "יוֹם רִאשׁוֹן",
23
+ 1: "יוֹם שֵׁנִי",
24
+ 2: "יוֹם שְׁלִישִׁי",
25
+ 3: "יוֹם רֵבִיעִי",
26
+ 4: "יוֹם חֲמִישִׁי",
27
+ 5: "יוֹם שִׁישִׁי",
28
+ 6: "יוֹם שַׁבָּת",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} בֵּ{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
mishkal/expander/dictionary.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from mishkal.utils import remove_nikud
9
+ from mishkal.utils import normalize
10
+ import unicodedata
11
+
12
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
13
+ # Sort in reverse order to prioritize the most recent and best
14
+ order = {"bronze": 1, "silver": 2, "gold": 3}
15
+ files = sorted(
16
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
17
+ )
18
+
19
+
20
+ class Dictionary:
21
+ def __init__(self):
22
+ self.dict = {}
23
+ self.load_dictionaries()
24
+
25
+ def load_dictionaries(self):
26
+ for file in files:
27
+ with open(file, "r", encoding="utf-8") as f:
28
+ dictionary: dict = json.load(f)
29
+ normalized_dictionary = {}
30
+
31
+ # normalize nikud keys
32
+ for k, v in dictionary.items():
33
+ k = normalize(k)
34
+ # Ensure not empty
35
+ if k and v:
36
+ normalized_dictionary[k] = v
37
+ self.dict.update(normalized_dictionary)
38
+
39
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
40
+ source: str = match.group(0)
41
+ # decomposite
42
+ source = unicodedata.normalize("NFD", source)
43
+ raw_lookup = self.dict.get(source)
44
+
45
+ without_nikud_lookup = self.dict.get(remove_nikud(source))
46
+ with_nikud_lookup = self.dict.get(normalize(source))
47
+ # Compare without nikud ONLY if source has no nikud
48
+ if raw_lookup:
49
+ return raw_lookup
50
+ if without_nikud_lookup:
51
+ return without_nikud_lookup
52
+ elif with_nikud_lookup:
53
+ return with_nikud_lookup
54
+ return source
55
+
56
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
57
+ raw_source: str = match.group(0)
58
+ if raw_source.isnumeric():
59
+ return raw_source
60
+
61
+ raw_lookup = self.dict.get(raw_source)
62
+
63
+ # Compare without nikud ONLY if source has no nikud
64
+ if raw_lookup:
65
+ return raw_lookup
66
+ # search by only ', space, regular nikud, alphabet
67
+ raw_source = re.sub(
68
+ r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
69
+ )
70
+ return raw_source
71
+
72
+ def expand_text(self, text: str) -> str:
73
+ """
74
+ TODO: if key doesn't have diacritics expand even diacritized words
75
+ """
76
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
77
+
78
+ return text
mishkal/expander/number_names.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ # TODO: add nikud hints
6
+
7
+ ZERO = {"אפס": "אֶפֶס"}
8
+
9
+
10
+ ONES = {
11
+ "אחת": "אַחַת",
12
+ "אחד": "אֶחָד",
13
+ "ראשונה": "רִאשׁוֹנָה",
14
+ "ראשון": "רִאשׁוֹן",
15
+ "ראשונות": "רִאשׁוֹנוֹת",
16
+ "ראשונים": "רִאשׁוֹנִים",
17
+ "שתיים": "שְׁתַּיִם",
18
+ "שניים": "שְׁנַיִם",
19
+ "שתי": "שְׁתֵּי",
20
+ "שני": "שְׁנֵי",
21
+ "שנייה": "שְׁנִיָּה",
22
+ "שניות": "שְׁנִיּוֹת",
23
+ "שלוש": "שָׁלוֹשׁ",
24
+ "שלושה": "שְׁלוֹשָׁה",
25
+ "שלושת": "שְׁלוֹשֶׁת",
26
+ "שלישית": "שְׁלִישִׁית",
27
+ "שלישי": "שְׁלִישִׁי",
28
+ "שלישיות": "שְׁלִישִׁיּוֹת",
29
+ "שלישיים": "שְׁלִישִׁיִּים",
30
+ "ארבע": "אַרְבַּע",
31
+ "ארבעה": "אַרְבַּעָה",
32
+ "ארבעת": "אַרְבַּעַת",
33
+ "רביעית": "רֵבִיעִית",
34
+ "רביעי": "רֵבִיעִי",
35
+ "רביעיות": "רֵבִיעִיוֹת",
36
+ "רביעיים": "רֵבִיעִיִּים",
37
+ "חמש": "חָמֵשׁ",
38
+ "חמישה": "חֲמִשָּׁה",
39
+ "חמשת": "חֲמֵשֶׁת",
40
+ "חמישית": "חֲמִישִּׁית",
41
+ "חמישי": "חֲמִישִּׁי",
42
+ "חמישיות": "חֲמִישִּׁיוֹת",
43
+ "חמישיים": "חֲמִישִּׁיִּים",
44
+ "שש": "שֵׁשׁ",
45
+ "שישה": "שִׁשָּׁה",
46
+ "ששת": "שֵׁשֶׁת",
47
+ "שישית": "שִׁשִּׁית",
48
+ "שישי": "שִׁשִּׁי",
49
+ "שישיות": "שִׁשִּׁיוֹת",
50
+ "שישיים": "שִׁשִּׁיִּים",
51
+ "שבע": "שֶׁבַע",
52
+ "שבעה": "שִׁבְעָה",
53
+ "שבעת": "שִׁבְעַת",
54
+ "שביעית": "שְׁבִיעִית",
55
+ "שביעי": "שְׁבִיעִי",
56
+ "שביעיות": "שְׁבִיעִיוֹת",
57
+ "שביעיים": "שְׁבִיעִיִּים",
58
+ "שמונה": "שְׁמוֹנֶה",
59
+ "שמונת": "שְׁמוֹנַת",
60
+ "שמינית": "שְׁמִינִית",
61
+ "שמיני": "שְׁמִינִי",
62
+ "שמיניות": "שְׁמִינִיוֹת",
63
+ "שמיניים": "שְׁמִינִיִּים",
64
+ "תשע": "תֵּשַׁע",
65
+ "תשעה": "תִּשְׁעָה",
66
+ "תשעת": "תִּשְׁעַת",
67
+ "תשיעית": "תְּשִׁיעִית",
68
+ "תשיעי": "תְּשִׁיעִי",
69
+ "תשיעיות": "תְּשִׁיעִיּוֹת",
70
+ "תשיעיים": "תְּשִׁיעִיִּים",
71
+ }
72
+
73
+
74
+ TENS = {
75
+ "עשר": "עֶשֶׂר",
76
+ "עשרה": "עֲשָׁרָה",
77
+ "עשרת": "עֲשֶׁרֶת",
78
+ "עשירית": "עֲשִׁירִית",
79
+ "עשירי": "עֲשִׁירִי",
80
+ "עשיריות": "עֲשִׁירִיוֹת",
81
+ "עשיריים": "עֲשִׁירִיִּים",
82
+ "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
83
+ "שנים עשר": "שְׁנֵים עָשָׂר",
84
+ }
85
+
86
+
87
+ TWENTIES = {
88
+ "עשרים": "עֶשְׂרִ֫ים",
89
+ "שלושים": "שְׁלוֹשִׁים",
90
+ "ארבעים": "אַרְבָּעִים",
91
+ "חמישים": "חֲמִשִּׁים",
92
+ "שישים": "שִׁשִּׁים",
93
+ "שבעים": "שִׁבְעִים",
94
+ "שמונים": "שְׁמוֹנִים",
95
+ "תשעים": "תִּשְׁעִים",
96
+ }
97
+
98
+
99
+ HUNDREDS = {
100
+ "מאה": "מֵאָה",
101
+ "מאת": "מֵאַת",
102
+ "מאתיים": "מָאתַיִם",
103
+ "מאות": "מֵאוֹת",
104
+ }
105
+
106
+ THOUSANDS = {
107
+ "אלף": "אֶלֶף",
108
+ "אלפיים": "אַלְפַּיִם",
109
+ "אלפים": "אֲלָפִים",
110
+ "אלפי": "אַלְפִּי",
111
+ }
112
+
113
+
114
+ LARGE = {
115
+ "מיליון": "מִילְיוֹן",
116
+ "מיליוני": "מִילְיוֹנִי",
117
+ "מיליארד": "מִילְיַארְד",
118
+ "מיליארדי": "מִילְיַארְדִּי",
119
+ "טריליון": "טְרִילְיוֹן",
120
+ "טריליוני": "טְרִילְיוֹנִי",
121
+ "קוודריליון": "קוֹוַדְרִילְיוֹן",
122
+ "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
123
+ "קווינטיליון": "קוִוִּנְטִילְיוֹן",
124
+ "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
125
+ "סקסטיליון": "סְקֶסְטִילְיוֹן",
126
+ "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
127
+ "ספטיליון": "סְפֶּטִילְיוֹן",
128
+ "ספטיליוני": "סְפֶּטִילְיוֹנִי",
129
+ "אוקטיליון": "אוֹקְטִילְיוֹן",
130
+ "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
131
+ "נוניליון": "נוּנִילְיוֹן",
132
+ "נוניליוני": "נוּנִילְיוֹנִי",
133
+ "דסיליון": "דֶּסִילְיוֹן",
134
+ "דסיליוני": "דֶּסִילְיוֹנִי",
135
+ "אונדסיליון": "אוּנְדְסִילְיוֹן",
136
+ "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
137
+ "דואודסיליון": "דוּאודְסִילְיוֹן",
138
+ "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
139
+ "טרדסיליון": "טֶרְדְסִילְיוֹן",
140
+ "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
141
+ "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
142
+ "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
143
+ "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
144
+ "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
145
+ "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
146
+ "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
147
+ "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
148
+ "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
149
+ "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
150
+ "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
151
+ "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
152
+ "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
153
+ "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
154
+ "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
155
+ }
156
+
157
+
158
+ LETTERS = {
159
+ "ו": "וֵ",
160
+ "ה": "הַ",
161
+ }
162
+
163
+
164
+ CURRENCY = {
165
+ "שקל": "שֵׁקֶל",
166
+ "שקלים": "שְׁקָלִים",
167
+ "אגורה": "אֲגוֹרָה",
168
+ "אגורות": "אֲגוֹרוֹת",
169
+ "אירו": "אֵירוֹ",
170
+ "סנט": "סֵנְט",
171
+ "סנטים": "סֵנְטִים",
172
+ "דולר": "דוֹלָר",
173
+ "דולרים": "דוֹלָרִים",
174
+ }
175
+
176
+
177
+ POINTS = {
178
+ "מינוס": "מִינּוּס",
179
+ "נקודה": "נְקֻדָּה",
180
+ }
181
+
182
+ NUMBER_NAMES = {
183
+ **CURRENCY,
184
+ **HUNDREDS,
185
+ **LARGE,
186
+ **LETTERS,
187
+ **ONES,
188
+ **POINTS,
189
+ **TENS,
190
+ **THOUSANDS,
191
+ **TWENTIES,
192
+ **ZERO,
193
+ }
mishkal/expander/numbers.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num = match.group()
22
+ words = num2words.num2words(num, lang="he", ordinal=False)
23
+ return add_diacritics(words)
24
+
25
+ # Replace all whole numbers in the string
26
+ result = re.sub(r"\d+", replace_number, maybe_number)
27
+
28
+ return result
mishkal/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "אֶפֶס",
47
+ "אַחַת",
48
+ "שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
49
+ "שָׁלוֹשׁ",
50
+ "אַרְבַּע",
51
+ "חָמֵשׁ",
52
+ "שֵׁשׁ",
53
+ "שֶׁבַע",
54
+ "שְׁמוֹנֵה",
55
+ "תֵּשַׁע",
56
+ "עֵשֵׂר",
57
+ "אַחַת עֶשְׂרֵה",
58
+ "שְׁתֵּים עֶשְׂרֵה",
59
+ ]
60
+
61
+ tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
62
+
63
+ ten_to_twenty = [
64
+ "עֵשֵׂר",
65
+ "אַחַת עֶשְׂרֵה",
66
+ "שְׁתֵּים עֶשְׂרֵה",
67
+ "שְׁלוֹשׁ עֶשְׂרֵה",
68
+ "אַרְבַּע עֶשְׂרֵה",
69
+ "חֲמֵשׁ עֶשְׂרֵה",
70
+ "שֵׁשׁ עֶשְׂרֵה",
71
+ "שְׁבַע עֶשְׂרֵה",
72
+ "שְׁמוֹנֶה עֶשְׂרֵה",
73
+ "תְּשַׁע עֶשְׂרֵה",
74
+ ]
75
+
76
+ vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "שניים" with "שני"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
mishkal/hebrew.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hebrew Phonemizer
3
+
4
+ Rules implemented:
5
+ 1. Consonant handling (including special cases)
6
+ 2. Nikud (vowel) processing
7
+ 3. Dagesh handling
8
+ 4. Geresh handling
9
+ 5. Shva na prediction
10
+ 6. Special letter combinations
11
+
12
+ Reference:
13
+ - https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
14
+ - https://hebrew-academy.org.il/2010/03/24/צהרים-נעמי-הגיית-קמץ-לפני-חט/
15
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
16
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
17
+ - https://he.wikipedia.org/wiki/הברה
18
+ """
19
+
20
+ from mishkal.variants import Letter
21
+ from mishkal import lexicon
22
+ import re
23
+
24
+ SHVA = "\u05b0"
25
+ SIN = "\u05c2"
26
+ PATAH = '\u05b7'
27
+ KAMATZ = '\u05b8'
28
+ HATAF_KAMATZ = '\u05b3'
29
+ DAGESH = "\u05bc"
30
+ HOLAM = "\u05b9"
31
+ HIRIK = "\u05b4"
32
+ PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
33
+ KUBUTS = "\u05bb"
34
+ TSERE = "\u05b5"
35
+
36
+ def phonemize_hebrew(letters: list[Letter], predict_shva_na: bool) -> list[str]:
37
+ phonemes = []
38
+ i = 0
39
+
40
+ while i < len(letters):
41
+ cur = letters[i]
42
+ prev = letters[i - 1] if i > 0 else None
43
+ next = letters[i + 1] if i < len(letters) - 1 else None
44
+
45
+ next_phonemes, skip_offset = letter_to_phonemes(cur, prev, next, predict_shva_na)
46
+ phonemes.extend(next_phonemes)
47
+ i += skip_offset + 1
48
+
49
+ return phonemes
50
+
51
+
52
+ def letter_to_phonemes(cur: Letter, prev: Letter | None, next: Letter | None, predict_shva_na: bool):
53
+ cur_phonemes = []
54
+ skip_diacritics = False
55
+ skip_constants = False
56
+ skip_offset = 0
57
+ # revised rules
58
+
59
+ # יַאלְלָה
60
+ if cur.char == "ל" and cur.diac == SHVA and next and next.char == "ל":
61
+ skip_diacritics = True
62
+ skip_constants = True
63
+
64
+ if (
65
+ cur.char == "ו"
66
+ and not prev
67
+ and next
68
+ and not next.diac
69
+ and cur.char + cur.diac == "וַא"
70
+ ):
71
+ skip_offset += 1
72
+ cur_phonemes.append("wa")
73
+
74
+ if cur.char == "א" and not cur.diac and prev:
75
+ if next and next.char != 'ו':
76
+ skip_constants = True
77
+
78
+ # TODO ?
79
+ if cur.char == "י" and next and not cur.diac and prev and prev.char + prev.diac != 'אֵ':
80
+ skip_constants = True
81
+
82
+ if cur.char == "ש" and SIN in cur.diac:
83
+ cur_phonemes.append("s")
84
+ skip_constants = True
85
+
86
+ # shin without nikud after sin = sin
87
+ if cur.char == "ש" and not cur.diac and prev and SIN in prev.diac:
88
+ cur_phonemes.append("s")
89
+ skip_constants = True
90
+
91
+ if not next and cur.char == "ח" and PATAH in cur.diac:
92
+ # Final Het gnuva
93
+ cur_phonemes.append("ax")
94
+ skip_diacritics = True
95
+ skip_constants = True
96
+
97
+ if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
98
+ if cur.char == "ת":
99
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
100
+ skip_diacritics = True
101
+ skip_constants = True
102
+ else:
103
+ # Geresh
104
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
105
+ skip_constants = True
106
+
107
+ elif (
108
+ DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES
109
+ ): # dagesh
110
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
111
+ skip_constants = True
112
+ elif cur.char == "ו":
113
+ skip_constants = True
114
+ if next and next.char == "ו" and next.diac == cur.diac:
115
+ # patah and next.diac empty
116
+ if re.search(PATAH_LIKE_PATTERN, cur.diac) and not next.diac:
117
+ cur_phonemes.append("w")
118
+ skip_diacritics = True
119
+ skip_offset += 1
120
+ elif cur.diac == next.diac:
121
+ # double vav
122
+ cur_phonemes.append("wo")
123
+ skip_diacritics = True
124
+ skip_offset += 1
125
+ else:
126
+ # TODO ?
127
+ # skip_consonants = False
128
+ skip_diacritics = False
129
+ else:
130
+ # Single vav
131
+
132
+ # Vav with Patah
133
+ if re.search(PATAH_LIKE_PATTERN, cur.diac):
134
+ cur_phonemes.append("va")
135
+
136
+ # Holam haser
137
+ elif HOLAM in cur.diac:
138
+ cur_phonemes.append("o")
139
+ # Shuruk / Kubutz
140
+ elif KUBUTS in cur.diac or DAGESH in cur.diac:
141
+ cur_phonemes.append("u")
142
+ # Vav with Shva in start
143
+ elif SHVA in cur.diac and not prev:
144
+ cur_phonemes.append("ve")
145
+ # Hirik
146
+ elif HIRIK in cur.diac:
147
+ cur_phonemes.append("vi")
148
+ # Tsere
149
+ elif TSERE in cur.diac:
150
+ cur_phonemes.append("ve")
151
+
152
+ else:
153
+ cur_phonemes.append("v")
154
+ skip_diacritics = True
155
+
156
+ if not skip_constants:
157
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
158
+
159
+ if predict_shva_na and SHVA in cur.diac and not skip_diacritics and lexicon.SHVA_NA_DIACRITIC not in cur.diac:
160
+ # shva na prediction
161
+ if not prev:
162
+ if cur.char in 'למנרי' or cur.char in 'אהע' or cur.char in 'וכלב':
163
+ cur_phonemes.append("e")
164
+ skip_diacritics = True
165
+ else:
166
+ if next and next.char == cur.char:
167
+ cur_phonemes.append("e")
168
+ skip_diacritics = True
169
+ elif prev and SHVA in prev.diac and cur_phonemes[-1] != 'e':
170
+ cur_phonemes.append("e")
171
+ skip_diacritics = True
172
+
173
+ if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
174
+ cur_phonemes.append('o')
175
+ skip_diacritics = True
176
+
177
+
178
+
179
+ nikud_phonemes = (
180
+ [lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.diac]
181
+ if not skip_diacritics
182
+ else []
183
+ )
184
+ cur_phonemes.extend(nikud_phonemes)
185
+ # Ensure the stress is at the beginning of the syllable
186
+ cur_phonemes.sort(key=lambda x: x != 'ˈ')
187
+ cur_phonemes = [p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)]
188
+ return cur_phonemes, skip_offset
mishkal/lexicon.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
6
+
7
+ MILHEL_PATTERNS = ['יים', 'וע', 'טו', "דיה"] # Used for stress prediction
8
+
9
+ HE_PATTERN = r'[\u05b0-\u05ea\u05ab\u05bd\'"]+'
10
+ HE_NIKUD_PATTERN = r"[\u05B0-\u05C7]"
11
+ PUNCTUATION = r".,!? "
12
+ STRESS = "\u02c8" # visually looks like '
13
+
14
+ GERESH_PHONEMES = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
15
+
16
+ # Consonants
17
+ LETTERS_PHONEMES = {
18
+ "א": "ʔ", # Alef
19
+ "ב": "v", # Bet
20
+ "ג": "g", # Gimel
21
+ "ד": "d", # Dalet
22
+ "ה": "h", # He
23
+ "ו": "v", # Vav
24
+ "ז": "z", # Zayin
25
+ "ח": "x", # Het
26
+ "ט": "t", # Tet
27
+ "י": "j", # Yod
28
+ "ך": "x", # Haf sofit
29
+ "כ": "x", # Haf
30
+ "ל": "l", # Lamed
31
+ "ם": "m", # Mem Sofit
32
+ "מ": "m", # Mem
33
+ "ן": "n", # Nun Sofit
34
+ "נ": "n", # Nun
35
+ "ס": "s", # Samekh
36
+ "ע": "ʔ", # Ayin, only voweled
37
+ "פ": "f", # Fey
38
+ "ף": "f", # Fey Sofit
39
+ "ץ": "ts", # Tsadik sofit
40
+ "צ": "ts", # Tsadik
41
+ "ק": "k", # Kuf
42
+ "ר": "r", # Resh
43
+ "ש": "ʃ", # Shin
44
+ "ת": "t", # Taf
45
+ # Beged Kefet
46
+ "בּ": "b",
47
+ "כּ": "k",
48
+ "פּ": "p",
49
+ "שׁ": "ʃ",
50
+ "שׂ": "s",
51
+ "'": "",
52
+ }
53
+
54
+ SHVA_NA_DIACRITIC = "\u05bd"
55
+ ATAMAHA_DIACRITIC = "\u05ab"
56
+
57
+ NIKUD_PHONEMES = {
58
+ "\u05b4": "i", # Hiriq
59
+ "\u05b1": "e", # Hataf segol
60
+ "\u05b5": "e", # Tsere
61
+ "\u05b6": "e", # Segol
62
+ "\u05b2": "a", # Hataf Patah
63
+ "\u05b7": "a", # Patah
64
+ "\u05c7": "o", # Kamatz katan
65
+ "\u05b9": "o", # Holam
66
+ "\u05ba": "o", # Holam haser for vav
67
+ "\u05bb": "u", # Qubuts
68
+
69
+ "\u05b3": 'o', # Hataf qamats
70
+ "\u05b8": "a", # Kamataz
71
+
72
+ ATAMAHA_DIACRITIC: "ˈ", # Stress (Atmaha)
73
+ SHVA_NA_DIACRITIC: "e", # Shva na
74
+ }
75
+
76
+ # Deprecated
77
+ DEDUPLICATE = {
78
+ # "\u05b1": "\u05b5", # Hataf Segol -> Tsere
79
+ # "\u05b2": "\u05b7", # Hataf Patah -> Patah
80
+ # "\u05b3": "\u05b9", # Hataf Qamats -> Holam
81
+ # "\u05b6": "\u05b5", # Segol -> Tsere
82
+ # Kamatz -> Patah
83
+ # "\u05b8": "\u05b7", # Qamats -> Patah
84
+ # "\u05c7": "\u05b9", # Qamats Qatan -> Holam
85
+ "\u05f3": "'", # Hebrew geresh to regular geresh
86
+ }
87
+
88
+ SET_PHONEMES = set(sorted({
89
+ *NIKUD_PHONEMES.values(),
90
+ *LETTERS_PHONEMES.values(),
91
+ *GERESH_PHONEMES.values()
92
+ }))
mishkal/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
mishkal/phonemize.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mishkal import lexicon
2
+ from mishkal.variants import Letter
3
+ from .expander import Expander
4
+ from mishkal.utils import get_letters, normalize, post_normalize, has_vowel, has_constant, remove_nikud, get_syllables
5
+ from typing import Callable
6
+ import regex as re
7
+ from mishkal.hebrew import phonemize_hebrew
8
+
9
+ ADDITIONAL_PHONEMES = set() # When using fallback
10
+
11
+ class Phonemizer:
12
+ # TODO: is that enough? what if there's punctuation around? other chars?
13
+ fallback_pattern = r"[a-zA-Z]+"
14
+
15
+ def __init__(self):
16
+ self.expander = Expander()
17
+
18
+ def phonemize(
19
+ self,
20
+ text: str,
21
+ preserve_punctuation=True,
22
+ preserve_stress=True,
23
+ use_expander=False,
24
+ use_post_normalize=False, # For TTS
25
+ predict_stress=False,
26
+ predict_shva_nah=False,
27
+ fallback: Callable[[str], str] = None,
28
+ ) -> str | list[str]:
29
+ # normalize
30
+ text = normalize(text)
31
+
32
+ def fallback_replace_callback(match: re.Match):
33
+ word = match.group(0)
34
+
35
+ if self.expander.dictionary.dict.get(word):
36
+ # skip
37
+ # TODO: better API
38
+ return word
39
+ phonemes = fallback(word).strip()
40
+ # TODO: check that it has only IPA?!
41
+ for c in phonemes:
42
+ ADDITIONAL_PHONEMES.add(c)
43
+ return phonemes
44
+
45
+ if fallback is not None:
46
+ text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
47
+
48
+ if use_expander:
49
+ text = self.expander.expand_text(text)
50
+
51
+ def heb_replace_callback(match: re.Match):
52
+ word = match.group(0)
53
+
54
+ word = normalize(word)
55
+ letters: list[Letter] = get_letters(word)
56
+ phonemes: list[str] = phonemize_hebrew(letters, predict_shva_na=predict_shva_nah)
57
+ syllables = get_syllables(phonemes)
58
+
59
+ phonemes_text = ''.join(phonemes)
60
+ if predict_stress and lexicon.STRESS not in phonemes_text and syllables and len(syllables) > 1:
61
+ if any(remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS) or phonemes_text.endswith('ax'):
62
+ # insert lexicon.STRESS in the first character of syllables[-2]
63
+ syllables[-2] = lexicon.STRESS + syllables[-2]
64
+ else:
65
+ # insert in syllables[-1]
66
+ syllables[-1] = lexicon.STRESS + syllables[-1]
67
+
68
+ phonemes = ''.join(syllables)
69
+ if use_post_normalize:
70
+ phonemes = post_normalize(phonemes)
71
+
72
+ return phonemes
73
+
74
+
75
+ text = re.sub(lexicon.HE_PATTERN, heb_replace_callback, text)
76
+
77
+ def hyper_phonemes_callback(match: re.Match):
78
+ """
79
+ Expand hyper phonemes into normal phonemes
80
+ eg. [hello](/hɛˈloʊ/) -> hɛˈloʊ
81
+ """
82
+ matched_phonemes = match.group(2)
83
+ for c in matched_phonemes:
84
+ ADDITIONAL_PHONEMES.add(c)
85
+ return matched_phonemes # The phoneme is in the second group
86
+
87
+
88
+ text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
89
+
90
+ if not preserve_punctuation:
91
+ text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
92
+ if not preserve_stress:
93
+ text = "".join(
94
+ i for i in text if i not in [lexicon.STRESS]
95
+ )
96
+ if use_post_normalize:
97
+ text = ''.join(i for i in text if i in lexicon.SET_PHONEMES or i in ADDITIONAL_PHONEMES or i == ' ')
98
+
99
+ return text
mishkal/utils.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mishkal import lexicon
2
+ import unicodedata
3
+ import regex as re
4
+ from mishkal.variants import Letter
5
+ import mishkal
6
+
7
+ def sort_diacritics(match):
8
+ letter = match.group(1)
9
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
10
+ return letter + diacritics
11
+
12
+
13
+ NORMALIZE_PATTERNS = {
14
+ # Sort diacritics
15
+ r"(\p{L})(\p{M}+)": sort_diacritics,
16
+ "״": '"', # Hebrew geresh to normal geresh
17
+ "׳": "'", # Same
18
+ }
19
+
20
+ def remove_nikud(text: str):
21
+ return re.sub(lexicon.HE_NIKUD_PATTERN, "", text)
22
+
23
+
24
+ def has_nikud(text: str):
25
+ return re.search(lexicon.HE_NIKUD_PATTERN, text) is not None
26
+
27
+
28
+ def normalize(text: str) -> str:
29
+ """
30
+ Normalize unicode (decomposite)
31
+ Keep only Hebrew characters / punctuation / IPA
32
+ Sort diacritics
33
+ """
34
+
35
+ # Decompose text
36
+ text = unicodedata.normalize("NFD", text)
37
+ for k, v in NORMALIZE_PATTERNS.items():
38
+ text = re.sub(k, v, text)
39
+ for k, v in lexicon.DEDUPLICATE.items():
40
+ text = re.sub(k, v, text)
41
+ return text
42
+
43
+
44
+ def post_normalize(phonemes: str):
45
+ new_phonemes = []
46
+ for word in phonemes.split(" "):
47
+ # remove glottal stop from end
48
+ word = re.sub(r"ʔ$", "", word)
49
+ # remove h from end
50
+ word = re.sub(r"h$", "", word)
51
+ word = re.sub(r"ˈh$", "", word)
52
+ # remove j followed by a i
53
+ word = re.sub(r"ij", "i", word)
54
+ new_phonemes.append(word)
55
+ phonemes = " ".join(new_phonemes)
56
+ return phonemes
57
+
58
+ def get_letters(word: str):
59
+ letters: list[tuple[str, str]] = re.findall(r"(\p{L})([\p{M}']*)", word) # with en_geresh
60
+ letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
61
+ return letters
62
+
63
+ def get_unicode_names(text: str):
64
+ return [unicodedata.name(c, "?") for c in text]
65
+
66
+ def has_vowel(s: iter):
67
+ return any(i in s for i in 'aeiou')
68
+
69
+ def has_constant(s: iter):
70
+ return any(i not in 'aeiou' for i in s)
71
+
72
+
73
+
74
+ def get_syllables(phonemes: list[str]) -> list[str]:
75
+ syllables = []
76
+ cur_syllable = ''
77
+
78
+ i = 0
79
+ while i < len(phonemes):
80
+ # Add current phoneme to the syllable
81
+
82
+ cur_syllable += phonemes[i]
83
+
84
+ # If we have a vowel in the current syllable
85
+ if has_vowel(cur_syllable):
86
+ # If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
87
+ if i+2 < len(phonemes) and not has_vowel(phonemes[i+1]) and has_vowel(phonemes[i+2]):
88
+ # End the current syllable and start a new one
89
+ syllables.append(cur_syllable)
90
+ cur_syllable = ''
91
+ # If we're at the end or next phoneme has a vowel
92
+ elif i+1 >= len(phonemes) or has_vowel(phonemes[i+1]):
93
+ # End the current syllable
94
+ syllables.append(cur_syllable)
95
+ cur_syllable = ''
96
+
97
+ i += 1
98
+
99
+ # Add any remaining syllable
100
+ if cur_syllable:
101
+ syllables.append(cur_syllable)
102
+
103
+ # Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
104
+ for i in range(len(syllables) - 1): # Ensure we're not at the last syllable
105
+ if syllables[i].endswith(lexicon.STRESS):
106
+ syllables[i+1] = lexicon.STRESS + syllables[i+1] # Move stress to next syllable
107
+ syllables[i] = syllables[i][:-len(lexicon.STRESS)] # Remove stress from current syllable
108
+
109
+ return syllables
mishkal/variants.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mishkal
2
+
3
+ class Letter:
4
+ def __init__(self, char: str, diac: list[str]):
5
+ self.char = mishkal.normalize(char)
6
+ self.diac = mishkal.normalize(diac)
7
+
8
+ def __repr__(self):
9
+ return f"[Letter] {self.char}{''.join(self.diac)}"
10
+
11
+ def __eq__(self, value: 'Letter'):
12
+ return value.diac == self.diac and value.char == self.char
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=5.15.0
2
+ num2words
3
+ colorlog
4
+ regex