thewh1teagle commited on
Commit
11e61f2
·
0 Parent(s):
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: 🐢
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv sync
3
+ uv pip install "gradio>=5.15.0"
4
+ uv run gradio examples/editor.py
5
+ """
6
+
7
+ from mishkal import phonemize, normalize
8
+ import gradio as gr
9
+
10
+ default_text = """
11
+ כָּל עֶרֶב יָאִיר (הַשֵּׁם הַמָּלֵא וּמְקוֹם הָעֲבוֹדָה שֶׁלּוֹ שְׁמוּרִים בַּמַּעֲרֶכֶת) רָץ 20 קִילוֹמֶטֶר. הוּא מְסַפֵּר לִי שֶׁזֶּה מְנַקֶּה לוֹ אֶת הָרֹאשׁ אַחֲרֵי הָעֲבוֹדָה, "שָׁעָה וָחֵצִי בְּלִי עֲבוֹדָה, אִשָּׁה וִילָדִים" כְּמוֹ שֶׁהוּא מַגְדִּיר זֹאת. אֲבָל אַחֲרֵי הַמִּקְלַחַת הוּא מַתְחִיל בְּמָה שֶׁנִּתָּן לְכַנּוֹת הָעֲבוֹדָה הַשְּׁנִיָּה שֶׁלּוֹ: לִמְצֹא לוֹ קוֹלֵגוֹת חֲדָשׁוֹת לָעֲבוֹדָה, כִּי יָאִיר הוּא כַּנִּרְאֶה הַמֶּלֶךְ שֶׁל "חָבֵר מֵבִיא חָבֵר" בְּיִשְׂרָאֵל.
12
+ """
13
+
14
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Roboto")])
15
+
16
+
17
+ def on_submit_debug(text: str) -> str:
18
+ phonemes = phonemize(text, preserve_punctuation=True)
19
+ normalized_text = normalize(text)
20
+ return phonemes + "\n\nNormalized:\n" + normalized_text
21
+
22
+
23
+ def on_submit(text: str) -> str:
24
+ return phonemize(text, preserve_punctuation=False)
25
+
26
+
27
+ with gr.Blocks(theme=theme) as demo:
28
+ text_input = gr.Textbox(
29
+ value=default_text, label="Text", rtl=True, elem_classes=["input"]
30
+ )
31
+ checkbox = gr.Checkbox(value=False, label="Enable Debug Mode")
32
+ phonemes_output = gr.Textbox(label="Phonemes")
33
+ submit_button = gr.Button("Create")
34
+
35
+ submit_button.click(
36
+ fn=lambda text, debug: on_submit_debug(text) if debug else on_submit(text),
37
+ inputs=[text_input, checkbox],
38
+ outputs=[phonemes_output],
39
+ )
40
+
41
+
42
+ if __name__ == "__main__":
43
+ demo.launch()
mishkal/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ use_expander=False,
17
+ use_post_normalize=False, # For TTS
18
+ fallback: Callable[[str], str] = None,
19
+ ) -> str:
20
+ phonemes = phonemizer.phonemize(
21
+ text,
22
+ preserve_punctuation=preserve_punctuation,
23
+ preserve_stress=preserve_stress,
24
+ fallback=fallback,
25
+ use_expander=use_expander,
26
+ use_post_normalize=use_post_normalize,
27
+ )
28
+ return phonemes
mishkal/data/kamatz_katan.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "כל": "ˈkol",
3
+ "רחבי": "roxˈbi",
4
+ "אמנות": "omaˈnut"
5
+ }
mishkal/data/rashej_tevot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "צה״ל": "ˈtsahal"
3
+ }
mishkal/data/symbols.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "₪": "ʃeˈkel",
3
+ "$": "doˈlar"
4
+ }
mishkal/expander/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with niqqud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from mishkal.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ text = self.dictionary.expand_text(text)
19
+
20
+ words = []
21
+ for source_word in text.split():
22
+ try:
23
+ word = date_to_word(source_word)
24
+ if word == source_word:
25
+ word = time_to_word(word)
26
+ if word == source_word:
27
+ word = num_to_word(word)
28
+ words.append(word)
29
+ except Exception as e:
30
+ log.error(f"Failed to expand {word} with error: {e}")
31
+ words.append(source_word)
32
+ return " ".join(words)
mishkal/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "יָנוּאָר",
7
+ 2: "פֶבְרוּאָר",
8
+ 3: "מֵרְץ",
9
+ 4: "אֵפְרִיל",
10
+ 5: "מַאי",
11
+ 6: "יוּנִי",
12
+ 7: "יוּלִי",
13
+ 8: "אוֹגֻסְט",
14
+ 9: "סֶפְּטֶמְבֶּר",
15
+ 10: "אוֹקְטוֹבֶּר",
16
+ 11: "נוֹבֶמְבֶּר",
17
+ 12: "דֶּצֶמְבֶּר",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "יוֹם רִאשׁוֹן",
23
+ 1: "יוֹם שֵׁנִי",
24
+ 2: "יוֹם שְׁלִישִׁי",
25
+ 3: "יוֹם רֵבִיעִי",
26
+ 4: "יוֹם חֲמִישִׁי",
27
+ 5: "יוֹם שִׁישִׁי",
28
+ 6: "יוֹם שַׁבָּת",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} בֵּ{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
mishkal/expander/dictionary.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from mishkal.utils import remove_niqqud
9
+ from mishkal.utils import normalize
10
+ import unicodedata
11
+
12
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
13
+ # Sort in reverse order to prioritize the most recent and best
14
+ order = {"bronze": 1, "silver": 2, "gold": 3}
15
+ files = sorted(
16
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
17
+ )
18
+
19
+
20
+ class Dictionary:
21
+ def __init__(self):
22
+ self.dict = {}
23
+ self.load_dictionaries()
24
+
25
+ def load_dictionaries(self):
26
+ for file in files:
27
+ with open(file, "r", encoding="utf-8") as f:
28
+ dictionary: dict = json.load(f)
29
+ normalized_dictionary = {}
30
+
31
+ # normalize niqqud keys
32
+ for k, v in dictionary.items():
33
+ k = normalize(k)
34
+ # Ensure not empty
35
+ if k and v:
36
+ normalized_dictionary[k] = v
37
+ self.dict.update(normalized_dictionary)
38
+
39
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
40
+ source: str = match.group(0)
41
+ # decomposite
42
+ source = unicodedata.normalize("NFD", source)
43
+ raw_lookup = self.dict.get(source)
44
+
45
+ without_niqqud_lookup = self.dict.get(remove_niqqud(source))
46
+ with_niqqud_lookup = self.dict.get(normalize(source))
47
+ # Compare without niqqud ONLY if source has no niqqud
48
+ if raw_lookup:
49
+ return raw_lookup
50
+ if without_niqqud_lookup:
51
+ return without_niqqud_lookup
52
+ elif with_niqqud_lookup:
53
+ return with_niqqud_lookup
54
+ return source
55
+
56
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
57
+ raw_source: str = match.group(0)
58
+ if raw_source.isnumeric():
59
+ return raw_source
60
+
61
+ raw_lookup = self.dict.get(raw_source)
62
+
63
+ # Compare without niqqud ONLY if source has no niqqud
64
+ if raw_lookup:
65
+ return raw_lookup
66
+ # search by only ', space, regular niqqud, alphabet
67
+ raw_source = re.sub(
68
+ r"[\u05B0-\u05EB ']+", self.replace_hebrew_only_callback, raw_source
69
+ )
70
+ return raw_source
71
+
72
+ def expand_text(self, text: str) -> str:
73
+ """
74
+ TODO: if key doesn't have diacritics expand even diacritized words
75
+ """
76
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
77
+
78
+ return text
mishkal/expander/number_names.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ # TODO: add niqqud hints
6
+
7
+ ZERO = {"אפס": "אֶפֶס"}
8
+
9
+
10
+ ONES = {
11
+ "אחת": "אַחַת",
12
+ "אחד": "אֶחָד",
13
+ "ראשונה": "רִאשׁוֹנָה",
14
+ "ראשון": "רִאשׁוֹן",
15
+ "ראשונות": "רִאשׁוֹנוֹת",
16
+ "ראשונים": "רִאשׁוֹנִים",
17
+ "שתיים": "שְׁתַּיִם",
18
+ "שניים": "שְׁנַיִם",
19
+ "שתי": "שְׁתֵּי",
20
+ "שני": "שְׁנֵי",
21
+ "שנייה": "שְׁנִיָּה",
22
+ "שניות": "שְׁנִיּוֹת",
23
+ "שלוש": "שָׁלוֹשׁ",
24
+ "שלושה": "שְׁלוֹשָׁה",
25
+ "שלושת": "שְׁלוֹשֶׁת",
26
+ "שלישית": "שְׁלִישִׁית",
27
+ "שלישי": "שְׁלִישִׁי",
28
+ "שלישיות": "שְׁלִישִׁיּוֹת",
29
+ "שלישיים": "שְׁלִישִׁיִּים",
30
+ "ארבע": "אַרְבַּע",
31
+ "ארבעה": "אַרְבַּעָה",
32
+ "ארבעת": "אַרְבַּעַת",
33
+ "רביעית": "רֵבִיעִית",
34
+ "רביעי": "רֵבִיעִי",
35
+ "רביעיות": "רֵבִיעִיוֹת",
36
+ "רביעיים": "רֵבִיעִיִּים",
37
+ "חמש": "חָמֵשׁ",
38
+ "חמישה": "חֲמִשָּׁה",
39
+ "חמשת": "חֲמֵשֶׁת",
40
+ "חמישית": "חֲמִישִּׁית",
41
+ "חמישי": "חֲמִישִּׁי",
42
+ "חמישיות": "חֲמִישִּׁיוֹת",
43
+ "חמישיים": "חֲמִישִּׁיִּים",
44
+ "שש": "שֵׁשׁ",
45
+ "שישה": "שִׁשָּׁה",
46
+ "ששת": "שֵׁשֶׁת",
47
+ "שישית": "שִׁשִּׁית",
48
+ "שישי": "שִׁשִּׁי",
49
+ "שישיות": "שִׁשִּׁיוֹת",
50
+ "שישיים": "שִׁשִּׁיִּים",
51
+ "שבע": "שֶׁבַע",
52
+ "שבעה": "שִׁבְעָה",
53
+ "שבעת": "שִׁבְעַת",
54
+ "שביעית": "שְׁבִיעִית",
55
+ "שביעי": "שְׁבִיעִי",
56
+ "שביעיות": "שְׁבִיעִיוֹת",
57
+ "שביעיים": "שְׁבִיעִיִּים",
58
+ "שמונה": "שְׁמוֹנֶה",
59
+ "שמונת": "שְׁמוֹנַת",
60
+ "שמינית": "שְׁמִינִית",
61
+ "שמיני": "שְׁמִינִי",
62
+ "שמיניות": "שְׁמִינִיוֹת",
63
+ "שמיניים": "שְׁמִינִיִּים",
64
+ "תשע": "תֵּשַׁע",
65
+ "תשעה": "תִּשְׁעָה",
66
+ "תשעת": "תִּשְׁעַת",
67
+ "תשיעית": "תְּשִׁיעִית",
68
+ "תשיעי": "תְּשִׁיעִי",
69
+ "תשיעיות": "תְּשִׁיעִיּוֹת",
70
+ "תשיעיים": "תְּשִׁיעִיִּים",
71
+ }
72
+
73
+
74
+ TENS = {
75
+ "עשר": "עֶשֶׂר",
76
+ "עשרה": "עֲשָׁרָה",
77
+ "עשרת": "עֲשֶׁרֶת",
78
+ "עשירית": "עֲשִׁירִית",
79
+ "עשירי": "עֲשִׁירִי",
80
+ "עשיריות": "עֲשִׁירִיוֹת",
81
+ "עשיריים": "עֲשִׁירִיִּים",
82
+ "שתים עשרה": "שְׁתֵּים עֶשְׂרֵה",
83
+ "שנים עשר": "שְׁנֵים עָשָׂר",
84
+ }
85
+
86
+
87
+ TWENTIES = {
88
+ "עשרים": "עֶשְׂרִ֫ים",
89
+ "שלושים": "שְׁלוֹשִׁים",
90
+ "ארבעים": "אַרְבָּעִים",
91
+ "חמישים": "חֲמִשִּׁים",
92
+ "שישים": "שִׁשִּׁים",
93
+ "שבעים": "שִׁבְעִים",
94
+ "שמונים": "שְׁמוֹנִים",
95
+ "תשעים": "תִּשְׁעִים",
96
+ }
97
+
98
+
99
+ HUNDREDS = {
100
+ "מאה": "מֵאָה",
101
+ "מאת": "מֵאַת",
102
+ "מאתיים": "מָאתַיִם",
103
+ "מאות": "מֵאוֹת",
104
+ }
105
+
106
+ THOUSANDS = {
107
+ "אלף": "אֶלֶף",
108
+ "אלפיים": "אַלְפַּיִם",
109
+ "אלפים": "אֲלָפִים",
110
+ "אלפי": "אַלְפִּי",
111
+ }
112
+
113
+
114
+ LARGE = {
115
+ "מיליון": "מִילְיוֹן",
116
+ "מיליוני": "מִילְיוֹנִי",
117
+ "מיליארד": "מִילְיַארְד",
118
+ "מיליארדי": "מִילְיַארְדִּי",
119
+ "טריליון": "טְרִילְיוֹן",
120
+ "טריליוני": "טְרִילְיוֹנִי",
121
+ "קוודריליון": "קוֹוַדְרִילְיוֹן",
122
+ "קוודריליוני": "קוֹוַדְרִילְיוֹנִי",
123
+ "קווינטיליון": "קוִוִּנְטִילְיוֹן",
124
+ "קווינטיליוני": "קוִוִּנְטִילְיוֹנִי",
125
+ "סקסטיליון": "סְקֶסְטִילְיוֹן",
126
+ "סקסטיליוני": "סְקֶסְטִילְיוֹנִי",
127
+ "ספטיליון": "סְפֶּטִילְיוֹן",
128
+ "ספטיליוני": "סְפֶּטִילְיוֹנִי",
129
+ "אוקטיליון": "אוֹקְטִילְיוֹן",
130
+ "אוקטיליוני": "אוֹקְטִילְיוֹנִי",
131
+ "נוניליון": "נוּנִילְיוֹן",
132
+ "נוניליוני": "נוּנִילְיוֹנִי",
133
+ "דסיליון": "דֶּסִילְיוֹן",
134
+ "דסיליוני": "דֶּסִילְיוֹנִי",
135
+ "אונדסיליון": "אוּנְדְסִילְיוֹ��",
136
+ "אונדסיליוני": "אוּנְדְסִילְיוֹנִי",
137
+ "דואודסיליון": "דוּאודְסִילְיוֹן",
138
+ "דואודסיליוני": "דוּאודְסִילְיוֹנִי",
139
+ "טרדסיליון": "טֶרְדְסִילְיוֹן",
140
+ "טרדסיליוני": "טֶרְדְסִילְיוֹנִי",
141
+ "קווטואורדסיליון": "קוּוטְוָאורְדְסִילְיוֹן",
142
+ "קווטואורדסיליוני": "קוּוטְוָאורְדְסִילְיוֹנִי",
143
+ "קווינדסיליון": "קוִוִּנְדְסִילְיוֹן",
144
+ "קווינדסיליוני": "קוִוִּנְדְסִילְיוֹנִי",
145
+ "סקסדסיליון": "סֶקְסְדְסִילְיוֹן",
146
+ "סקסדסיליוני": "סֶקְסְדְסִילְיוֹנִי",
147
+ "ספטנדסיליון": "סְפֶּטַנְדְסִילְיוֹן",
148
+ "ספטנדסיליוני": "סְפֶּטַנְדְסִילְיוֹנִי",
149
+ "אוקטודסיליון": "אוֹקְטוֹדְסִילְיוֹן",
150
+ "אוקטודסיליוני": "אוֹקְטוֹדְסִילְיוֹנִי",
151
+ "נובמדסיליון": "נוֹבְמַדְסִילְיוֹן",
152
+ "נובמדסיליוני": "נוֹבְמַדְסִילְיוֹנִי",
153
+ "ויגינטיליון": "וִיגִּינְטִילְיוֹן",
154
+ "ויגינטיליוני": "וִיגִּינְטִילְיוֹנִי",
155
+ }
156
+
157
+
158
+ LETTERS = {
159
+ "ו": "וֵ",
160
+ "ה": "הַ",
161
+ }
162
+
163
+
164
+ CURRENCY = {
165
+ "שקל": "שֵׁקֶל",
166
+ "שקלים": "שְׁקָלִים",
167
+ "אגורה": "אֲגוֹרָה",
168
+ "אגורות": "אֲגוֹרוֹת",
169
+ "אירו": "אֵירוֹ",
170
+ "סנט": "סֵנְט",
171
+ "סנטים": "סֵנְטִים",
172
+ "דולר": "דוֹלָר",
173
+ "דולרים": "דוֹלָרִים",
174
+ }
175
+
176
+
177
+ POINTS = {
178
+ "מינוס": "מִינּוּס",
179
+ "נקודה": "נְקֻדָּה",
180
+ }
181
+
182
+ NUMBER_NAMES = {
183
+ **CURRENCY,
184
+ **HUNDREDS,
185
+ **LARGE,
186
+ **LETTERS,
187
+ **ONES,
188
+ **POINTS,
189
+ **TENS,
190
+ **THOUSANDS,
191
+ **TWENTIES,
192
+ **ZERO,
193
+ }
mishkal/expander/numbers.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num = match.group()
22
+ words = num2words.num2words(num, lang="he", ordinal=False)
23
+ return add_diacritics(words)
24
+
25
+ # Replace all whole numbers in the string
26
+ result = re.sub(r"\d+", replace_number, maybe_number)
27
+
28
+ return result
mishkal/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "אֶפֶס",
47
+ "אַחַת",
48
+ "שְׁנַיִם", # Will be replaced with "שֵׁנִי" when needed
49
+ "שָׁלוֹשׁ",
50
+ "אַרְבַּע",
51
+ "חָמֵשׁ",
52
+ "שֵׁשׁ",
53
+ "שֶׁבַע",
54
+ "שְׁמוֹנֵה",
55
+ "תֵּשַׁע",
56
+ "עֵשֵׂר",
57
+ "אַחַת עֶשְׂרֵה",
58
+ "שְׁתֵּים עֶשְׂרֵה",
59
+ ]
60
+
61
+ tens = ["", "עֵשֵׂר", "עֶשְׂרִים", "שְׁלוֹשִׁים", "אַרְבָּעִים", "חֲמִשִּׁים"]
62
+
63
+ ten_to_twenty = [
64
+ "עֵשֵׂר",
65
+ "אַחַת עֶשְׂרֵה",
66
+ "שְׁתֵּים עֶשְׂרֵה",
67
+ "שְׁלוֹשׁ עֶשְׂרֵה",
68
+ "אַרְבַּע עֶשְׂרֵה",
69
+ "חֲמֵשׁ עֶשְׂרֵה",
70
+ "שֵׁשׁ עֶשְׂרֵה",
71
+ "שְׁבַע עֶשְׂרֵה",
72
+ "שְׁמוֹנֶה עֶשְׂרֵה",
73
+ "תְּשַׁע עֶשְׂרֵה",
74
+ ]
75
+
76
+ vocab = {"minutes": "דַּקּוֹת", "and": "וֵ", "shtey": "שְׁתֵּי"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "שניים" with "שני"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
mishkal/lexicon.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
6
+ HE_CHARS_PATTERN = (
7
+ r"\b[\u05B0-\u05EA\u05F3\u0027]+\b" # Chars including niqqud, geresh and en_geresh
8
+ )
9
+ HE_NIQQUD_PATTERN = r"[\u05B0-\u05C7]"
10
+ PUNCTUATION = r".,!? "
11
+
12
+ # Special
13
+ GIMEL_OR_ZAIN_WITH_DAGESH = "dʒ"
14
+ TSADIK_WITH_DAGESH = "tʃ"
15
+ SHIN_WITH_POINT = "ʃ"
16
+ SIN_WITH_POINT = "s"
17
+ STRESS = "\u02c8" # visually looks like '
18
+ SECONDARY_STRESS = "\u02cc"
19
+ HET_GNUVA = "ax"
20
+ W_AS_WALLA = "w"
21
+
22
+ GERESH_LETTERS = {"ג": "dʒ", "ז": "ʒ", "ת": "ta", "צ": "tʃ", "ץ": "tʃ"}
23
+
24
+ LETTERS_NAMES_PHONEMES = {
25
+ "א": "alef", # Alef, glottal stop
26
+ "ב": "bet", # Bet
27
+ "ג": "gimel", # Gimel
28
+ "ד": "dalet", # Dalet
29
+ "ה": "hej", # He
30
+ "ו": "vav", # Vav
31
+ "ז": "zajin", # Zayin
32
+ "ח": "xet", # Het
33
+ "ט": "tet", # Tet
34
+ "י": "jud", # Yod
35
+ "ך": "xaf sofit", # Haf sofit
36
+ "כ": "xaf", # Haf
37
+ "ל": "lamed", # Lamed
38
+ "ם": "mem sofit", # Mem Sofit
39
+ "מ": "mem", # Mem
40
+ "ן": "nun sofit", # Nun Sofit
41
+ "נ": "nun", # Nun
42
+ "ס": "samex", # Samekh
43
+ "ע": "ajin", # Ayin, glottal stop
44
+ "פ": "fey", # Fey
45
+ "ף": "fey sofit", # Fey Sofit
46
+ "ץ": "tsadik sofit", # Tsadik sofit
47
+ "צ": "tsadik", # Tsadik
48
+ "ק": "kuf", # Kuf
49
+ "ר": "rejiʃ", # Resh
50
+ "ש": "ʃin", # Shin
51
+ "ת": "taf", # Taf
52
+ }
53
+
54
+ # Consonants
55
+ LETTERS_PHONEMES = {
56
+ "א": "ʔ", # Alef
57
+ "ב": "v", # Bet
58
+ "ג": "g", # Gimel
59
+ "ד": "d", # Dalet
60
+ "ה": "h", # He
61
+ "ו": "v", # Vav
62
+ "ז": "z", # Zayin
63
+ "ח": "x", # Het
64
+ "ט": "t", # Tet
65
+ "י": "j", # Yod
66
+ "ך": "x", # Haf sofit
67
+ "כ": "x", # Haf
68
+ "ל": "l", # Lamed
69
+ "ם": "m", # Mem Sofit
70
+ "מ": "m", # Mem
71
+ "ן": "n", # Nun Sofit
72
+ "נ": "n", # Nun
73
+ "ס": "s", # Samekh
74
+ "ע": "ʔ", # Ayin, only voweled
75
+ "פ": "f", # Fey
76
+ "ף": "f", # Fey Sofit
77
+ "ץ": "ts", # Tsadik sofit
78
+ "צ": "ts", # Tsadik
79
+ "ק": "k", # Kuf
80
+ "ר": "r", # Resh
81
+ "ש": "ʃ", # Shin
82
+ "ת": "t", # Taf
83
+ # Beged Kefet
84
+ "בּ": "b",
85
+ "כּ": "k",
86
+ "פּ": "p",
87
+ "שׁ": "ʃ",
88
+ "שׂ": "s",
89
+ "'": "",
90
+ }
91
+
92
+ # Vowels
93
+ VOWEL_A = "a"
94
+ VOWEL_E = "e"
95
+ VOWEL_I = "i"
96
+ VOWEL_O = "o"
97
+ VOWEL_U = "u"
98
+
99
+ NIQQUD_PHONEMES = {
100
+ "\u05b4": "i", # Hiriq
101
+ "\u05b5": "e", # Tsere
102
+ "\u05b7": "a", # Patah
103
+ "\u05b9": "o", # Holam
104
+ "\u05ba": "o", # Holam haser for vav
105
+ "\u05bb": "u", # Qubuts
106
+ "\u05ab": "ˈ", # Stress (Atmaha)
107
+ "\u05bd": "e", # Shva na
108
+ }
109
+
110
+ SET_LETTER_SYMBOLS = {
111
+ "\u05b0", # Shva
112
+ "\u05b4", # Hiriq
113
+ "\u05b5", # Tsere
114
+ "\u05b7", # Patah
115
+ "\u05b9", # Holam
116
+ "\u05ba", # Holam haser for vav
117
+ "\u05bb", # Qubuts
118
+ "\u05bc", # Dagesh
119
+ "\u05c1", # Shin dot
120
+ "\u05c2", # Sin dot
121
+ "'", # Geresh
122
+ }
123
+
124
+ """
125
+ We're left with the following niqqud (10):
126
+ Shva, Hiriq, Tsere, Patah, Holam, Qubuts, Dagesh,
127
+ Holam haser for vav, Shin dot, Sin dot
128
+ """
129
+ NIQQUD_DEDUPLICATE = {
130
+ "\u05b1": "\u05b5", # Hataf Segol -> Tsere
131
+ "\u05b2": "\u05b7", # Hataf Patah -> Patah
132
+ "\u05b3": "\u05b9", # Hataf Qamats -> Holam
133
+ "\u05b6": "\u05b5", # Segol -> Tsere
134
+ # Kamatz -> Patah
135
+ "\u05b8": "\u05b7", # Qamats -> Patah
136
+ "\u05c7": "\u05b9", # Qamats Qatan -> Holam
137
+ "\u05f3": "'", # Hebrew geresh to regular geresh
138
+ }
139
+
140
+
141
+ SET_OUTPUT_CHARACTERS = set(
142
+ [
143
+ *GIMEL_OR_ZAIN_WITH_DAGESH,
144
+ TSADIK_WITH_DAGESH,
145
+ SHIN_WITH_POINT,
146
+ SIN_WITH_POINT,
147
+ W_AS_WALLA,
148
+ ]
149
+ + [STRESS, SECONDARY_STRESS]
150
+ + list(LETTERS_PHONEMES.values())
151
+ + list(NIQQUD_PHONEMES.values())
152
+ + [VOWEL_A, VOWEL_E, VOWEL_I, VOWEL_O, VOWEL_U]
153
+ + list(PUNCTUATION)
154
+ )
155
+
156
+ SET_NIQQUD = {
157
+ # Shva, Hiriq, Tsere, Patah, Holam, Holam haser for vav, Qubuts, Dagesh, Shin dot, Sin dot
158
+ "\u05b0",
159
+ "\u05b4",
160
+ "\u05b5",
161
+ "\u05b7",
162
+ "\u05b9",
163
+ "\u05ba",
164
+ "\u05bb",
165
+ "\u05bc",
166
+ "\u05c1",
167
+ "\u05c2",
168
+ # shva na and atmaha
169
+ "\u05bd", # shva na
170
+ "\u05ab", # atmaha
171
+ }
172
+ SET_LETTERS = set(LETTERS_PHONEMES.keys())
173
+ SET_PUNCTUATION = set(PUNCTUATION)
174
+
175
+
176
+ # Set for fast lookup
177
+ SET_INPUT_CHARACTERS = set(
178
+ list(LETTERS_PHONEMES.keys()) + list(SET_NIQQUD) + list(PUNCTUATION) + ["'"]
179
+ )
mishkal/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
mishkal/phonemize.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The actual letters phonemization happens here.
3
+ Phonemes generated based on rules.
4
+
5
+ Early rules:
6
+ 1. Niqqud malle vowels
7
+ 2. Dagesh (custom beged kefet)
8
+ 3. Final letter without niqqud
9
+ 4. Final Het gnuva
10
+ 5. Geresh (Gimel, Ttadik, Zain)
11
+ 6. Shva na
12
+ Revised rules:
13
+ 1. Consonants
14
+ 2. Niqqud
15
+
16
+ Reference:
17
+ - https://hebrew-academy.org.il/2020/08/11/איך-הוגים-את-השווא-הנע
18
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
19
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
20
+ """
21
+
22
+ from mishkal import lexicon
23
+ from .expander import Expander
24
+ from mishkal.utils import normalize, post_normalize
25
+ from typing import Callable
26
+ import regex as re
27
+
28
+
29
+ class Phonemizer:
30
+ def __init__(self):
31
+ self.expander = Expander()
32
+
33
+ def phonemize(
34
+ self,
35
+ text: str,
36
+ preserve_punctuation=True,
37
+ preserve_stress=True,
38
+ use_expander=False,
39
+ use_post_normalize=False, # For TTS
40
+ fallback: Callable[[str], str] = None,
41
+ ) -> str:
42
+ # normalize
43
+ text = normalize(text)
44
+ # TODO: is that enough? what if there's punctuation around? other chars?
45
+ he_pattern = r"[\u05b0-\u05ea\u05ab\u05bd']+"
46
+ fallback_pattern = r"[a-zA-Z]+"
47
+
48
+ def fallback_replace_callback(match: re.Match):
49
+ word = match.group(0)
50
+
51
+ if self.expander.dictionary.dict.get(word):
52
+ # skip
53
+ # TODO: better API
54
+ return word
55
+ phonemes = fallback(word).strip()
56
+ # TODO: check that it has only IPA?!
57
+ for c in phonemes:
58
+ lexicon.SET_OUTPUT_CHARACTERS.add(c)
59
+ return phonemes
60
+
61
+ if fallback is not None:
62
+ text = re.sub(fallback_pattern, fallback_replace_callback, text)
63
+ if use_expander:
64
+ text = self.expander.expand_text(text)
65
+ self.fallback = fallback
66
+
67
+ def heb_replace_callback(match: re.Match):
68
+ word = match.group(0)
69
+
70
+ word = normalize(word)
71
+ word = "".join(
72
+ i for i in word if i in lexicon.SET_LETTERS or i in lexicon.SET_NIQQUD
73
+ )
74
+ letters = re.findall(r"(\p{L})([\p{M}']*)", word) # with en_geresh
75
+ phonemes = self.phonemize_hebrew(letters)
76
+ return "".join(phonemes)
77
+
78
+ text = re.sub(he_pattern, heb_replace_callback, text)
79
+
80
+ if not preserve_punctuation:
81
+ text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
82
+ if not preserve_stress:
83
+ text = "".join(
84
+ i for i in text if i not in [lexicon.STRESS, lexicon.SECONDARY_STRESS]
85
+ )
86
+ if use_post_normalize:
87
+ text = post_normalize(text)
88
+ text = "".join(i for i in text if i in lexicon.SET_OUTPUT_CHARACTERS)
89
+
90
+ return text
91
+
92
+ def phonemize_hebrew(self, letters: list[str]):
93
+ phonemes = []
94
+ i = 0
95
+
96
+ while i < len(letters):
97
+ cur = letters[i]
98
+ prev = letters[i - 1] if i > 0 else None
99
+ next = letters[i + 1] if i < len(letters) - 1 else None
100
+ skip_diacritics = False
101
+ skip_consonants = False
102
+ # revised rules
103
+
104
+ # יַאלְלָה
105
+ if cur[0] == "ל" and cur[1] == "\u05b0" and next and next[0] == "ל":
106
+ skip_diacritics = True
107
+ skip_consonants = True
108
+
109
+ if (
110
+ cur[0] == "ו"
111
+ and not prev
112
+ and next
113
+ and not next[1]
114
+ and cur[0] + cur[1] == "וַא"
115
+ ):
116
+ i += 1
117
+ phonemes.append("wa")
118
+
119
+ if cur[0] == "א" and not cur[1] and prev:
120
+ skip_consonants = True
121
+
122
+ # TODO ?
123
+ if cur[0] == "י" and next and not cur[1]:
124
+ skip_consonants = True
125
+
126
+ if cur[0] == "ש" and "\u05c2" in cur[1]:
127
+ phonemes.append("s")
128
+ skip_consonants = True
129
+
130
+ # shin without niqqud after sin = sin
131
+ if cur[0] == "ש" and not cur[1] and prev and "\u05c2" in prev[1]:
132
+ phonemes.append("s")
133
+ skip_consonants = True
134
+
135
+ if not next and cur[0] == "ח":
136
+ # Final Het gnuva
137
+ phonemes.append("ax")
138
+ skip_diacritics = True
139
+ skip_consonants = True
140
+
141
+ if cur and "'" in cur[1] and cur[0] in lexicon.GERESH_LETTERS:
142
+ if cur[0] == "ת":
143
+ phonemes.append(lexicon.GERESH_LETTERS.get(cur[0], ""))
144
+ skip_diacritics = True
145
+ skip_consonants = True
146
+ else:
147
+ # Geresh
148
+ phonemes.append(lexicon.GERESH_LETTERS.get(cur[0], ""))
149
+ skip_consonants = True
150
+
151
+ elif (
152
+ "\u05bc" in cur[1] and cur[0] + "\u05bc" in lexicon.LETTERS_PHONEMES
153
+ ): # dagesh
154
+ phonemes.append(lexicon.LETTERS_PHONEMES.get(cur[0] + "\u05bc", ""))
155
+ skip_consonants = True
156
+ elif cur[0] == "ו":
157
+ skip_consonants = True
158
+ if next and next[0] == "ו":
159
+ # patah and next[1] empty
160
+ if cur[1] == "\u05b7" and not next[1]:
161
+ phonemes.append("w")
162
+ i += 2
163
+ else:
164
+ # double vav
165
+ phonemes.append("wo")
166
+ skip_diacritics = True
167
+ else:
168
+ # Single vav
169
+
170
+ # Vav with Patah
171
+ if "\u05b7" in cur[1]:
172
+ phonemes.append("va")
173
+
174
+ # Holam haser
175
+ elif "\u05b9" in cur[1]:
176
+ phonemes.append("o")
177
+ # Shuruk / Kubutz
178
+ elif "\u05bb" in cur[1] or "\u05bc" in cur[1]:
179
+ phonemes.append("u")
180
+ # Vav with Shva in start
181
+ elif "\u05b0" in cur[1] and not prev:
182
+ phonemes.append("ve")
183
+ # Hirik
184
+ elif "\u05b4" in cur[1]:
185
+ phonemes.append("vi")
186
+ else:
187
+ phonemes.append("v")
188
+ skip_diacritics = True
189
+
190
+ if not skip_consonants:
191
+ phonemes.append(lexicon.LETTERS_PHONEMES.get(cur[0], ""))
192
+ niqqud_phonemes = (
193
+ [lexicon.NIQQUD_PHONEMES.get(niqqud, "") for niqqud in cur[1]]
194
+ if not skip_diacritics
195
+ else []
196
+ )
197
+
198
+ if "\u05ab" in cur[1] and phonemes:
199
+ # Ensure ATMAHA is before the letter (before the last phoneme added)
200
+ niqqud_phonemes.remove(lexicon.NIQQUD_PHONEMES["\u05ab"])
201
+ phonemes = (
202
+ phonemes[:-1] + [lexicon.NIQQUD_PHONEMES["\u05ab"]] + [phonemes[-1]]
203
+ )
204
+
205
+ phonemes.extend(niqqud_phonemes)
206
+ i += 1
207
+ return phonemes
mishkal/utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mishkal import lexicon
2
+ import unicodedata
3
+ import regex as re
4
+
5
+
6
+ def sort_diacritics(match):
7
+ letter = match.group(1)
8
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
9
+ return letter + diacritics
10
+
11
+
12
+ NORMALIZE_PATTERNS = {
13
+ # Alphabet followed by 1/2 symbols then dagesh. make dagesh first
14
+ r"(\p{L})(\p{M}+)": sort_diacritics,
15
+ "״": '"',
16
+ "׳": "'",
17
+ }
18
+
19
+
20
+ def remove_niqqud(text: str):
21
+ return re.sub(lexicon.HE_NIQQUD_PATTERN, "", text)
22
+
23
+
24
+ def has_niqqud(text: str):
25
+ return re.search(lexicon.HE_NIQQUD_PATTERN, text) is not None
26
+
27
+
28
+ def normalize(text: str) -> str:
29
+ """
30
+ Normalize unicode (decomposite)
31
+ Deduplicate niqqud (eg. only Patah instead of Kamatz)
32
+ Keep only Hebrew characters / punctuation / IPA
33
+ Sort diacritics
34
+ """
35
+
36
+ # Decompose text
37
+ text = unicodedata.normalize("NFD", text)
38
+ for k, v in NORMALIZE_PATTERNS.items():
39
+ text = re.sub(k, v, text)
40
+ # Normalize niqqud, remove duplicate phonetics 'sounds' (eg. only Patah)
41
+ for k, v in lexicon.NIQQUD_DEDUPLICATE.items():
42
+ text = text.replace(k, v)
43
+ return text
44
+
45
+
46
+ def post_normalize(phonemes: str):
47
+ new_phonemes = []
48
+ for word in phonemes.split(" "):
49
+ # remove glottal stop from start and end
50
+ word = re.sub(r"ʔ$", "", word)
51
+ word = re.sub(r"^ʔ", "", word)
52
+ word = re.sub(r"^ˈʔ", "ˈ", word)
53
+ # remove h from end
54
+ word = re.sub(r"h$", "", word)
55
+ word = re.sub(r"ˈh$", "ˈ", word)
56
+ # remove j followed by a i
57
+ word = re.sub(r"ij", "i", word)
58
+ new_phonemes.append(word)
59
+ return " ".join(new_phonemes)
60
+
61
+
62
+ def get_unicode_names(text: str):
63
+ return [unicodedata.name(c, "?") for c in text]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=5.15.0
2
+ num2words
3
+ colorlog
4
+ regex