thewh1teagle commited on
Commit
bcfb376
ยท
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.onnx filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phonemize in Hebrew
3
+ emoji: ๐Ÿข
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ uv sync
3
+ wget https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx
4
+ uv run gradio app.py
5
+ """
6
+
7
+ from phonikud import phonemize, lexicon
8
+ from phonikud.utils import remove_nikud
9
+ import gradio as gr
10
+ from phonikud_onnx import Phonikud
11
+ from pathlib import Path
12
+
13
+
14
+ default_text = """
15
+ ื”ึทื“ึผึทื™ึผึธื™ื’ ื ึดืฆึฐืžึธื“ ืœึฐื“ึนื•ืคึถืŸ ื”ึทืกึผึดื™ืจึธื” ื‘ึผึดื–ึฐืžึทืŸ ื”ึทืกึผึฐืขึธืจึธื”.
16
+ ื”ึดืกึฐื‘ึผึทืจึฐืชึผึดื™ ืœึธื”ึผ ืึถืช ื”ึทื›ึผึนืœ, ื•ึฐืึธืžึทืจึฐืชึผึดื™ ื‘ึผึฐื“ึดื™ึผื•ึผืง ืžึธื” ืงึธืจึธื”.
17
+ ื”ึทื™ึผึฐืœึธื“ึดื™ื ืึธื”ึฒื‘ื•ึผ ื‘ึผึดืžึฐื™ึปื•ื—ึธื“ ืึถืช ื”ึทืกึผึดื™ืคึผื•ึผืจึดื™ื ื”ึทืœึผึธืœื•ึผ ืฉืึถื”ึทืžึผื•ึนืจึธื” ื”ึดืงึฐืจึดื™ืึธื”.
18
+ """.strip()
19
+
20
+
21
+ def on_phonikud_toggle(use_phonikud):
22
+ if not use_phonikud:
23
+ return default_text
24
+ return remove_nikud(default_text)
25
+
26
+
27
+ css = """
28
+ .input textarea {
29
+ font-size: 22px;
30
+ padding: 15px;
31
+ height: 200px;
32
+ }
33
+
34
+ .phonemes {
35
+ background: var(--input-background-fill);
36
+
37
+ }
38
+ .phonemes {
39
+ padding: 5px;
40
+ min-height: 50px;
41
+ }
42
+ """
43
+
44
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Noto Sans Hebrew")])
45
+
46
+ phonikud = None
47
+ commit = "unknown"
48
+ model_path = Path("./phonikud-1.0.int8.onnx")
49
+ if model_path.exists():
50
+ phonikud = Phonikud(str(model_path))
51
+ metadata = phonikud.get_metadata()
52
+ commit = metadata.get("commit", "unknown")
53
+
54
+
55
+ def on_submit(text: str, schema: str, use_phonikud: bool) -> str:
56
+ diacritized = (
57
+ phonikud.add_diacritics(
58
+ text, mark_matres_lectionis=lexicon.NIKUD_HASER_DIACRITIC
59
+ )
60
+ if phonikud and use_phonikud
61
+ else text
62
+ )
63
+ phonemes = phonemize(
64
+ diacritized, predict_stress=True, schema=schema, predict_vocal_shva=False
65
+ )
66
+ if use_phonikud:
67
+ return f"<div dir='rtl' style='font-size: 22px;'>{diacritized.strip()}</div><br><div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
68
+ else:
69
+ return f"<div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
70
+
71
+
72
+ with gr.Blocks(theme=theme, css=css) as demo:
73
+ text_input = gr.Textbox(
74
+ value=remove_nikud(default_text),
75
+ label="Text",
76
+ rtl=True,
77
+ elem_classes=["input"],
78
+ lines=7,
79
+ )
80
+
81
+ with gr.Row():
82
+ schema_dropdown = gr.Dropdown(
83
+ choices=["modern", "plain"], value="plain", label="Phoneme Schema"
84
+ )
85
+ use_phonikud_checkbox = gr.Checkbox(
86
+ value=True, label="Use Phonikud (add diacritics)"
87
+ )
88
+
89
+ submit_button = gr.Button("Create")
90
+ output_box = gr.Markdown(label="Phonemes + Diacritics", elem_classes=["phonemes"])
91
+ use_phonikud_checkbox.change(
92
+ fn=lambda use_phonikud: (
93
+ on_phonikud_toggle(use_phonikud), # Update text_input
94
+ on_submit(
95
+ on_phonikud_toggle(use_phonikud), schema_dropdown.value, use_phonikud
96
+ ), # Update output_box
97
+ ),
98
+ inputs=use_phonikud_checkbox,
99
+ outputs=[text_input, output_box], # Update both text input and output box
100
+ )
101
+
102
+ submit_button.click(
103
+ fn=on_submit,
104
+ inputs=[text_input, schema_dropdown, use_phonikud_checkbox],
105
+ outputs=output_box,
106
+ )
107
+
108
+ gr.Markdown("""
109
+ <p style='text-align: center;'><a href='https://github.com/thewh1teagle/phonikud' target='_blank'>Phonikud on Github</a></p>
110
+ """)
111
+
112
+ gr.Markdown(f"""
113
+ <p style='text-align: center; opacity: 0.2; font-size: 10px;'>
114
+ <a href='https://huggingface.co/thewh1teagle/phonikud/commit/{commit}' target='_blank' style='color: white;'>phonikud version {commit}</a>
115
+ </p>
116
+ """)
117
+
118
+ if __name__ == "__main__":
119
+ demo.launch()
phonikud-1.0.int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1fa2624b1e8202a0c0a23259b560b0c41ad92a3a6750bd0e322ce5a2b1acdb6
3
+ size 307844158
phonikud/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ High level phonemize functions
3
+ """
4
+
5
+ from .phonemize import Phonemizer
6
+ from .utils import normalize # noqa: F401
7
+ from typing import Callable, Literal
8
+
9
+ phonemizer = Phonemizer()
10
+
11
+
12
+ def phonemize(
13
+ text: str,
14
+ preserve_punctuation=True,
15
+ preserve_stress=True,
16
+ use_expander=True,
17
+ use_post_normalize=True, # For TTS
18
+ predict_stress=True,
19
+ predict_vocal_shva=True,
20
+ stress_placement: Literal["syllable", "vowel"] = "vowel",
21
+ schema: Literal["plain", "modern"] = "modern",
22
+ fallback: Callable[[str], str] = None,
23
+ ) -> str:
24
+ """
25
+ Set stress_at_start=True to place stress at syllable start.
26
+ """
27
+ phonemes = phonemizer.phonemize(
28
+ text,
29
+ preserve_punctuation=preserve_punctuation,
30
+ preserve_stress=preserve_stress,
31
+ fallback=fallback,
32
+ use_expander=use_expander,
33
+ use_post_normalize=use_post_normalize,
34
+ predict_stress=predict_stress,
35
+ schema=schema,
36
+ predict_vocal_shva=predict_vocal_shva,
37
+ stress_placement=stress_placement,
38
+ )
39
+ return phonemes
phonikud/data/rashej_tevot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "ืฆื”ืดืœ": "tsหˆahal"
3
+ }
phonikud/data/special.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ื•ึทืืœึฐืœึธื”": "wหˆala",
3
+ "ื•ึทืกึทื‘ึดึผื™": "wasหˆabi",
4
+ "ืคื™ื ื’ื•ื™ืŸ": "pinguwหˆin",
5
+ "ื•ื•ืืฆืืค": "wหˆatsส”ap",
6
+ "ื•ื•ืื˜ืกืืค": "wหˆatsส”ap",
7
+ "ื™ืืœืœื”": "jหˆala",
8
+ "ื•ื•ืœื˜ืจ": "wหˆolter"
9
+ }
phonikud/data/symbols.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "โ‚ช": "สƒหˆekel",
3
+ "$": "dหˆolar",
4
+ "%": "axหˆuz"
5
+ }
phonikud/expander/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expand dates and numbers into words with nikud
3
+ This happens before phonemization
4
+ """
5
+
6
+ from .numbers import num_to_word
7
+ from .dates import date_to_word
8
+ from .time_to_word import time_to_word
9
+ from .dictionary import Dictionary
10
+ from phonikud.log import log
11
+
12
+
13
+ class Expander:
14
+ def __init__(self):
15
+ self.dictionary = Dictionary()
16
+
17
+ def expand_text(self, text: str):
18
+ words = []
19
+ for source_word in text.split():
20
+ try:
21
+ word = date_to_word(source_word)
22
+ if word == source_word:
23
+ word = time_to_word(word)
24
+ if word == source_word:
25
+ word = num_to_word(word)
26
+ words.append(word)
27
+ except Exception as e:
28
+ log.error(f"Failed to expand {word} with error: {e}")
29
+ words.append(source_word)
30
+ text = " ".join(words)
31
+ text = self.dictionary.expand_text(text)
32
+
33
+ return text
phonikud/expander/dates.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .numbers import num_to_word
3
+
4
+ # Mapping of month names in Hebrew with diacritics (Gregorian months)
5
+ MONTHS = {
6
+ 1: "ื™ึธึซื ื•ึผืึธืจ",
7
+ 2: "ืคึถึซื‘ึฐืจื•ึผืึธืจ",
8
+ 3: "ืžึตึซืจึฐืฅ",
9
+ 4: "ืึตืคึฐืจึดื™ืœ",
10
+ 5: "ืžึทืื™",
11
+ 6: "ื™ึซื•ึผื ึดื™",
12
+ 7: "ื™ึซื•ึผืœึดื™",
13
+ 8: "ืึซื•ึนื’ึปืกึฐื˜",
14
+ 9: "ืกึถืคึผึฐื˜ึถึซืžึฐื‘ึถึผืจ",
15
+ 10: "ืื•ึนืงึฐื˜ึซื•ึนื‘ึถึผืจ",
16
+ 11: "ื ื•ึนื‘ึถึซืžึฐื‘ึถึผืจ",
17
+ 12: "ื“ึถึผืฆึถึซืžึฐื‘ึถึผืจ",
18
+ }
19
+
20
+ # Mapping of day names in Hebrew with diacritics
21
+ DAYS = {
22
+ 0: "ื™ื•ึนื ืจึดืืฉืื•ึนืŸ",
23
+ 1: "ื™ื•ึนื ืฉึตืื ึดื™",
24
+ 2: "ื™ื•ึนื ืฉึฐืืœึดื™ืฉึดืื™",
25
+ 3: "ื™ื•ึนื ืจึตื‘ึดื™ืขึดื™",
26
+ 4: "ื™ื•ึนื ื—ึฒืžึดื™ืฉึดืื™",
27
+ 5: "ื™ื•ึนื ืฉึดืื™ืฉึดืื™",
28
+ 6: "ื™ื•ึนื ืฉึทืื‘ึธึผืช",
29
+ }
30
+
31
+
32
+ def date_to_word(word: str, include_day_name=False) -> str:
33
+ """
34
+ Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
35
+ Returns the original word if it's not a valid date.
36
+ """
37
+ separators = ["-", ".", "/"]
38
+ orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
39
+ date_formats = [sep.join(order) for order in orders for sep in separators]
40
+
41
+ for date_format in date_formats:
42
+ try:
43
+ # Try parsing the word with each date format
44
+ date_obj = datetime.strptime(word, date_format)
45
+
46
+ # Get the Hebrew day name with diacritics
47
+ day_name = DAYS[date_obj.weekday()]
48
+
49
+ # Convert month to Hebrew name with diacritics
50
+ month_name = MONTHS[date_obj.month]
51
+ day = num_to_word(str(date_obj.day))
52
+ year = num_to_word(str(date_obj.year))
53
+
54
+ text = f"{day} ื‘ึผึต{month_name} {year}"
55
+ if include_day_name:
56
+ text = f"{day_name}, {text}"
57
+ return text
58
+ except ValueError:
59
+ continue
60
+ return word
phonikud/expander/dictionary.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dictionaries are tab separated key value words
3
+ """
4
+
5
+ from pathlib import Path
6
+ import json
7
+ import re
8
+ from phonikud.utils import remove_nikud
9
+ from phonikud.utils import normalize
10
+ from phonikud import lexicon
11
+ import unicodedata
12
+
13
+ files = Path(__file__).parent.joinpath("../data").glob("*.json")
14
+ # Sort in reverse order to prioritize the most recent and best
15
+ order = {"bronze": 1, "silver": 2, "gold": 3}
16
+ files = sorted(
17
+ files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
18
+ )
19
+
20
+
21
+ class Dictionary:
22
+ def __init__(self):
23
+ self.dict = {}
24
+ self.load_dictionaries()
25
+
26
+ def load_dictionaries(self):
27
+ for file in files:
28
+ with open(file, "r", encoding="utf-8") as f:
29
+ dictionary: dict = json.load(f)
30
+ normalized_dictionary = {}
31
+
32
+ # normalize nikud keys
33
+ for k, v in dictionary.items():
34
+ k = normalize(k)
35
+ # Ensure not empty
36
+ if k and v:
37
+ normalized_dictionary[k] = v
38
+ self.dict.update(normalized_dictionary)
39
+
40
+ def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
41
+ source: str = match.group(0)
42
+ # decomposite
43
+ source = unicodedata.normalize("NFD", source)
44
+ raw_lookup = self.dict.get(source)
45
+
46
+ without_nikud_lookup = self.dict.get(remove_nikud(source))
47
+ with_nikud_lookup = self.dict.get(normalize(source))
48
+ # Compare without nikud ONLY if source has no nikud
49
+ if raw_lookup:
50
+ return raw_lookup
51
+ if without_nikud_lookup:
52
+ return without_nikud_lookup
53
+ elif with_nikud_lookup:
54
+ return with_nikud_lookup
55
+ return source
56
+
57
+ def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
58
+ raw_source: str = match.group(0)
59
+ if raw_source.isnumeric():
60
+ return raw_source
61
+
62
+ raw_lookup = self.dict.get(raw_source)
63
+
64
+ # Compare without nikud ONLY if source has no nikud
65
+ if raw_lookup:
66
+ return raw_lookup
67
+ # search by only ', space, regular nikud, alphabet
68
+ raw_source = re.sub(
69
+ lexicon.HE_PATTERN, self.replace_hebrew_only_callback, raw_source
70
+ )
71
+ return raw_source
72
+
73
+ def expand_text(self, text: str) -> str:
74
+ """
75
+ TODO: if key doesn't have diacritics expand even diacritized words
76
+ """
77
+ text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
78
+
79
+ return text
phonikud/expander/number_names.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
3
+ """
4
+
5
+ # TODO: add nikud hints
6
+
7
+ ZERO = {"ืืคืก": "ืึถึซืคึถืก"}
8
+
9
+
10
+ ONES = {
11
+ "ืื—ืช": "ืึทื—ึทืช",
12
+ "ืื—ื“": "ืึถื—ึธื“",
13
+ "ืจืืฉื•ื ื”": "ืจึดืืฉืื•ึนื ึธื”",
14
+ "ืจืืฉื•ืŸ": "ืจึดืืฉืื•ึนืŸ",
15
+ "ืจืืฉื•ื ื•ืช": "ืจึดืืฉืื•ึนื ื•ึนืช",
16
+ "ืจืืฉื•ื ื™ื": "ืจึดืืฉืื•ึนื ึดื™ื",
17
+ "ืฉืชื™ื™ื": "ืฉืึฐืชึทึผึซื™ึดื",
18
+ "ืฉื ื™ื™ื": "ืฉึฐืื ึทึซื™ึดื",
19
+ "ืฉืชื™": "ืฉึฐืืชึตึผื™",
20
+ "ืฉื ื™": "ืฉืึฐื ึตื™",
21
+ "ืฉื ื™ื™ื”": "ืฉึฐืื ึดื™ึธึผื”",
22
+ "ืฉื ื™ื•ืช": "ืฉึฐืื ึดื™ึผื•ึนืช",
23
+ "ืฉืœื•ืฉ": "ืฉืึธืœื•ึนืฉื",
24
+ "ืฉืœื•ืฉื”": "ืฉืึฐืœื•ึนืฉืึธื”",
25
+ "ืฉืœื•ืฉืช": "ืฉืึฐืœึซื•ึนืฉืึถืช",
26
+ "ืฉืœื™ืฉื™ืช": "ืฉืึฐืœึดื™ืฉืึดื™ืช",
27
+ "ืฉืœื™ืฉื™": "ืฉืึฐืœึดื™ืฉืึดื™",
28
+ "ืฉืœื™ืฉื™ื•ืช": "ืฉืึฐืœึดื™ืฉืึดื™ึผื•ึนืช",
29
+ "ืฉืœื™ืฉื™ื™ื": "ืฉืึฐืœึดื™ืฉืึดื™ึดึผื™ื",
30
+ "ืืจื‘ืข": "ืึทึซืจึฐื‘ึทึผืข",
31
+ "ืืจื‘ืขื”": "ืึทืจึฐื‘ึทึผืขึธื”",
32
+ "ืืจื‘ืขืช": "ืึทืจึฐื‘ึผึทึซืขึทืช",
33
+ "ืจื‘ื™ืขื™ืช": "ืจึตื‘ึดื™ืขึดื™ืช",
34
+ "ืจื‘ื™ืขื™": "ืจึตื‘ึดื™ืขึดื™",
35
+ "ืจื‘ื™ืขื™ื•ืช": "ืจึตื‘ึดื™ืขึดื™ื•ึนืช",
36
+ "ืจื‘ื™ืขื™ื™ื": "ืจึตื‘ึดื™ืขึดื™ึดึผื™ื",
37
+ "ื—ืžืฉ": "ื—ึธืžึตืฉื",
38
+ "ื—ืžื™ืฉื”": "ื—ึฒืžึดืฉึธึผืื”",
39
+ "ื—ืžืฉืช": "ื—ึฒืžึตึซืฉืึถืช",
40
+ "ื—ืžื™ืฉื™ืช": "ื—ึฒืžึดื™ืฉึดึผืื™ืช",
41
+ "ื—ืžื™ืฉื™": "ื—ึฒืžึดื™ืฉึดึผืื™",
42
+ "ื—ืžื™ืฉื™ื•ืช": "ื—ึฒืžึดื™ืฉึดึผืื™ื•ึนืช",
43
+ "ื—ืžื™ืฉื™ื™ื": "ื—ึฒืžึดื™ืฉึดึผืื™ึดึผื™ื",
44
+ "ืฉืฉ": "ืฉึตืืฉื",
45
+ "ืฉื™ืฉื”": "ืฉึดืืฉึธึผืื”",
46
+ "ืฉืฉืช": "ืฉืึตึซืฉืึถืช",
47
+ "ืฉื™ืฉื™ืช": "ืฉึดืืฉึดึผืื™ืช",
48
+ "ืฉื™ืฉื™": "ืฉึดืืฉึดึผืื™",
49
+ "ืฉื™ืฉื™ื•ืช": "ืฉึดืืฉึดึผืื™ื•ึนืช",
50
+ "ืฉื™ืฉื™ื™ื": "ืฉึดืืฉึดึผืื™ึดึผื™ื",
51
+ "ืฉื‘ืข": "ืฉึถืึซื‘ึทืข",
52
+ "ืฉื‘ืขื”": "ืฉึดืื‘ึฐืขึธื”",
53
+ "ืฉื‘ืขืช": "ืฉืึดื‘ึฐืขึทืช",
54
+ "ืฉื‘ื™ืขื™ืช": "ืฉึฐืื‘ึดื™ืขึดื™ืช",
55
+ "ืฉื‘ื™ืขื™": "ืฉึฐืื‘ึดื™ืขึดื™",
56
+ "ืฉื‘ื™ืขื™ื•ืช": "ืฉึฐืื‘ึดื™ืขึดื™ื•ึนืช",
57
+ "ืฉื‘ื™ืขื™ื™ื": "ืฉึฐืื‘ึดื™ืขึดื™ึดึผื™ื",
58
+ "ืฉืžื•ื ื”": "ืฉืึฐืžึซื•ึนื ึถื”",
59
+ "ืฉืžื•ื ืช": "ืฉืึฐืžื•ึนื ึทืช",
60
+ "ืฉืžื™ื ื™ืช": "ืฉึฐืืžึดื™ื ึดื™ืช",
61
+ "ืฉืžื™ื ื™": "ืฉึฐืืžึดื™ื ึดื™",
62
+ "ืฉืžื™ื ื™ื•ืช": "ืฉึฐืืžึดื™ื ึดื™ื•ึนืช",
63
+ "ืฉืžื™ื ื™ื™ื": "ืฉึฐืืžึดื™ื ึดื™ึดึผื™ื",
64
+ "ืชืฉืข": "ืชึผึตืฉืึทืข",
65
+ "ืชืฉืขื”": "ืชึดึผืฉึฐืืขึธื”",
66
+ "ืชืฉืขืช": "ืชึผึดืฉืึฐืขึทืช",
67
+ "ืชืฉื™ืขื™ืช": "ืชึผึฐืฉืึดื™ืขึดื™ืช",
68
+ "ืชืฉื™ืขื™": "ืชึผึฐืฉืึดื™ืขึดื™",
69
+ "ืชืฉื™ืขื™ื•ืช": "ืชึผึฐืฉืึดื™ืขึดื™ึผื•ึนืช",
70
+ "ืชืฉื™ืขื™ื™ื": "ืชึผึฐืฉืึดื™ืขึดื™ึผึดื™ื",
71
+ }
72
+
73
+
74
+ TENS = {
75
+ "ืขืฉืจ": "ืขึถึซืฉื‚ึถืจ",
76
+ "ืขืฉืจื”": "ืขึถืฉื‚ึฐืจึตื”",
77
+ "ืขืฉืจืช": "ืขึฒืฉึถื‚ึซืจึถืช",
78
+ "ืขืฉื™ืจื™ืช": "ืขึฒืฉื‚ึดื™ืจึดื™ืช",
79
+ "ืขืฉื™ืจื™": "ืขึฒืฉื‚ึดื™ืจึดื™",
80
+ "ืขืฉื™ืจื™ื•ืช": "ืขึฒืฉื‚ึดื™ืจึดื™ึผื•ึนืช",
81
+ "ืขืฉื™ืจื™ื™ื": "ืขึฒืฉื‚ึดื™ืจึดื™ึผึดื™ื",
82
+ "ืฉืชื™ื ืขืฉืจื”": "ืฉืึฐืชึผึตึซื™ื ืขึถืฉื‚ึฐืจึตื”",
83
+ "ืฉื ื™ื ืขืฉืจ": "ืฉืึฐื ึตื™ื ืขึธืฉื‚ึธืจ",
84
+ }
85
+
86
+
87
+ TWENTIES = {
88
+ "ืขืฉืจื™ื": "ืขึถืฉึฐื‚ืจึดึซื™ื",
89
+ "ืฉืœื•ืฉื™ื": "ืฉึฐืืœื•ึนืฉึดืื™ื",
90
+ "ืืจื‘ืขื™ื": "ืึทืจึฐื‘ึธึผืขึดื™ื",
91
+ "ื—ืžื™ืฉื™ื": "ื—ึฒืžึดืฉึดึผืื™ื",
92
+ "ืฉื™ืฉื™ื": "ืฉึดืืฉึดึผืื™ื",
93
+ "ืฉื‘ืขื™ื": "ืฉึดืื‘ึฐืขึดื™ื",
94
+ "ืฉืžื•ื ื™ื": "ืฉึฐืืžื•ึนื ึดื™ื",
95
+ "ืชืฉืขื™ื": "ืชึดึผืฉึฐืืขึดื™ื",
96
+ }
97
+
98
+
99
+ HUNDREDS = {
100
+ "ืžืื”": "ืžึตึซืึธื”",
101
+ "ืžืืช": "ืžึตืึทืช",
102
+ "ืžืืชื™ื™ื": "ืžึธืืชึทึซื™ึดื",
103
+ "ืžืื•ืช": "ืžึตืื•ึนืช",
104
+ }
105
+
106
+ THOUSANDS = {
107
+ "ืืœืฃ": "ืึถึซืœึถืฃ",
108
+ "ืืœืคื™ื™ื": "ืึทืœึฐืคึทึผึซื™ึดื",
109
+ "ืืœืคื™ื": "ืึฒืœึธืคึดื™ื",
110
+ "ืืœืคื™": "ืึทืœึฐืคึดึผื™",
111
+ }
112
+
113
+
114
+ LARGE = {
115
+ "ืžื™ืœื™ื•ืŸ": "ืžึดื™ืœึฐื™ื•ึนืŸ",
116
+ "ืžื™ืœื™ื•ื ื™": "ืžึดื™ืœึฐื™ื•ึนื ึดื™",
117
+ "ืžื™ืœื™ืืจื“": "ืžึดื™ืœึฐื™ึทืืจึฐื“",
118
+ "ืžื™ืœื™ืืจื“ื™": "ืžึดื™ืœึฐื™ึทึซืืจึฐื“ึดึผื™",
119
+ "ื˜ืจื™ืœื™ื•ืŸ": "ื˜ึฐืจึดื™ืœึฐื™ื•ึนืŸ",
120
+ "ื˜ืจื™ืœื™ื•ื ื™": "ื˜ึฐืจึดื™ืœึฐื™ื•ึนื ึดื™",
121
+ "ืงื•ื•ื“ืจื™ืœื™ื•ืŸ": "ืงื•ึนื•ึทื“ึฐืจึดื™ืœึฐื™ื•ึนืŸ",
122
+ "ืงื•ื•ื“ืจื™ืœื™ื•ื ื™": "ืงื•ึนื•ึทื“ึฐืจึดื™ืœึฐื™ื•ึนื ึดื™",
123
+ "ืงื•ื•ื™ื ื˜ื™ืœื™ื•ืŸ": "ืงื•ึดื•ึดึผื ึฐื˜ึดื™ืœึฐื™ื•ึนืŸ",
124
+ "ืงื•ื•ื™ื ื˜ื™ืœื™ื•ื ื™": "ืงื•ึดื•ึดึผื ึฐื˜ึดื™ืœึฐื™ื•ึนื ึดื™",
125
+ "ืกืงืกื˜ื™ืœื™ื•ืŸ": "ืกึฐืงึถืกึฐื˜ึดื™ืœึฐื™ื•ึนืŸ",
126
+ "ืกืงืกื˜ื™ืœื™ื•ื ื™": "ืกึฐืงึถืกึฐื˜ึดื™ืœึฐื™ื•ึนื ึดื™",
127
+ "ืกืคื˜ื™ืœื™ื•ืŸ": "ืกึฐืคึถึผื˜ึดื™ืœึฐื™ื•ึนืŸ",
128
+ "ืกืคื˜ื™ืœื™ื•ื ื™": "ืกึฐืคึถึผื˜ึดื™ืœึฐื™ื•ึนื ึดื™",
129
+ "ืื•ืงื˜ื™ืœื™ื•ืŸ": "ืื•ึนืงึฐื˜ึดื™ืœึฐื™ื•ึนืŸ",
130
+ "ืื•ืงื˜ื™ืœื™ื•ื ื™": "ืื•ึนืงึฐื˜ึดื™ืœึฐื™ื•ึนื ึดื™",
131
+ "ื ื•ื ื™ืœื™ื•ืŸ": "ื ื•ึผื ึดื™ืœึฐื™ื•ึนืŸ",
132
+ "ื ื•ื ื™ืœื™ื•ื ื™": "ื ื•ึผื ึดื™ืœึฐื™ื•ึนื ึดื™",
133
+ "ื“ืกื™ืœื™ื•ืŸ": "ื“ึถึผืกึดื™ืœึฐื™ื•ึนืŸ",
134
+ "ื“ืกื™ืœื™ื•ื ื™": "ื“ึถึผืกึดื™ืœึฐื™ื•ึนื ึดื™",
135
+ "ืื•ื ื“ืกื™ืœื™ื•ืŸ": "ืื•ึผื ึฐื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
136
+ "ืื•ื ื“ืกื™ืœื™ื•ื ื™": "ืื•ึผื ึฐื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
137
+ "ื“ื•ืื•ื“ืกื™ืœื™ื•ืŸ": "ื“ื•ึผืื•ื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
138
+ "ื“ื•ืื•ื“ืกื™ืœื™ื•ื ื™": "ื“ื•ึผืื•ื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
139
+ "ื˜ืจื“ืกื™ืœื™ื•ืŸ": "ื˜ึถืจึฐื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
140
+ "ื˜ืจื“ืกื™ืœื™ื•ื ื™": "ื˜ึถืจึฐื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
141
+ "ืงื•ื•ื˜ื•ืื•ืจื“ืกื™ืœื™ื•ืŸ": "ืงื•ึผื•ื˜ึฐื•ึธืื•ืจึฐื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
142
+ "ืงื•ื•ื˜ื•ืื•ืจื“ืกื™ืœื™ื•ื ื™": "ืงื•ึผื•ื˜ึฐื•ึธืื•ืจึฐื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
143
+ "ืงื•ื•ื™ื ื“ืกื™ืœื™ื•ืŸ": "ืงื•ึดื•ึดึผื ึฐื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
144
+ "ืงื•ื•ื™ื ื“ืกื™ืœื™ื•ื ื™": "ืงื•ึดื•ึดึผื ึฐื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
145
+ "ืกืงืกื“ืกื™ืœื™ื•ืŸ": "ืกึถืงึฐืกึฐื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
146
+ "ืกืงืกื“ืกื™ืœื™ื•ื ื™": "ืกึถืงึฐืกึฐื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
147
+ "ืกืคื˜ื ื“ืกื™ืœื™ื•ืŸ": "ืกึฐืคึถึผื˜ึทื ึฐื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
148
+ "ืกืคื˜ื ื“ืกื™ืœื™ื•ื ื™": "ืกึฐืคึถึผื˜ึทื ึฐื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
149
+ "ืื•ืงื˜ื•ื“ืกื™ืœื™ื•ืŸ": "ืื•ึนืงึฐื˜ื•ึนื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
150
+ "ืื•ืงื˜ื•ื“ืกื™ืœื™ื•ื ื™": "ืื•ึนืงึฐื˜ื•ึนื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
151
+ "ื ื•ื‘ืžื“ืกื™ืœื™ื•ืŸ": "ื ื•ึนื‘ึฐืžึทื“ึฐืกึดื™ืœึฐื™ื•ึนืŸ",
152
+ "ื ื•ื‘ืžื“ืกื™ืœื™ื•ื ื™": "ื ื•ึนื‘ึฐืžึทื“ึฐืกึดื™ืœึฐื™ื•ึนื ึดื™",
153
+ "ื•ื™ื’ื™ื ื˜ื™ืœื™ื•ืŸ": "ื•ึดื™ื’ึดึผื™ื ึฐื˜ึดื™ืœึฐื™ื•ึนืŸ",
154
+ "ื•ื™ื’ื™ื ื˜ื™ืœื™ื•ื ื™": "ื•ึดื™ื’ึดึผื™ื ึฐื˜ึดื™ืœึฐื™ื•ึนื ึดื™",
155
+ }
156
+
157
+
158
+ LETTERS = {
159
+ "ื•": "ื•ึต",
160
+ "ื”": "ื”ึท",
161
+ }
162
+
163
+
164
+ CURRENCY = {
165
+ "ืฉืงืœ": "ืฉึตืึซืงึถืœ",
166
+ "ืฉืงืœื™ื": "ืฉึฐืืงึธืœึดื™ื",
167
+ "ืื’ื•ืจื”": "ืึฒื’ื•ึนืจึธื”",
168
+ "ืื’ื•ืจื•ืช": "ืึฒื’ื•ึนืจื•ึนืช",
169
+ "ืื™ืจื•": "ืึตึซื™ืจื•ึน",
170
+ "ืกื ื˜": "ืกึตื ึฐื˜",
171
+ "ืกื ื˜ื™ื": "ืกึตึซื ึฐื˜ึดื™ื",
172
+ "ื“ื•ืœืจ": "ื“ึซื•ึนืœึธืจ",
173
+ "ื“ื•ืœืจื™ื": "ื“ื•ึนืœึธึซืจึดื™ื",
174
+ }
175
+
176
+
177
+ POINTS = {
178
+ "ืžื™ื ื•ืก": "ืžึดึซื™ื ึผื•ึผืก",
179
+ "ื ืงื•ื“ื”": "ื ึฐึฝืงึปื“ึผึธื”",
180
+ }
181
+
182
+ NUMBER_NAMES = {
183
+ **CURRENCY,
184
+ **HUNDREDS,
185
+ **LARGE,
186
+ **LETTERS,
187
+ **ONES,
188
+ **POINTS,
189
+ **TENS,
190
+ **THOUSANDS,
191
+ **TWENTIES,
192
+ **ZERO,
193
+ }
phonikud/expander/numbers.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import num2words
2
+ from .number_names import NUMBER_NAMES
3
+ import re
4
+
5
+
6
+ def add_diacritics(words: str):
7
+ new_words = []
8
+ for word in words.split():
9
+ if NUMBER_NAMES.get(word):
10
+ new_words.append(NUMBER_NAMES[word])
11
+ elif NUMBER_NAMES.get(word[1:]):
12
+ # With Vav or Bet
13
+ new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
14
+ else:
15
+ new_words.append(word)
16
+ return " ".join(new_words)
17
+
18
+
19
+ def num_to_word(maybe_number: str) -> str:
20
+ def replace_number(match):
21
+ num: str = match.group()
22
+ suffix, prefix = "", ""
23
+ # prefix
24
+ if not num.startswith("-") and not num[0].isdigit():
25
+ prefix = num[0]
26
+ num = num[1:]
27
+ if not num[-1].isdigit():
28
+ suffix = num[-1]
29
+ num = num[:-1]
30
+ words = num2words.num2words(num, lang="he", ordinal=False)
31
+ words_with_diacritics = add_diacritics(words)
32
+ return (
33
+ f"{prefix.strip()} {words_with_diacritics.strip()} {suffix.strip()}".strip()
34
+ )
35
+
36
+ # Replace all whole numbers in the string
37
+ result = re.sub(r"[^\d\-]?-?\d+(?:[\.,]\d+)?[^\d]?", replace_number, maybe_number)
38
+
39
+ return result
phonikud/expander/time_to_word.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert time to words
3
+ TODO: fix zeros eg. 22:00
4
+ """
5
+
6
+ import re
7
+
8
+ PATTERNS = [
9
+ r"(\d{1,2})([apm]{2})", # AM/PM format
10
+ r"(\d{1,2}):(\d{2})", # HH:MM format
11
+ ]
12
+
13
+
14
+ def extract_time(match):
15
+ """
16
+ Extract hour and minute from a string in HH:MM or AM/PM format
17
+ and return as integers.
18
+ """
19
+ time_str = match.group(0).lower().strip()
20
+
21
+ # Check for HH:MM format
22
+ match = re.match(r"(\d{1,2}):(\d{2})", time_str)
23
+ if match:
24
+ h = int(match.group(1))
25
+ m = int(match.group(2))
26
+ return f"{convert_to_word(h, m)}"
27
+
28
+ # Check for AM/PM format
29
+ match = re.match(r"(\d{1,2})([apm]{2})", time_str)
30
+ if match:
31
+ h = int(match.group(1))
32
+ period = match.group(2)
33
+
34
+ # Normalize to 24-hour format
35
+ if period == "am" and h == 12:
36
+ h = 0
37
+ elif period == "pm" and h != 12:
38
+ h += 12
39
+ return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
40
+
41
+ return match.group(0) # Return original text if the format is not recognized
42
+
43
+
44
+ def convert_to_word(h, m):
45
+ hours = [
46
+ "ืึถืคึถืก",
47
+ "ืึทื—ึทืช",
48
+ "ืฉืึฐื ึทื™ึดื", # Will be replaced with "ืฉืึตื ึดื™" when needed
49
+ "ืฉืึธืœื•ึนืฉื",
50
+ "ืึทึซืจึฐื‘ึผึทืข",
51
+ "ื—ึธืžึตืฉื",
52
+ "ืฉืึตืฉื",
53
+ "ืฉืึถึซื‘ึทืข",
54
+ "ืฉืึฐืžึซื•ึนื ึตื”",
55
+ "ืชึผึตึซืฉืึทืข",
56
+ "ืขึตึซืฉื‚ึตืจ",
57
+ "ืึทื—ึทืช ืขึถืฉื‚ึฐืจึตื”",
58
+ "ืฉืึฐืชึผึตื™ื ืขึถืฉื‚ึฐืจึตื”",
59
+ ]
60
+
61
+ tens = ["", "ืขึตืฉื‚ึตืจ", "ืขึถืฉื‚ึฐืจึดื™ื", "ืฉืึฐืœื•ึนืฉืึดื™ื", "ืึทืจึฐื‘ึผึธืขึดื™ื", "ื—ึฒืžึดืฉึผืึดื™ื"]
62
+
63
+ ten_to_twenty = [
64
+ "ืขึตึซืฉื‚ึตืจ",
65
+ "ืึทื—ึทืช ืขึถืฉื‚ึฐืจึตื”",
66
+ "ืฉืึฐืชึผึตื™ื ืขึถืฉื‚ึฐืจึตื”",
67
+ "ืฉืึฐืœื•ึนืฉื ืขึถืฉื‚ึฐืจึตื”",
68
+ "ืึทืจึฐื‘ึผึทืข ืขึถืฉื‚ึฐืจึตื”",
69
+ "ื—ึฒืžึตืฉื ืขึถืฉื‚ึฐืจึตื”",
70
+ "ืฉืึตืฉื ืขึถืฉื‚ึฐืจึตื”",
71
+ "ืฉืึฐื‘ึทืข ืขึถืฉื‚ึฐืจึตื”",
72
+ "ืฉืึฐืžื•ึนื ึถื” ืขึถืฉื‚ึฐืจึตื”",
73
+ "ืชึผึฐืฉืึทืข ืขึถืฉื‚ึฐืจึตื”",
74
+ ]
75
+
76
+ vocab = {"minutes": "ื“ึผึทืงึผื•ึนืช", "and": "ื•ึต", "shtey": "ืฉืึฐืชึผึตื™"}
77
+
78
+ # Convert 0 hours to 12 (midnight)
79
+ if h == 0:
80
+ h = 12
81
+
82
+ elif h > 12:
83
+ h -= 12
84
+
85
+ if m == 0:
86
+ return f"{hours[h]}"
87
+
88
+ elif 1 <= m <= 9:
89
+ minute_word = (
90
+ vocab["shtey"] if m == 2 else hours[m]
91
+ ) # Replace "ืฉื ื™ื™ื" with "ืฉื ื™"
92
+ return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
93
+
94
+ elif 10 <= m <= 19:
95
+ return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
96
+
97
+ else:
98
+ tens_part = f"{vocab['and']}{tens[m // 10]}"
99
+ units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
100
+ return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
101
+
102
+
103
+ def time_to_word(text: str):
104
+ return re.sub("|".join(PATTERNS), extract_time, text)
phonikud/hebrew.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hebrew Phonemizer
3
+
4
+ Fast rule-based FST that converts Hebrew text to phonemes.
5
+ See https://en.wikipedia.org/wiki/Finite-state_transducer
6
+
7
+ Rules implemented:
8
+ 1. Consonant handling (including special cases)
9
+ 2. Nikud (vowel) processing
10
+ 3. Dagesh handling
11
+ 4. Geresh handling
12
+ 5. Vocal Shva prediction
13
+ 6. Special letter combinations
14
+
15
+ Reference:
16
+ - https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
17
+ - https://en.wikipedia.org/wiki/Help:IPA/Hebrew
18
+ - https://he.wikipedia.org/wiki/ื”ื‘ืจื”
19
+ - https://hebrew-academy.org.il/2020/08/11/ืื™ืš-ื”ื•ื’ื™ื-ืืช-ื”ืฉื•ื•ื-ื”ื ืข
20
+ - https://hebrew-academy.org.il/2010/03/24/ืฆื”ืจื™ื-ื ืขืžื™-ื”ื’ื™ื™ืช-ืงืžืฅ-ืœืคื ื™-ื—ื˜
21
+ - https://hebrew-academy.org.il/2022/03/03/ืžืœืขื™ืœ-ื•ืžืœืจืข-ืขืœ-ื”ื”ื˜ืขืžื”-ื‘ืขื‘ืจื™ืช
22
+ """
23
+
24
+ from typing import Literal
25
+ from phonikud.variants import Letter
26
+ from phonikud import lexicon
27
+ import re
28
+ from phonikud.utils import sort_stress
29
+
30
+ SHVA = "\u05b0"
31
+ SIN = "\u05c2"
32
+ PATAH = "\u05b7"
33
+ KAMATZ = "\u05b8"
34
+ HATAF_KAMATZ = "\u05b3"
35
+ DAGESH = "\u05bc"
36
+ HOLAM = "\u05b9"
37
+ HIRIK = "\u05b4"
38
+ PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
39
+ KUBUTS = "\u05bb"
40
+ TSERE = "\u05b5"
41
+ HATAMA = "\u05ab"
42
+ VAV_HOLAM = "\u05ba"
43
+ DAGESH = "\u05bc"
44
+ SEGOL = "\u05b6"
45
+
46
+
47
+ def phonemize_hebrew(
48
+ letters: list[Letter], stress_placement: Literal["syllable", "vowel"]
49
+ ) -> list[str]:
50
+ phonemes, i = [], 0
51
+ while i < len(letters):
52
+ cur = letters[i]
53
+ prev = letters[i - 1] if i > 0 else None
54
+ next = letters[i + 1] if i + 1 < len(letters) else None
55
+ next_phonemes, skip_offset = letter_to_phonemes(
56
+ cur, prev, next, stress_placement
57
+ )
58
+ phonemes.extend(next_phonemes)
59
+ i += skip_offset + 1
60
+ return phonemes
61
+
62
+
63
+ def handle_yud(cur: Letter, prev: Letter | None, next: Letter | None) -> bool:
64
+ """Returns True if Yud should skip consonants"""
65
+ return (
66
+ next
67
+ # Yud without diacritics
68
+ and not cur.diac
69
+ # In middle
70
+ and prev
71
+ # Prev Hirik
72
+ and prev.char + prev.diac != "ืึต"
73
+ # Next Vav has meaning
74
+ and not (next.char == "ื•" and next.diac and "\u05b0" not in next.diac)
75
+ )
76
+
77
+
78
+ def handle_vav(cur: Letter, prev: Letter | None, next: Letter | None):
79
+ if prev and SHVA in prev.diac and HOLAM in cur.diac:
80
+ return ["vo"], True, True, 0
81
+
82
+ if next and next.char == "ื•":
83
+ diac = cur.diac + next.diac
84
+ if HOLAM in diac:
85
+ return ["vo"], True, True, 1
86
+ if cur.diac == next.diac:
87
+ return ["vu"], True, True, 1
88
+ if HIRIK in cur.diac:
89
+ return ["vi"], True, True, 0
90
+ if SHVA in cur.diac and not next.diac:
91
+ return ["v"], True, True, 0
92
+ if KAMATZ in cur.diac or PATAH in cur.diac:
93
+ return ["va"], True, True, 0
94
+ if TSERE in cur.diac or SEGOL in cur.diac:
95
+ return ["ve"], True, True, 0
96
+ return [], False, False, 0
97
+
98
+ # Single ื•
99
+ if re.search(PATAH_LIKE_PATTERN, cur.diac):
100
+ return ["va"], True, True, 0
101
+ if TSERE in cur.diac or SEGOL in cur.diac:
102
+ return ["ve"], True, True, 0
103
+ if HOLAM in cur.diac:
104
+ return ["o"], True, True, 0
105
+ if KUBUTS in cur.diac or DAGESH in cur.diac:
106
+ return ["u"], True, True, 0
107
+ if SHVA in cur.diac and not prev:
108
+ return ["ve"], True, True, 0
109
+ if HIRIK in cur.diac:
110
+ return ["vi"], True, True, 0
111
+ if next and not cur.diac:
112
+ return [], True, True, 0
113
+
114
+ return ["v"], True, True, 0
115
+
116
+
117
+ def letter_to_phonemes(
118
+ cur: Letter,
119
+ prev: Letter | None,
120
+ next: Letter | None,
121
+ stress_placement: Literal["syllable", "vowel"],
122
+ ) -> tuple[str, int]:
123
+ cur_phonemes = []
124
+ skip_diacritics = False
125
+ skip_consonants = False
126
+ skip_offset = 0
127
+
128
+ if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
129
+ skip_consonants = True
130
+ skip_diacritics = True
131
+
132
+ elif cur.char == "ื" and not cur.diac and prev:
133
+ if next and next.char != "ื•":
134
+ skip_consonants = True
135
+
136
+ elif cur.char == "ื™" and handle_yud(cur, prev, next):
137
+ skip_consonants = True
138
+
139
+ elif cur.char == "ืฉ" and SIN in cur.diac:
140
+ if (
141
+ next
142
+ and next.char == "ืฉ"
143
+ and not next.diac
144
+ and re.search("[\u05b7\u05b8]", cur.diac)
145
+ ):
146
+ # ^ ื™ืฉืฉื›ืจ
147
+ cur_phonemes.append("sa")
148
+ skip_consonants = True
149
+ skip_diacritics = True
150
+ skip_offset += 1
151
+ else:
152
+ cur_phonemes.append("s")
153
+ skip_consonants = True
154
+
155
+ # shin without nikud after sin = sin
156
+ elif cur.char == "ืฉ" and not cur.diac and prev and SIN in prev.diac:
157
+ cur_phonemes.append("s")
158
+ skip_consonants = True
159
+
160
+ elif not next and cur.char == "ื—" and PATAH in cur.diac:
161
+ # Final Het gnuva
162
+ cur_phonemes.append("ax")
163
+ skip_diacritics = True
164
+ skip_consonants = True
165
+
166
+ elif not next and cur.char == "ื”" and PATAH in cur.diac:
167
+ # Final He gnuva
168
+ cur_phonemes.append("ah")
169
+ skip_diacritics = True
170
+ skip_consonants = True
171
+
172
+ elif not next and cur.char == "ืข" and PATAH in cur.diac:
173
+ # Final Ayin gnuva
174
+ cur_phonemes.append("a")
175
+ skip_diacritics = True
176
+ skip_consonants = True
177
+
178
+ if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
179
+ if cur.char == "ืช":
180
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
181
+ skip_diacritics = True
182
+ skip_consonants = True
183
+ else:
184
+ # Geresh
185
+ cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
186
+ skip_consonants = True
187
+
188
+ elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES: # dagesh
189
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
190
+ skip_consonants = True
191
+ elif cur.char == "ื•" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
192
+ vav_phonemes, vav_skip_consonants, vav_skip_diacritics, vav_skip_offset = (
193
+ handle_vav(cur, prev, next)
194
+ )
195
+ cur_phonemes.extend(vav_phonemes)
196
+ skip_consonants = vav_skip_consonants
197
+ skip_diacritics = vav_skip_diacritics
198
+ skip_offset += vav_skip_offset
199
+
200
+ if not skip_consonants:
201
+ cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
202
+
203
+ if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
204
+ cur_phonemes.append("o")
205
+ skip_diacritics = True
206
+
207
+ nikud_phonemes = []
208
+ if not skip_diacritics:
209
+ nikud_phonemes = [
210
+ lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
211
+ ]
212
+ elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
213
+ nikud_phonemes = [lexicon.STRESS_PHONEME]
214
+ cur_phonemes.extend(nikud_phonemes)
215
+ # Ensure the stress is at the beginning of the syllable
216
+ cur_phonemes = sort_stress(cur_phonemes, stress_placement)
217
+ cur_phonemes = [
218
+ p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
219
+ ]
220
+ # Remove empty phonemes
221
+ cur_phonemes = [p for p in cur_phonemes if p]
222
+ return cur_phonemes, skip_offset
phonikud/lexicon.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASCII IPA transcription of Hebrew consonants and vowels.
3
+ """
4
+
5
+ # https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
6
+
7
+ # Non standard diacritics
8
+ VOCAL_SHVA_DIACRITIC = "\u05bd" # Meteg
9
+ HATAMA_DIACRITIC = "\u05ab" # Ole
10
+ PREFIX_DIACRITIC = "|" # Vertical bar
11
+ NIKUD_HASER_DIACRITIC = "\u05af" # Masora, not in use
12
+ EN_GERESH = "'"
13
+ NON_STANDARD_DIAC = "".join(
14
+ [
15
+ VOCAL_SHVA_DIACRITIC,
16
+ HATAMA_DIACRITIC,
17
+ PREFIX_DIACRITIC,
18
+ NIKUD_HASER_DIACRITIC,
19
+ EN_GERESH,
20
+ ]
21
+ )
22
+
23
+ HE_PATTERN = rf'[\u05b0-\u05ea{NON_STANDARD_DIAC}"]+'
24
+ # ^ Standard nikud and letters, ole, meteg, masora, vertical bar, en geresh
25
+ HE_NIKUD_PATTERN = rf"[\u05b0-\u05c7{NON_STANDARD_DIAC}]"
26
+ # ^ Letters, diacritics, en geresh
27
+ PUNCTUATION = set(r".,!? ")
28
+
29
+ STRESS_PHONEME = "หˆ" # \u02c8 visually looks like single quote
30
+ SPECIAL_PHONEMES = ["w"]
31
+ MODERN_SCHEMA = {
32
+ "x": "ฯ‡", # Het
33
+ "r": "ส", # Resh
34
+ "g": "ษก", # Gimel
35
+ }
36
+
37
+ # Geresh
38
+ GERESH_PHONEMES = {"ื’": "dส’", "ื–": "ส’", "ืช": "ta", "ืฆ": "tสƒ", "ืฅ": "tสƒ"}
39
+
40
+ # Consonants
41
+ LETTERS_PHONEMES = {
42
+ "ื": "ส”", # Alef
43
+ "ื‘": "v", # Bet
44
+ "ื’": "g", # Gimel
45
+ "ื“": "d", # Dalet
46
+ "ื”": "h", # He
47
+ "ื•": "v", # Vav
48
+ "ื–": "z", # Zayin
49
+ "ื—": "x", # Het
50
+ "ื˜": "t", # Tet
51
+ "ื™": "j", # Yod
52
+ "ืš": "x", # Haf sofit
53
+ "ื›": "x", # Haf
54
+ "ืœ": "l", # Lamed
55
+ "ื": "m", # Mem Sofit
56
+ "ืž": "m", # Mem
57
+ "ืŸ": "n", # Nun Sofit
58
+ "ื ": "n", # Nun
59
+ "ืก": "s", # Samekh
60
+ "ืข": "ส”", # Ayin, only voweled
61
+ "ืค": "f", # Fey
62
+ "ืฃ": "f", # Fey Sofit
63
+ "ืฅ": "ts", # Tsadik sofit
64
+ "ืฆ": "ts", # Tsadik
65
+ "ืง": "k", # Kuf
66
+ "ืจ": "r", # Resh
67
+ "ืฉ": "สƒ", # Shin
68
+ "ืช": "t", # Taf
69
+ # Beged Kefet
70
+ "ื‘ึผ": "b",
71
+ "ื›ึผ": "k",
72
+ "ืคึผ": "p",
73
+ # Shin Sin
74
+ "ืฉื": "สƒ",
75
+ "ืฉื‚": "s",
76
+ "'": "",
77
+ }
78
+
79
+ NIKUD_PHONEMES = {
80
+ "\u05b4": "i", # Hiriq
81
+ "\u05b1": "e", # Hataf segol
82
+ "\u05b5": "e", # Tsere
83
+ "\u05b6": "e", # Segol
84
+ "\u05b2": "a", # Hataf Patah
85
+ "\u05b7": "a", # Patah
86
+ "\u05c7": "o", # Kamatz katan
87
+ "\u05b9": "o", # Holam
88
+ "\u05ba": "o", # Holam haser for vav
89
+ "\u05bb": "u", # Qubuts
90
+ "\u05b3": "o", # Hataf qamats
91
+ "\u05b8": "a", # Kamataz
92
+ HATAMA_DIACRITIC: STRESS_PHONEME, # Stress (Hat'ama)
93
+ VOCAL_SHVA_DIACRITIC: "e", # Vocal Shva
94
+ }
95
+
96
+ DEDUPLICATE = {
97
+ "\u05f3": "'", # Hebrew geresh to regular geresh
98
+ "ึพ": "-", # Hebrew Makaf to hypen
99
+ }
100
+
101
+ # Sets
102
+ SET_ENHANCED_DIACRITICS = set(
103
+ [HATAMA_DIACRITIC, PREFIX_DIACRITIC, VOCAL_SHVA_DIACRITIC]
104
+ )
105
+
106
+ ADDITIONAL_PHONEMES = set() # When using fallback
107
+ SET_PHONEMES = set(
108
+ sorted(
109
+ {
110
+ *NIKUD_PHONEMES.values(),
111
+ *LETTERS_PHONEMES.values(),
112
+ *GERESH_PHONEMES.values(),
113
+ *MODERN_SCHEMA.values(),
114
+ *SPECIAL_PHONEMES,
115
+ }
116
+ )
117
+ )
phonikud/log.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import colorlog
4
+
5
+
6
+ def _create_logger():
7
+ """
8
+ Create a logger with colorized output
9
+ Usage: LOG_LEVEL=DEBUG python <script.py>
10
+ """
11
+
12
+ handler = colorlog.StreamHandler()
13
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
14
+ handler.setFormatter(
15
+ colorlog.ColoredFormatter(
16
+ fmt=fmt,
17
+ log_colors={
18
+ "DEBUG": "blue",
19
+ "INFO": "green",
20
+ "WARNING": "yellow",
21
+ "ERROR": "red",
22
+ "CRITICAL": "red",
23
+ },
24
+ )
25
+ )
26
+ # Get log level from LOG_LEVEL environment variable
27
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
28
+ logger = colorlog.getLogger(__package__)
29
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
30
+ # Setup logging to stdout
31
+ logger.addHandler(handler)
32
+ return logger
33
+
34
+
35
+ log = _create_logger()
phonikud/phonemize.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from phonikud import lexicon
2
+ from phonikud.variants import Letter
3
+ from .expander import Expander
4
+ from phonikud.utils import (
5
+ get_letters,
6
+ normalize,
7
+ post_normalize,
8
+ post_clean,
9
+ add_milra_hatama,
10
+ mark_vocal_shva,
11
+ sort_hatama,
12
+ )
13
+ from typing import Callable, Literal
14
+ import regex as re
15
+ from phonikud.hebrew import phonemize_hebrew
16
+
17
+
18
+ class Phonemizer:
19
+ # TODO: is that enough? what if there's punctuation around? other chars?
20
+ fallback_pattern = r"[a-zA-Z]+"
21
+
22
+ def __init__(self):
23
+ self.expander = Expander()
24
+
25
+ def phonemize(
26
+ self,
27
+ text: str,
28
+ preserve_punctuation: bool,
29
+ preserve_stress: bool,
30
+ use_expander: bool,
31
+ use_post_normalize: bool, # For TTS
32
+ predict_stress: bool,
33
+ predict_vocal_shva: bool,
34
+ stress_placement: Literal["syllable", "vowel"],
35
+ schema: Literal["plain", "modern"],
36
+ fallback: Callable[[str], str] = None,
37
+ ) -> str | list[str]:
38
+ # normalize
39
+ text = normalize(text)
40
+
41
+ def fallback_replace_callback(match: re.Match):
42
+ word = match.group(0)
43
+
44
+ if self.expander.dictionary.dict.get(word):
45
+ # skip
46
+ # TODO: better API
47
+ return word
48
+ phonemes = fallback(word).strip()
49
+ # TODO: check that it has only IPA?!
50
+ for c in phonemes:
51
+ lexicon.ADDITIONAL_PHONEMES.add(c)
52
+ return phonemes
53
+
54
+ if fallback is not None:
55
+ text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
56
+
57
+ if use_expander:
58
+ text = self.expander.expand_text(text)
59
+
60
+ def heb_replace_callback(match: re.Match, original_text: str):
61
+ word = match.group(0)
62
+ start_offset = match.start()
63
+ if start_offset > 0 and original_text[start_offset - 1] == "[":
64
+ # Skip if it starts with [ as it's used for hyper phonemes
65
+ return word
66
+
67
+ if predict_vocal_shva:
68
+ mark_vocal_shva(word)
69
+ if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
70
+ word = add_milra_hatama(word)
71
+ letters: list[Letter] = get_letters(word)
72
+ letters = sort_hatama(letters)
73
+
74
+ phonemes: list[str] = phonemize_hebrew(
75
+ letters,
76
+ stress_placement=stress_placement,
77
+ )
78
+ phonemes = "".join(phonemes)
79
+ # syllables = get_syllables(phonemes)
80
+
81
+ # phonemes_text = "".join(phonemes)
82
+ # # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
83
+ # # if len(syllables) == 1:
84
+ # # syllables[-1] = lexicon.STRESS + syllables[-1]
85
+ # # syllables[-1] = "".join(sort_stress(syllables[-1]))
86
+ # # elif any(
87
+ # # remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
88
+ # # ) or phonemes_text.endswith("ax"):
89
+ # # # insert lexicon.STRESS in the first character of syllables[-2]
90
+ # # syllables[-2] = lexicon.STRESS + syllables[-2]
91
+ # # syllables[-2] = "".join(sort_stress(syllables[-2]))
92
+ # # else:
93
+ # # # insert in syllables[-1]
94
+ # # syllables[-1] = lexicon.STRESS + syllables[-1]
95
+ # # syllables[-1] = "".join(sort_stress(syllables[-1]))
96
+
97
+ # phonemes = "".join(syllables)
98
+ if use_post_normalize:
99
+ phonemes = post_normalize(phonemes)
100
+
101
+ if schema == "modern":
102
+ # We'll keep this feature simple for now
103
+ for k, v in lexicon.MODERN_SCHEMA.items():
104
+ phonemes = re.sub(k, v, phonemes)
105
+ return phonemes
106
+
107
+ text = re.sub(
108
+ lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
109
+ )
110
+
111
+ def hyper_phonemes_callback(match: re.Match):
112
+ """
113
+ Expand hyper phonemes into normal phonemes
114
+ eg. [hello](/hษ›หˆloสŠ/) -> hษ›หˆloสŠ
115
+ """
116
+ matched_phonemes = match.group(2)
117
+ for c in matched_phonemes:
118
+ lexicon.ADDITIONAL_PHONEMES.add(c)
119
+ return matched_phonemes # The phoneme is in the second group
120
+
121
+ text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
122
+
123
+ if not preserve_punctuation:
124
+ text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
125
+ if not preserve_stress:
126
+ text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
127
+ if use_post_normalize:
128
+ # We don't keep hypens in the output, but we should replace it with space
129
+ text = post_clean(text)
130
+ return text
phonikud/syllables.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
3
+
4
+ TODO: add to phonikud?
5
+ """
6
+
7
+ import regex as re
8
+ import phonikud
9
+
10
+ VOWEL_DIACS = [chr(i) for i in range(0x05B1, 0x05BC)] + [chr(0x05C7)] + [chr(0x5BD)]
11
+
12
+ STRESS = "\u05ab"
13
+ SHVA = "\u05b0"
14
+ DAGESH = "\u05bc"
15
+
16
+
17
+ def sort_diacritics(word: str):
18
+ def sort_diacritics_callback(match):
19
+ letter = match.group(1)
20
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
21
+ return letter + diacritics
22
+
23
+ return re.sub(r"(\p{L})(\p{M}+)", sort_diacritics_callback, word)
24
+
25
+
26
+ def has_vowel_diacs(s: str):
27
+ if s == "ื•ึผ":
28
+ return True
29
+ return any(i in s for i in VOWEL_DIACS)
30
+
31
+
32
+ def get_syllables(word: str) -> list[str]:
33
+ letters = phonikud.utils.get_letters(word)
34
+ syllables, cur = [], ""
35
+ vowel_state = False
36
+
37
+ i = 0
38
+ while i < len(letters):
39
+ letter = letters[i]
40
+ has_vowel = has_vowel_diacs(str(letter)) or (i == 0 and SHVA in letter.all_diac)
41
+ # Look ahead
42
+ vav1 = i + 2 < len(letters) and letters[i + 2].char == "ื•"
43
+ vav2 = i + 3 < len(letters) and letters[i + 3].char == "ื•"
44
+
45
+ if has_vowel:
46
+ if vowel_state:
47
+ syllables.append(cur)
48
+ cur = str(letter)
49
+ else:
50
+ cur += str(letter)
51
+ vowel_state = True
52
+ else:
53
+ cur += str(letter)
54
+
55
+ i += 1
56
+
57
+ # If two ื•s are coming: force current syllable to end, and join both ื•s as next syllable
58
+ if vav1 and vav2:
59
+ if cur:
60
+ # Finish current syllable
61
+ syllables.append(cur + str(letters[i]))
62
+ cur = ""
63
+ cur = str(letters[i + 1]) + str(letters[i + 2])
64
+ i += 3 # skip past the double-vav
65
+ vowel_state = True
66
+
67
+ # If one ื• is coming, end the syllable now
68
+ elif vav1 and letters[i + 1].diac:
69
+ if cur:
70
+ syllables.append(cur)
71
+ cur = ""
72
+ vowel_state = False
73
+
74
+ if cur:
75
+ syllables.append(cur)
76
+ # print(syllables)
77
+ return syllables
78
+
79
+
80
+ def add_stress_to_syllable(s: str):
81
+ letters = phonikud.utils.get_letters(s)
82
+ letters[0].all_diac = STRESS + letters[0].all_diac
83
+ return "".join(letter.char + letter.all_diac for letter in letters)
84
+
85
+
86
+ def add_stress(word: str, syllable_position: int):
87
+ syllables: list[str] = get_syllables(word)
88
+
89
+ if not syllables:
90
+ return word # no syllables, return original word
91
+
92
+ # Normalize negative indices
93
+ if syllable_position < 0:
94
+ syllable_position += len(syllables)
95
+
96
+ # Clamp to valid range
97
+ syllable_position = max(0, min(syllable_position, len(syllables) - 1))
98
+
99
+ stressed_syllable = syllables[syllable_position]
100
+ stressed_syllable = add_stress_to_syllable(stressed_syllable)
101
+ syllables[syllable_position] = stressed_syllable
102
+
103
+ return "".join(syllables)
phonikud/utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from typing import Literal
3
+ from phonikud import lexicon
4
+ import unicodedata
5
+ import regex as re
6
+ import phonikud.syllables
7
+ from phonikud.variants import Letter
8
+ import phonikud
9
+
10
+
11
+ def sort_diacritics(match):
12
+ letter = match.group(1)
13
+ diacritics = "".join(sorted(match.group(2))) # Sort diacritics
14
+ return letter + diacritics
15
+
16
+
17
+ NORMALIZE_PATTERNS = {
18
+ # Sort diacritics
19
+ r"(\p{L})(\p{M}+)": sort_diacritics,
20
+ "ืด": '"', # Hebrew geresh to normal geresh
21
+ "ืณ": "'", # Same
22
+ }
23
+
24
+
25
+ def remove_nikud(text: str, to_keep=""):
26
+ pattern = lexicon.HE_NIKUD_PATTERN
27
+ pattern = "".join(i for i in pattern if i not in to_keep)
28
+ return re.sub(
29
+ pattern,
30
+ "",
31
+ text,
32
+ )
33
+
34
+
35
+ @lru_cache(maxsize=10000)
36
+ def normalize(text: str) -> str:
37
+ """
38
+ Normalize unicode (decomposite)
39
+ Keep only Hebrew characters / punctuation / IPA
40
+ Sort diacritics
41
+ """
42
+
43
+ # Decompose text
44
+ text = unicodedata.normalize("NFD", text)
45
+ for k, v in NORMALIZE_PATTERNS.items():
46
+ text = re.sub(k, v, text)
47
+ for k, v in lexicon.DEDUPLICATE.items():
48
+ text = re.sub(k, v, text)
49
+ return text
50
+
51
+
52
+ def post_normalize(phonemes: str):
53
+ new_phonemes = []
54
+ for word in phonemes.split(" "):
55
+ # remove glottal stop from end
56
+ word = re.sub(r"ส”$", "", word)
57
+ # remove h from end
58
+ word = re.sub(r"h$", "", word)
59
+ word = re.sub(r"หˆh$", "", word)
60
+ # remove j followed by a i
61
+ word = re.sub(r"ij$", "i", word)
62
+ new_phonemes.append(word)
63
+ phonemes = " ".join(new_phonemes)
64
+ return phonemes
65
+
66
+
67
+ def post_clean(phonemes: str):
68
+ clean = []
69
+ for i in phonemes:
70
+ if i == "-":
71
+ clean.append(" ")
72
+ elif (
73
+ i in lexicon.SET_PHONEMES
74
+ or i in lexicon.ADDITIONAL_PHONEMES
75
+ or i == " "
76
+ or i in lexicon.PUNCTUATION
77
+ ):
78
+ clean.append(i)
79
+ return "".join(clean)
80
+
81
+
82
+ letters_pattern = re.compile(r"(\p{L})([\p{M}'|]*)")
83
+
84
+
85
+ # @lru_cache(maxsize=10000) TODO?
86
+ def get_letters(word: str):
87
+ letters: list[tuple[str, str]] = letters_pattern.findall(word) # with en_geresh
88
+ letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
89
+ return letters
90
+
91
+
92
+ def get_unicode_names(text: str):
93
+ return [unicodedata.name(c, "?") for c in text]
94
+
95
+
96
+ def has_vowel(s: iter):
97
+ return any(i in s for i in "aeiou")
98
+
99
+
100
+ def has_constant(s: iter):
101
+ return any(i not in "aeiou" for i in s)
102
+
103
+
104
+ def get_phoneme_syllables(phonemes: list[str]) -> list[str]:
105
+ syllables = []
106
+ cur_syllable = ""
107
+
108
+ i = 0
109
+ while i < len(phonemes):
110
+ # Add current phoneme to the syllable
111
+
112
+ cur_syllable += phonemes[i]
113
+
114
+ # If we have a vowel in the current syllable
115
+ if has_vowel(cur_syllable):
116
+ # If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
117
+ if (
118
+ i + 2 < len(phonemes)
119
+ and not has_vowel(phonemes[i + 1])
120
+ and has_vowel(phonemes[i + 2])
121
+ ):
122
+ # End the current syllable and start a new one
123
+ syllables.append(cur_syllable)
124
+ cur_syllable = ""
125
+ # If we're at the end or next phoneme has a vowel
126
+ elif i + 1 >= len(phonemes) or has_vowel(phonemes[i + 1]):
127
+ # End the current syllable
128
+ syllables.append(cur_syllable)
129
+ cur_syllable = ""
130
+
131
+ i += 1
132
+
133
+ # Add any remaining syllable
134
+ if cur_syllable:
135
+ syllables.append(cur_syllable)
136
+
137
+ # Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
138
+ for i in range(len(syllables) - 1): # Ensure we're not at the last syllable
139
+ if syllables[i].endswith(lexicon.STRESS_PHONEME):
140
+ syllables[i + 1] = (
141
+ lexicon.STRESS_PHONEME + syllables[i + 1]
142
+ ) # Move stress to next syllable
143
+ syllables[i] = syllables[i][
144
+ : -len(lexicon.STRESS_PHONEME)
145
+ ] # Remove stress from current syllable
146
+
147
+ return syllables
148
+
149
+
150
+ def sort_stress(
151
+ phonemes: list[str], placement: Literal["syllable", "vowel"] = "vowel"
152
+ ) -> list[str]:
153
+ """
154
+ TTS systems expect that the stress will be BEFORE vowel
155
+ Linguistics expect in the START of the syllable
156
+ at_start=True for place it in the beginning
157
+ """
158
+ if "หˆ" not in "".join(phonemes):
159
+ # ^ Does not contains stress
160
+ return phonemes
161
+ if not any(i in "".join(phonemes) for i in "aeiou"):
162
+ # ^ Does not contains vowel
163
+ return phonemes
164
+
165
+ # Remove stress marker
166
+ phonemes = [p for p in phonemes if p != "หˆ"]
167
+
168
+ if placement == "syllable":
169
+ return ["หˆ"] + phonemes
170
+
171
+ # Define vowels
172
+ vowels = "aeiou"
173
+
174
+ # Find the first phoneme that contains a vowel, and inject the stress before the vowel
175
+
176
+ for i, phoneme in enumerate(phonemes):
177
+ for j, char in enumerate(phoneme):
178
+ if char in vowels:
179
+ # Insert stress before the vowel
180
+ phonemes[i] = phoneme[:j] + "หˆ" + phoneme[j:]
181
+ return phonemes
182
+
183
+ # If no vowels found, return unchanged
184
+ return phonemes
185
+
186
+
187
+ def mark_vocal_shva(word: str):
188
+ """
189
+ Vocal Shva is context-independent and can be predicted with just the word or a dictionary.
190
+ See https://hebrew-academy.org.il/2020/08/11/ืื™ืš-ื”ื•ื’ื™ื-ืืช-ื”ืฉื•ื•ื-ื”ื ืข
191
+ Note: we predict only if Shva in the first letter in the word
192
+ Note: we assume that the word comes with | to mark 'Txiliyot'
193
+ Note: Vocal Shva rules mid-word are unreliable, so we donโ€™t code them.
194
+
195
+ Meteg (\u05bd) will be added in the letter with Vocal Shva
196
+
197
+ What we don't predict:
198
+ (1) some shva in beginning in future form (we don't know)
199
+ (2) shva in the middle of the word
200
+ """
201
+ letters = get_letters(word)
202
+ if not letters:
203
+ return word
204
+ if letters[0].char in "ืœืžื ืจื™":
205
+ letters[0].all_diac += lexicon.VOCAL_SHVA_DIACRITIC
206
+ elif len(letters) > 1 and letters[1].char in "ืืขื”":
207
+ letters[0].all_diac += lexicon.VOCAL_SHVA_DIACRITIC
208
+ elif letters[0].char in "ื•ื›ืœื‘" and lexicon.PREFIX_DIACRITIC in letters[0].all_diac:
209
+ # ^ The nakdan should add |
210
+ letters[0].all_diac += lexicon.VOCAL_SHVA_DIACRITIC
211
+ # Ensure that prefix character will be last
212
+ for letter in letters:
213
+ if "|" in letter.all_diac:
214
+ letter.all_diac = letter.all_diac.replace("|", "") + "|"
215
+ return "".join(str(i) for i in letters)
216
+
217
+
218
+ def sort_hatama(letters: list[Letter]) -> list[Letter]:
219
+ for i in range(len(letters) - 1):
220
+ diacs = list(letters[i].all_diac)
221
+ if lexicon.HATAMA_DIACRITIC in diacs and lexicon.NIKUD_HASER_DIACRITIC in diacs:
222
+ diacs.remove(lexicon.HATAMA_DIACRITIC)
223
+ letters[i].all_diac = "".join(diacs) # Reassign the updated diacritics
224
+ letters[i + 1].all_diac += lexicon.HATAMA_DIACRITIC
225
+ return letters
226
+
227
+
228
+ def add_milra_hatama(word: str):
229
+ syllables = phonikud.syllables.get_syllables(word)
230
+ stress_index = -1
231
+
232
+ if not syllables:
233
+ return word
234
+
235
+ if len(syllables) == 1:
236
+ stress_index = 0
237
+
238
+ # Get latest syllable
239
+ milra = syllables[stress_index]
240
+ # Get letters
241
+ letters = get_letters(milra)
242
+ # Add Hatama
243
+ letters[0].all_diac += lexicon.HATAMA_DIACRITIC
244
+
245
+ # Replace latest syllable
246
+ syllables[stress_index] = "".join(str(i) for i in letters)
247
+ return "".join(syllables)
phonikud/variants.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import phonikud
2
+ from phonikud import lexicon
3
+
4
+
5
+ class Letter:
6
+ def __init__(self, char: str, diac: list[str]):
7
+ self.char = phonikud.normalize(char)
8
+ self.all_diac = phonikud.normalize(diac)
9
+ self.diac = "".join(
10
+ i for i in self.all_diac if i not in lexicon.SET_ENHANCED_DIACRITICS
11
+ )
12
+
13
+ def __repr__(self):
14
+ return f"[Letter] {self.char}{''.join(self.all_diac)}"
15
+
16
+ def __eq__(self, value: "Letter"):
17
+ return value.all_diac == self.all_diac and value.char == self.char
18
+
19
+ def __str__(self):
20
+ return self.char + self.all_diac
requirements.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv export --no-hashes --no-emit-project
3
+ colorama==0.4.6 ; sys_platform == 'win32'
4
+ # via
5
+ # colorlog
6
+ # pytest
7
+ # tqdm
8
+ colorlog==6.9.0
9
+ # via phonikud
10
+ docopt==0.6.2
11
+ # via num2words
12
+ exceptiongroup==1.3.0 ; python_full_version < '3.11'
13
+ # via pytest
14
+ iniconfig==2.1.0
15
+ # via pytest
16
+ num2words==0.5.14
17
+ # via phonikud
18
+ numpy==2.2.6 ; python_full_version < '3.11'
19
+ # via pandas
20
+ numpy==2.3.1 ; python_full_version >= '3.11'
21
+ # via pandas
22
+ packaging==25.0
23
+ # via pytest
24
+ pandas==2.3.0
25
+ pluggy==1.6.0
26
+ # via pytest
27
+ pygments==2.19.2
28
+ # via pytest
29
+ pytest==8.4.1
30
+ python-dateutil==2.9.0.post0
31
+ # via pandas
32
+ pytz==2025.2
33
+ # via pandas
34
+ regex==2024.11.6
35
+ # via phonikud
36
+ ruff==0.12.0
37
+ six==1.17.0
38
+ # via python-dateutil
39
+ tomli==2.2.1 ; python_full_version < '3.11'
40
+ # via pytest
41
+ tqdm==4.67.1
42
+ typing-extensions==4.14.0 ; python_full_version < '3.11'
43
+ # via exceptiongroup
44
+ tzdata==2025.2
45
+ # via pandas
46
+ phonikud-onnx
47
+ gradio>=5.34.2