thewh1teagle
commited on
Commit
ยท
bcfb376
0
Parent(s):
latest
Browse files- .gitattributes +1 -0
- README.md +10 -0
- app.py +119 -0
- phonikud-1.0.int8.onnx +3 -0
- phonikud/__init__.py +39 -0
- phonikud/data/rashej_tevot.json +3 -0
- phonikud/data/special.json +9 -0
- phonikud/data/symbols.json +5 -0
- phonikud/expander/__init__.py +33 -0
- phonikud/expander/dates.py +60 -0
- phonikud/expander/dictionary.py +79 -0
- phonikud/expander/number_names.py +193 -0
- phonikud/expander/numbers.py +39 -0
- phonikud/expander/time_to_word.py +104 -0
- phonikud/hebrew.py +222 -0
- phonikud/lexicon.py +117 -0
- phonikud/log.py +35 -0
- phonikud/phonemize.py +130 -0
- phonikud/syllables.py +103 -0
- phonikud/utils.py +247 -0
- phonikud/variants.py +20 -0
- requirements.txt +47 -0
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Phonemize in Hebrew
|
| 3 |
+
emoji: ๐ข
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "4.44.0"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
app.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
uv sync
|
| 3 |
+
wget https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx
|
| 4 |
+
uv run gradio app.py
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from phonikud import phonemize, lexicon
|
| 8 |
+
from phonikud.utils import remove_nikud
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from phonikud_onnx import Phonikud
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
default_text = """
|
| 15 |
+
ืึทืึผึทืึผึธืื ื ึดืฆึฐืึธื ืึฐืึนืืคึถื ืึทืกึผึดืืจึธื ืึผึดืึฐืึทื ืึทืกึผึฐืขึธืจึธื.
|
| 16 |
+
ืึดืกึฐืึผึทืจึฐืชึผึดื ืึธืึผ ืึถืช ืึทืึผึนื, ืึฐืึธืึทืจึฐืชึผึดื ืึผึฐืึดืึผืึผืง ืึธื ืงึธืจึธื.
|
| 17 |
+
ืึทืึผึฐืึธืึดืื ืึธืึฒืืึผ ืึผึดืึฐืึปืืึธื ืึถืช ืึทืกึผึดืืคึผืึผืจึดืื ืึทืึผึธืืึผ ืฉืึถืึทืึผืึนืจึธื ืึดืงึฐืจึดืืึธื.
|
| 18 |
+
""".strip()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def on_phonikud_toggle(use_phonikud):
|
| 22 |
+
if not use_phonikud:
|
| 23 |
+
return default_text
|
| 24 |
+
return remove_nikud(default_text)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
css = """
|
| 28 |
+
.input textarea {
|
| 29 |
+
font-size: 22px;
|
| 30 |
+
padding: 15px;
|
| 31 |
+
height: 200px;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
.phonemes {
|
| 35 |
+
background: var(--input-background-fill);
|
| 36 |
+
|
| 37 |
+
}
|
| 38 |
+
.phonemes {
|
| 39 |
+
padding: 5px;
|
| 40 |
+
min-height: 50px;
|
| 41 |
+
}
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Noto Sans Hebrew")])
|
| 45 |
+
|
| 46 |
+
phonikud = None
|
| 47 |
+
commit = "unknown"
|
| 48 |
+
model_path = Path("./phonikud-1.0.int8.onnx")
|
| 49 |
+
if model_path.exists():
|
| 50 |
+
phonikud = Phonikud(str(model_path))
|
| 51 |
+
metadata = phonikud.get_metadata()
|
| 52 |
+
commit = metadata.get("commit", "unknown")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def on_submit(text: str, schema: str, use_phonikud: bool) -> str:
|
| 56 |
+
diacritized = (
|
| 57 |
+
phonikud.add_diacritics(
|
| 58 |
+
text, mark_matres_lectionis=lexicon.NIKUD_HASER_DIACRITIC
|
| 59 |
+
)
|
| 60 |
+
if phonikud and use_phonikud
|
| 61 |
+
else text
|
| 62 |
+
)
|
| 63 |
+
phonemes = phonemize(
|
| 64 |
+
diacritized, predict_stress=True, schema=schema, predict_vocal_shva=False
|
| 65 |
+
)
|
| 66 |
+
if use_phonikud:
|
| 67 |
+
return f"<div dir='rtl' style='font-size: 22px;'>{diacritized.strip()}</div><br><div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
|
| 68 |
+
else:
|
| 69 |
+
return f"<div dir='ltr' style='font-size: 22px;'>{phonemes.strip()}</div>"
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
with gr.Blocks(theme=theme, css=css) as demo:
|
| 73 |
+
text_input = gr.Textbox(
|
| 74 |
+
value=remove_nikud(default_text),
|
| 75 |
+
label="Text",
|
| 76 |
+
rtl=True,
|
| 77 |
+
elem_classes=["input"],
|
| 78 |
+
lines=7,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
with gr.Row():
|
| 82 |
+
schema_dropdown = gr.Dropdown(
|
| 83 |
+
choices=["modern", "plain"], value="plain", label="Phoneme Schema"
|
| 84 |
+
)
|
| 85 |
+
use_phonikud_checkbox = gr.Checkbox(
|
| 86 |
+
value=True, label="Use Phonikud (add diacritics)"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
submit_button = gr.Button("Create")
|
| 90 |
+
output_box = gr.Markdown(label="Phonemes + Diacritics", elem_classes=["phonemes"])
|
| 91 |
+
use_phonikud_checkbox.change(
|
| 92 |
+
fn=lambda use_phonikud: (
|
| 93 |
+
on_phonikud_toggle(use_phonikud), # Update text_input
|
| 94 |
+
on_submit(
|
| 95 |
+
on_phonikud_toggle(use_phonikud), schema_dropdown.value, use_phonikud
|
| 96 |
+
), # Update output_box
|
| 97 |
+
),
|
| 98 |
+
inputs=use_phonikud_checkbox,
|
| 99 |
+
outputs=[text_input, output_box], # Update both text input and output box
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
submit_button.click(
|
| 103 |
+
fn=on_submit,
|
| 104 |
+
inputs=[text_input, schema_dropdown, use_phonikud_checkbox],
|
| 105 |
+
outputs=output_box,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
gr.Markdown("""
|
| 109 |
+
<p style='text-align: center;'><a href='https://github.com/thewh1teagle/phonikud' target='_blank'>Phonikud on Github</a></p>
|
| 110 |
+
""")
|
| 111 |
+
|
| 112 |
+
gr.Markdown(f"""
|
| 113 |
+
<p style='text-align: center; opacity: 0.2; font-size: 10px;'>
|
| 114 |
+
<a href='https://huggingface.co/thewh1teagle/phonikud/commit/{commit}' target='_blank' style='color: white;'>phonikud version {commit}</a>
|
| 115 |
+
</p>
|
| 116 |
+
""")
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
demo.launch()
|
phonikud-1.0.int8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1fa2624b1e8202a0c0a23259b560b0c41ad92a3a6750bd0e322ce5a2b1acdb6
|
| 3 |
+
size 307844158
|
phonikud/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
High level phonemize functions
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .phonemize import Phonemizer
|
| 6 |
+
from .utils import normalize # noqa: F401
|
| 7 |
+
from typing import Callable, Literal
|
| 8 |
+
|
| 9 |
+
phonemizer = Phonemizer()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def phonemize(
|
| 13 |
+
text: str,
|
| 14 |
+
preserve_punctuation=True,
|
| 15 |
+
preserve_stress=True,
|
| 16 |
+
use_expander=True,
|
| 17 |
+
use_post_normalize=True, # For TTS
|
| 18 |
+
predict_stress=True,
|
| 19 |
+
predict_vocal_shva=True,
|
| 20 |
+
stress_placement: Literal["syllable", "vowel"] = "vowel",
|
| 21 |
+
schema: Literal["plain", "modern"] = "modern",
|
| 22 |
+
fallback: Callable[[str], str] = None,
|
| 23 |
+
) -> str:
|
| 24 |
+
"""
|
| 25 |
+
Set stress_at_start=True to place stress at syllable start.
|
| 26 |
+
"""
|
| 27 |
+
phonemes = phonemizer.phonemize(
|
| 28 |
+
text,
|
| 29 |
+
preserve_punctuation=preserve_punctuation,
|
| 30 |
+
preserve_stress=preserve_stress,
|
| 31 |
+
fallback=fallback,
|
| 32 |
+
use_expander=use_expander,
|
| 33 |
+
use_post_normalize=use_post_normalize,
|
| 34 |
+
predict_stress=predict_stress,
|
| 35 |
+
schema=schema,
|
| 36 |
+
predict_vocal_shva=predict_vocal_shva,
|
| 37 |
+
stress_placement=stress_placement,
|
| 38 |
+
)
|
| 39 |
+
return phonemes
|
phonikud/data/rashej_tevot.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"ืฆืืดื": "tsหahal"
|
| 3 |
+
}
|
phonikud/data/special.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"ืึทืืึฐืึธื": "wหala",
|
| 3 |
+
"ืึทืกึทืึดึผื": "wasหabi",
|
| 4 |
+
"ืคืื ืืืื": "pinguwหin",
|
| 5 |
+
"ืืืืฆืืค": "wหatsสap",
|
| 6 |
+
"ืืืืืกืืค": "wหatsสap",
|
| 7 |
+
"ืืืืื": "jหala",
|
| 8 |
+
"ืืืืืจ": "wหolter"
|
| 9 |
+
}
|
phonikud/data/symbols.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"โช": "สหekel",
|
| 3 |
+
"$": "dหolar",
|
| 4 |
+
"%": "axหuz"
|
| 5 |
+
}
|
phonikud/expander/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Expand dates and numbers into words with nikud
|
| 3 |
+
This happens before phonemization
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .numbers import num_to_word
|
| 7 |
+
from .dates import date_to_word
|
| 8 |
+
from .time_to_word import time_to_word
|
| 9 |
+
from .dictionary import Dictionary
|
| 10 |
+
from phonikud.log import log
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Expander:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.dictionary = Dictionary()
|
| 16 |
+
|
| 17 |
+
def expand_text(self, text: str):
|
| 18 |
+
words = []
|
| 19 |
+
for source_word in text.split():
|
| 20 |
+
try:
|
| 21 |
+
word = date_to_word(source_word)
|
| 22 |
+
if word == source_word:
|
| 23 |
+
word = time_to_word(word)
|
| 24 |
+
if word == source_word:
|
| 25 |
+
word = num_to_word(word)
|
| 26 |
+
words.append(word)
|
| 27 |
+
except Exception as e:
|
| 28 |
+
log.error(f"Failed to expand {word} with error: {e}")
|
| 29 |
+
words.append(source_word)
|
| 30 |
+
text = " ".join(words)
|
| 31 |
+
text = self.dictionary.expand_text(text)
|
| 32 |
+
|
| 33 |
+
return text
|
phonikud/expander/dates.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
from .numbers import num_to_word
|
| 3 |
+
|
| 4 |
+
# Mapping of month names in Hebrew with diacritics (Gregorian months)
|
| 5 |
+
MONTHS = {
|
| 6 |
+
1: "ืึธึซื ืึผืึธืจ",
|
| 7 |
+
2: "ืคึถึซืึฐืจืึผืึธืจ",
|
| 8 |
+
3: "ืึตึซืจึฐืฅ",
|
| 9 |
+
4: "ืึตืคึฐืจึดืื",
|
| 10 |
+
5: "ืึทืื",
|
| 11 |
+
6: "ืึซืึผื ึดื",
|
| 12 |
+
7: "ืึซืึผืึดื",
|
| 13 |
+
8: "ืึซืึนืึปืกึฐื",
|
| 14 |
+
9: "ืกึถืคึผึฐืึถึซืึฐืึถึผืจ",
|
| 15 |
+
10: "ืืึนืงึฐืึซืึนืึถึผืจ",
|
| 16 |
+
11: "ื ืึนืึถึซืึฐืึถึผืจ",
|
| 17 |
+
12: "ืึถึผืฆึถึซืึฐืึถึผืจ",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Mapping of day names in Hebrew with diacritics
|
| 21 |
+
DAYS = {
|
| 22 |
+
0: "ืืึนื ืจึดืืฉืืึนื",
|
| 23 |
+
1: "ืืึนื ืฉึตืื ึดื",
|
| 24 |
+
2: "ืืึนื ืฉึฐืืึดืืฉึดืื",
|
| 25 |
+
3: "ืืึนื ืจึตืึดืืขึดื",
|
| 26 |
+
4: "ืืึนื ืึฒืึดืืฉึดืื",
|
| 27 |
+
5: "ืืึนื ืฉึดืืืฉึดืื",
|
| 28 |
+
6: "ืืึนื ืฉึทืืึธึผืช",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def date_to_word(word: str, include_day_name=False) -> str:
|
| 33 |
+
"""
|
| 34 |
+
Converts a given date string in formats (YYYY-MM-DD, YYYY.MM.DD, YYYY/MM/DD) to Hebrew date format with diacritics.
|
| 35 |
+
Returns the original word if it's not a valid date.
|
| 36 |
+
"""
|
| 37 |
+
separators = ["-", ".", "/"]
|
| 38 |
+
orders = [("%Y", "%m", "%d"), ("%d", "%m", "%Y")]
|
| 39 |
+
date_formats = [sep.join(order) for order in orders for sep in separators]
|
| 40 |
+
|
| 41 |
+
for date_format in date_formats:
|
| 42 |
+
try:
|
| 43 |
+
# Try parsing the word with each date format
|
| 44 |
+
date_obj = datetime.strptime(word, date_format)
|
| 45 |
+
|
| 46 |
+
# Get the Hebrew day name with diacritics
|
| 47 |
+
day_name = DAYS[date_obj.weekday()]
|
| 48 |
+
|
| 49 |
+
# Convert month to Hebrew name with diacritics
|
| 50 |
+
month_name = MONTHS[date_obj.month]
|
| 51 |
+
day = num_to_word(str(date_obj.day))
|
| 52 |
+
year = num_to_word(str(date_obj.year))
|
| 53 |
+
|
| 54 |
+
text = f"{day} ืึผึต{month_name} {year}"
|
| 55 |
+
if include_day_name:
|
| 56 |
+
text = f"{day_name}, {text}"
|
| 57 |
+
return text
|
| 58 |
+
except ValueError:
|
| 59 |
+
continue
|
| 60 |
+
return word
|
phonikud/expander/dictionary.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dictionaries are tab separated key value words
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
from phonikud.utils import remove_nikud
|
| 9 |
+
from phonikud.utils import normalize
|
| 10 |
+
from phonikud import lexicon
|
| 11 |
+
import unicodedata
|
| 12 |
+
|
| 13 |
+
files = Path(__file__).parent.joinpath("../data").glob("*.json")
|
| 14 |
+
# Sort in reverse order to prioritize the most recent and best
|
| 15 |
+
order = {"bronze": 1, "silver": 2, "gold": 3}
|
| 16 |
+
files = sorted(
|
| 17 |
+
files, key=lambda f: order.get(next((x for x in order if x in f.stem), ""), 0)
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Dictionary:
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.dict = {}
|
| 24 |
+
self.load_dictionaries()
|
| 25 |
+
|
| 26 |
+
def load_dictionaries(self):
|
| 27 |
+
for file in files:
|
| 28 |
+
with open(file, "r", encoding="utf-8") as f:
|
| 29 |
+
dictionary: dict = json.load(f)
|
| 30 |
+
normalized_dictionary = {}
|
| 31 |
+
|
| 32 |
+
# normalize nikud keys
|
| 33 |
+
for k, v in dictionary.items():
|
| 34 |
+
k = normalize(k)
|
| 35 |
+
# Ensure not empty
|
| 36 |
+
if k and v:
|
| 37 |
+
normalized_dictionary[k] = v
|
| 38 |
+
self.dict.update(normalized_dictionary)
|
| 39 |
+
|
| 40 |
+
def replace_hebrew_only_callback(self, match: re.Match[str]) -> str:
|
| 41 |
+
source: str = match.group(0)
|
| 42 |
+
# decomposite
|
| 43 |
+
source = unicodedata.normalize("NFD", source)
|
| 44 |
+
raw_lookup = self.dict.get(source)
|
| 45 |
+
|
| 46 |
+
without_nikud_lookup = self.dict.get(remove_nikud(source))
|
| 47 |
+
with_nikud_lookup = self.dict.get(normalize(source))
|
| 48 |
+
# Compare without nikud ONLY if source has no nikud
|
| 49 |
+
if raw_lookup:
|
| 50 |
+
return raw_lookup
|
| 51 |
+
if without_nikud_lookup:
|
| 52 |
+
return without_nikud_lookup
|
| 53 |
+
elif with_nikud_lookup:
|
| 54 |
+
return with_nikud_lookup
|
| 55 |
+
return source
|
| 56 |
+
|
| 57 |
+
def replace_non_whitespace_callback(self, match: re.Match[str]) -> str:
|
| 58 |
+
raw_source: str = match.group(0)
|
| 59 |
+
if raw_source.isnumeric():
|
| 60 |
+
return raw_source
|
| 61 |
+
|
| 62 |
+
raw_lookup = self.dict.get(raw_source)
|
| 63 |
+
|
| 64 |
+
# Compare without nikud ONLY if source has no nikud
|
| 65 |
+
if raw_lookup:
|
| 66 |
+
return raw_lookup
|
| 67 |
+
# search by only ', space, regular nikud, alphabet
|
| 68 |
+
raw_source = re.sub(
|
| 69 |
+
lexicon.HE_PATTERN, self.replace_hebrew_only_callback, raw_source
|
| 70 |
+
)
|
| 71 |
+
return raw_source
|
| 72 |
+
|
| 73 |
+
def expand_text(self, text: str) -> str:
|
| 74 |
+
"""
|
| 75 |
+
TODO: if key doesn't have diacritics expand even diacritized words
|
| 76 |
+
"""
|
| 77 |
+
text = re.sub(r"\S+", self.replace_non_whitespace_callback, text)
|
| 78 |
+
|
| 79 |
+
return text
|
phonikud/expander/number_names.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
See https://github.com/savoirfairelinux/num2words/blob/master/num2words/lang_HE.py
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# TODO: add nikud hints
|
| 6 |
+
|
| 7 |
+
ZERO = {"ืืคืก": "ืึถึซืคึถืก"}
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
ONES = {
|
| 11 |
+
"ืืืช": "ืึทืึทืช",
|
| 12 |
+
"ืืื": "ืึถืึธื",
|
| 13 |
+
"ืจืืฉืื ื": "ืจึดืืฉืืึนื ึธื",
|
| 14 |
+
"ืจืืฉืื": "ืจึดืืฉืืึนื",
|
| 15 |
+
"ืจืืฉืื ืืช": "ืจึดืืฉืืึนื ืึนืช",
|
| 16 |
+
"ืจืืฉืื ืื": "ืจึดืืฉืืึนื ึดืื",
|
| 17 |
+
"ืฉืชืืื": "ืฉืึฐืชึทึผึซืึดื",
|
| 18 |
+
"ืฉื ืืื": "ืฉึฐืื ึทึซืึดื",
|
| 19 |
+
"ืฉืชื": "ืฉึฐืืชึตึผื",
|
| 20 |
+
"ืฉื ื": "ืฉืึฐื ึตื",
|
| 21 |
+
"ืฉื ืืื": "ืฉึฐืื ึดืึธึผื",
|
| 22 |
+
"ืฉื ืืืช": "ืฉึฐืื ึดืึผืึนืช",
|
| 23 |
+
"ืฉืืืฉ": "ืฉืึธืืึนืฉื",
|
| 24 |
+
"ืฉืืืฉื": "ืฉืึฐืืึนืฉืึธื",
|
| 25 |
+
"ืฉืืืฉืช": "ืฉืึฐืึซืึนืฉืึถืช",
|
| 26 |
+
"ืฉืืืฉืืช": "ืฉืึฐืึดืืฉืึดืืช",
|
| 27 |
+
"ืฉืืืฉื": "ืฉืึฐืึดืืฉืึดื",
|
| 28 |
+
"ืฉืืืฉืืืช": "ืฉืึฐืึดืืฉืึดืึผืึนืช",
|
| 29 |
+
"ืฉืืืฉืืื": "ืฉืึฐืึดืืฉืึดืึดึผืื",
|
| 30 |
+
"ืืจืืข": "ืึทึซืจึฐืึทึผืข",
|
| 31 |
+
"ืืจืืขื": "ืึทืจึฐืึทึผืขึธื",
|
| 32 |
+
"ืืจืืขืช": "ืึทืจึฐืึผึทึซืขึทืช",
|
| 33 |
+
"ืจืืืขืืช": "ืจึตืึดืืขึดืืช",
|
| 34 |
+
"ืจืืืขื": "ืจึตืึดืืขึดื",
|
| 35 |
+
"ืจืืืขืืืช": "ืจึตืึดืืขึดืืึนืช",
|
| 36 |
+
"ืจืืืขืืื": "ืจึตืึดืืขึดืึดึผืื",
|
| 37 |
+
"ืืืฉ": "ืึธืึตืฉื",
|
| 38 |
+
"ืืืืฉื": "ืึฒืึดืฉึธึผืื",
|
| 39 |
+
"ืืืฉืช": "ืึฒืึตึซืฉืึถืช",
|
| 40 |
+
"ืืืืฉืืช": "ืึฒืึดืืฉึดึผืืืช",
|
| 41 |
+
"ืืืืฉื": "ืึฒืึดืืฉึดึผืื",
|
| 42 |
+
"ืืืืฉืืืช": "ืึฒืึดืืฉึดึผืืืึนืช",
|
| 43 |
+
"ืืืืฉืืื": "ืึฒืึดืืฉึดึผืืึดึผืื",
|
| 44 |
+
"ืฉืฉ": "ืฉึตืืฉื",
|
| 45 |
+
"ืฉืืฉื": "ืฉึดืืฉึธึผืื",
|
| 46 |
+
"ืฉืฉืช": "ืฉืึตึซืฉืึถืช",
|
| 47 |
+
"ืฉืืฉืืช": "ืฉึดืืฉึดึผืืืช",
|
| 48 |
+
"ืฉืืฉื": "ืฉึดืืฉึดึผืื",
|
| 49 |
+
"ืฉืืฉืืืช": "ืฉึดืืฉึดึผืืืึนืช",
|
| 50 |
+
"ืฉืืฉืืื": "ืฉึดืืฉึดึผืืึดึผืื",
|
| 51 |
+
"ืฉืืข": "ืฉึถืึซืึทืข",
|
| 52 |
+
"ืฉืืขื": "ืฉึดืืึฐืขึธื",
|
| 53 |
+
"ืฉืืขืช": "ืฉืึดืึฐืขึทืช",
|
| 54 |
+
"ืฉืืืขืืช": "ืฉึฐืืึดืืขึดืืช",
|
| 55 |
+
"ืฉืืืขื": "ืฉึฐืืึดืืขึดื",
|
| 56 |
+
"ืฉืืืขืืืช": "ืฉึฐืืึดืืขึดืืึนืช",
|
| 57 |
+
"ืฉืืืขืืื": "ืฉึฐืืึดืืขึดืึดึผืื",
|
| 58 |
+
"ืฉืืื ื": "ืฉืึฐืึซืึนื ึถื",
|
| 59 |
+
"ืฉืืื ืช": "ืฉืึฐืืึนื ึทืช",
|
| 60 |
+
"ืฉืืื ืืช": "ืฉึฐืืึดืื ึดืืช",
|
| 61 |
+
"ืฉืืื ื": "ืฉึฐืืึดืื ึดื",
|
| 62 |
+
"ืฉืืื ืืืช": "ืฉึฐืืึดืื ึดืืึนืช",
|
| 63 |
+
"ืฉืืื ืืื": "ืฉึฐืืึดืื ึดืึดึผืื",
|
| 64 |
+
"ืชืฉืข": "ืชึผึตืฉืึทืข",
|
| 65 |
+
"ืชืฉืขื": "ืชึดึผืฉึฐืืขึธื",
|
| 66 |
+
"ืชืฉืขืช": "ืชึผึดืฉืึฐืขึทืช",
|
| 67 |
+
"ืชืฉืืขืืช": "ืชึผึฐืฉืึดืืขึดืืช",
|
| 68 |
+
"ืชืฉืืขื": "ืชึผึฐืฉืึดืืขึดื",
|
| 69 |
+
"ืชืฉืืขืืืช": "ืชึผึฐืฉืึดืืขึดืึผืึนืช",
|
| 70 |
+
"ืชืฉืืขืืื": "ืชึผึฐืฉืึดืืขึดืึผึดืื",
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
TENS = {
|
| 75 |
+
"ืขืฉืจ": "ืขึถึซืฉืึถืจ",
|
| 76 |
+
"ืขืฉืจื": "ืขึถืฉืึฐืจึตื",
|
| 77 |
+
"ืขืฉืจืช": "ืขึฒืฉึถืึซืจึถืช",
|
| 78 |
+
"ืขืฉืืจืืช": "ืขึฒืฉืึดืืจึดืืช",
|
| 79 |
+
"ืขืฉืืจื": "ืขึฒืฉืึดืืจึดื",
|
| 80 |
+
"ืขืฉืืจืืืช": "ืขึฒืฉืึดืืจึดืึผืึนืช",
|
| 81 |
+
"ืขืฉืืจืืื": "ืขึฒืฉืึดืืจึดืึผึดืื",
|
| 82 |
+
"ืฉืชืื ืขืฉืจื": "ืฉืึฐืชึผึตึซืื ืขึถืฉืึฐืจึตื",
|
| 83 |
+
"ืฉื ืื ืขืฉืจ": "ืฉืึฐื ึตืื ืขึธืฉืึธืจ",
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
TWENTIES = {
|
| 88 |
+
"ืขืฉืจืื": "ืขึถืฉึฐืืจึดึซืื",
|
| 89 |
+
"ืฉืืืฉืื": "ืฉึฐืืืึนืฉึดืืื",
|
| 90 |
+
"ืืจืืขืื": "ืึทืจึฐืึธึผืขึดืื",
|
| 91 |
+
"ืืืืฉืื": "ืึฒืึดืฉึดึผืืื",
|
| 92 |
+
"ืฉืืฉืื": "ืฉึดืืฉึดึผืืื",
|
| 93 |
+
"ืฉืืขืื": "ืฉึดืืึฐืขึดืื",
|
| 94 |
+
"ืฉืืื ืื": "ืฉึฐืืืึนื ึดืื",
|
| 95 |
+
"ืชืฉืขืื": "ืชึดึผืฉึฐืืขึดืื",
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
HUNDREDS = {
|
| 100 |
+
"ืืื": "ืึตึซืึธื",
|
| 101 |
+
"ืืืช": "ืึตืึทืช",
|
| 102 |
+
"ืืืชืืื": "ืึธืืชึทึซืึดื",
|
| 103 |
+
"ืืืืช": "ืึตืืึนืช",
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
THOUSANDS = {
|
| 107 |
+
"ืืืฃ": "ืึถึซืึถืฃ",
|
| 108 |
+
"ืืืคืืื": "ืึทืึฐืคึทึผึซืึดื",
|
| 109 |
+
"ืืืคืื": "ืึฒืึธืคึดืื",
|
| 110 |
+
"ืืืคื": "ืึทืึฐืคึดึผื",
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
LARGE = {
|
| 115 |
+
"ืืืืืื": "ืึดืืึฐืืึนื",
|
| 116 |
+
"ืืืืืื ื": "ืึดืืึฐืืึนื ึดื",
|
| 117 |
+
"ืืืืืืจื": "ืึดืืึฐืึทืืจึฐื",
|
| 118 |
+
"ืืืืืืจืื": "ืึดืืึฐืึทึซืืจึฐืึดึผื",
|
| 119 |
+
"ืืจืืืืื": "ืึฐืจึดืืึฐืืึนื",
|
| 120 |
+
"ืืจืืืืื ื": "ืึฐืจึดืืึฐืืึนื ึดื",
|
| 121 |
+
"ืงืืืืจืืืืื": "ืงืึนืึทืึฐืจึดืืึฐืืึนื",
|
| 122 |
+
"ืงืืืืจืืืืื ื": "ืงืึนืึทืึฐืจึดืืึฐืืึนื ึดื",
|
| 123 |
+
"ืงืืืื ืืืืืื": "ืงืึดืึดึผื ึฐืึดืืึฐืืึนื",
|
| 124 |
+
"ืงืืืื ืืืืืื ื": "ืงืึดืึดึผื ึฐืึดืืึฐืืึนื ึดื",
|
| 125 |
+
"ืกืงืกืืืืืื": "ืกึฐืงึถืกึฐืึดืืึฐืืึนื",
|
| 126 |
+
"ืกืงืกืืืืืื ื": "ืกึฐืงึถืกึฐืึดืืึฐืืึนื ึดื",
|
| 127 |
+
"ืกืคืืืืืื": "ืกึฐืคึถึผืึดืืึฐืืึนื",
|
| 128 |
+
"ืกืคืืืืืื ื": "ืกึฐืคึถึผืึดืืึฐืืึนื ึดื",
|
| 129 |
+
"ืืืงืืืืืื": "ืืึนืงึฐืึดืืึฐืืึนื",
|
| 130 |
+
"ืืืงืืืืืื ื": "ืืึนืงึฐืึดืืึฐืืึนื ึดื",
|
| 131 |
+
"ื ืื ืืืืื": "ื ืึผื ึดืืึฐืืึนื",
|
| 132 |
+
"ื ืื ืืืืื ื": "ื ืึผื ึดืืึฐืืึนื ึดื",
|
| 133 |
+
"ืืกืืืืื": "ืึถึผืกึดืืึฐืืึนื",
|
| 134 |
+
"ืืกืืืืื ื": "ืึถึผืกึดืืึฐืืึนื ึดื",
|
| 135 |
+
"ืืื ืืกืืืืื": "ืืึผื ึฐืึฐืกึดืืึฐืืึนื",
|
| 136 |
+
"ืืื ืืกืืืืื ื": "ืืึผื ึฐืึฐืกึดืืึฐืืึนื ึดื",
|
| 137 |
+
"ืืืืืืกืืืืื": "ืืึผืืืึฐืกึดืืึฐืืึนื",
|
| 138 |
+
"ืืืืืืกืืืืื ื": "ืืึผืืืึฐืกึดืืึฐืืึนื ึดื",
|
| 139 |
+
"ืืจืืกืืืืื": "ืึถืจึฐืึฐืกึดืืึฐืืึนื",
|
| 140 |
+
"ืืจืืกืืืืื ื": "ืึถืจึฐืึฐืกึดืืึฐืืึนื ึดื",
|
| 141 |
+
"ืงืืืืืืืจืืกืืืืื": "ืงืึผืืึฐืึธืืืจึฐืึฐืกึดืืึฐืืึนื",
|
| 142 |
+
"ืงืืืืืืืจืืกืืืืื ื": "ืงืึผืืึฐืึธืืืจึฐืึฐืกึดืืึฐืืึนื ึดื",
|
| 143 |
+
"ืงืืืื ืืกืืืืื": "ืงืึดืึดึผื ึฐืึฐืกึดืืึฐืืึนื",
|
| 144 |
+
"ืงืืืื ืืกืืืืื ื": "ืงืึดืึดึผื ึฐืึฐืกึดืืึฐืืึนื ึดื",
|
| 145 |
+
"ืกืงืกืืกืืืืื": "ืกึถืงึฐืกึฐืึฐืกึดืืึฐืืึนื",
|
| 146 |
+
"ืกืงืกืืกืืืืื ื": "ืกึถืงึฐืกึฐืึฐืกึดืืึฐืืึนื ึดื",
|
| 147 |
+
"ืกืคืื ืืกืืืืื": "ืกึฐืคึถึผืึทื ึฐืึฐืกึดืืึฐืืึนื",
|
| 148 |
+
"ืกืคืื ืืกืืืืื ื": "ืกึฐืคึถึผืึทื ึฐืึฐืกึดืืึฐืืึนื ึดื",
|
| 149 |
+
"ืืืงืืืืกืืืืื": "ืืึนืงึฐืืึนืึฐืกึดืืึฐืืึนื",
|
| 150 |
+
"ืืืงืืืืกืืืืื ื": "ืืึนืงึฐืืึนืึฐืกึดืืึฐืืึนื ึดื",
|
| 151 |
+
"ื ืืืืืกืืืืื": "ื ืึนืึฐืึทืึฐืกึดืืึฐืืึนื",
|
| 152 |
+
"ื ืืืืืกืืืืื ื": "ื ืึนืึฐืึทืึฐืกึดืืึฐืืึนื ึดื",
|
| 153 |
+
"ืืืืื ืืืืืื": "ืึดืืึดึผืื ึฐืึดืืึฐืืึนื",
|
| 154 |
+
"ืืืืื ืืืืืื ื": "ืึดืืึดึผืื ึฐืึดืืึฐืืึนื ึดื",
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
LETTERS = {
|
| 159 |
+
"ื": "ืึต",
|
| 160 |
+
"ื": "ืึท",
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
CURRENCY = {
|
| 165 |
+
"ืฉืงื": "ืฉึตืึซืงึถื",
|
| 166 |
+
"ืฉืงืืื": "ืฉึฐืืงึธืึดืื",
|
| 167 |
+
"ืืืืจื": "ืึฒืืึนืจึธื",
|
| 168 |
+
"ืืืืจืืช": "ืึฒืืึนืจืึนืช",
|
| 169 |
+
"ืืืจื": "ืึตึซืืจืึน",
|
| 170 |
+
"ืกื ื": "ืกึตื ึฐื",
|
| 171 |
+
"ืกื ืืื": "ืกึตึซื ึฐืึดืื",
|
| 172 |
+
"ืืืืจ": "ืึซืึนืึธืจ",
|
| 173 |
+
"ืืืืจืื": "ืืึนืึธึซืจึดืื",
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
POINTS = {
|
| 178 |
+
"ืืื ืืก": "ืึดึซืื ึผืึผืก",
|
| 179 |
+
"ื ืงืืื": "ื ึฐึฝืงึปืึผึธื",
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
NUMBER_NAMES = {
|
| 183 |
+
**CURRENCY,
|
| 184 |
+
**HUNDREDS,
|
| 185 |
+
**LARGE,
|
| 186 |
+
**LETTERS,
|
| 187 |
+
**ONES,
|
| 188 |
+
**POINTS,
|
| 189 |
+
**TENS,
|
| 190 |
+
**THOUSANDS,
|
| 191 |
+
**TWENTIES,
|
| 192 |
+
**ZERO,
|
| 193 |
+
}
|
phonikud/expander/numbers.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import num2words
|
| 2 |
+
from .number_names import NUMBER_NAMES
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def add_diacritics(words: str):
|
| 7 |
+
new_words = []
|
| 8 |
+
for word in words.split():
|
| 9 |
+
if NUMBER_NAMES.get(word):
|
| 10 |
+
new_words.append(NUMBER_NAMES[word])
|
| 11 |
+
elif NUMBER_NAMES.get(word[1:]):
|
| 12 |
+
# With Vav or Bet
|
| 13 |
+
new_words.append(NUMBER_NAMES[word[0]] + NUMBER_NAMES[word[1:]])
|
| 14 |
+
else:
|
| 15 |
+
new_words.append(word)
|
| 16 |
+
return " ".join(new_words)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def num_to_word(maybe_number: str) -> str:
|
| 20 |
+
def replace_number(match):
|
| 21 |
+
num: str = match.group()
|
| 22 |
+
suffix, prefix = "", ""
|
| 23 |
+
# prefix
|
| 24 |
+
if not num.startswith("-") and not num[0].isdigit():
|
| 25 |
+
prefix = num[0]
|
| 26 |
+
num = num[1:]
|
| 27 |
+
if not num[-1].isdigit():
|
| 28 |
+
suffix = num[-1]
|
| 29 |
+
num = num[:-1]
|
| 30 |
+
words = num2words.num2words(num, lang="he", ordinal=False)
|
| 31 |
+
words_with_diacritics = add_diacritics(words)
|
| 32 |
+
return (
|
| 33 |
+
f"{prefix.strip()} {words_with_diacritics.strip()} {suffix.strip()}".strip()
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Replace all whole numbers in the string
|
| 37 |
+
result = re.sub(r"[^\d\-]?-?\d+(?:[\.,]\d+)?[^\d]?", replace_number, maybe_number)
|
| 38 |
+
|
| 39 |
+
return result
|
phonikud/expander/time_to_word.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Convert time to words
|
| 3 |
+
TODO: fix zeros eg. 22:00
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
PATTERNS = [
|
| 9 |
+
r"(\d{1,2})([apm]{2})", # AM/PM format
|
| 10 |
+
r"(\d{1,2}):(\d{2})", # HH:MM format
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def extract_time(match):
|
| 15 |
+
"""
|
| 16 |
+
Extract hour and minute from a string in HH:MM or AM/PM format
|
| 17 |
+
and return as integers.
|
| 18 |
+
"""
|
| 19 |
+
time_str = match.group(0).lower().strip()
|
| 20 |
+
|
| 21 |
+
# Check for HH:MM format
|
| 22 |
+
match = re.match(r"(\d{1,2}):(\d{2})", time_str)
|
| 23 |
+
if match:
|
| 24 |
+
h = int(match.group(1))
|
| 25 |
+
m = int(match.group(2))
|
| 26 |
+
return f"{convert_to_word(h, m)}"
|
| 27 |
+
|
| 28 |
+
# Check for AM/PM format
|
| 29 |
+
match = re.match(r"(\d{1,2})([apm]{2})", time_str)
|
| 30 |
+
if match:
|
| 31 |
+
h = int(match.group(1))
|
| 32 |
+
period = match.group(2)
|
| 33 |
+
|
| 34 |
+
# Normalize to 24-hour format
|
| 35 |
+
if period == "am" and h == 12:
|
| 36 |
+
h = 0
|
| 37 |
+
elif period == "pm" and h != 12:
|
| 38 |
+
h += 12
|
| 39 |
+
return f"{convert_to_word(h, 0)}" # Defaulting to 0 minutes when only hour is provided
|
| 40 |
+
|
| 41 |
+
return match.group(0) # Return original text if the format is not recognized
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def convert_to_word(h, m):
|
| 45 |
+
hours = [
|
| 46 |
+
"ืึถืคึถืก",
|
| 47 |
+
"ืึทืึทืช",
|
| 48 |
+
"ืฉืึฐื ึทืึดื", # Will be replaced with "ืฉืึตื ึดื" when needed
|
| 49 |
+
"ืฉืึธืืึนืฉื",
|
| 50 |
+
"ืึทึซืจึฐืึผึทืข",
|
| 51 |
+
"ืึธืึตืฉื",
|
| 52 |
+
"ืฉืึตืฉื",
|
| 53 |
+
"ืฉืึถึซืึทืข",
|
| 54 |
+
"ืฉืึฐืึซืึนื ึตื",
|
| 55 |
+
"ืชึผึตึซืฉืึทืข",
|
| 56 |
+
"ืขึตึซืฉืึตืจ",
|
| 57 |
+
"ืึทืึทืช ืขึถืฉืึฐืจึตื",
|
| 58 |
+
"ืฉืึฐืชึผึตืื ืขึถืฉืึฐืจึตื",
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
tens = ["", "ืขึตืฉืึตืจ", "ืขึถืฉืึฐืจึดืื", "ืฉืึฐืืึนืฉืึดืื", "ืึทืจึฐืึผึธืขึดืื", "ืึฒืึดืฉึผืึดืื"]
|
| 62 |
+
|
| 63 |
+
ten_to_twenty = [
|
| 64 |
+
"ืขึตึซืฉืึตืจ",
|
| 65 |
+
"ืึทืึทืช ืขึถืฉืึฐืจึตื",
|
| 66 |
+
"ืฉืึฐืชึผึตืื ืขึถืฉืึฐืจึตื",
|
| 67 |
+
"ืฉืึฐืืึนืฉื ืขึถืฉืึฐืจึตื",
|
| 68 |
+
"ืึทืจึฐืึผึทืข ืขึถืฉืึฐืจึตื",
|
| 69 |
+
"ืึฒืึตืฉื ืขึถืฉืึฐืจึตื",
|
| 70 |
+
"ืฉืึตืฉื ืขึถืฉืึฐืจึตื",
|
| 71 |
+
"ืฉืึฐืึทืข ืขึถืฉืึฐืจึตื",
|
| 72 |
+
"ืฉืึฐืืึนื ึถื ืขึถืฉืึฐืจึตื",
|
| 73 |
+
"ืชึผึฐืฉืึทืข ืขึถืฉืึฐืจึตื",
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
vocab = {"minutes": "ืึผึทืงึผืึนืช", "and": "ืึต", "shtey": "ืฉืึฐืชึผึตื"}
|
| 77 |
+
|
| 78 |
+
# Convert 0 hours to 12 (midnight)
|
| 79 |
+
if h == 0:
|
| 80 |
+
h = 12
|
| 81 |
+
|
| 82 |
+
elif h > 12:
|
| 83 |
+
h -= 12
|
| 84 |
+
|
| 85 |
+
if m == 0:
|
| 86 |
+
return f"{hours[h]}"
|
| 87 |
+
|
| 88 |
+
elif 1 <= m <= 9:
|
| 89 |
+
minute_word = (
|
| 90 |
+
vocab["shtey"] if m == 2 else hours[m]
|
| 91 |
+
) # Replace "ืฉื ืืื" with "ืฉื ื"
|
| 92 |
+
return f"{hours[h]} {vocab['and']}{minute_word} {vocab['minutes']}"
|
| 93 |
+
|
| 94 |
+
elif 10 <= m <= 19:
|
| 95 |
+
return f"{hours[h]} {vocab['and']}{ten_to_twenty[m - 10]} {vocab['minutes']}"
|
| 96 |
+
|
| 97 |
+
else:
|
| 98 |
+
tens_part = f"{vocab['and']}{tens[m // 10]}"
|
| 99 |
+
units_part = f"{vocab['and']}{hours[m % 10]}" if m % 10 != 0 else ""
|
| 100 |
+
return f"{hours[h]} {tens_part} {units_part} {vocab['minutes']}".strip()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def time_to_word(text: str):
|
| 104 |
+
return re.sub("|".join(PATTERNS), extract_time, text)
|
phonikud/hebrew.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hebrew Phonemizer
|
| 3 |
+
|
| 4 |
+
Fast rule-based FST that converts Hebrew text to phonemes.
|
| 5 |
+
See https://en.wikipedia.org/wiki/Finite-state_transducer
|
| 6 |
+
|
| 7 |
+
Rules implemented:
|
| 8 |
+
1. Consonant handling (including special cases)
|
| 9 |
+
2. Nikud (vowel) processing
|
| 10 |
+
3. Dagesh handling
|
| 11 |
+
4. Geresh handling
|
| 12 |
+
5. Vocal Shva prediction
|
| 13 |
+
6. Special letter combinations
|
| 14 |
+
|
| 15 |
+
Reference:
|
| 16 |
+
- https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
|
| 17 |
+
- https://en.wikipedia.org/wiki/Help:IPA/Hebrew
|
| 18 |
+
- https://he.wikipedia.org/wiki/ืืืจื
|
| 19 |
+
- https://hebrew-academy.org.il/2020/08/11/ืืื-ืืืืื-ืืช-ืืฉืืื-ืื ืข
|
| 20 |
+
- https://hebrew-academy.org.il/2010/03/24/ืฆืืจืื-ื ืขืื-ืืืืืช-ืงืืฅ-ืืคื ื-ืื
|
| 21 |
+
- https://hebrew-academy.org.il/2022/03/03/ืืืขืื-ืืืืจืข-ืขื-ืืืืขืื-ืืขืืจืืช
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from typing import Literal
|
| 25 |
+
from phonikud.variants import Letter
|
| 26 |
+
from phonikud import lexicon
|
| 27 |
+
import re
|
| 28 |
+
from phonikud.utils import sort_stress
|
| 29 |
+
|
| 30 |
+
SHVA = "\u05b0"
|
| 31 |
+
SIN = "\u05c2"
|
| 32 |
+
PATAH = "\u05b7"
|
| 33 |
+
KAMATZ = "\u05b8"
|
| 34 |
+
HATAF_KAMATZ = "\u05b3"
|
| 35 |
+
DAGESH = "\u05bc"
|
| 36 |
+
HOLAM = "\u05b9"
|
| 37 |
+
HIRIK = "\u05b4"
|
| 38 |
+
PATAH_LIKE_PATTERN = "[\u05b7-\u05b8]"
|
| 39 |
+
KUBUTS = "\u05bb"
|
| 40 |
+
TSERE = "\u05b5"
|
| 41 |
+
HATAMA = "\u05ab"
|
| 42 |
+
VAV_HOLAM = "\u05ba"
|
| 43 |
+
DAGESH = "\u05bc"
|
| 44 |
+
SEGOL = "\u05b6"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def phonemize_hebrew(
|
| 48 |
+
letters: list[Letter], stress_placement: Literal["syllable", "vowel"]
|
| 49 |
+
) -> list[str]:
|
| 50 |
+
phonemes, i = [], 0
|
| 51 |
+
while i < len(letters):
|
| 52 |
+
cur = letters[i]
|
| 53 |
+
prev = letters[i - 1] if i > 0 else None
|
| 54 |
+
next = letters[i + 1] if i + 1 < len(letters) else None
|
| 55 |
+
next_phonemes, skip_offset = letter_to_phonemes(
|
| 56 |
+
cur, prev, next, stress_placement
|
| 57 |
+
)
|
| 58 |
+
phonemes.extend(next_phonemes)
|
| 59 |
+
i += skip_offset + 1
|
| 60 |
+
return phonemes
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def handle_yud(cur: Letter, prev: Letter | None, next: Letter | None) -> bool:
|
| 64 |
+
"""Returns True if Yud should skip consonants"""
|
| 65 |
+
return (
|
| 66 |
+
next
|
| 67 |
+
# Yud without diacritics
|
| 68 |
+
and not cur.diac
|
| 69 |
+
# In middle
|
| 70 |
+
and prev
|
| 71 |
+
# Prev Hirik
|
| 72 |
+
and prev.char + prev.diac != "ืึต"
|
| 73 |
+
# Next Vav has meaning
|
| 74 |
+
and not (next.char == "ื" and next.diac and "\u05b0" not in next.diac)
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def handle_vav(cur: Letter, prev: Letter | None, next: Letter | None):
|
| 79 |
+
if prev and SHVA in prev.diac and HOLAM in cur.diac:
|
| 80 |
+
return ["vo"], True, True, 0
|
| 81 |
+
|
| 82 |
+
if next and next.char == "ื":
|
| 83 |
+
diac = cur.diac + next.diac
|
| 84 |
+
if HOLAM in diac:
|
| 85 |
+
return ["vo"], True, True, 1
|
| 86 |
+
if cur.diac == next.diac:
|
| 87 |
+
return ["vu"], True, True, 1
|
| 88 |
+
if HIRIK in cur.diac:
|
| 89 |
+
return ["vi"], True, True, 0
|
| 90 |
+
if SHVA in cur.diac and not next.diac:
|
| 91 |
+
return ["v"], True, True, 0
|
| 92 |
+
if KAMATZ in cur.diac or PATAH in cur.diac:
|
| 93 |
+
return ["va"], True, True, 0
|
| 94 |
+
if TSERE in cur.diac or SEGOL in cur.diac:
|
| 95 |
+
return ["ve"], True, True, 0
|
| 96 |
+
return [], False, False, 0
|
| 97 |
+
|
| 98 |
+
# Single ื
|
| 99 |
+
if re.search(PATAH_LIKE_PATTERN, cur.diac):
|
| 100 |
+
return ["va"], True, True, 0
|
| 101 |
+
if TSERE in cur.diac or SEGOL in cur.diac:
|
| 102 |
+
return ["ve"], True, True, 0
|
| 103 |
+
if HOLAM in cur.diac:
|
| 104 |
+
return ["o"], True, True, 0
|
| 105 |
+
if KUBUTS in cur.diac or DAGESH in cur.diac:
|
| 106 |
+
return ["u"], True, True, 0
|
| 107 |
+
if SHVA in cur.diac and not prev:
|
| 108 |
+
return ["ve"], True, True, 0
|
| 109 |
+
if HIRIK in cur.diac:
|
| 110 |
+
return ["vi"], True, True, 0
|
| 111 |
+
if next and not cur.diac:
|
| 112 |
+
return [], True, True, 0
|
| 113 |
+
|
| 114 |
+
return ["v"], True, True, 0
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def letter_to_phonemes(
|
| 118 |
+
cur: Letter,
|
| 119 |
+
prev: Letter | None,
|
| 120 |
+
next: Letter | None,
|
| 121 |
+
stress_placement: Literal["syllable", "vowel"],
|
| 122 |
+
) -> tuple[str, int]:
|
| 123 |
+
cur_phonemes = []
|
| 124 |
+
skip_diacritics = False
|
| 125 |
+
skip_consonants = False
|
| 126 |
+
skip_offset = 0
|
| 127 |
+
|
| 128 |
+
if lexicon.NIKUD_HASER_DIACRITIC in cur.all_diac:
|
| 129 |
+
skip_consonants = True
|
| 130 |
+
skip_diacritics = True
|
| 131 |
+
|
| 132 |
+
elif cur.char == "ื" and not cur.diac and prev:
|
| 133 |
+
if next and next.char != "ื":
|
| 134 |
+
skip_consonants = True
|
| 135 |
+
|
| 136 |
+
elif cur.char == "ื" and handle_yud(cur, prev, next):
|
| 137 |
+
skip_consonants = True
|
| 138 |
+
|
| 139 |
+
elif cur.char == "ืฉ" and SIN in cur.diac:
|
| 140 |
+
if (
|
| 141 |
+
next
|
| 142 |
+
and next.char == "ืฉ"
|
| 143 |
+
and not next.diac
|
| 144 |
+
and re.search("[\u05b7\u05b8]", cur.diac)
|
| 145 |
+
):
|
| 146 |
+
# ^ ืืฉืฉืืจ
|
| 147 |
+
cur_phonemes.append("sa")
|
| 148 |
+
skip_consonants = True
|
| 149 |
+
skip_diacritics = True
|
| 150 |
+
skip_offset += 1
|
| 151 |
+
else:
|
| 152 |
+
cur_phonemes.append("s")
|
| 153 |
+
skip_consonants = True
|
| 154 |
+
|
| 155 |
+
# shin without nikud after sin = sin
|
| 156 |
+
elif cur.char == "ืฉ" and not cur.diac and prev and SIN in prev.diac:
|
| 157 |
+
cur_phonemes.append("s")
|
| 158 |
+
skip_consonants = True
|
| 159 |
+
|
| 160 |
+
elif not next and cur.char == "ื" and PATAH in cur.diac:
|
| 161 |
+
# Final Het gnuva
|
| 162 |
+
cur_phonemes.append("ax")
|
| 163 |
+
skip_diacritics = True
|
| 164 |
+
skip_consonants = True
|
| 165 |
+
|
| 166 |
+
elif not next and cur.char == "ื" and PATAH in cur.diac:
|
| 167 |
+
# Final He gnuva
|
| 168 |
+
cur_phonemes.append("ah")
|
| 169 |
+
skip_diacritics = True
|
| 170 |
+
skip_consonants = True
|
| 171 |
+
|
| 172 |
+
elif not next and cur.char == "ืข" and PATAH in cur.diac:
|
| 173 |
+
# Final Ayin gnuva
|
| 174 |
+
cur_phonemes.append("a")
|
| 175 |
+
skip_diacritics = True
|
| 176 |
+
skip_consonants = True
|
| 177 |
+
|
| 178 |
+
if cur and "'" in cur.diac and cur.char in lexicon.GERESH_PHONEMES:
|
| 179 |
+
if cur.char == "ืช":
|
| 180 |
+
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
|
| 181 |
+
skip_diacritics = True
|
| 182 |
+
skip_consonants = True
|
| 183 |
+
else:
|
| 184 |
+
# Geresh
|
| 185 |
+
cur_phonemes.append(lexicon.GERESH_PHONEMES.get(cur.char, ""))
|
| 186 |
+
skip_consonants = True
|
| 187 |
+
|
| 188 |
+
elif DAGESH in cur.diac and cur.char + DAGESH in lexicon.LETTERS_PHONEMES: # dagesh
|
| 189 |
+
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char + DAGESH, ""))
|
| 190 |
+
skip_consonants = True
|
| 191 |
+
elif cur.char == "ื" and lexicon.NIKUD_HASER_DIACRITIC not in cur.all_diac:
|
| 192 |
+
vav_phonemes, vav_skip_consonants, vav_skip_diacritics, vav_skip_offset = (
|
| 193 |
+
handle_vav(cur, prev, next)
|
| 194 |
+
)
|
| 195 |
+
cur_phonemes.extend(vav_phonemes)
|
| 196 |
+
skip_consonants = vav_skip_consonants
|
| 197 |
+
skip_diacritics = vav_skip_diacritics
|
| 198 |
+
skip_offset += vav_skip_offset
|
| 199 |
+
|
| 200 |
+
if not skip_consonants:
|
| 201 |
+
cur_phonemes.append(lexicon.LETTERS_PHONEMES.get(cur.char, ""))
|
| 202 |
+
|
| 203 |
+
if KAMATZ in cur.diac and next and HATAF_KAMATZ in next.diac:
|
| 204 |
+
cur_phonemes.append("o")
|
| 205 |
+
skip_diacritics = True
|
| 206 |
+
|
| 207 |
+
nikud_phonemes = []
|
| 208 |
+
if not skip_diacritics:
|
| 209 |
+
nikud_phonemes = [
|
| 210 |
+
lexicon.NIKUD_PHONEMES.get(nikud, "") for nikud in cur.all_diac
|
| 211 |
+
]
|
| 212 |
+
elif skip_diacritics and lexicon.HATAMA_DIACRITIC in cur.all_diac:
|
| 213 |
+
nikud_phonemes = [lexicon.STRESS_PHONEME]
|
| 214 |
+
cur_phonemes.extend(nikud_phonemes)
|
| 215 |
+
# Ensure the stress is at the beginning of the syllable
|
| 216 |
+
cur_phonemes = sort_stress(cur_phonemes, stress_placement)
|
| 217 |
+
cur_phonemes = [
|
| 218 |
+
p for p in cur_phonemes if all(i in lexicon.SET_PHONEMES for i in p)
|
| 219 |
+
]
|
| 220 |
+
# Remove empty phonemes
|
| 221 |
+
cur_phonemes = [p for p in cur_phonemes if p]
|
| 222 |
+
return cur_phonemes, skip_offset
|
phonikud/lexicon.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ASCII IPA transcription of Hebrew consonants and vowels.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
|
| 6 |
+
|
| 7 |
+
# Non standard diacritics
|
| 8 |
+
VOCAL_SHVA_DIACRITIC = "\u05bd" # Meteg
|
| 9 |
+
HATAMA_DIACRITIC = "\u05ab" # Ole
|
| 10 |
+
PREFIX_DIACRITIC = "|" # Vertical bar
|
| 11 |
+
NIKUD_HASER_DIACRITIC = "\u05af" # Masora, not in use
|
| 12 |
+
EN_GERESH = "'"
|
| 13 |
+
NON_STANDARD_DIAC = "".join(
|
| 14 |
+
[
|
| 15 |
+
VOCAL_SHVA_DIACRITIC,
|
| 16 |
+
HATAMA_DIACRITIC,
|
| 17 |
+
PREFIX_DIACRITIC,
|
| 18 |
+
NIKUD_HASER_DIACRITIC,
|
| 19 |
+
EN_GERESH,
|
| 20 |
+
]
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
HE_PATTERN = rf'[\u05b0-\u05ea{NON_STANDARD_DIAC}"]+'
|
| 24 |
+
# ^ Standard nikud and letters, ole, meteg, masora, vertical bar, en geresh
|
| 25 |
+
HE_NIKUD_PATTERN = rf"[\u05b0-\u05c7{NON_STANDARD_DIAC}]"
|
| 26 |
+
# ^ Letters, diacritics, en geresh
|
| 27 |
+
PUNCTUATION = set(r".,!? ")
|
| 28 |
+
|
| 29 |
+
STRESS_PHONEME = "ห" # \u02c8 visually looks like single quote
|
| 30 |
+
SPECIAL_PHONEMES = ["w"]
|
| 31 |
+
MODERN_SCHEMA = {
|
| 32 |
+
"x": "ฯ", # Het
|
| 33 |
+
"r": "ส", # Resh
|
| 34 |
+
"g": "ษก", # Gimel
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Geresh
|
| 38 |
+
GERESH_PHONEMES = {"ื": "dส", "ื": "ส", "ืช": "ta", "ืฆ": "tส", "ืฅ": "tส"}
|
| 39 |
+
|
| 40 |
+
# Consonants
|
| 41 |
+
LETTERS_PHONEMES = {
|
| 42 |
+
"ื": "ส", # Alef
|
| 43 |
+
"ื": "v", # Bet
|
| 44 |
+
"ื": "g", # Gimel
|
| 45 |
+
"ื": "d", # Dalet
|
| 46 |
+
"ื": "h", # He
|
| 47 |
+
"ื": "v", # Vav
|
| 48 |
+
"ื": "z", # Zayin
|
| 49 |
+
"ื": "x", # Het
|
| 50 |
+
"ื": "t", # Tet
|
| 51 |
+
"ื": "j", # Yod
|
| 52 |
+
"ื": "x", # Haf sofit
|
| 53 |
+
"ื": "x", # Haf
|
| 54 |
+
"ื": "l", # Lamed
|
| 55 |
+
"ื": "m", # Mem Sofit
|
| 56 |
+
"ื": "m", # Mem
|
| 57 |
+
"ื": "n", # Nun Sofit
|
| 58 |
+
"ื ": "n", # Nun
|
| 59 |
+
"ืก": "s", # Samekh
|
| 60 |
+
"ืข": "ส", # Ayin, only voweled
|
| 61 |
+
"ืค": "f", # Fey
|
| 62 |
+
"ืฃ": "f", # Fey Sofit
|
| 63 |
+
"ืฅ": "ts", # Tsadik sofit
|
| 64 |
+
"ืฆ": "ts", # Tsadik
|
| 65 |
+
"ืง": "k", # Kuf
|
| 66 |
+
"ืจ": "r", # Resh
|
| 67 |
+
"ืฉ": "ส", # Shin
|
| 68 |
+
"ืช": "t", # Taf
|
| 69 |
+
# Beged Kefet
|
| 70 |
+
"ืึผ": "b",
|
| 71 |
+
"ืึผ": "k",
|
| 72 |
+
"ืคึผ": "p",
|
| 73 |
+
# Shin Sin
|
| 74 |
+
"ืฉื": "ส",
|
| 75 |
+
"ืฉื": "s",
|
| 76 |
+
"'": "",
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
NIKUD_PHONEMES = {
|
| 80 |
+
"\u05b4": "i", # Hiriq
|
| 81 |
+
"\u05b1": "e", # Hataf segol
|
| 82 |
+
"\u05b5": "e", # Tsere
|
| 83 |
+
"\u05b6": "e", # Segol
|
| 84 |
+
"\u05b2": "a", # Hataf Patah
|
| 85 |
+
"\u05b7": "a", # Patah
|
| 86 |
+
"\u05c7": "o", # Kamatz katan
|
| 87 |
+
"\u05b9": "o", # Holam
|
| 88 |
+
"\u05ba": "o", # Holam haser for vav
|
| 89 |
+
"\u05bb": "u", # Qubuts
|
| 90 |
+
"\u05b3": "o", # Hataf qamats
|
| 91 |
+
"\u05b8": "a", # Kamataz
|
| 92 |
+
HATAMA_DIACRITIC: STRESS_PHONEME, # Stress (Hat'ama)
|
| 93 |
+
VOCAL_SHVA_DIACRITIC: "e", # Vocal Shva
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
DEDUPLICATE = {
|
| 97 |
+
"\u05f3": "'", # Hebrew geresh to regular geresh
|
| 98 |
+
"ึพ": "-", # Hebrew Makaf to hypen
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# Sets
|
| 102 |
+
SET_ENHANCED_DIACRITICS = set(
|
| 103 |
+
[HATAMA_DIACRITIC, PREFIX_DIACRITIC, VOCAL_SHVA_DIACRITIC]
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
ADDITIONAL_PHONEMES = set() # When using fallback
|
| 107 |
+
SET_PHONEMES = set(
|
| 108 |
+
sorted(
|
| 109 |
+
{
|
| 110 |
+
*NIKUD_PHONEMES.values(),
|
| 111 |
+
*LETTERS_PHONEMES.values(),
|
| 112 |
+
*GERESH_PHONEMES.values(),
|
| 113 |
+
*MODERN_SCHEMA.values(),
|
| 114 |
+
*SPECIAL_PHONEMES,
|
| 115 |
+
}
|
| 116 |
+
)
|
| 117 |
+
)
|
phonikud/log.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import colorlog
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def _create_logger():
|
| 7 |
+
"""
|
| 8 |
+
Create a logger with colorized output
|
| 9 |
+
Usage: LOG_LEVEL=DEBUG python <script.py>
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
handler = colorlog.StreamHandler()
|
| 13 |
+
fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
|
| 14 |
+
handler.setFormatter(
|
| 15 |
+
colorlog.ColoredFormatter(
|
| 16 |
+
fmt=fmt,
|
| 17 |
+
log_colors={
|
| 18 |
+
"DEBUG": "blue",
|
| 19 |
+
"INFO": "green",
|
| 20 |
+
"WARNING": "yellow",
|
| 21 |
+
"ERROR": "red",
|
| 22 |
+
"CRITICAL": "red",
|
| 23 |
+
},
|
| 24 |
+
)
|
| 25 |
+
)
|
| 26 |
+
# Get log level from LOG_LEVEL environment variable
|
| 27 |
+
log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
|
| 28 |
+
logger = colorlog.getLogger(__package__)
|
| 29 |
+
logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
|
| 30 |
+
# Setup logging to stdout
|
| 31 |
+
logger.addHandler(handler)
|
| 32 |
+
return logger
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
log = _create_logger()
|
phonikud/phonemize.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from phonikud import lexicon
|
| 2 |
+
from phonikud.variants import Letter
|
| 3 |
+
from .expander import Expander
|
| 4 |
+
from phonikud.utils import (
|
| 5 |
+
get_letters,
|
| 6 |
+
normalize,
|
| 7 |
+
post_normalize,
|
| 8 |
+
post_clean,
|
| 9 |
+
add_milra_hatama,
|
| 10 |
+
mark_vocal_shva,
|
| 11 |
+
sort_hatama,
|
| 12 |
+
)
|
| 13 |
+
from typing import Callable, Literal
|
| 14 |
+
import regex as re
|
| 15 |
+
from phonikud.hebrew import phonemize_hebrew
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class Phonemizer:
|
| 19 |
+
# TODO: is that enough? what if there's punctuation around? other chars?
|
| 20 |
+
fallback_pattern = r"[a-zA-Z]+"
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.expander = Expander()
|
| 24 |
+
|
| 25 |
+
def phonemize(
|
| 26 |
+
self,
|
| 27 |
+
text: str,
|
| 28 |
+
preserve_punctuation: bool,
|
| 29 |
+
preserve_stress: bool,
|
| 30 |
+
use_expander: bool,
|
| 31 |
+
use_post_normalize: bool, # For TTS
|
| 32 |
+
predict_stress: bool,
|
| 33 |
+
predict_vocal_shva: bool,
|
| 34 |
+
stress_placement: Literal["syllable", "vowel"],
|
| 35 |
+
schema: Literal["plain", "modern"],
|
| 36 |
+
fallback: Callable[[str], str] = None,
|
| 37 |
+
) -> str | list[str]:
|
| 38 |
+
# normalize
|
| 39 |
+
text = normalize(text)
|
| 40 |
+
|
| 41 |
+
def fallback_replace_callback(match: re.Match):
|
| 42 |
+
word = match.group(0)
|
| 43 |
+
|
| 44 |
+
if self.expander.dictionary.dict.get(word):
|
| 45 |
+
# skip
|
| 46 |
+
# TODO: better API
|
| 47 |
+
return word
|
| 48 |
+
phonemes = fallback(word).strip()
|
| 49 |
+
# TODO: check that it has only IPA?!
|
| 50 |
+
for c in phonemes:
|
| 51 |
+
lexicon.ADDITIONAL_PHONEMES.add(c)
|
| 52 |
+
return phonemes
|
| 53 |
+
|
| 54 |
+
if fallback is not None:
|
| 55 |
+
text = re.sub(self.fallback_pattern, fallback_replace_callback, text)
|
| 56 |
+
|
| 57 |
+
if use_expander:
|
| 58 |
+
text = self.expander.expand_text(text)
|
| 59 |
+
|
| 60 |
+
def heb_replace_callback(match: re.Match, original_text: str):
|
| 61 |
+
word = match.group(0)
|
| 62 |
+
start_offset = match.start()
|
| 63 |
+
if start_offset > 0 and original_text[start_offset - 1] == "[":
|
| 64 |
+
# Skip if it starts with [ as it's used for hyper phonemes
|
| 65 |
+
return word
|
| 66 |
+
|
| 67 |
+
if predict_vocal_shva:
|
| 68 |
+
mark_vocal_shva(word)
|
| 69 |
+
if lexicon.HATAMA_DIACRITIC not in word and predict_stress:
|
| 70 |
+
word = add_milra_hatama(word)
|
| 71 |
+
letters: list[Letter] = get_letters(word)
|
| 72 |
+
letters = sort_hatama(letters)
|
| 73 |
+
|
| 74 |
+
phonemes: list[str] = phonemize_hebrew(
|
| 75 |
+
letters,
|
| 76 |
+
stress_placement=stress_placement,
|
| 77 |
+
)
|
| 78 |
+
phonemes = "".join(phonemes)
|
| 79 |
+
# syllables = get_syllables(phonemes)
|
| 80 |
+
|
| 81 |
+
# phonemes_text = "".join(phonemes)
|
| 82 |
+
# # if predict_stress and lexicon.STRESS not in phonemes_text and syllables:
|
| 83 |
+
# # if len(syllables) == 1:
|
| 84 |
+
# # syllables[-1] = lexicon.STRESS + syllables[-1]
|
| 85 |
+
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
|
| 86 |
+
# # elif any(
|
| 87 |
+
# # remove_nikud(word).endswith(i) for i in lexicon.MILHEL_PATTERNS
|
| 88 |
+
# # ) or phonemes_text.endswith("ax"):
|
| 89 |
+
# # # insert lexicon.STRESS in the first character of syllables[-2]
|
| 90 |
+
# # syllables[-2] = lexicon.STRESS + syllables[-2]
|
| 91 |
+
# # syllables[-2] = "".join(sort_stress(syllables[-2]))
|
| 92 |
+
# # else:
|
| 93 |
+
# # # insert in syllables[-1]
|
| 94 |
+
# # syllables[-1] = lexicon.STRESS + syllables[-1]
|
| 95 |
+
# # syllables[-1] = "".join(sort_stress(syllables[-1]))
|
| 96 |
+
|
| 97 |
+
# phonemes = "".join(syllables)
|
| 98 |
+
if use_post_normalize:
|
| 99 |
+
phonemes = post_normalize(phonemes)
|
| 100 |
+
|
| 101 |
+
if schema == "modern":
|
| 102 |
+
# We'll keep this feature simple for now
|
| 103 |
+
for k, v in lexicon.MODERN_SCHEMA.items():
|
| 104 |
+
phonemes = re.sub(k, v, phonemes)
|
| 105 |
+
return phonemes
|
| 106 |
+
|
| 107 |
+
text = re.sub(
|
| 108 |
+
lexicon.HE_PATTERN, lambda match: heb_replace_callback(match, text), text
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
def hyper_phonemes_callback(match: re.Match):
|
| 112 |
+
"""
|
| 113 |
+
Expand hyper phonemes into normal phonemes
|
| 114 |
+
eg. [hello](/hษหloส/) -> hษหloส
|
| 115 |
+
"""
|
| 116 |
+
matched_phonemes = match.group(2)
|
| 117 |
+
for c in matched_phonemes:
|
| 118 |
+
lexicon.ADDITIONAL_PHONEMES.add(c)
|
| 119 |
+
return matched_phonemes # The phoneme is in the second group
|
| 120 |
+
|
| 121 |
+
text = re.sub(r"\[(.+?)\]\(\/(.+?)\/\)", hyper_phonemes_callback, text)
|
| 122 |
+
|
| 123 |
+
if not preserve_punctuation:
|
| 124 |
+
text = "".join(i for i in text if i not in lexicon.PUNCTUATION or i == " ")
|
| 125 |
+
if not preserve_stress:
|
| 126 |
+
text = "".join(i for i in text if i not in [lexicon.STRESS_PHONEME])
|
| 127 |
+
if use_post_normalize:
|
| 128 |
+
# We don't keep hypens in the output, but we should replace it with space
|
| 129 |
+
text = post_clean(text)
|
| 130 |
+
return text
|
phonikud/syllables.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
https://en.wikipedia.org/wiki/Unicode_and_HTML_for_the_Hebrew_alphabet#Compact_table
|
| 3 |
+
|
| 4 |
+
TODO: add to phonikud?
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import regex as re
|
| 8 |
+
import phonikud
|
| 9 |
+
|
| 10 |
+
VOWEL_DIACS = [chr(i) for i in range(0x05B1, 0x05BC)] + [chr(0x05C7)] + [chr(0x5BD)]
|
| 11 |
+
|
| 12 |
+
STRESS = "\u05ab"
|
| 13 |
+
SHVA = "\u05b0"
|
| 14 |
+
DAGESH = "\u05bc"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def sort_diacritics(word: str):
|
| 18 |
+
def sort_diacritics_callback(match):
|
| 19 |
+
letter = match.group(1)
|
| 20 |
+
diacritics = "".join(sorted(match.group(2))) # Sort diacritics
|
| 21 |
+
return letter + diacritics
|
| 22 |
+
|
| 23 |
+
return re.sub(r"(\p{L})(\p{M}+)", sort_diacritics_callback, word)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def has_vowel_diacs(s: str):
|
| 27 |
+
if s == "ืึผ":
|
| 28 |
+
return True
|
| 29 |
+
return any(i in s for i in VOWEL_DIACS)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_syllables(word: str) -> list[str]:
|
| 33 |
+
letters = phonikud.utils.get_letters(word)
|
| 34 |
+
syllables, cur = [], ""
|
| 35 |
+
vowel_state = False
|
| 36 |
+
|
| 37 |
+
i = 0
|
| 38 |
+
while i < len(letters):
|
| 39 |
+
letter = letters[i]
|
| 40 |
+
has_vowel = has_vowel_diacs(str(letter)) or (i == 0 and SHVA in letter.all_diac)
|
| 41 |
+
# Look ahead
|
| 42 |
+
vav1 = i + 2 < len(letters) and letters[i + 2].char == "ื"
|
| 43 |
+
vav2 = i + 3 < len(letters) and letters[i + 3].char == "ื"
|
| 44 |
+
|
| 45 |
+
if has_vowel:
|
| 46 |
+
if vowel_state:
|
| 47 |
+
syllables.append(cur)
|
| 48 |
+
cur = str(letter)
|
| 49 |
+
else:
|
| 50 |
+
cur += str(letter)
|
| 51 |
+
vowel_state = True
|
| 52 |
+
else:
|
| 53 |
+
cur += str(letter)
|
| 54 |
+
|
| 55 |
+
i += 1
|
| 56 |
+
|
| 57 |
+
# If two ืs are coming: force current syllable to end, and join both ืs as next syllable
|
| 58 |
+
if vav1 and vav2:
|
| 59 |
+
if cur:
|
| 60 |
+
# Finish current syllable
|
| 61 |
+
syllables.append(cur + str(letters[i]))
|
| 62 |
+
cur = ""
|
| 63 |
+
cur = str(letters[i + 1]) + str(letters[i + 2])
|
| 64 |
+
i += 3 # skip past the double-vav
|
| 65 |
+
vowel_state = True
|
| 66 |
+
|
| 67 |
+
# If one ื is coming, end the syllable now
|
| 68 |
+
elif vav1 and letters[i + 1].diac:
|
| 69 |
+
if cur:
|
| 70 |
+
syllables.append(cur)
|
| 71 |
+
cur = ""
|
| 72 |
+
vowel_state = False
|
| 73 |
+
|
| 74 |
+
if cur:
|
| 75 |
+
syllables.append(cur)
|
| 76 |
+
# print(syllables)
|
| 77 |
+
return syllables
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def add_stress_to_syllable(s: str):
|
| 81 |
+
letters = phonikud.utils.get_letters(s)
|
| 82 |
+
letters[0].all_diac = STRESS + letters[0].all_diac
|
| 83 |
+
return "".join(letter.char + letter.all_diac for letter in letters)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def add_stress(word: str, syllable_position: int):
|
| 87 |
+
syllables: list[str] = get_syllables(word)
|
| 88 |
+
|
| 89 |
+
if not syllables:
|
| 90 |
+
return word # no syllables, return original word
|
| 91 |
+
|
| 92 |
+
# Normalize negative indices
|
| 93 |
+
if syllable_position < 0:
|
| 94 |
+
syllable_position += len(syllables)
|
| 95 |
+
|
| 96 |
+
# Clamp to valid range
|
| 97 |
+
syllable_position = max(0, min(syllable_position, len(syllables) - 1))
|
| 98 |
+
|
| 99 |
+
stressed_syllable = syllables[syllable_position]
|
| 100 |
+
stressed_syllable = add_stress_to_syllable(stressed_syllable)
|
| 101 |
+
syllables[syllable_position] = stressed_syllable
|
| 102 |
+
|
| 103 |
+
return "".join(syllables)
|
phonikud/utils.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
from typing import Literal
|
| 3 |
+
from phonikud import lexicon
|
| 4 |
+
import unicodedata
|
| 5 |
+
import regex as re
|
| 6 |
+
import phonikud.syllables
|
| 7 |
+
from phonikud.variants import Letter
|
| 8 |
+
import phonikud
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def sort_diacritics(match):
|
| 12 |
+
letter = match.group(1)
|
| 13 |
+
diacritics = "".join(sorted(match.group(2))) # Sort diacritics
|
| 14 |
+
return letter + diacritics
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
NORMALIZE_PATTERNS = {
|
| 18 |
+
# Sort diacritics
|
| 19 |
+
r"(\p{L})(\p{M}+)": sort_diacritics,
|
| 20 |
+
"ืด": '"', # Hebrew geresh to normal geresh
|
| 21 |
+
"ืณ": "'", # Same
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def remove_nikud(text: str, to_keep=""):
|
| 26 |
+
pattern = lexicon.HE_NIKUD_PATTERN
|
| 27 |
+
pattern = "".join(i for i in pattern if i not in to_keep)
|
| 28 |
+
return re.sub(
|
| 29 |
+
pattern,
|
| 30 |
+
"",
|
| 31 |
+
text,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@lru_cache(maxsize=10000)
|
| 36 |
+
def normalize(text: str) -> str:
|
| 37 |
+
"""
|
| 38 |
+
Normalize unicode (decomposite)
|
| 39 |
+
Keep only Hebrew characters / punctuation / IPA
|
| 40 |
+
Sort diacritics
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
# Decompose text
|
| 44 |
+
text = unicodedata.normalize("NFD", text)
|
| 45 |
+
for k, v in NORMALIZE_PATTERNS.items():
|
| 46 |
+
text = re.sub(k, v, text)
|
| 47 |
+
for k, v in lexicon.DEDUPLICATE.items():
|
| 48 |
+
text = re.sub(k, v, text)
|
| 49 |
+
return text
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def post_normalize(phonemes: str):
|
| 53 |
+
new_phonemes = []
|
| 54 |
+
for word in phonemes.split(" "):
|
| 55 |
+
# remove glottal stop from end
|
| 56 |
+
word = re.sub(r"ส$", "", word)
|
| 57 |
+
# remove h from end
|
| 58 |
+
word = re.sub(r"h$", "", word)
|
| 59 |
+
word = re.sub(r"หh$", "", word)
|
| 60 |
+
# remove j followed by a i
|
| 61 |
+
word = re.sub(r"ij$", "i", word)
|
| 62 |
+
new_phonemes.append(word)
|
| 63 |
+
phonemes = " ".join(new_phonemes)
|
| 64 |
+
return phonemes
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def post_clean(phonemes: str):
|
| 68 |
+
clean = []
|
| 69 |
+
for i in phonemes:
|
| 70 |
+
if i == "-":
|
| 71 |
+
clean.append(" ")
|
| 72 |
+
elif (
|
| 73 |
+
i in lexicon.SET_PHONEMES
|
| 74 |
+
or i in lexicon.ADDITIONAL_PHONEMES
|
| 75 |
+
or i == " "
|
| 76 |
+
or i in lexicon.PUNCTUATION
|
| 77 |
+
):
|
| 78 |
+
clean.append(i)
|
| 79 |
+
return "".join(clean)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
letters_pattern = re.compile(r"(\p{L})([\p{M}'|]*)")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# @lru_cache(maxsize=10000) TODO?
|
| 86 |
+
def get_letters(word: str):
|
| 87 |
+
letters: list[tuple[str, str]] = letters_pattern.findall(word) # with en_geresh
|
| 88 |
+
letters: list[Letter] = [Letter(i[0], i[1]) for i in letters]
|
| 89 |
+
return letters
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def get_unicode_names(text: str):
|
| 93 |
+
return [unicodedata.name(c, "?") for c in text]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def has_vowel(s: iter):
|
| 97 |
+
return any(i in s for i in "aeiou")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def has_constant(s: iter):
|
| 101 |
+
return any(i not in "aeiou" for i in s)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def get_phoneme_syllables(phonemes: list[str]) -> list[str]:
|
| 105 |
+
syllables = []
|
| 106 |
+
cur_syllable = ""
|
| 107 |
+
|
| 108 |
+
i = 0
|
| 109 |
+
while i < len(phonemes):
|
| 110 |
+
# Add current phoneme to the syllable
|
| 111 |
+
|
| 112 |
+
cur_syllable += phonemes[i]
|
| 113 |
+
|
| 114 |
+
# If we have a vowel in the current syllable
|
| 115 |
+
if has_vowel(cur_syllable):
|
| 116 |
+
# If there's a next phoneme that's a consonant followed by a vowel-containing phoneme
|
| 117 |
+
if (
|
| 118 |
+
i + 2 < len(phonemes)
|
| 119 |
+
and not has_vowel(phonemes[i + 1])
|
| 120 |
+
and has_vowel(phonemes[i + 2])
|
| 121 |
+
):
|
| 122 |
+
# End the current syllable and start a new one
|
| 123 |
+
syllables.append(cur_syllable)
|
| 124 |
+
cur_syllable = ""
|
| 125 |
+
# If we're at the end or next phoneme has a vowel
|
| 126 |
+
elif i + 1 >= len(phonemes) or has_vowel(phonemes[i + 1]):
|
| 127 |
+
# End the current syllable
|
| 128 |
+
syllables.append(cur_syllable)
|
| 129 |
+
cur_syllable = ""
|
| 130 |
+
|
| 131 |
+
i += 1
|
| 132 |
+
|
| 133 |
+
# Add any remaining syllable
|
| 134 |
+
if cur_syllable:
|
| 135 |
+
syllables.append(cur_syllable)
|
| 136 |
+
|
| 137 |
+
# Iterate over syllables and move any syllable ending with lexicon.STRESS to the next one
|
| 138 |
+
for i in range(len(syllables) - 1): # Ensure we're not at the last syllable
|
| 139 |
+
if syllables[i].endswith(lexicon.STRESS_PHONEME):
|
| 140 |
+
syllables[i + 1] = (
|
| 141 |
+
lexicon.STRESS_PHONEME + syllables[i + 1]
|
| 142 |
+
) # Move stress to next syllable
|
| 143 |
+
syllables[i] = syllables[i][
|
| 144 |
+
: -len(lexicon.STRESS_PHONEME)
|
| 145 |
+
] # Remove stress from current syllable
|
| 146 |
+
|
| 147 |
+
return syllables
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def sort_stress(
|
| 151 |
+
phonemes: list[str], placement: Literal["syllable", "vowel"] = "vowel"
|
| 152 |
+
) -> list[str]:
|
| 153 |
+
"""
|
| 154 |
+
TTS systems expect that the stress will be BEFORE vowel
|
| 155 |
+
Linguistics expect in the START of the syllable
|
| 156 |
+
at_start=True for place it in the beginning
|
| 157 |
+
"""
|
| 158 |
+
if "ห" not in "".join(phonemes):
|
| 159 |
+
# ^ Does not contains stress
|
| 160 |
+
return phonemes
|
| 161 |
+
if not any(i in "".join(phonemes) for i in "aeiou"):
|
| 162 |
+
# ^ Does not contains vowel
|
| 163 |
+
return phonemes
|
| 164 |
+
|
| 165 |
+
# Remove stress marker
|
| 166 |
+
phonemes = [p for p in phonemes if p != "ห"]
|
| 167 |
+
|
| 168 |
+
if placement == "syllable":
|
| 169 |
+
return ["ห"] + phonemes
|
| 170 |
+
|
| 171 |
+
# Define vowels
|
| 172 |
+
vowels = "aeiou"
|
| 173 |
+
|
| 174 |
+
# Find the first phoneme that contains a vowel, and inject the stress before the vowel
|
| 175 |
+
|
| 176 |
+
for i, phoneme in enumerate(phonemes):
|
| 177 |
+
for j, char in enumerate(phoneme):
|
| 178 |
+
if char in vowels:
|
| 179 |
+
# Insert stress before the vowel
|
| 180 |
+
phonemes[i] = phoneme[:j] + "ห" + phoneme[j:]
|
| 181 |
+
return phonemes
|
| 182 |
+
|
| 183 |
+
# If no vowels found, return unchanged
|
| 184 |
+
return phonemes
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def mark_vocal_shva(word: str):
|
| 188 |
+
"""
|
| 189 |
+
Vocal Shva is context-independent and can be predicted with just the word or a dictionary.
|
| 190 |
+
See https://hebrew-academy.org.il/2020/08/11/ืืื-ืืืืื-ืืช-ืืฉืืื-ืื ืข
|
| 191 |
+
Note: we predict only if Shva in the first letter in the word
|
| 192 |
+
Note: we assume that the word comes with | to mark 'Txiliyot'
|
| 193 |
+
Note: Vocal Shva rules mid-word are unreliable, so we donโt code them.
|
| 194 |
+
|
| 195 |
+
Meteg (\u05bd) will be added in the letter with Vocal Shva
|
| 196 |
+
|
| 197 |
+
What we don't predict:
|
| 198 |
+
(1) some shva in beginning in future form (we don't know)
|
| 199 |
+
(2) shva in the middle of the word
|
| 200 |
+
"""
|
| 201 |
+
letters = get_letters(word)
|
| 202 |
+
if not letters:
|
| 203 |
+
return word
|
| 204 |
+
if letters[0].char in "ืืื ืจื":
|
| 205 |
+
letters[0].all_diac += lexicon.VOCAL_SHVA_DIACRITIC
|
| 206 |
+
elif len(letters) > 1 and letters[1].char in "ืืขื":
|
| 207 |
+
letters[0].all_diac += lexicon.VOCAL_SHVA_DIACRITIC
|
| 208 |
+
elif letters[0].char in "ืืืื" and lexicon.PREFIX_DIACRITIC in letters[0].all_diac:
|
| 209 |
+
# ^ The nakdan should add |
|
| 210 |
+
letters[0].all_diac += lexicon.VOCAL_SHVA_DIACRITIC
|
| 211 |
+
# Ensure that prefix character will be last
|
| 212 |
+
for letter in letters:
|
| 213 |
+
if "|" in letter.all_diac:
|
| 214 |
+
letter.all_diac = letter.all_diac.replace("|", "") + "|"
|
| 215 |
+
return "".join(str(i) for i in letters)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def sort_hatama(letters: list[Letter]) -> list[Letter]:
|
| 219 |
+
for i in range(len(letters) - 1):
|
| 220 |
+
diacs = list(letters[i].all_diac)
|
| 221 |
+
if lexicon.HATAMA_DIACRITIC in diacs and lexicon.NIKUD_HASER_DIACRITIC in diacs:
|
| 222 |
+
diacs.remove(lexicon.HATAMA_DIACRITIC)
|
| 223 |
+
letters[i].all_diac = "".join(diacs) # Reassign the updated diacritics
|
| 224 |
+
letters[i + 1].all_diac += lexicon.HATAMA_DIACRITIC
|
| 225 |
+
return letters
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def add_milra_hatama(word: str):
|
| 229 |
+
syllables = phonikud.syllables.get_syllables(word)
|
| 230 |
+
stress_index = -1
|
| 231 |
+
|
| 232 |
+
if not syllables:
|
| 233 |
+
return word
|
| 234 |
+
|
| 235 |
+
if len(syllables) == 1:
|
| 236 |
+
stress_index = 0
|
| 237 |
+
|
| 238 |
+
# Get latest syllable
|
| 239 |
+
milra = syllables[stress_index]
|
| 240 |
+
# Get letters
|
| 241 |
+
letters = get_letters(milra)
|
| 242 |
+
# Add Hatama
|
| 243 |
+
letters[0].all_diac += lexicon.HATAMA_DIACRITIC
|
| 244 |
+
|
| 245 |
+
# Replace latest syllable
|
| 246 |
+
syllables[stress_index] = "".join(str(i) for i in letters)
|
| 247 |
+
return "".join(syllables)
|
phonikud/variants.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import phonikud
|
| 2 |
+
from phonikud import lexicon
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Letter:
|
| 6 |
+
def __init__(self, char: str, diac: list[str]):
|
| 7 |
+
self.char = phonikud.normalize(char)
|
| 8 |
+
self.all_diac = phonikud.normalize(diac)
|
| 9 |
+
self.diac = "".join(
|
| 10 |
+
i for i in self.all_diac if i not in lexicon.SET_ENHANCED_DIACRITICS
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
def __repr__(self):
|
| 14 |
+
return f"[Letter] {self.char}{''.join(self.all_diac)}"
|
| 15 |
+
|
| 16 |
+
def __eq__(self, value: "Letter"):
|
| 17 |
+
return value.all_diac == self.all_diac and value.char == self.char
|
| 18 |
+
|
| 19 |
+
def __str__(self):
|
| 20 |
+
return self.char + self.all_diac
|
requirements.txt
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file was autogenerated by uv via the following command:
|
| 2 |
+
# uv export --no-hashes --no-emit-project
|
| 3 |
+
colorama==0.4.6 ; sys_platform == 'win32'
|
| 4 |
+
# via
|
| 5 |
+
# colorlog
|
| 6 |
+
# pytest
|
| 7 |
+
# tqdm
|
| 8 |
+
colorlog==6.9.0
|
| 9 |
+
# via phonikud
|
| 10 |
+
docopt==0.6.2
|
| 11 |
+
# via num2words
|
| 12 |
+
exceptiongroup==1.3.0 ; python_full_version < '3.11'
|
| 13 |
+
# via pytest
|
| 14 |
+
iniconfig==2.1.0
|
| 15 |
+
# via pytest
|
| 16 |
+
num2words==0.5.14
|
| 17 |
+
# via phonikud
|
| 18 |
+
numpy==2.2.6 ; python_full_version < '3.11'
|
| 19 |
+
# via pandas
|
| 20 |
+
numpy==2.3.1 ; python_full_version >= '3.11'
|
| 21 |
+
# via pandas
|
| 22 |
+
packaging==25.0
|
| 23 |
+
# via pytest
|
| 24 |
+
pandas==2.3.0
|
| 25 |
+
pluggy==1.6.0
|
| 26 |
+
# via pytest
|
| 27 |
+
pygments==2.19.2
|
| 28 |
+
# via pytest
|
| 29 |
+
pytest==8.4.1
|
| 30 |
+
python-dateutil==2.9.0.post0
|
| 31 |
+
# via pandas
|
| 32 |
+
pytz==2025.2
|
| 33 |
+
# via pandas
|
| 34 |
+
regex==2024.11.6
|
| 35 |
+
# via phonikud
|
| 36 |
+
ruff==0.12.0
|
| 37 |
+
six==1.17.0
|
| 38 |
+
# via python-dateutil
|
| 39 |
+
tomli==2.2.1 ; python_full_version < '3.11'
|
| 40 |
+
# via pytest
|
| 41 |
+
tqdm==4.67.1
|
| 42 |
+
typing-extensions==4.14.0 ; python_full_version < '3.11'
|
| 43 |
+
# via exceptiongroup
|
| 44 |
+
tzdata==2025.2
|
| 45 |
+
# via pandas
|
| 46 |
+
phonikud-onnx
|
| 47 |
+
gradio>=5.34.2
|