Spaces:

pwenker
/

pronunciation_trainer

Running

App Files Files Community

pwenker commited on May 10, 2024

Commit

20943e6

0 Parent(s):

Initialize

Browse files

Files changed (38) hide show

.gitignore +162 -0
.python-version +1 -0
README.md +69 -0
audios/learner/book.aac +0 -0
audios/learner/euros.wav +0 -0
audios/learner/interesting.wav +0 -0
audios/learner/today.wav +0 -0
audios/learner/won.wav +0 -0
audios/learner/youtube.wav +0 -0
audios/teacher/book.wav +0 -0
audios/teacher/euros.wav +0 -0
audios/teacher/interesting.wav +0 -0
audios/teacher/today.wav +0 -0
audios/teacher/won.wav +0 -0
audios/teacher/youtube.wav +0 -0
docs/assets/date_example_grapheme_based.png +0 -0
docs/assets/date_example_phoneme_based.png +0 -0
docs/assets/grapheme_based_pronunciation.png +0 -0
docs/assets/phoneme_based_pronunciation_feedback_example.png +0 -0
docs/assets/phoneme_based_pronunciation_input_example.png +0 -0
docs/assets/phoneme_based_pronunciation_interface.png +0 -0
docs/grapheme_based_solution.md +125 -0
docs/phoneme_based_solution.md +213 -0
docs/prompt.md +47 -0
learner_input.json +27 -0
prompt.md +47 -0
pyproject.toml +28 -0
requirements-dev.lock +147 -0
requirements.lock +147 -0
src/pronunciation_trainer/__init__.py +0 -0
src/pronunciation_trainer/app.py +144 -0
src/pronunciation_trainer/config.py +11 -0
src/pronunciation_trainer/evaluation.py +85 -0
src/pronunciation_trainer/llm.py +35 -0
src/pronunciation_trainer/loading.py +44 -0
src/pronunciation_trainer/rich_logging.py +30 -0
src/pronunciation_trainer/scripts.py +46 -0
src/pronunciation_trainer/transcription.py +44 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ [email protected]

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Pronunciation Trainer 🗣️
+This repository/app showcases how a [phoneme-based pronunciation trainer](docs/phoneme_based_solution.md) (including personalized LLM-based feedback) overcomes the limitations of a [grapheme-based approach](docs/grapheme_based_solution.md).
+| Feature                           | Grapheme-Based Solution                                  | Phoneme-Based Solution                                  |
+|-----------------------------------|----------------------------------------------------------|---------------------------------------------------------|
+| **Input Type**                    | Text transcriptions of speech                            | Audio files and phoneme transcriptions                  |
+| **Feedback Mechanism**            | Comparison of grapheme sequences                         | Comparison of phoneme sequences and advanced LLM-based feedback |
+| **Technological Approach**        | Simple text comparison using `SequenceMatcher`           | Advanced ASR models like Wav2Vec2 for phoneme recognition |
+| **Feedback Detail**               | Basic similarity score and diff                          | Detailed phoneme comparison, LLM-based feedback including motivational and corrective elements |
+| **Error Sensitivity**             | Sensitive to homophones and transcription errors         | More accurate in capturing pronunciation nuances        |
+| **Suprasegmental Features**       | Does not capture (stress, intonation)                    | Potentially captures through phoneme dynamics and advanced evaluation |
+| **Personalization**               | Limited to error feedback based on text similarity       | Advanced personalization considering learner's native language and target language proficiency |
+| **Scalability**                   | Easy to scale with basic text processing tools           | Requires more computational resources for ASR and LLM processing |
+| **Cost**                          | Lower, primarily involves basic computational resources   | Higher, due to usage of advanced APIs and model processing |
+| **Accuracy**                      | Lower, prone to misinterpretations of homophones         | Higher, better at handling diverse pronunciation patterns (but LLM hallucinations) |
+| **Feedback Quality**              | Basic, often not linguistically rich                     | Rich, detailed, personalized, and linguistically informed              |
+| **Potential for Learning**        | Limited to recognizing text differences                   | High, includes phonetic and prosodic feedback, as well as resource and practice recommendations           |
+## Quickstart 🚀
+### 👉 Click here to try out the app directly:
+[**Pronunciation Trainer App**](https://pwenker-pronunciation-trainer.hf.space/)
+### 🔍 Inspect the code at:
+- **GitHub:** [pwenker/pronunciation_trainer](https://github.com/pwenker/pronounciation_trainer)
+- **Hugging Face Spaces:** [pwenker/pronunciation_trainer](https://huggingface.co/spaces/pwenker/pronounciation_trainer)
+## Local Deployment 🏠
+### Prerequisites 📋
+#### Rye 🌾
+[Install `Rye`](https://rye-up.com/guide/installation/#installing-rye)
+> Rye is a comprehensive tool designed for Python developers. It simplifies your workflow by managing Python installations and dependencies. Simply install Rye, and it takes care of the rest.
+- Create a `.env` file in the `pronunciation_trainer` folder and add the following variable:
+#### OPENAI API Token 🔑
+```
+OPENAI_TOKEN=... # Token for the OpenAI API
+```
+### Set-Up 🛠️
+Clone the repository:
+```
+git clone [repository-url] # Replace [repository-url] with the actual URL of the repository
+```
+Navigate to the directory:
+```
+cd pronunciation_trainer
+```
+Create a virtual environment in `.venv` and synchronize the repo:
+```
+rye sync
+```
+For more details, visit: [Basics - Rye](https://rye-up.com/guide/basics/)
+### Start the App 🌟
+Launch the app using:
+```
+rye run python src/pronunciation_trainer/app.py
+```
+Then, open your browser and visit [http://localhost:7860](http://localhost:7860/) to start practicing!

audios/learner/book.aac ADDED Viewed

Binary file (8.93 kB). View file

audios/learner/euros.wav ADDED Viewed

Binary file (709 kB). View file

audios/learner/interesting.wav ADDED Viewed

Binary file (246 kB). View file

audios/learner/today.wav ADDED Viewed

Binary file (721 kB). View file

audios/learner/won.wav ADDED Viewed

Binary file (195 kB). View file

audios/learner/youtube.wav ADDED Viewed

Binary file (330 kB). View file

audios/teacher/book.wav ADDED Viewed

Binary file (25.9 kB). View file

audios/teacher/euros.wav ADDED Viewed

Binary file (34.1 kB). View file

audios/teacher/interesting.wav ADDED Viewed

Binary file (14.9 kB). View file

audios/teacher/today.wav ADDED Viewed

Binary file (62.4 kB). View file

audios/teacher/won.wav ADDED Viewed

Binary file (10.1 kB). View file

audios/teacher/youtube.wav ADDED Viewed

Binary file (30.2 kB). View file

docs/assets/date_example_grapheme_based.png ADDED Viewed

docs/assets/date_example_phoneme_based.png ADDED Viewed

docs/assets/grapheme_based_pronunciation.png ADDED Viewed

docs/assets/phoneme_based_pronunciation_feedback_example.png ADDED Viewed

docs/assets/phoneme_based_pronunciation_input_example.png ADDED Viewed

docs/assets/phoneme_based_pronunciation_interface.png ADDED Viewed

docs/grapheme_based_solution.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# Basic Grapheme-Based Solution
+![Grapheme-based pronunciation](assets/grapheme_based_pronunciation.png)
+## Approach
+In this simple solution we load the grapheme transcription of the learner recording, as well as the ground-truth transcription and compute
+a similarity ratio, as well as a diff between the graphemes. Further, based on the similarity score simple feedback is generated to inform the learner about their performance.
+Before comparing the graphemes, we use a very simple normalization procedure.
+```python
+def normalize_texts(actual: str, expected: str) -> list[str]:
+    """Normalize two input texts by converting them to lower case and stripping whitespace.
+    Note: This normalization function is very simple and only here to demonstrate the general necessity of normalization
+    """
+    return [text.lower().strip() for text in [actual, expected]]
+```
+The similarity score and diff are produced with the `difflib.SequenceMatcher`/`difflib.Differ`:
+```python
+def compare_phrases(expected: str, actual: str) -> float:
+    """Calculate the similarity ratio between two phrases."""
+    return SequenceMatcher(None, expected, actual).ratio()
+def diff_phrases(expected: str, actual: str) -> list[Tuple[str, Optional[str]]]:
+    """Generate a diff between two phrases."""
+    differ = Differ()
+    return [
+        (token[2:], None if token[0] == " " else token[0])
+        for token in differ.compare(expected, actual)
+    ]
+```
+and then a simple rule-based approach yields the feedback:
+```python
+def generate_feedback(similarity_ratio: float) -> str:
+    """Generate feedback based on the similarity ratio."""
+    if similarity_ratio > 0.9:
+        return "Excellent!"
+    elif similarity_ratio > 0.7:
+        return "Good job!"
+    elif similarity_ratio > 0.5:
+        return "Not bad, but there's room for improvement."
+    else:
+        return "Please try again, focus on pronunciation and clarity."
+```
+The whole grapheme-based evaluation is as simple as:
+```python
+def basic_evaluation(
+    expected: str, actual: str, autojunk: bool = True
+) -> Tuple[float, str, list[Tuple[str, Optional[str]]]]:
+    """Evaluate speaking attempts by comparing expected and actual phrases."""
+    expected, actual = normalize_texts(expected, actual)
+    similarity_ratio = compare_phrases(expected, actual)
+    diff = diff_phrases(expected, actual)
+    feedback = generate_feedback(similarity_ratio)
+    return similarity_ratio, feedback, diff
+```
+#### About the `SequenceMatcher`
+Note: Visit [https://docs.python.org/3/library/difflib.html](https://docs.python.org/3/library/difflib.html) for more details.
+**Information**: The `SequenceMatcher` is a flexible class for comparing pairs of sequences of any type, so long as the sequence elements are hashable. The basic algorithm predates, and is a little fancier than, an algorithm published in the late 1980’s by Ratcliff and Obershelp under the hyperbolic name “gestalt pattern matching.” The idea is to find the longest contiguous matching subsequence that contains no “junk” elements; these “junk” elements are ones that are uninteresting in some sense, such as blank lines or whitespace. (Handling junk is an extension to the Ratcliff and Obershelp algorithm.) The same idea is then applied recursively to the pieces of the sequences to the left and to the right of the matching subsequence. This does not yield minimal edit sequences, but does tend to yield matches that “look right” to people.
+**Timing**: The basic Ratcliff-Obershelp algorithm is cubic time in the worst case and quadratic time in the expected case. SequenceMatcher is quadratic time for the worst case and has expected-case behavior dependent in a complicated way on how many elements the sequences have in common; best case time is linear.
+**Automatic junk heuristic**: SequenceMatcher supports a heuristic that automatically treats certain sequence items as junk. The heuristic counts how many times each individual item appears in the sequence. If an item’s duplicates (after the first one) account for more than 1% of the sequence and the sequence is at least 200 items long, this item is marked as “popular” and is treated as junk for the purpose of sequence matching. This heuristic can be turned off by setting the autojunk argument to False when creating the SequenceMatcher.
+## Limitations of Grapheme-Based Approach
+### Analysis of Transcription Pairs
+#### Pair 1: "Interesting."
+- **Teacher Transcription:** "Interesting."
+- **Learner Transcription:** "Interesting."
+**Analysis:**
+- The learner's transcription starts with a capital "I" and ends with a period. This could be remedied by using a simple normalization function.
+- The capitalization and punctuation are transcription errors but do not necessarily indicate a pronunciation error. This highlights a limitation in assessing pronunciation purely from written transcriptions without audio context.
+- Further, without audio, it's impossible to assess aspects like stress, intonation, or subtle phonetic variations (e.g., the reduction of unstressed vowels or the precise articulation of consonants).
+#### Pair 2: "won" vs. "One."
+- **Teacher Transcription:** "won"
+- **Learner Transcription:** "One."
+**Analysis:**
+- This is an example where grapheme-based transcription fails to capture the intended meaning, as it cannot differentiate homophones based solely on sounds
+(The words "won" and "one" are homophones in English, pronounced the same way but differing in meaning and spelling)k:> [!WARNING]
+#### Pair 3: "Today is the thirteenth of May, twenty twenty-three." vs. "Today is the 13th of May, 2023."
+- **Teacher Transcription:** "Today is the thirteenth of May, twenty twenty-three."
+- **Learner Transcription:** "Today is the 13th of May, 2023."
+**Analysis:** :
+- While there is no indication of pronunciation errors (both transcriptions likely sound identical), the discrepancy lies in the representation of dates and numbers due to the transcription
+- This again demonstrates that grapheme-based approaches are affected by transcription errors/differences.
+#### Pair 5: "I have two and a half euros." vs. "I have, I have €2.5."
+- **Teacher Transcription:** "I have two and a half euros."
+- **Learner Transcription:** "I have, I have €2.5."
+**Analysis:**
+- The disfluency in the learner's transcription ("I have, I have") could be striped by preprocessing before the evaluation to ensure a better comparison
+- The discrepancy between "€2.5" instead of "two and a half euros" again demonstrates grapheme-based transcription problems
+### Limitations of a Grapheme-Based Approach
+A grapheme-based approach primarily focuses on the written symbols of a language (letters and numbers) and their standard pronunciations. However, this approach has several limitations. For example:
+1. **Homophones and Homographs:** Words like "won" and "one" illustrate how identical pronunciations can lead to a bad evaluation score when solely relying on graphemes. Similarly, words spelled the same but pronounced differently (read (present) /read (past)) could lead to perfect grapheme scores, although the pronunciation was off.
+2. **Suprasegmental Features:** Elements such as stress, rhythm, intonation, and pitch are crucial in spoken language but are not captured by graphemes. For example, the word "interesting" in the learner audio was pronounced with wrong stress patterns.
+### Remedy through Phoneme-based Approach
+To address the limitations, as well as in order to provide a more personalized and constructive feedback, we create a more advanced [phoneme-based solution](phoneme_based_solution.md]

docs/phoneme_based_solution.md ADDED Viewed

	@@ -0,0 +1,213 @@

+# Phoneme-based Pronunciation Trainer
+## Interface
+![Phoneme-based pronunciation interface](assets/phoneme_based_pronunciation_interface.png)
+## Feedback Example
+![Phoneme-based pronunciation feedback example](assets/phoneme_based_pronunciation_feedback_example.png)
+## Approach
+### Create teacher/ground truth audio files
+We start by creating teacher audio files using OpenAi's Text-To-Speech API
+```python
+def produce_teacher_audio(audio_name, text):
+    """
+    Produce a teacher audio file for the given text using OpenAI's Text-to-Speech API.
+    See: https://platform.openai.com/docs/guides/text-to-speech
+    """
+    speech_file_path = Path(f"audios/teacher/{audio_name}")
+    response = client.audio.speech.create(
+        model="tts-1-hd",
+        voice="alloy",
+        input=text,
+    )
+    response.stream_to_file(speech_file_path)
+    log.info(
+        f"Successfully produced teacher audio for {text=} at {speech_file_path.name=} 🎉"
+    )
+if __name__ == "__main__":
+    # Produce teacher/ground-truth audio files for the given examples
+    data = load_data()
+    for datum in data:
+        produce_teacher_audio(datum["learner_recording"], datum["text_to_record"])
+    # Produce an additional example
+    produce_teacher_audio("book.wav", "The book is on the table")
+```
+### Audio-based and Personalized Input
+Given the newly created teacher audios, we can now use both learner and teacher audios as input to our system
+and thereby avoid the transcription-based limitations that we described in the [grapheme-based solution](grapheme_based_solution.md).
+As you can see in the following image, we also supply the native language of the learner and the language they want to acquire.
+![Phoneme-Based Pronunciation Input Example](assets/phoneme_based_pronunciation_input_example.png)
+### Phoneme-Based ASR
+We use a [Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme) model as proposed in  [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
+> Recent progress in self-training, self-supervised pretraining and unsupervised learning enabled well performing speech recognition systems without any labeled data. However, in many cases there is labeled data available for related languages which is not utilized by these methods. This paper extends previous work on zero-shot cross-lingual transfer learning by fine-tuning a multilingually pretrained wav2vec 2.0 model to transcribe unseen languages. This is done by mapping phonemes of the training languages to the target language using articulatory features. Experiments show that this simple method significantly outperforms prior work which introduced task-specific architectures and used only part of a monolingually pretrained model.
+In particular, we use the following checkpoint: [facebook/wav2vec2-lv-60-espeak-cv-ft · Hugging Face](https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft).
+> This checkpoint leverages the pretrained checkpoint wav2vec2-large-lv60 and is fine-tuned on CommonVoice to recognize phonetic labels in multiple languages.
+For convenience, we create a partial `transcribe_to_phonemes` function as interface to this checkpoint:
+```python
+class TranscriberChoice(StrEnum):
+    grapheme = "openai/whisper-base.en"
+    phoneme = "facebook/wav2vec2-lv-60-espeak-cv-ft"
+def transcribe(
+    audio, transcriber_choice: TranscriberChoice = TranscriberChoice.grapheme
+):
+    """
+    The transcribe function takes a single parameter, audio, which is a numpy array of the audio the user recorded.
+    The pipeline object expects this in float32 format,so we convert it first to float32, and then extract the transcribed text.
+    """
+    transcriber = pipeline("automatic-speech-recognition", model=transcriber_choice)
+    try:
+        sr, y = audio
+        print(f"Sampling rate is {sr}")
+    except TypeError:
+        return None
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    transcription = transcriber({"sampling_rate": sr, "raw": y})["text"]
+    return transcription
+transcribe_to_phonemes = partial(
+    transcribe, transcriber_choice=TranscriberChoice.phoneme
+)
+transcribe_to_graphemes = partial(
+    transcribe, transcriber_choice=TranscriberChoice.grapheme
+)
+```
+### Simple Evaluation based on `SequenceMatcher`
+We can again apply the `SequenceMatcher`, but this time compare the phoneme transcriptions of teacher and learner audio.
+As illustrated here for the date example, we get a much better similarity score:
+#### Grapheme-based
+![date_example_grapheme_based.png](assets/date_example_grapheme_based.png)
+#### Phoneme-based
+![date_example_phoneme_based.png](assets/date_example_phoneme_based.png)
+But, the feedback message is still not very helpful.
+### Advanced Evaluation based on LLM
+Much more powerful, however, is an evaluation leveraging the power of an LLM (`GPT-4-turbo`).
+We create a simple LLM chain
+```python
+prompt = ChatPromptTemplate.from_template(Path("prompt.md").read_text())
+output_parser = StrOutputParser()
+def create_llm(openai_api_key=openai_api_key):
+    if openai_api_key in [None, ""]:
+        raise gr.Error(
+            "No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."
+        )
+    llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
+    return llm
+def create_llm_chain(prompt=prompt, output_parser=output_parser, openai_api_key=openai_api_key):
+    if openai_api_key in [None, ""]:
+        raise gr.Error(
+            """No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."""
+        )
+    llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
+    llm_chain = prompt | llm | output_parser
+    return llm_chain
+```
+and ingest the following inputs:
+```python
+def advanced_evaluation(
+    learner_l1,
+    learner_l2,
+    learner_phoneme_transcription,
+    teacher_phoneme_transcription,
+) -> str:
+    """Provide LLM-based feedback"""
+    return create_llm_chain().invoke(
+        {
+            "learner_l1": learner_l1,
+            "learner_l2": learner_l2,
+            "learner_phoneme_transcription": learner_phoneme_transcription,
+            "teacher_phoneme_transcription": teacher_phoneme_transcription,
+        }
+    )
+```
+into an LLM prompt template that can be found here:
+[Enhanced Prompt Template for Language Model Expert with Motivational Elements and Language-Specific Feedback](prompt.md).
+## Limitations & Outlook
+### Improve ASR Model
+Due to time constraints, I selected the first phoneme recognition model that I found on Hugging Face.
+With more time, one could
+- Experiment with different checkpoints at [Phoneme Recognition Models - Hugging Face](https://huggingface.co/models?other=phoneme-recognition)
+- Adapt OpenAI's Whisper model on phoneme recognition/transcription by simply changing the tokenizer to handle the new vocabulary (the set of phonemes),
+and fine-tuning th model on an (audio, phoneme) dataset with an appropriate metric. See [openai/whisper · Phoneme recognition](https://huggingface.co/spaces/openai/whisper/discussions/86) for a short discussion about it.
+- Employ a model like [m-bain/whisperX: WhisperX](https://github.com/m-bain/whisperX) and possibly fine-tune it, to achieve word-level timestamps & diarization.
+Further, the output of the ASR model could be enhanced by grouping phonemes (to allow for better world-level feedback and alignment) and also adding better prosodic/suprasegmental support.
+### Improve LLM prompt
+Again due to time constraints, I created a single prompt template.
+Further prompt engineering and metaprompting could
+- Reduce hallucinations
+- Create a more didactically sound feedback, e.g. divided in different feedback sections like
+    - Place. The place of articulation is where a sound is made.
+    - Manner. The manner of articulation is how a sound is made.
+    - Voicing. Voice or voicing refers to the vibration of the vocal folds.
+- Recommend fitting exercises and content of babbel.com
+#### Improve UI/feedback time
+The LLM response currently takes some time. Among many ways to tackle this problem, one could:
+- Stream the response for immediate feedback and better UX
+- Use clever caching for immediate responses
+- Collect several attempts, and only provide the LLM feedback on an aggregate of attempts (for example in a dedicated pronounciation trainer section)
+### Personalization
+The personalization is very limited as it only looks as l1 and l2 of the learner.
+We could further:
+- Compare the current attempt with previous attempts in order to show progress/regress. This could be especially motivating if learner is still far from a perfect pronunciation, but steadily improves.
+- Include additional learner information, like preferences and proficiency, etc.
+### Alternative phoneme-based feedback
+- Instead of, or complimentary, to employing an LLM for advanced and personalized feedback, we could provide scores and feedback based on a distance measure between phonemes.
+    - Among a variety of possible distances, a simple starting point could be a 3-D distance of the place of articulation (where the sound is made).

docs/prompt.md ADDED Viewed

	@@ -0,0 +1,47 @@

+### Enhanced Prompt Template for Language Model Expert with Motivational Elements and Language-Specific Feedback
+**Context:** You are tasked with assisting a {learner_l1} language learner who is struggling with {learner_l2} pronunciation. The learner attempted to say "{teacher_phoneme_transcription}" but instead pronounced it as "{learner_phoneme_transcription}". Your goal is to provide detailed, constructive feedback on how to improve their pronunciation, focusing on the specific sounds they are mispronouncing.
+**Instructions:** Write a comprehensive and motivational feedback report that identifies the pronunciation errors, explains why these errors might be occurring from a linguistic perspective related to the learner's native language, and offers practical exercises to correct these errors. Include phonetic transcriptions where necessary and suggest any useful resources or techniques for pronunciation practice.
+**Details:**
+- **Length:** The feedback should be concise yet thorough, ideally not exceeding 500 words.
+- **Format:** Use a structured format with clear headings for each section (e.g., Introduction, Error Analysis, Corrective Actions, Additional Resources, Words of Encouragement).
+- **Style:** Maintain a supportive, educational, and motivational tone throughout the feedback. Use emojis to make the feedback more engaging.
+- **Outcome:** The learner should have a clear understanding of their pronunciation mistakes and feel equipped and motivated with specific strategies to improve.
+**Example of Expected Output:**
+```
+Feedback on Pronunciation Errors and Improvement Strategies 🌟
+Introduction:
+Hello! Great effort on your journey to mastering {learner_l2} pronunciation. This feedback is designed to help you refine your skills and embrace the nuances of {learner_l2}, especially focusing on the challenges posed by your native language, {learner_l1}.
+Error Analysis:
+1. Confusion between /ɹɛkəɡnaɪz/ and /ɹɛk/:
+   - Error: You substituted the complex consonant cluster /ɡn/ in "recognize" with a simpler /k/ sound in "wreck."
+   - Possible Cause: The /ɡn/ cluster might be challenging due to its rarity in {learner_l1}.
+2. Mispronunciation of /spiːtʃ/ as /biːtʃ/:
+   - Error: You articulated the initial consonant sound incorrectly, substituting /sp/ with /b/.
+   - Possible Cause: The /sp/ cluster may be particularly challenging as it does not commonly occur in {learner_l1} phonetic patterns.
+Corrective Actions:
+1. For /ɹɛkəɡnaɪz/ vs. /ɹɛk/:
+   - Practice slowly pronouncing /ɡn/ in isolation, then gradually in the word "recognize."
+   - Record yourself and compare with native speakers using online pronunciation tools like Forvo.
+2. For /spiːtʃ/ vs. /biːtʃ/:
+   - Focus on differentiating /s/ and /b/ sounds. Begin by pronouncing /s/ and /b/ separately, then in simple words, and finally in the target phrase.
+   - Engage in minimal pairs exercises, practicing pairs of words that differ only in the initial sound (e.g., "spit" vs. "bit").
+Additional Resources:
+- Explore [resource link] for interactive pronunciation exercises tailored to {learner_l1} speakers.
+- Consider consulting a speech therapist specializing in accent reduction for personalized guidance.
+Words of Encouragement:
+Keep up the fantastic work! 🚀 Remember, every mistake is a stepping stone towards your success. Stay persistent, practice regularly, and keep a positive mindset. You're doing wonderfully, and with continued effort, you'll achieve your pronunciation goals! 💪
+Conclusion:
+By focusing on these specific areas and consistently practicing, you will see significant improvement in your pronunciation. We believe in you! 🌟

learner_input.json ADDED Viewed

	@@ -0,0 +1,27 @@

+[
+    {
+        "text_to_record": "interesting",
+        "learner_recording": "interesting.wav",
+        "sr_transcript_of_learner_recording": "Interesting."
+    },
+    {
+        "text_to_record": "won",
+        "learner_recording": "won.wav",
+        "sr_transcript_of_learner_recording": "One."
+    },
+    {
+        "text_to_record": "Today is the thirteenth of May, twenty twenty three.",
+        "learner_recording": "today.wav",
+        "sr_transcript_of_learner_recording": "Today is the 13th of May, 2023."
+    },
+    {
+        "text_to_record": "I like to watch Youtube.",
+        "learner_recording": "youtube.wav",
+        "sr_transcript_of_learner_recording": "I like to watch you too."
+    },
+    {
+        "text_to_record": "I have two and a half euros.",
+        "learner_recording": "euros.wav",
+        "sr_transcript_of_learner_recording": "I have a, I have €2.5."
+    }
+]

prompt.md ADDED Viewed

	@@ -0,0 +1,47 @@

+### Enhanced Prompt Template for Language Model Expert with Motivational Elements and Language-Specific Feedback
+**Context:** You are tasked with assisting a {learner_l1} language learner who is struggling with {learner_l2} pronunciation. The learner attempted to say "{teacher_phoneme_transcription}" but instead pronounced it as "{learner_phoneme_transcription}". Your goal is to provide detailed, constructive feedback on how to improve their pronunciation, focusing on the specific sounds they are mispronouncing.
+**Instructions:** Write a comprehensive and motivational feedback report that identifies the pronunciation errors, explains why these errors might be occurring from a linguistic perspective related to the learner's native language, and offers practical exercises to correct these errors. Include phonetic transcriptions where necessary and suggest any useful resources or techniques for pronunciation practice.
+**Details:**
+- **Length:** The feedback should be concise yet thorough, ideally not exceeding 500 words.
+- **Format:** Use a structured format with clear headings for each section (e.g., Introduction, Error Analysis, Corrective Actions, Additional Resources, Words of Encouragement).
+- **Style:** Maintain a supportive, educational, and motivational tone throughout the feedback. Use emojis to make the feedback more engaging.
+- **Outcome:** The learner should have a clear understanding of their pronunciation mistakes and feel equipped and motivated with specific strategies to improve.
+**Example of Expected Output:**
+```
+Feedback on Pronunciation Errors and Improvement Strategies 🌟
+Introduction:
+Hello! Great effort on your journey to mastering {learner_l2} pronunciation. This feedback is designed to help you refine your skills and embrace the nuances of {learner_l2}, especially focusing on the challenges posed by your native language, {learner_l1}.
+Error Analysis:
+1. Confusion between /ɹɛkəɡnaɪz/ and /ɹɛk/:
+   - Error: You substituted the complex consonant cluster /ɡn/ in "recognize" with a simpler /k/ sound in "wreck."
+   - Possible Cause: The /ɡn/ cluster might be challenging due to its rarity in {learner_l1}.
+2. Mispronunciation of /spiːtʃ/ as /biːtʃ/:
+   - Error: You articulated the initial consonant sound incorrectly, substituting /sp/ with /b/.
+   - Possible Cause: The /sp/ cluster may be particularly challenging as it does not commonly occur in {learner_l1} phonetic patterns.
+Corrective Actions:
+1. For /ɹɛkəɡnaɪz/ vs. /ɹɛk/:
+   - Practice slowly pronouncing /ɡn/ in isolation, then gradually in the word "recognize."
+   - Record yourself and compare with native speakers using online pronunciation tools like Forvo.
+2. For /spiːtʃ/ vs. /biːtʃ/:
+   - Focus on differentiating /s/ and /b/ sounds. Begin by pronouncing /s/ and /b/ separately, then in simple words, and finally in the target phrase.
+   - Engage in minimal pairs exercises, practicing pairs of words that differ only in the initial sound (e.g., "spit" vs. "bit").
+Additional Resources:
+- Explore [resource link] for interactive pronunciation exercises tailored to {learner_l1} speakers.
+- Consider consulting a speech therapist specializing in accent reduction for personalized guidance.
+Words of Encouragement:
+Keep up the fantastic work! 🚀 Remember, every mistake is a stepping stone towards your success. Stay persistent, practice regularly, and keep a positive mindset. You're doing wonderfully, and with continued effort, you'll achieve your pronunciation goals! 💪
+Conclusion:
+By focusing on these specific areas and consistently practicing, you will see significant improvement in your pronunciation. We believe in you! 🌟

pyproject.toml ADDED Viewed

	@@ -0,0 +1,28 @@

+[project]
+name = "pronunciation-trainer"
+version = "0.1.0"
+description = "Grapheme and Phoneme-Based Pronounciation Trainer"
+authors = [{ name = "Pascal Wenker", email = "[email protected]" }]
+dependencies = [
+  "gradio>=4.29.0",
+  "transformers>=4.40.2",
+  "torch>=2.3.0",
+  "torchaudio>=2.3.0",
+  "setuptools>=69.5.1",
+  "phonemizer>=3.2.1",
+  "langchain>=0.1.19",
+  "langchain_openai>=0.1.6",
+]
+readme = "README.md"
+requires-python = ">= 3.11"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.rye]
+managed = true
+dev-dependencies = []
+[tool.hatch.metadata]
+allow-direct-references = true

requirements-dev.lock ADDED Viewed

	@@ -0,0 +1,147 @@

+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+-e file:.
+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+babel==2.15.0
+bibtexparser==2.0.0b7
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+clldutils==3.22.2
+colorama==0.4.6
+colorlog==6.8.2
+contourpy==1.2.1
+csvw==3.3.0
+cycler==0.12.1
+dataclasses-json==0.6.5
+distro==1.9.0
+dlinfo==1.2.1
+dnspython==2.6.1
+email-validator==2.1.1
+fastapi==0.111.0
+fastapi-cli==0.0.3
+ffmpy==0.3.2
+filelock==3.14.0
+fonttools==4.51.0
+frozenlist==1.4.1
+fsspec==2024.3.1
+gradio==4.29.0
+gradio-client==0.16.1
+greenlet==3.0.3
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.0
+idna==3.7
+importlib-resources==6.4.0
+isodate==0.6.1
+jinja2==3.1.4
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+langchain==0.1.19
+langchain-community==0.0.38
+langchain-core==0.1.52
+langchain-openai==0.1.6
+langchain-text-splitters==0.0.1
+langsmith==0.1.56
+language-tags==1.2.0
+lxml==5.2.1
+markdown==3.6
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+marshmallow==3.21.2
+matplotlib==3.8.4
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.1.105
+openai==1.28.0
+orjson==3.10.3
+packaging==23.2
+pandas==2.2.2
+phonemizer==3.2.1
+pillow==10.3.0
+pydantic==2.7.1
+pydantic-core==2.18.2
+pydub==0.25.1
+pygments==2.18.0
+pylatexenc==2.10
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+pyyaml==6.0.1
+rdflib==7.0.0
+referencing==0.35.1
+regex==2024.5.10
+requests==2.31.0
+rfc3986==1.5.0
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.4.4
+safetensors==0.4.3
+segments==2.2.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+sqlalchemy==2.0.30
+starlette==0.37.2
+sympy==1.12
+tabulate==0.9.0
+tenacity==8.3.0
+tiktoken==0.6.0
+tokenizers==0.19.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.3.0
+torchaudio==2.3.0
+tqdm==4.66.4
+transformers==4.40.2
+triton==2.3.0
+typer==0.12.3
+typing-extensions==4.11.0
+typing-inspect==0.9.0
+tzdata==2024.1
+ujson==5.9.0
+uritemplate==4.1.1
+urllib3==2.2.1
+uvicorn==0.29.0
+uvloop==0.19.0
+watchfiles==0.21.0
+websockets==11.0.3
+yarl==1.9.4
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==69.5.1

requirements.lock ADDED Viewed

	@@ -0,0 +1,147 @@

+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+-e file:.
+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+babel==2.15.0
+bibtexparser==2.0.0b7
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+clldutils==3.22.2
+colorama==0.4.6
+colorlog==6.8.2
+contourpy==1.2.1
+csvw==3.3.0
+cycler==0.12.1
+dataclasses-json==0.6.5
+distro==1.9.0
+dlinfo==1.2.1
+dnspython==2.6.1
+email-validator==2.1.1
+fastapi==0.111.0
+fastapi-cli==0.0.3
+ffmpy==0.3.2
+filelock==3.14.0
+fonttools==4.51.0
+frozenlist==1.4.1
+fsspec==2024.3.1
+gradio==4.29.0
+gradio-client==0.16.1
+greenlet==3.0.3
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.0
+idna==3.7
+importlib-resources==6.4.0
+isodate==0.6.1
+jinja2==3.1.4
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+langchain==0.1.19
+langchain-community==0.0.38
+langchain-core==0.1.52
+langchain-openai==0.1.6
+langchain-text-splitters==0.0.1
+langsmith==0.1.56
+language-tags==1.2.0
+lxml==5.2.1
+markdown==3.6
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+marshmallow==3.21.2
+matplotlib==3.8.4
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.1.105
+openai==1.28.0
+orjson==3.10.3
+packaging==23.2
+pandas==2.2.2
+phonemizer==3.2.1
+pillow==10.3.0
+pydantic==2.7.1
+pydantic-core==2.18.2
+pydub==0.25.1
+pygments==2.18.0
+pylatexenc==2.10
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+pyyaml==6.0.1
+rdflib==7.0.0
+referencing==0.35.1
+regex==2024.5.10
+requests==2.31.0
+rfc3986==1.5.0
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.4.4
+safetensors==0.4.3
+segments==2.2.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+sqlalchemy==2.0.30
+starlette==0.37.2
+sympy==1.12
+tabulate==0.9.0
+tenacity==8.3.0
+tiktoken==0.6.0
+tokenizers==0.19.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.3.0
+torchaudio==2.3.0
+tqdm==4.66.4
+transformers==4.40.2
+triton==2.3.0
+typer==0.12.3
+typing-extensions==4.11.0
+typing-inspect==0.9.0
+tzdata==2024.1
+ujson==5.9.0
+uritemplate==4.1.1
+urllib3==2.2.1
+uvicorn==0.29.0
+uvloop==0.19.0
+watchfiles==0.21.0
+websockets==11.0.3
+yarl==1.9.4
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==69.5.1

src/pronunciation_trainer/__init__.py ADDED Viewed

File without changes

src/pronunciation_trainer/app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+This script provides a simple web interface for the pronunciation trainer using the gradio library
+"""
+from pathlib import Path
+import gradio as gr
+from phonemizer import phonemize
+from pronunciation_trainer.evaluation import (advanced_evaluation,
+                                              basic_evaluation,
+                                              normalize_texts)
+from pronunciation_trainer.loading import (load_advanced_examples,
+                                           load_simple_examples)
+from pronunciation_trainer.transcription import (transcribe_to_graphemes,
+                                                 transcribe_to_phonemes)
+with gr.Blocks() as demo:
+    with gr.Tab("Welcome"):
+        readme = Path("README.md").read_text()
+        gr.Markdown(readme)
+    with gr.Tab("Grapheme-Based Speech Evaluation"):
+        with gr.Row():
+            learner_transcription = gr.Textbox(
+                label="Learner Transcription",
+                placeholder="It is nice to wreck a nice beach",
+            )
+            teacher_transcription = gr.Textbox(
+                label="Teacher Transcription",
+                placeholder="It is nice to recognize speech",
+            )
+        basic_evaluate_btn = gr.Button("Evaluate", variant="primary")
+        gr.Markdown("## Evaluation")
+        gr.Markdown("### Basic Evaluation")
+        grapheme_evaluation = gr.Markdown()
+        with gr.Row():
+            basic_similarity_score = gr.Label(label="Similarity Score of Transcripts")
+            basic_diff_box = gr.HighlightedText(
+                label="Difference between Learner and Teacher transcripts",
+                combine_adjacent=True,
+                show_legend=True,
+                color_map={"+": "red", "-": "green"},
+            )
+        basic_evaluate_btn.click(
+            fn=normalize_texts,
+            inputs=[learner_transcription, teacher_transcription],
+            outputs=[learner_transcription, teacher_transcription],
+        ).success(
+            fn=basic_evaluation,
+            inputs=[learner_transcription, teacher_transcription],
+            outputs=[basic_similarity_score, grapheme_evaluation, basic_diff_box],
+        )
+        with gr.Accordion("Learner Examples"):
+            gr.Markdown("### Examples for grapheme-based evaluation")
+            simple_examples = gr.Examples(
+                examples=load_simple_examples(),
+                inputs=[
+                    teacher_transcription,
+                    learner_transcription,
+                ],
+            )
+    with gr.Tab("Phoneme-Based Speech Evaluation"):
+        with gr.Row():
+            learner_recording = gr.Audio(
+                label="Learner Recording",
+                sources=["microphone", "upload"],
+            )
+            teacher_recording = gr.Audio(
+                label="Teacher Recording",
+                sources=["microphone", "upload"],
+            )
+        with gr.Row():
+            learner_phoneme_transcription = gr.Textbox(
+                label="Learner Phoneme Transcription",
+                placeholder=phonemize(learner_transcription.placeholder),
+                interactive=True,
+            )
+            teacher_phoneme_transcription = gr.Textbox(
+                label="Teacher Phoneme Transcription",
+                placeholder=phonemize(teacher_transcription.placeholder),
+                interactive=True,
+            )
+        learner_l1 = gr.Textbox(
+            label="Native language of  Learner (L1)", placeholder="German"
+        )
+        learner_l2 = gr.Textbox(
+            label="Language the learner aims to acquire (L2)", placeholder="English"
+        )
+        advanced_evaluate_btn = gr.Button("Evaluate", variant="primary")
+        gr.Markdown("## Advanced Evaluation")
+        with gr.Row():
+            similarity_score = gr.Label(
+                label="Similarity Score of of Phoneme Transcripts"
+            )
+            diff_box = gr.HighlightedText(
+                label="Difference between Learner and Teacher Phoneme transcripts",
+                combine_adjacent=True,
+                show_legend=True,
+                color_map={"+": "red", "-": "green"},
+            )
+        llm_evaluation = gr.Markdown()
+        learner_recording.change(
+            fn=transcribe_to_phonemes,
+            inputs=learner_recording,
+            outputs=learner_phoneme_transcription,
+        )
+        teacher_recording.change(
+            fn=transcribe_to_phonemes,
+            inputs=teacher_recording,
+            outputs=teacher_phoneme_transcription,
+        )
+        advanced_evaluate_btn.click(
+            fn=basic_evaluation,
+            inputs=[learner_phoneme_transcription, teacher_phoneme_transcription],
+            outputs=[similarity_score, llm_evaluation, diff_box],
+        ).success(
+            advanced_evaluation,
+            inputs=[
+                learner_l1,
+                learner_l2,
+                learner_phoneme_transcription,
+                teacher_phoneme_transcription,
+            ],
+            outputs=llm_evaluation,
+        )
+        with gr.Accordion("Learner Examples"):
+            gr.Markdown("### Examples for advanced evaluation")
+            advanced_examples = gr.Examples(
+                examples=load_advanced_examples(),
+                inputs=[
+                    learner_l1,
+                    learner_l2,
+                    learner_recording,
+                    teacher_recording,
+                ],
+            )
+if __name__ == "__main__":
+    demo.launch()

src/pronunciation_trainer/config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+This file is used to load the OpenAI API key from the .env file.
+"""
+import os
+from dotenv import find_dotenv, load_dotenv
+load_dotenv(find_dotenv())
+openai_api_key = os.getenv("OPENAI_API_KEY")

src/pronunciation_trainer/evaluation.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+This module provides functions for evaluating speaking attempts.
+It includes:
+- A basic evaluation function that compares two phrases (grapheme-based) and provides feedback based on their similarity ratio.
+- An advanced evaluation function that uses a language learning model (LLM) to provide feedback based on phoneme transcriptions.
+The basic evaluation function includes:
+- A normalization function that converts input texts to lower case and strips whitespace.
+- A function that calculates the similarity ratio between two phrases.
+- A function that generates a diff between two phrases.
+- A function that generates feedback based on the similarity ratio.
+The advanced evaluation function includes:
+- A function that invokes an LLM chain to provide phoneme-based feedback
+"""
+from difflib import Differ, SequenceMatcher
+from typing import Optional, Tuple
+from pronunciation_trainer.llm import create_llm_chain
+def normalize_texts(actual: str, expected: str) -> list[str]:
+    """Normalize two input texts by converting them to lower case and stripping whitespace.
+    Note: This normalization function is very simple and only here to demonstrate the general necessity of normalization
+    """
+    return [text.lower().strip() for text in [actual, expected]]
+def compare_phrases(expected: str, actual: str) -> float:
+    """Calculate the similarity ratio between two phrases."""
+    return SequenceMatcher(None, expected, actual).ratio()
+def diff_phrases(expected: str, actual: str) -> list[Tuple[str, Optional[str]]]:
+    """Generate a diff between two phrases."""
+    differ = Differ()
+    return [
+        (token[2:], None if token[0] == " " else token[0])
+        for token in differ.compare(expected, actual)
+    ]
+def generate_feedback(similarity_ratio: float) -> str:
+    """Generate feedback based on the similarity ratio."""
+    if similarity_ratio > 0.9:
+        return "Excellent!"
+    elif similarity_ratio > 0.7:
+        return "Good job!"
+    elif similarity_ratio > 0.5:
+        return "Not bad, but there's room for improvement."
+    else:
+        return "Please try again, focus on pronunciation and clarity."
+def basic_evaluation(
+    expected: str, actual: str, autojunk: bool = True
+) -> Tuple[float, str, list[Tuple[str, Optional[str]]]]:
+    """Evaluate speaking attempts by comparing expected and actual phrases."""
+    expected, actual = normalize_texts(expected, actual)
+    similarity_ratio = compare_phrases(expected, actual)
+    diff = diff_phrases(expected, actual)
+    feedback = generate_feedback(similarity_ratio)
+    return similarity_ratio, feedback, diff
+def advanced_evaluation(
+    learner_l1,
+    learner_l2,
+    learner_phoneme_transcription,
+    teacher_phoneme_transcription,
+) -> str:
+    """Provide LLM-based feedback"""
+    return create_llm_chain().invoke(
+        {
+            "learner_l1": learner_l1,
+            "learner_l2": learner_l2,
+            "learner_phoneme_transcription": learner_phoneme_transcription,
+            "teacher_phoneme_transcription": teacher_phoneme_transcription,
+        }
+    )

src/pronunciation_trainer/llm.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+This module contains the code to create a language model chain using the OpenAI API.
+"""
+from pathlib import Path
+import gradio as gr
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from pronunciation_trainer.config import openai_api_key
+prompt = ChatPromptTemplate.from_template(Path("prompt.md").read_text())
+output_parser = StrOutputParser()
+def create_llm(openai_api_key=openai_api_key):
+    if openai_api_key in [None, ""]:
+        raise gr.Error(
+            "No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."
+        )
+    llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
+    return llm
+def create_llm_chain(prompt=prompt, output_parser=output_parser, openai_api_key=openai_api_key):
+    if openai_api_key in [None, ""]:
+        raise gr.Error(
+            """No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."""
+        )
+    llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
+    llm_chain = prompt | llm | output_parser
+    return llm_chain

src/pronunciation_trainer/loading.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+This module is responsible for loading data from the provided JSON file.
+It also provides functions to load simple and advanced examples for the grapheme-based
+and phoneme-based pronunciation trainer, respectively.
+"""
+import json
+def load_data(filepath: str = "learner_input.json") -> dict:
+    with open(filepath, "r") as file:
+        data = json.load(file)
+    return data
+def load_simple_examples():
+    simple_examples = [
+        [
+            ex["text_to_record"],
+            ex["sr_transcript_of_learner_recording"],
+        ]
+        for ex in load_data()
+    ]
+    return simple_examples
+def load_advanced_examples():
+    advanced_examples = [
+        [
+            "Lithuanian",
+            "English",
+            f'audios/learner/{ex["learner_recording"]}',
+            f'audios/teacher/{ex["learner_recording"]}',
+        ]
+        for ex in load_data()
+    ]
+    advanced_examples.append(
+        [
+            "German",
+            "English",
+            f"audios/learner/book.aac",
+            f"audios/teacher/book.wav",
+        ]
+    )
+    return advanced_examples

src/pronunciation_trainer/rich_logging.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+This file contains the logging configuration for the project.
+"""
+import logging
+from rich.console import Console
+from rich.logging import RichHandler
+FORMAT = "%(message)s"
+logging.basicConfig(
+    level="INFO",
+    format=FORMAT,
+    datefmt="[%X]",
+    handlers=[RichHandler(rich_tracebacks=True)],
+)
+console = Console()
+log = logging.getLogger("rich")
+def set_log_level(verbosity: int):
+    if verbosity == 0:
+        log.setLevel(logging.WARNING)
+    elif verbosity == 1:
+        log.setLevel(logging.INFO)
+    elif verbosity == 2:
+        log.setLevel(logging.DEBUG)
+    return log

src/pronunciation_trainer/scripts.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+This script is used to produce teacher audio files for the given examples using OpenAI's Text-to-Speech API.
+The produced audio files are saved in the `audios/teacher` directory.
+"""
+from pathlib import Path
+from openai import OpenAI
+from pronunciation_trainer.config import openai_api_key
+from pronunciation_trainer.loading import load_data
+from pronunciation_trainer.rich_logging import log
+data = load_data()
+client = OpenAI(api_key=openai_api_key)
+def produce_teacher_audio(audio_name, text):
+    """
+    Produce a teacher audio file for the given text using OpenAI's Text-to-Speech API.
+    See: https://platform.openai.com/docs/guides/text-to-speech
+    """
+    speech_file_path = Path(f"audios/teacher/{audio_name}")
+    response = client.audio.speech.create(
+        model="tts-1-hd",
+        voice="alloy",
+        input=text,
+    )
+    response.stream_to_file(speech_file_path)
+    log.info(
+        f"Successfully produced teacher audio for {text=} at {speech_file_path.name=} 🎉"
+    )
+if __name__ == "__main__":
+    # Produce teacher/ground-truth audio files for the given examples
+    data = load_data()
+    for datum in data:
+        produce_teacher_audio(datum["learner_recording"], datum["text_to_record"])
+    # Produce an additional example
+    produce_teacher_audio("book.wav", "The book is on the table")

src/pronunciation_trainer/transcription.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+This module contains the transcribe function, which uses the Hugging Face pipeline to transcribe audio to text.
+The transcribe function takes a single parameter, audio, which is a numpy array of the audio the user recorded.
+There are two transcriber choices available: grapheme and phoneme. The grapheme transcriber uses the openai/whisper-base.en model, while the phoneme transcriber uses the facebook/wav2vec2-lv-60-espeak-cv-ft model.
+"""
+from enum import StrEnum
+from functools import partial
+import numpy as np
+from transformers import pipeline
+class TranscriberChoice(StrEnum):
+    grapheme = "openai/whisper-base.en"
+    phoneme = "facebook/wav2vec2-lv-60-espeak-cv-ft"
+def transcribe(
+    audio, transcriber_choice: TranscriberChoice = TranscriberChoice.grapheme
+):
+    """
+    The transcribe function takes a single parameter, audio, which is a numpy array of the audio the user recorded.
+    The pipeline object expects this in float32 format,so we convert it first to float32, and then extract the transcribed text.
+    """
+    transcriber = pipeline("automatic-speech-recognition", model=transcriber_choice)
+    try:
+        sr, y = audio
+        print(f"Sampling rate is {sr}")
+    except TypeError:
+        return None
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    transcription = transcriber({"sampling_rate": sr, "raw": y})["text"]
+    return transcription
+transcribe_to_phonemes = partial(
+    transcribe, transcriber_choice=TranscriberChoice.phoneme
+)
+transcribe_to_graphemes = partial(
+    transcribe, transcriber_choice=TranscriberChoice.grapheme
+)