pwenker commited on
Commit
20943e6
·
0 Parent(s):

Initialize

Browse files
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
.python-version ADDED
@@ -0,0 +1 @@
 
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pronunciation Trainer 🗣️
2
+
3
+ This repository/app showcases how a [phoneme-based pronunciation trainer](docs/phoneme_based_solution.md) (including personalized LLM-based feedback) overcomes the limitations of a [grapheme-based approach](docs/grapheme_based_solution.md).
4
+
5
+ | Feature | Grapheme-Based Solution | Phoneme-Based Solution |
6
+ |-----------------------------------|----------------------------------------------------------|---------------------------------------------------------|
7
+ | **Input Type** | Text transcriptions of speech | Audio files and phoneme transcriptions |
8
+ | **Feedback Mechanism** | Comparison of grapheme sequences | Comparison of phoneme sequences and advanced LLM-based feedback |
9
+ | **Technological Approach** | Simple text comparison using `SequenceMatcher` | Advanced ASR models like Wav2Vec2 for phoneme recognition |
10
+ | **Feedback Detail** | Basic similarity score and diff | Detailed phoneme comparison, LLM-based feedback including motivational and corrective elements |
11
+ | **Error Sensitivity** | Sensitive to homophones and transcription errors | More accurate in capturing pronunciation nuances |
12
+ | **Suprasegmental Features** | Does not capture (stress, intonation) | Potentially captures through phoneme dynamics and advanced evaluation |
13
+ | **Personalization** | Limited to error feedback based on text similarity | Advanced personalization considering learner's native language and target language proficiency |
14
+ | **Scalability** | Easy to scale with basic text processing tools | Requires more computational resources for ASR and LLM processing |
15
+ | **Cost** | Lower, primarily involves basic computational resources | Higher, due to usage of advanced APIs and model processing |
16
+ | **Accuracy** | Lower, prone to misinterpretations of homophones | Higher, better at handling diverse pronunciation patterns (but LLM hallucinations) |
17
+ | **Feedback Quality** | Basic, often not linguistically rich | Rich, detailed, personalized, and linguistically informed |
18
+ | **Potential for Learning** | Limited to recognizing text differences | High, includes phonetic and prosodic feedback, as well as resource and practice recommendations |
19
+
20
+ ## Quickstart 🚀
21
+
22
+ ### 👉 Click here to try out the app directly:
23
+ [**Pronunciation Trainer App**](https://pwenker-pronunciation-trainer.hf.space/)
24
+
25
+ ### 🔍 Inspect the code at:
26
+ - **GitHub:** [pwenker/pronunciation_trainer](https://github.com/pwenker/pronounciation_trainer)
27
+ - **Hugging Face Spaces:** [pwenker/pronunciation_trainer](https://huggingface.co/spaces/pwenker/pronounciation_trainer)
28
+
29
+ ## Local Deployment 🏠
30
+
31
+ ### Prerequisites 📋
32
+
33
+ #### Rye 🌾
34
+ [Install `Rye`](https://rye-up.com/guide/installation/#installing-rye)
35
+ > Rye is a comprehensive tool designed for Python developers. It simplifies your workflow by managing Python installations and dependencies. Simply install Rye, and it takes care of the rest.
36
+
37
+ - Create a `.env` file in the `pronunciation_trainer` folder and add the following variable:
38
+
39
+ #### OPENAI API Token 🔑
40
+ ```
41
+ OPENAI_TOKEN=... # Token for the OpenAI API
42
+ ```
43
+
44
+ ### Set-Up 🛠️
45
+
46
+ Clone the repository:
47
+ ```
48
+ git clone [repository-url] # Replace [repository-url] with the actual URL of the repository
49
+ ```
50
+ Navigate to the directory:
51
+ ```
52
+ cd pronunciation_trainer
53
+ ```
54
+
55
+ Create a virtual environment in `.venv` and synchronize the repo:
56
+ ```
57
+ rye sync
58
+ ```
59
+ For more details, visit: [Basics - Rye](https://rye-up.com/guide/basics/)
60
+
61
+ ### Start the App 🌟
62
+
63
+ Launch the app using:
64
+ ```
65
+ rye run python src/pronunciation_trainer/app.py
66
+ ```
67
+
68
+ Then, open your browser and visit [http://localhost:7860](http://localhost:7860/) to start practicing!
69
+
audios/learner/book.aac ADDED
Binary file (8.93 kB). View file
 
audios/learner/euros.wav ADDED
Binary file (709 kB). View file
 
audios/learner/interesting.wav ADDED
Binary file (246 kB). View file
 
audios/learner/today.wav ADDED
Binary file (721 kB). View file
 
audios/learner/won.wav ADDED
Binary file (195 kB). View file
 
audios/learner/youtube.wav ADDED
Binary file (330 kB). View file
 
audios/teacher/book.wav ADDED
Binary file (25.9 kB). View file
 
audios/teacher/euros.wav ADDED
Binary file (34.1 kB). View file
 
audios/teacher/interesting.wav ADDED
Binary file (14.9 kB). View file
 
audios/teacher/today.wav ADDED
Binary file (62.4 kB). View file
 
audios/teacher/won.wav ADDED
Binary file (10.1 kB). View file
 
audios/teacher/youtube.wav ADDED
Binary file (30.2 kB). View file
 
docs/assets/date_example_grapheme_based.png ADDED
docs/assets/date_example_phoneme_based.png ADDED
docs/assets/grapheme_based_pronunciation.png ADDED
docs/assets/phoneme_based_pronunciation_feedback_example.png ADDED
docs/assets/phoneme_based_pronunciation_input_example.png ADDED
docs/assets/phoneme_based_pronunciation_interface.png ADDED
docs/grapheme_based_solution.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Basic Grapheme-Based Solution
2
+
3
+ ![Grapheme-based pronunciation](assets/grapheme_based_pronunciation.png)
4
+
5
+ ## Approach
6
+
7
+ In this simple solution we load the grapheme transcription of the learner recording, as well as the ground-truth transcription and compute
8
+ a similarity ratio, as well as a diff between the graphemes. Further, based on the similarity score simple feedback is generated to inform the learner about their performance.
9
+
10
+ Before comparing the graphemes, we use a very simple normalization procedure.
11
+
12
+ ```python
13
+ def normalize_texts(actual: str, expected: str) -> list[str]:
14
+ """Normalize two input texts by converting them to lower case and stripping whitespace.
15
+
16
+ Note: This normalization function is very simple and only here to demonstrate the general necessity of normalization
17
+ """
18
+
19
+ return [text.lower().strip() for text in [actual, expected]]
20
+ ```
21
+
22
+ The similarity score and diff are produced with the `difflib.SequenceMatcher`/`difflib.Differ`:
23
+
24
+ ```python
25
+ def compare_phrases(expected: str, actual: str) -> float:
26
+ """Calculate the similarity ratio between two phrases."""
27
+ return SequenceMatcher(None, expected, actual).ratio()
28
+
29
+
30
+ def diff_phrases(expected: str, actual: str) -> list[Tuple[str, Optional[str]]]:
31
+ """Generate a diff between two phrases."""
32
+ differ = Differ()
33
+ return [
34
+ (token[2:], None if token[0] == " " else token[0])
35
+ for token in differ.compare(expected, actual)
36
+ ]
37
+ ```
38
+
39
+ and then a simple rule-based approach yields the feedback:
40
+
41
+ ```python
42
+ def generate_feedback(similarity_ratio: float) -> str:
43
+ """Generate feedback based on the similarity ratio."""
44
+ if similarity_ratio > 0.9:
45
+ return "Excellent!"
46
+ elif similarity_ratio > 0.7:
47
+ return "Good job!"
48
+ elif similarity_ratio > 0.5:
49
+ return "Not bad, but there's room for improvement."
50
+ else:
51
+ return "Please try again, focus on pronunciation and clarity."
52
+ ```
53
+
54
+ The whole grapheme-based evaluation is as simple as:
55
+
56
+
57
+ ```python
58
+ def basic_evaluation(
59
+ expected: str, actual: str, autojunk: bool = True
60
+ ) -> Tuple[float, str, list[Tuple[str, Optional[str]]]]:
61
+ """Evaluate speaking attempts by comparing expected and actual phrases."""
62
+ expected, actual = normalize_texts(expected, actual)
63
+ similarity_ratio = compare_phrases(expected, actual)
64
+ diff = diff_phrases(expected, actual)
65
+ feedback = generate_feedback(similarity_ratio)
66
+ return similarity_ratio, feedback, diff
67
+ ```
68
+
69
+ #### About the `SequenceMatcher`
70
+ Note: Visit [https://docs.python.org/3/library/difflib.html](https://docs.python.org/3/library/difflib.html) for more details.
71
+
72
+ **Information**: The `SequenceMatcher` is a flexible class for comparing pairs of sequences of any type, so long as the sequence elements are hashable. The basic algorithm predates, and is a little fancier than, an algorithm published in the late 1980’s by Ratcliff and Obershelp under the hyperbolic name “gestalt pattern matching.” The idea is to find the longest contiguous matching subsequence that contains no “junk” elements; these “junk” elements are ones that are uninteresting in some sense, such as blank lines or whitespace. (Handling junk is an extension to the Ratcliff and Obershelp algorithm.) The same idea is then applied recursively to the pieces of the sequences to the left and to the right of the matching subsequence. This does not yield minimal edit sequences, but does tend to yield matches that “look right” to people.
73
+
74
+ **Timing**: The basic Ratcliff-Obershelp algorithm is cubic time in the worst case and quadratic time in the expected case. SequenceMatcher is quadratic time for the worst case and has expected-case behavior dependent in a complicated way on how many elements the sequences have in common; best case time is linear.
75
+
76
+ **Automatic junk heuristic**: SequenceMatcher supports a heuristic that automatically treats certain sequence items as junk. The heuristic counts how many times each individual item appears in the sequence. If an item’s duplicates (after the first one) account for more than 1% of the sequence and the sequence is at least 200 items long, this item is marked as “popular” and is treated as junk for the purpose of sequence matching. This heuristic can be turned off by setting the autojunk argument to False when creating the SequenceMatcher.
77
+
78
+ ## Limitations of Grapheme-Based Approach
79
+
80
+ ### Analysis of Transcription Pairs
81
+
82
+ #### Pair 1: "Interesting."
83
+ - **Teacher Transcription:** "Interesting."
84
+ - **Learner Transcription:** "Interesting."
85
+
86
+ **Analysis:**
87
+ - The learner's transcription starts with a capital "I" and ends with a period. This could be remedied by using a simple normalization function.
88
+ - The capitalization and punctuation are transcription errors but do not necessarily indicate a pronunciation error. This highlights a limitation in assessing pronunciation purely from written transcriptions without audio context.
89
+ - Further, without audio, it's impossible to assess aspects like stress, intonation, or subtle phonetic variations (e.g., the reduction of unstressed vowels or the precise articulation of consonants).
90
+
91
+ #### Pair 2: "won" vs. "One."
92
+ - **Teacher Transcription:** "won"
93
+ - **Learner Transcription:** "One."
94
+
95
+ **Analysis:**
96
+ - This is an example where grapheme-based transcription fails to capture the intended meaning, as it cannot differentiate homophones based solely on sounds
97
+ (The words "won" and "one" are homophones in English, pronounced the same way but differing in meaning and spelling)k:> [!WARNING]
98
+
99
+ #### Pair 3: "Today is the thirteenth of May, twenty twenty-three." vs. "Today is the 13th of May, 2023."
100
+ - **Teacher Transcription:** "Today is the thirteenth of May, twenty twenty-three."
101
+ - **Learner Transcription:** "Today is the 13th of May, 2023."
102
+
103
+ **Analysis:** :
104
+ - While there is no indication of pronunciation errors (both transcriptions likely sound identical), the discrepancy lies in the representation of dates and numbers due to the transcription
105
+ - This again demonstrates that grapheme-based approaches are affected by transcription errors/differences.
106
+
107
+ #### Pair 5: "I have two and a half euros." vs. "I have, I have €2.5."
108
+ - **Teacher Transcription:** "I have two and a half euros."
109
+ - **Learner Transcription:** "I have, I have €2.5."
110
+
111
+ **Analysis:**
112
+ - The disfluency in the learner's transcription ("I have, I have") could be striped by preprocessing before the evaluation to ensure a better comparison
113
+ - The discrepancy between "€2.5" instead of "two and a half euros" again demonstrates grapheme-based transcription problems
114
+
115
+ ### Limitations of a Grapheme-Based Approach
116
+
117
+ A grapheme-based approach primarily focuses on the written symbols of a language (letters and numbers) and their standard pronunciations. However, this approach has several limitations. For example:
118
+
119
+ 1. **Homophones and Homographs:** Words like "won" and "one" illustrate how identical pronunciations can lead to a bad evaluation score when solely relying on graphemes. Similarly, words spelled the same but pronounced differently (read (present) /read (past)) could lead to perfect grapheme scores, although the pronunciation was off.
120
+
121
+ 2. **Suprasegmental Features:** Elements such as stress, rhythm, intonation, and pitch are crucial in spoken language but are not captured by graphemes. For example, the word "interesting" in the learner audio was pronounced with wrong stress patterns.
122
+
123
+ ### Remedy through Phoneme-based Approach
124
+
125
+ To address the limitations, as well as in order to provide a more personalized and constructive feedback, we create a more advanced [phoneme-based solution](phoneme_based_solution.md]
docs/phoneme_based_solution.md ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phoneme-based Pronunciation Trainer
2
+
3
+ ## Interface
4
+ ![Phoneme-based pronunciation interface](assets/phoneme_based_pronunciation_interface.png)
5
+
6
+ ## Feedback Example
7
+ ![Phoneme-based pronunciation feedback example](assets/phoneme_based_pronunciation_feedback_example.png)
8
+
9
+ ## Approach
10
+
11
+ ### Create teacher/ground truth audio files
12
+
13
+ We start by creating teacher audio files using OpenAi's Text-To-Speech API
14
+
15
+ ```python
16
+ def produce_teacher_audio(audio_name, text):
17
+ """
18
+ Produce a teacher audio file for the given text using OpenAI's Text-to-Speech API.
19
+ See: https://platform.openai.com/docs/guides/text-to-speech
20
+ """
21
+
22
+ speech_file_path = Path(f"audios/teacher/{audio_name}")
23
+ response = client.audio.speech.create(
24
+ model="tts-1-hd",
25
+ voice="alloy",
26
+ input=text,
27
+ )
28
+
29
+ response.stream_to_file(speech_file_path)
30
+ log.info(
31
+ f"Successfully produced teacher audio for {text=} at {speech_file_path.name=} 🎉"
32
+ )
33
+
34
+
35
+ if __name__ == "__main__":
36
+ # Produce teacher/ground-truth audio files for the given examples
37
+ data = load_data()
38
+ for datum in data:
39
+ produce_teacher_audio(datum["learner_recording"], datum["text_to_record"])
40
+
41
+ # Produce an additional example
42
+ produce_teacher_audio("book.wav", "The book is on the table")
43
+ ```
44
+
45
+
46
+ ### Audio-based and Personalized Input
47
+
48
+ Given the newly created teacher audios, we can now use both learner and teacher audios as input to our system
49
+ and thereby avoid the transcription-based limitations that we described in the [grapheme-based solution](grapheme_based_solution.md).
50
+
51
+ As you can see in the following image, we also supply the native language of the learner and the language they want to acquire.
52
+
53
+ ![Phoneme-Based Pronunciation Input Example](assets/phoneme_based_pronunciation_input_example.png)
54
+
55
+ ### Phoneme-Based ASR
56
+
57
+ We use a [Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme) model as proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680)
58
+
59
+ > Recent progress in self-training, self-supervised pretraining and unsupervised learning enabled well performing speech recognition systems without any labeled data. However, in many cases there is labeled data available for related languages which is not utilized by these methods. This paper extends previous work on zero-shot cross-lingual transfer learning by fine-tuning a multilingually pretrained wav2vec 2.0 model to transcribe unseen languages. This is done by mapping phonemes of the training languages to the target language using articulatory features. Experiments show that this simple method significantly outperforms prior work which introduced task-specific architectures and used only part of a monolingually pretrained model.
60
+
61
+
62
+ In particular, we use the following checkpoint: [facebook/wav2vec2-lv-60-espeak-cv-ft · Hugging Face](https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft).
63
+ > This checkpoint leverages the pretrained checkpoint wav2vec2-large-lv60 and is fine-tuned on CommonVoice to recognize phonetic labels in multiple languages.
64
+
65
+ For convenience, we create a partial `transcribe_to_phonemes` function as interface to this checkpoint:
66
+
67
+ ```python
68
+ class TranscriberChoice(StrEnum):
69
+ grapheme = "openai/whisper-base.en"
70
+ phoneme = "facebook/wav2vec2-lv-60-espeak-cv-ft"
71
+
72
+
73
+ def transcribe(
74
+ audio, transcriber_choice: TranscriberChoice = TranscriberChoice.grapheme
75
+ ):
76
+ """
77
+ The transcribe function takes a single parameter, audio, which is a numpy array of the audio the user recorded.
78
+ The pipeline object expects this in float32 format,so we convert it first to float32, and then extract the transcribed text.
79
+ """
80
+ transcriber = pipeline("automatic-speech-recognition", model=transcriber_choice)
81
+ try:
82
+ sr, y = audio
83
+ print(f"Sampling rate is {sr}")
84
+ except TypeError:
85
+ return None
86
+ y = y.astype(np.float32)
87
+ y /= np.max(np.abs(y))
88
+ transcription = transcriber({"sampling_rate": sr, "raw": y})["text"]
89
+ return transcription
90
+
91
+
92
+ transcribe_to_phonemes = partial(
93
+ transcribe, transcriber_choice=TranscriberChoice.phoneme
94
+ )
95
+ transcribe_to_graphemes = partial(
96
+ transcribe, transcriber_choice=TranscriberChoice.grapheme
97
+ )
98
+ ```
99
+ ### Simple Evaluation based on `SequenceMatcher`
100
+
101
+ We can again apply the `SequenceMatcher`, but this time compare the phoneme transcriptions of teacher and learner audio.
102
+
103
+ As illustrated here for the date example, we get a much better similarity score:
104
+
105
+ #### Grapheme-based
106
+ ![date_example_grapheme_based.png](assets/date_example_grapheme_based.png)
107
+
108
+ #### Phoneme-based
109
+ ![date_example_phoneme_based.png](assets/date_example_phoneme_based.png)
110
+
111
+ But, the feedback message is still not very helpful.
112
+
113
+
114
+ ### Advanced Evaluation based on LLM
115
+
116
+ Much more powerful, however, is an evaluation leveraging the power of an LLM (`GPT-4-turbo`).
117
+
118
+
119
+ We create a simple LLM chain
120
+
121
+ ```python
122
+ prompt = ChatPromptTemplate.from_template(Path("prompt.md").read_text())
123
+ output_parser = StrOutputParser()
124
+
125
+
126
+ def create_llm(openai_api_key=openai_api_key):
127
+ if openai_api_key in [None, ""]:
128
+ raise gr.Error(
129
+ "No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."
130
+ )
131
+ llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
132
+ return llm
133
+
134
+
135
+ def create_llm_chain(prompt=prompt, output_parser=output_parser, openai_api_key=openai_api_key):
136
+ if openai_api_key in [None, ""]:
137
+ raise gr.Error(
138
+ """No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."""
139
+ )
140
+ llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
141
+ llm_chain = prompt | llm | output_parser
142
+ return llm_chain
143
+ ```
144
+
145
+ and ingest the following inputs:
146
+
147
+ ```python
148
+ def advanced_evaluation(
149
+ learner_l1,
150
+ learner_l2,
151
+ learner_phoneme_transcription,
152
+ teacher_phoneme_transcription,
153
+ ) -> str:
154
+ """Provide LLM-based feedback"""
155
+ return create_llm_chain().invoke(
156
+ {
157
+ "learner_l1": learner_l1,
158
+ "learner_l2": learner_l2,
159
+ "learner_phoneme_transcription": learner_phoneme_transcription,
160
+ "teacher_phoneme_transcription": teacher_phoneme_transcription,
161
+ }
162
+ )
163
+ ```
164
+
165
+ into an LLM prompt template that can be found here:
166
+ [Enhanced Prompt Template for Language Model Expert with Motivational Elements and Language-Specific Feedback](prompt.md).
167
+
168
+
169
+ ## Limitations & Outlook
170
+
171
+
172
+ ### Improve ASR Model
173
+
174
+ Due to time constraints, I selected the first phoneme recognition model that I found on Hugging Face.
175
+ With more time, one could
176
+ - Experiment with different checkpoints at [Phoneme Recognition Models - Hugging Face](https://huggingface.co/models?other=phoneme-recognition)
177
+ - Adapt OpenAI's Whisper model on phoneme recognition/transcription by simply changing the tokenizer to handle the new vocabulary (the set of phonemes),
178
+ and fine-tuning th model on an (audio, phoneme) dataset with an appropriate metric. See [openai/whisper · Phoneme recognition](https://huggingface.co/spaces/openai/whisper/discussions/86) for a short discussion about it.
179
+ - Employ a model like [m-bain/whisperX: WhisperX](https://github.com/m-bain/whisperX) and possibly fine-tune it, to achieve word-level timestamps & diarization.
180
+
181
+ Further, the output of the ASR model could be enhanced by grouping phonemes (to allow for better world-level feedback and alignment) and also adding better prosodic/suprasegmental support.
182
+
183
+
184
+ ### Improve LLM prompt
185
+
186
+ Again due to time constraints, I created a single prompt template.
187
+ Further prompt engineering and metaprompting could
188
+ - Reduce hallucinations
189
+ - Create a more didactically sound feedback, e.g. divided in different feedback sections like
190
+ - Place. The place of articulation is where a sound is made.
191
+ - Manner. The manner of articulation is how a sound is made.
192
+ - Voicing. Voice or voicing refers to the vibration of the vocal folds.
193
+ - Recommend fitting exercises and content of babbel.com
194
+
195
+ #### Improve UI/feedback time
196
+
197
+ The LLM response currently takes some time. Among many ways to tackle this problem, one could:
198
+ - Stream the response for immediate feedback and better UX
199
+ - Use clever caching for immediate responses
200
+ - Collect several attempts, and only provide the LLM feedback on an aggregate of attempts (for example in a dedicated pronounciation trainer section)
201
+
202
+
203
+ ### Personalization
204
+
205
+ The personalization is very limited as it only looks as l1 and l2 of the learner.
206
+ We could further:
207
+
208
+ - Compare the current attempt with previous attempts in order to show progress/regress. This could be especially motivating if learner is still far from a perfect pronunciation, but steadily improves.
209
+ - Include additional learner information, like preferences and proficiency, etc.
210
+
211
+ ### Alternative phoneme-based feedback
212
+ - Instead of, or complimentary, to employing an LLM for advanced and personalized feedback, we could provide scores and feedback based on a distance measure between phonemes.
213
+ - Among a variety of possible distances, a simple starting point could be a 3-D distance of the place of articulation (where the sound is made).
docs/prompt.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Enhanced Prompt Template for Language Model Expert with Motivational Elements and Language-Specific Feedback
2
+
3
+ **Context:** You are tasked with assisting a {learner_l1} language learner who is struggling with {learner_l2} pronunciation. The learner attempted to say "{teacher_phoneme_transcription}" but instead pronounced it as "{learner_phoneme_transcription}". Your goal is to provide detailed, constructive feedback on how to improve their pronunciation, focusing on the specific sounds they are mispronouncing.
4
+
5
+ **Instructions:** Write a comprehensive and motivational feedback report that identifies the pronunciation errors, explains why these errors might be occurring from a linguistic perspective related to the learner's native language, and offers practical exercises to correct these errors. Include phonetic transcriptions where necessary and suggest any useful resources or techniques for pronunciation practice.
6
+
7
+ **Details:**
8
+ - **Length:** The feedback should be concise yet thorough, ideally not exceeding 500 words.
9
+ - **Format:** Use a structured format with clear headings for each section (e.g., Introduction, Error Analysis, Corrective Actions, Additional Resources, Words of Encouragement).
10
+ - **Style:** Maintain a supportive, educational, and motivational tone throughout the feedback. Use emojis to make the feedback more engaging.
11
+ - **Outcome:** The learner should have a clear understanding of their pronunciation mistakes and feel equipped and motivated with specific strategies to improve.
12
+
13
+ **Example of Expected Output:**
14
+
15
+ ```
16
+ Feedback on Pronunciation Errors and Improvement Strategies 🌟
17
+
18
+ Introduction:
19
+ Hello! Great effort on your journey to mastering {learner_l2} pronunciation. This feedback is designed to help you refine your skills and embrace the nuances of {learner_l2}, especially focusing on the challenges posed by your native language, {learner_l1}.
20
+
21
+ Error Analysis:
22
+ 1. Confusion between /ɹɛkəɡnaɪz/ and /ɹɛk/:
23
+ - Error: You substituted the complex consonant cluster /ɡn/ in "recognize" with a simpler /k/ sound in "wreck."
24
+ - Possible Cause: The /ɡn/ cluster might be challenging due to its rarity in {learner_l1}.
25
+
26
+ 2. Mispronunciation of /spiːtʃ/ as /biːtʃ/:
27
+ - Error: You articulated the initial consonant sound incorrectly, substituting /sp/ with /b/.
28
+ - Possible Cause: The /sp/ cluster may be particularly challenging as it does not commonly occur in {learner_l1} phonetic patterns.
29
+
30
+ Corrective Actions:
31
+ 1. For /ɹɛkəɡnaɪz/ vs. /ɹɛk/:
32
+ - Practice slowly pronouncing /ɡn/ in isolation, then gradually in the word "recognize."
33
+ - Record yourself and compare with native speakers using online pronunciation tools like Forvo.
34
+
35
+ 2. For /spiːtʃ/ vs. /biːtʃ/:
36
+ - Focus on differentiating /s/ and /b/ sounds. Begin by pronouncing /s/ and /b/ separately, then in simple words, and finally in the target phrase.
37
+ - Engage in minimal pairs exercises, practicing pairs of words that differ only in the initial sound (e.g., "spit" vs. "bit").
38
+
39
+ Additional Resources:
40
+ - Explore [resource link] for interactive pronunciation exercises tailored to {learner_l1} speakers.
41
+ - Consider consulting a speech therapist specializing in accent reduction for personalized guidance.
42
+
43
+ Words of Encouragement:
44
+ Keep up the fantastic work! 🚀 Remember, every mistake is a stepping stone towards your success. Stay persistent, practice regularly, and keep a positive mindset. You're doing wonderfully, and with continued effort, you'll achieve your pronunciation goals! 💪
45
+
46
+ Conclusion:
47
+ By focusing on these specific areas and consistently practicing, you will see significant improvement in your pronunciation. We believe in you! 🌟
learner_input.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "text_to_record": "interesting",
4
+ "learner_recording": "interesting.wav",
5
+ "sr_transcript_of_learner_recording": "Interesting."
6
+ },
7
+ {
8
+ "text_to_record": "won",
9
+ "learner_recording": "won.wav",
10
+ "sr_transcript_of_learner_recording": "One."
11
+ },
12
+ {
13
+ "text_to_record": "Today is the thirteenth of May, twenty twenty three.",
14
+ "learner_recording": "today.wav",
15
+ "sr_transcript_of_learner_recording": "Today is the 13th of May, 2023."
16
+ },
17
+ {
18
+ "text_to_record": "I like to watch Youtube.",
19
+ "learner_recording": "youtube.wav",
20
+ "sr_transcript_of_learner_recording": "I like to watch you too."
21
+ },
22
+ {
23
+ "text_to_record": "I have two and a half euros.",
24
+ "learner_recording": "euros.wav",
25
+ "sr_transcript_of_learner_recording": "I have a, I have €2.5."
26
+ }
27
+ ]
prompt.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Enhanced Prompt Template for Language Model Expert with Motivational Elements and Language-Specific Feedback
2
+
3
+ **Context:** You are tasked with assisting a {learner_l1} language learner who is struggling with {learner_l2} pronunciation. The learner attempted to say "{teacher_phoneme_transcription}" but instead pronounced it as "{learner_phoneme_transcription}". Your goal is to provide detailed, constructive feedback on how to improve their pronunciation, focusing on the specific sounds they are mispronouncing.
4
+
5
+ **Instructions:** Write a comprehensive and motivational feedback report that identifies the pronunciation errors, explains why these errors might be occurring from a linguistic perspective related to the learner's native language, and offers practical exercises to correct these errors. Include phonetic transcriptions where necessary and suggest any useful resources or techniques for pronunciation practice.
6
+
7
+ **Details:**
8
+ - **Length:** The feedback should be concise yet thorough, ideally not exceeding 500 words.
9
+ - **Format:** Use a structured format with clear headings for each section (e.g., Introduction, Error Analysis, Corrective Actions, Additional Resources, Words of Encouragement).
10
+ - **Style:** Maintain a supportive, educational, and motivational tone throughout the feedback. Use emojis to make the feedback more engaging.
11
+ - **Outcome:** The learner should have a clear understanding of their pronunciation mistakes and feel equipped and motivated with specific strategies to improve.
12
+
13
+ **Example of Expected Output:**
14
+
15
+ ```
16
+ Feedback on Pronunciation Errors and Improvement Strategies 🌟
17
+
18
+ Introduction:
19
+ Hello! Great effort on your journey to mastering {learner_l2} pronunciation. This feedback is designed to help you refine your skills and embrace the nuances of {learner_l2}, especially focusing on the challenges posed by your native language, {learner_l1}.
20
+
21
+ Error Analysis:
22
+ 1. Confusion between /ɹɛkəɡnaɪz/ and /ɹɛk/:
23
+ - Error: You substituted the complex consonant cluster /ɡn/ in "recognize" with a simpler /k/ sound in "wreck."
24
+ - Possible Cause: The /ɡn/ cluster might be challenging due to its rarity in {learner_l1}.
25
+
26
+ 2. Mispronunciation of /spiːtʃ/ as /biːtʃ/:
27
+ - Error: You articulated the initial consonant sound incorrectly, substituting /sp/ with /b/.
28
+ - Possible Cause: The /sp/ cluster may be particularly challenging as it does not commonly occur in {learner_l1} phonetic patterns.
29
+
30
+ Corrective Actions:
31
+ 1. For /ɹɛkəɡnaɪz/ vs. /ɹɛk/:
32
+ - Practice slowly pronouncing /ɡn/ in isolation, then gradually in the word "recognize."
33
+ - Record yourself and compare with native speakers using online pronunciation tools like Forvo.
34
+
35
+ 2. For /spiːtʃ/ vs. /biːtʃ/:
36
+ - Focus on differentiating /s/ and /b/ sounds. Begin by pronouncing /s/ and /b/ separately, then in simple words, and finally in the target phrase.
37
+ - Engage in minimal pairs exercises, practicing pairs of words that differ only in the initial sound (e.g., "spit" vs. "bit").
38
+
39
+ Additional Resources:
40
+ - Explore [resource link] for interactive pronunciation exercises tailored to {learner_l1} speakers.
41
+ - Consider consulting a speech therapist specializing in accent reduction for personalized guidance.
42
+
43
+ Words of Encouragement:
44
+ Keep up the fantastic work! 🚀 Remember, every mistake is a stepping stone towards your success. Stay persistent, practice regularly, and keep a positive mindset. You're doing wonderfully, and with continued effort, you'll achieve your pronunciation goals! 💪
45
+
46
+ Conclusion:
47
+ By focusing on these specific areas and consistently practicing, you will see significant improvement in your pronunciation. We believe in you! 🌟
pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "pronunciation-trainer"
3
+ version = "0.1.0"
4
+ description = "Grapheme and Phoneme-Based Pronounciation Trainer"
5
+ authors = [{ name = "Pascal Wenker", email = "[email protected]" }]
6
+ dependencies = [
7
+ "gradio>=4.29.0",
8
+ "transformers>=4.40.2",
9
+ "torch>=2.3.0",
10
+ "torchaudio>=2.3.0",
11
+ "setuptools>=69.5.1",
12
+ "phonemizer>=3.2.1",
13
+ "langchain>=0.1.19",
14
+ "langchain_openai>=0.1.6",
15
+ ]
16
+ readme = "README.md"
17
+ requires-python = ">= 3.11"
18
+
19
+ [build-system]
20
+ requires = ["hatchling"]
21
+ build-backend = "hatchling.build"
22
+
23
+ [tool.rye]
24
+ managed = true
25
+ dev-dependencies = []
26
+
27
+ [tool.hatch.metadata]
28
+ allow-direct-references = true
requirements-dev.lock ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by rye
2
+ # use `rye lock` or `rye sync` to update this lockfile
3
+ #
4
+ # last locked with the following flags:
5
+ # pre: false
6
+ # features: []
7
+ # all-features: false
8
+
9
+ -e file:.
10
+ aiofiles==23.2.1
11
+ aiohttp==3.9.5
12
+ aiosignal==1.3.1
13
+ altair==5.3.0
14
+ annotated-types==0.6.0
15
+ anyio==4.3.0
16
+ attrs==23.2.0
17
+ babel==2.15.0
18
+ bibtexparser==2.0.0b7
19
+ certifi==2024.2.2
20
+ charset-normalizer==3.3.2
21
+ click==8.1.7
22
+ clldutils==3.22.2
23
+ colorama==0.4.6
24
+ colorlog==6.8.2
25
+ contourpy==1.2.1
26
+ csvw==3.3.0
27
+ cycler==0.12.1
28
+ dataclasses-json==0.6.5
29
+ distro==1.9.0
30
+ dlinfo==1.2.1
31
+ dnspython==2.6.1
32
+ email-validator==2.1.1
33
+ fastapi==0.111.0
34
+ fastapi-cli==0.0.3
35
+ ffmpy==0.3.2
36
+ filelock==3.14.0
37
+ fonttools==4.51.0
38
+ frozenlist==1.4.1
39
+ fsspec==2024.3.1
40
+ gradio==4.29.0
41
+ gradio-client==0.16.1
42
+ greenlet==3.0.3
43
+ h11==0.14.0
44
+ httpcore==1.0.5
45
+ httptools==0.6.1
46
+ httpx==0.27.0
47
+ huggingface-hub==0.23.0
48
+ idna==3.7
49
+ importlib-resources==6.4.0
50
+ isodate==0.6.1
51
+ jinja2==3.1.4
52
+ joblib==1.4.2
53
+ jsonpatch==1.33
54
+ jsonpointer==2.4
55
+ jsonschema==4.22.0
56
+ jsonschema-specifications==2023.12.1
57
+ kiwisolver==1.4.5
58
+ langchain==0.1.19
59
+ langchain-community==0.0.38
60
+ langchain-core==0.1.52
61
+ langchain-openai==0.1.6
62
+ langchain-text-splitters==0.0.1
63
+ langsmith==0.1.56
64
+ language-tags==1.2.0
65
+ lxml==5.2.1
66
+ markdown==3.6
67
+ markdown-it-py==3.0.0
68
+ markupsafe==2.1.5
69
+ marshmallow==3.21.2
70
+ matplotlib==3.8.4
71
+ mdurl==0.1.2
72
+ mpmath==1.3.0
73
+ multidict==6.0.5
74
+ mypy-extensions==1.0.0
75
+ networkx==3.3
76
+ numpy==1.26.4
77
+ nvidia-cublas-cu12==12.1.3.1
78
+ nvidia-cuda-cupti-cu12==12.1.105
79
+ nvidia-cuda-nvrtc-cu12==12.1.105
80
+ nvidia-cuda-runtime-cu12==12.1.105
81
+ nvidia-cudnn-cu12==8.9.2.26
82
+ nvidia-cufft-cu12==11.0.2.54
83
+ nvidia-curand-cu12==10.3.2.106
84
+ nvidia-cusolver-cu12==11.4.5.107
85
+ nvidia-cusparse-cu12==12.1.0.106
86
+ nvidia-nccl-cu12==2.20.5
87
+ nvidia-nvjitlink-cu12==12.4.127
88
+ nvidia-nvtx-cu12==12.1.105
89
+ openai==1.28.0
90
+ orjson==3.10.3
91
+ packaging==23.2
92
+ pandas==2.2.2
93
+ phonemizer==3.2.1
94
+ pillow==10.3.0
95
+ pydantic==2.7.1
96
+ pydantic-core==2.18.2
97
+ pydub==0.25.1
98
+ pygments==2.18.0
99
+ pylatexenc==2.10
100
+ pyparsing==3.1.2
101
+ python-dateutil==2.9.0.post0
102
+ python-dotenv==1.0.1
103
+ python-multipart==0.0.9
104
+ pytz==2024.1
105
+ pyyaml==6.0.1
106
+ rdflib==7.0.0
107
+ referencing==0.35.1
108
+ regex==2024.5.10
109
+ requests==2.31.0
110
+ rfc3986==1.5.0
111
+ rich==13.7.1
112
+ rpds-py==0.18.1
113
+ ruff==0.4.4
114
+ safetensors==0.4.3
115
+ segments==2.2.1
116
+ semantic-version==2.10.0
117
+ shellingham==1.5.4
118
+ six==1.16.0
119
+ sniffio==1.3.1
120
+ sqlalchemy==2.0.30
121
+ starlette==0.37.2
122
+ sympy==1.12
123
+ tabulate==0.9.0
124
+ tenacity==8.3.0
125
+ tiktoken==0.6.0
126
+ tokenizers==0.19.1
127
+ tomlkit==0.12.0
128
+ toolz==0.12.1
129
+ torch==2.3.0
130
+ torchaudio==2.3.0
131
+ tqdm==4.66.4
132
+ transformers==4.40.2
133
+ triton==2.3.0
134
+ typer==0.12.3
135
+ typing-extensions==4.11.0
136
+ typing-inspect==0.9.0
137
+ tzdata==2024.1
138
+ ujson==5.9.0
139
+ uritemplate==4.1.1
140
+ urllib3==2.2.1
141
+ uvicorn==0.29.0
142
+ uvloop==0.19.0
143
+ watchfiles==0.21.0
144
+ websockets==11.0.3
145
+ yarl==1.9.4
146
+ # The following packages are considered to be unsafe in a requirements file:
147
+ setuptools==69.5.1
requirements.lock ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by rye
2
+ # use `rye lock` or `rye sync` to update this lockfile
3
+ #
4
+ # last locked with the following flags:
5
+ # pre: false
6
+ # features: []
7
+ # all-features: false
8
+
9
+ -e file:.
10
+ aiofiles==23.2.1
11
+ aiohttp==3.9.5
12
+ aiosignal==1.3.1
13
+ altair==5.3.0
14
+ annotated-types==0.6.0
15
+ anyio==4.3.0
16
+ attrs==23.2.0
17
+ babel==2.15.0
18
+ bibtexparser==2.0.0b7
19
+ certifi==2024.2.2
20
+ charset-normalizer==3.3.2
21
+ click==8.1.7
22
+ clldutils==3.22.2
23
+ colorama==0.4.6
24
+ colorlog==6.8.2
25
+ contourpy==1.2.1
26
+ csvw==3.3.0
27
+ cycler==0.12.1
28
+ dataclasses-json==0.6.5
29
+ distro==1.9.0
30
+ dlinfo==1.2.1
31
+ dnspython==2.6.1
32
+ email-validator==2.1.1
33
+ fastapi==0.111.0
34
+ fastapi-cli==0.0.3
35
+ ffmpy==0.3.2
36
+ filelock==3.14.0
37
+ fonttools==4.51.0
38
+ frozenlist==1.4.1
39
+ fsspec==2024.3.1
40
+ gradio==4.29.0
41
+ gradio-client==0.16.1
42
+ greenlet==3.0.3
43
+ h11==0.14.0
44
+ httpcore==1.0.5
45
+ httptools==0.6.1
46
+ httpx==0.27.0
47
+ huggingface-hub==0.23.0
48
+ idna==3.7
49
+ importlib-resources==6.4.0
50
+ isodate==0.6.1
51
+ jinja2==3.1.4
52
+ joblib==1.4.2
53
+ jsonpatch==1.33
54
+ jsonpointer==2.4
55
+ jsonschema==4.22.0
56
+ jsonschema-specifications==2023.12.1
57
+ kiwisolver==1.4.5
58
+ langchain==0.1.19
59
+ langchain-community==0.0.38
60
+ langchain-core==0.1.52
61
+ langchain-openai==0.1.6
62
+ langchain-text-splitters==0.0.1
63
+ langsmith==0.1.56
64
+ language-tags==1.2.0
65
+ lxml==5.2.1
66
+ markdown==3.6
67
+ markdown-it-py==3.0.0
68
+ markupsafe==2.1.5
69
+ marshmallow==3.21.2
70
+ matplotlib==3.8.4
71
+ mdurl==0.1.2
72
+ mpmath==1.3.0
73
+ multidict==6.0.5
74
+ mypy-extensions==1.0.0
75
+ networkx==3.3
76
+ numpy==1.26.4
77
+ nvidia-cublas-cu12==12.1.3.1
78
+ nvidia-cuda-cupti-cu12==12.1.105
79
+ nvidia-cuda-nvrtc-cu12==12.1.105
80
+ nvidia-cuda-runtime-cu12==12.1.105
81
+ nvidia-cudnn-cu12==8.9.2.26
82
+ nvidia-cufft-cu12==11.0.2.54
83
+ nvidia-curand-cu12==10.3.2.106
84
+ nvidia-cusolver-cu12==11.4.5.107
85
+ nvidia-cusparse-cu12==12.1.0.106
86
+ nvidia-nccl-cu12==2.20.5
87
+ nvidia-nvjitlink-cu12==12.4.127
88
+ nvidia-nvtx-cu12==12.1.105
89
+ openai==1.28.0
90
+ orjson==3.10.3
91
+ packaging==23.2
92
+ pandas==2.2.2
93
+ phonemizer==3.2.1
94
+ pillow==10.3.0
95
+ pydantic==2.7.1
96
+ pydantic-core==2.18.2
97
+ pydub==0.25.1
98
+ pygments==2.18.0
99
+ pylatexenc==2.10
100
+ pyparsing==3.1.2
101
+ python-dateutil==2.9.0.post0
102
+ python-dotenv==1.0.1
103
+ python-multipart==0.0.9
104
+ pytz==2024.1
105
+ pyyaml==6.0.1
106
+ rdflib==7.0.0
107
+ referencing==0.35.1
108
+ regex==2024.5.10
109
+ requests==2.31.0
110
+ rfc3986==1.5.0
111
+ rich==13.7.1
112
+ rpds-py==0.18.1
113
+ ruff==0.4.4
114
+ safetensors==0.4.3
115
+ segments==2.2.1
116
+ semantic-version==2.10.0
117
+ shellingham==1.5.4
118
+ six==1.16.0
119
+ sniffio==1.3.1
120
+ sqlalchemy==2.0.30
121
+ starlette==0.37.2
122
+ sympy==1.12
123
+ tabulate==0.9.0
124
+ tenacity==8.3.0
125
+ tiktoken==0.6.0
126
+ tokenizers==0.19.1
127
+ tomlkit==0.12.0
128
+ toolz==0.12.1
129
+ torch==2.3.0
130
+ torchaudio==2.3.0
131
+ tqdm==4.66.4
132
+ transformers==4.40.2
133
+ triton==2.3.0
134
+ typer==0.12.3
135
+ typing-extensions==4.11.0
136
+ typing-inspect==0.9.0
137
+ tzdata==2024.1
138
+ ujson==5.9.0
139
+ uritemplate==4.1.1
140
+ urllib3==2.2.1
141
+ uvicorn==0.29.0
142
+ uvloop==0.19.0
143
+ watchfiles==0.21.0
144
+ websockets==11.0.3
145
+ yarl==1.9.4
146
+ # The following packages are considered to be unsafe in a requirements file:
147
+ setuptools==69.5.1
src/pronunciation_trainer/__init__.py ADDED
File without changes
src/pronunciation_trainer/app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script provides a simple web interface for the pronunciation trainer using the gradio library
3
+ """
4
+
5
+
6
+ from pathlib import Path
7
+
8
+ import gradio as gr
9
+ from phonemizer import phonemize
10
+
11
+ from pronunciation_trainer.evaluation import (advanced_evaluation,
12
+ basic_evaluation,
13
+ normalize_texts)
14
+ from pronunciation_trainer.loading import (load_advanced_examples,
15
+ load_simple_examples)
16
+ from pronunciation_trainer.transcription import (transcribe_to_graphemes,
17
+ transcribe_to_phonemes)
18
+
19
+ with gr.Blocks() as demo:
20
+ with gr.Tab("Welcome"):
21
+ readme = Path("README.md").read_text()
22
+ gr.Markdown(readme)
23
+
24
+ with gr.Tab("Grapheme-Based Speech Evaluation"):
25
+ with gr.Row():
26
+ learner_transcription = gr.Textbox(
27
+ label="Learner Transcription",
28
+ placeholder="It is nice to wreck a nice beach",
29
+ )
30
+ teacher_transcription = gr.Textbox(
31
+ label="Teacher Transcription",
32
+ placeholder="It is nice to recognize speech",
33
+ )
34
+ basic_evaluate_btn = gr.Button("Evaluate", variant="primary")
35
+ gr.Markdown("## Evaluation")
36
+ gr.Markdown("### Basic Evaluation")
37
+ grapheme_evaluation = gr.Markdown()
38
+ with gr.Row():
39
+ basic_similarity_score = gr.Label(label="Similarity Score of Transcripts")
40
+ basic_diff_box = gr.HighlightedText(
41
+ label="Difference between Learner and Teacher transcripts",
42
+ combine_adjacent=True,
43
+ show_legend=True,
44
+ color_map={"+": "red", "-": "green"},
45
+ )
46
+ basic_evaluate_btn.click(
47
+ fn=normalize_texts,
48
+ inputs=[learner_transcription, teacher_transcription],
49
+ outputs=[learner_transcription, teacher_transcription],
50
+ ).success(
51
+ fn=basic_evaluation,
52
+ inputs=[learner_transcription, teacher_transcription],
53
+ outputs=[basic_similarity_score, grapheme_evaluation, basic_diff_box],
54
+ )
55
+ with gr.Accordion("Learner Examples"):
56
+ gr.Markdown("### Examples for grapheme-based evaluation")
57
+ simple_examples = gr.Examples(
58
+ examples=load_simple_examples(),
59
+ inputs=[
60
+ teacher_transcription,
61
+ learner_transcription,
62
+ ],
63
+ )
64
+ with gr.Tab("Phoneme-Based Speech Evaluation"):
65
+ with gr.Row():
66
+ learner_recording = gr.Audio(
67
+ label="Learner Recording",
68
+ sources=["microphone", "upload"],
69
+ )
70
+ teacher_recording = gr.Audio(
71
+ label="Teacher Recording",
72
+ sources=["microphone", "upload"],
73
+ )
74
+ with gr.Row():
75
+ learner_phoneme_transcription = gr.Textbox(
76
+ label="Learner Phoneme Transcription",
77
+ placeholder=phonemize(learner_transcription.placeholder),
78
+ interactive=True,
79
+ )
80
+ teacher_phoneme_transcription = gr.Textbox(
81
+ label="Teacher Phoneme Transcription",
82
+ placeholder=phonemize(teacher_transcription.placeholder),
83
+ interactive=True,
84
+ )
85
+ learner_l1 = gr.Textbox(
86
+ label="Native language of Learner (L1)", placeholder="German"
87
+ )
88
+ learner_l2 = gr.Textbox(
89
+ label="Language the learner aims to acquire (L2)", placeholder="English"
90
+ )
91
+ advanced_evaluate_btn = gr.Button("Evaluate", variant="primary")
92
+ gr.Markdown("## Advanced Evaluation")
93
+
94
+ with gr.Row():
95
+ similarity_score = gr.Label(
96
+ label="Similarity Score of of Phoneme Transcripts"
97
+ )
98
+ diff_box = gr.HighlightedText(
99
+ label="Difference between Learner and Teacher Phoneme transcripts",
100
+ combine_adjacent=True,
101
+ show_legend=True,
102
+ color_map={"+": "red", "-": "green"},
103
+ )
104
+ llm_evaluation = gr.Markdown()
105
+
106
+ learner_recording.change(
107
+ fn=transcribe_to_phonemes,
108
+ inputs=learner_recording,
109
+ outputs=learner_phoneme_transcription,
110
+ )
111
+ teacher_recording.change(
112
+ fn=transcribe_to_phonemes,
113
+ inputs=teacher_recording,
114
+ outputs=teacher_phoneme_transcription,
115
+ )
116
+
117
+ advanced_evaluate_btn.click(
118
+ fn=basic_evaluation,
119
+ inputs=[learner_phoneme_transcription, teacher_phoneme_transcription],
120
+ outputs=[similarity_score, llm_evaluation, diff_box],
121
+ ).success(
122
+ advanced_evaluation,
123
+ inputs=[
124
+ learner_l1,
125
+ learner_l2,
126
+ learner_phoneme_transcription,
127
+ teacher_phoneme_transcription,
128
+ ],
129
+ outputs=llm_evaluation,
130
+ )
131
+ with gr.Accordion("Learner Examples"):
132
+ gr.Markdown("### Examples for advanced evaluation")
133
+ advanced_examples = gr.Examples(
134
+ examples=load_advanced_examples(),
135
+ inputs=[
136
+ learner_l1,
137
+ learner_l2,
138
+ learner_recording,
139
+ teacher_recording,
140
+ ],
141
+ )
142
+
143
+ if __name__ == "__main__":
144
+ demo.launch()
src/pronunciation_trainer/config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file is used to load the OpenAI API key from the .env file.
3
+ """
4
+
5
+ import os
6
+
7
+ from dotenv import find_dotenv, load_dotenv
8
+
9
+ load_dotenv(find_dotenv())
10
+
11
+ openai_api_key = os.getenv("OPENAI_API_KEY")
src/pronunciation_trainer/evaluation.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module provides functions for evaluating speaking attempts.
3
+
4
+ It includes:
5
+ - A basic evaluation function that compares two phrases (grapheme-based) and provides feedback based on their similarity ratio.
6
+ - An advanced evaluation function that uses a language learning model (LLM) to provide feedback based on phoneme transcriptions.
7
+
8
+ The basic evaluation function includes:
9
+ - A normalization function that converts input texts to lower case and strips whitespace.
10
+ - A function that calculates the similarity ratio between two phrases.
11
+ - A function that generates a diff between two phrases.
12
+ - A function that generates feedback based on the similarity ratio.
13
+
14
+ The advanced evaluation function includes:
15
+ - A function that invokes an LLM chain to provide phoneme-based feedback
16
+ """
17
+
18
+
19
+ from difflib import Differ, SequenceMatcher
20
+ from typing import Optional, Tuple
21
+
22
+ from pronunciation_trainer.llm import create_llm_chain
23
+
24
+
25
+ def normalize_texts(actual: str, expected: str) -> list[str]:
26
+ """Normalize two input texts by converting them to lower case and stripping whitespace.
27
+
28
+ Note: This normalization function is very simple and only here to demonstrate the general necessity of normalization
29
+ """
30
+
31
+ return [text.lower().strip() for text in [actual, expected]]
32
+
33
+
34
+ def compare_phrases(expected: str, actual: str) -> float:
35
+ """Calculate the similarity ratio between two phrases."""
36
+ return SequenceMatcher(None, expected, actual).ratio()
37
+
38
+
39
+ def diff_phrases(expected: str, actual: str) -> list[Tuple[str, Optional[str]]]:
40
+ """Generate a diff between two phrases."""
41
+ differ = Differ()
42
+ return [
43
+ (token[2:], None if token[0] == " " else token[0])
44
+ for token in differ.compare(expected, actual)
45
+ ]
46
+
47
+
48
+ def generate_feedback(similarity_ratio: float) -> str:
49
+ """Generate feedback based on the similarity ratio."""
50
+ if similarity_ratio > 0.9:
51
+ return "Excellent!"
52
+ elif similarity_ratio > 0.7:
53
+ return "Good job!"
54
+ elif similarity_ratio > 0.5:
55
+ return "Not bad, but there's room for improvement."
56
+ else:
57
+ return "Please try again, focus on pronunciation and clarity."
58
+
59
+
60
+ def basic_evaluation(
61
+ expected: str, actual: str, autojunk: bool = True
62
+ ) -> Tuple[float, str, list[Tuple[str, Optional[str]]]]:
63
+ """Evaluate speaking attempts by comparing expected and actual phrases."""
64
+ expected, actual = normalize_texts(expected, actual)
65
+ similarity_ratio = compare_phrases(expected, actual)
66
+ diff = diff_phrases(expected, actual)
67
+ feedback = generate_feedback(similarity_ratio)
68
+ return similarity_ratio, feedback, diff
69
+
70
+
71
+ def advanced_evaluation(
72
+ learner_l1,
73
+ learner_l2,
74
+ learner_phoneme_transcription,
75
+ teacher_phoneme_transcription,
76
+ ) -> str:
77
+ """Provide LLM-based feedback"""
78
+ return create_llm_chain().invoke(
79
+ {
80
+ "learner_l1": learner_l1,
81
+ "learner_l2": learner_l2,
82
+ "learner_phoneme_transcription": learner_phoneme_transcription,
83
+ "teacher_phoneme_transcription": teacher_phoneme_transcription,
84
+ }
85
+ )
src/pronunciation_trainer/llm.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the code to create a language model chain using the OpenAI API.
3
+ """
4
+
5
+
6
+ from pathlib import Path
7
+
8
+ import gradio as gr
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain_openai import ChatOpenAI
12
+
13
+ from pronunciation_trainer.config import openai_api_key
14
+
15
+ prompt = ChatPromptTemplate.from_template(Path("prompt.md").read_text())
16
+ output_parser = StrOutputParser()
17
+
18
+
19
+ def create_llm(openai_api_key=openai_api_key):
20
+ if openai_api_key in [None, ""]:
21
+ raise gr.Error(
22
+ "No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."
23
+ )
24
+ llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
25
+ return llm
26
+
27
+
28
+ def create_llm_chain(prompt=prompt, output_parser=output_parser, openai_api_key=openai_api_key):
29
+ if openai_api_key in [None, ""]:
30
+ raise gr.Error(
31
+ """No API key provided! You can find your API key at https://platform.openai.com/account/api-keys."""
32
+ )
33
+ llm = ChatOpenAI(model="gpt-4-turbo", openai_api_key=openai_api_key)
34
+ llm_chain = prompt | llm | output_parser
35
+ return llm_chain
src/pronunciation_trainer/loading.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module is responsible for loading data from the provided JSON file.
3
+ It also provides functions to load simple and advanced examples for the grapheme-based
4
+ and phoneme-based pronunciation trainer, respectively.
5
+ """
6
+ import json
7
+
8
+
9
+ def load_data(filepath: str = "learner_input.json") -> dict:
10
+ with open(filepath, "r") as file:
11
+ data = json.load(file)
12
+ return data
13
+
14
+
15
+ def load_simple_examples():
16
+ simple_examples = [
17
+ [
18
+ ex["text_to_record"],
19
+ ex["sr_transcript_of_learner_recording"],
20
+ ]
21
+ for ex in load_data()
22
+ ]
23
+ return simple_examples
24
+
25
+
26
+ def load_advanced_examples():
27
+ advanced_examples = [
28
+ [
29
+ "Lithuanian",
30
+ "English",
31
+ f'audios/learner/{ex["learner_recording"]}',
32
+ f'audios/teacher/{ex["learner_recording"]}',
33
+ ]
34
+ for ex in load_data()
35
+ ]
36
+ advanced_examples.append(
37
+ [
38
+ "German",
39
+ "English",
40
+ f"audios/learner/book.aac",
41
+ f"audios/teacher/book.wav",
42
+ ]
43
+ )
44
+ return advanced_examples
src/pronunciation_trainer/rich_logging.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file contains the logging configuration for the project.
3
+ """
4
+
5
+ import logging
6
+
7
+ from rich.console import Console
8
+ from rich.logging import RichHandler
9
+
10
+ FORMAT = "%(message)s"
11
+ logging.basicConfig(
12
+ level="INFO",
13
+ format=FORMAT,
14
+ datefmt="[%X]",
15
+ handlers=[RichHandler(rich_tracebacks=True)],
16
+ )
17
+
18
+ console = Console()
19
+ log = logging.getLogger("rich")
20
+
21
+
22
+ def set_log_level(verbosity: int):
23
+ if verbosity == 0:
24
+ log.setLevel(logging.WARNING)
25
+ elif verbosity == 1:
26
+ log.setLevel(logging.INFO)
27
+ elif verbosity == 2:
28
+ log.setLevel(logging.DEBUG)
29
+
30
+ return log
src/pronunciation_trainer/scripts.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script is used to produce teacher audio files for the given examples using OpenAI's Text-to-Speech API.
3
+ The produced audio files are saved in the `audios/teacher` directory.
4
+ """
5
+
6
+
7
+ from pathlib import Path
8
+
9
+ from openai import OpenAI
10
+
11
+ from pronunciation_trainer.config import openai_api_key
12
+ from pronunciation_trainer.loading import load_data
13
+ from pronunciation_trainer.rich_logging import log
14
+
15
+ data = load_data()
16
+
17
+ client = OpenAI(api_key=openai_api_key)
18
+
19
+
20
+ def produce_teacher_audio(audio_name, text):
21
+ """
22
+ Produce a teacher audio file for the given text using OpenAI's Text-to-Speech API.
23
+ See: https://platform.openai.com/docs/guides/text-to-speech
24
+ """
25
+
26
+ speech_file_path = Path(f"audios/teacher/{audio_name}")
27
+ response = client.audio.speech.create(
28
+ model="tts-1-hd",
29
+ voice="alloy",
30
+ input=text,
31
+ )
32
+
33
+ response.stream_to_file(speech_file_path)
34
+ log.info(
35
+ f"Successfully produced teacher audio for {text=} at {speech_file_path.name=} 🎉"
36
+ )
37
+
38
+
39
+ if __name__ == "__main__":
40
+ # Produce teacher/ground-truth audio files for the given examples
41
+ data = load_data()
42
+ for datum in data:
43
+ produce_teacher_audio(datum["learner_recording"], datum["text_to_record"])
44
+
45
+ # Produce an additional example
46
+ produce_teacher_audio("book.wav", "The book is on the table")
src/pronunciation_trainer/transcription.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the transcribe function, which uses the Hugging Face pipeline to transcribe audio to text.
3
+
4
+ The transcribe function takes a single parameter, audio, which is a numpy array of the audio the user recorded.
5
+
6
+ There are two transcriber choices available: grapheme and phoneme. The grapheme transcriber uses the openai/whisper-base.en model, while the phoneme transcriber uses the facebook/wav2vec2-lv-60-espeak-cv-ft model.
7
+ """
8
+ from enum import StrEnum
9
+ from functools import partial
10
+
11
+ import numpy as np
12
+ from transformers import pipeline
13
+
14
+
15
+ class TranscriberChoice(StrEnum):
16
+ grapheme = "openai/whisper-base.en"
17
+ phoneme = "facebook/wav2vec2-lv-60-espeak-cv-ft"
18
+
19
+
20
+ def transcribe(
21
+ audio, transcriber_choice: TranscriberChoice = TranscriberChoice.grapheme
22
+ ):
23
+ """
24
+ The transcribe function takes a single parameter, audio, which is a numpy array of the audio the user recorded.
25
+ The pipeline object expects this in float32 format,so we convert it first to float32, and then extract the transcribed text.
26
+ """
27
+ transcriber = pipeline("automatic-speech-recognition", model=transcriber_choice)
28
+ try:
29
+ sr, y = audio
30
+ print(f"Sampling rate is {sr}")
31
+ except TypeError:
32
+ return None
33
+ y = y.astype(np.float32)
34
+ y /= np.max(np.abs(y))
35
+ transcription = transcriber({"sampling_rate": sr, "raw": y})["text"]
36
+ return transcription
37
+
38
+
39
+ transcribe_to_phonemes = partial(
40
+ transcribe, transcriber_choice=TranscriberChoice.phoneme
41
+ )
42
+ transcribe_to_graphemes = partial(
43
+ transcribe, transcriber_choice=TranscriberChoice.grapheme
44
+ )