khof312 commited on
Commit
7a39622
·
1 Parent(s): c530006

Add initial clone of previous space.'

Browse files
Files changed (11) hide show
  1. .gitignore +167 -0
  2. README.md +18 -13
  3. app.py +293 -0
  4. packages.txt +1 -0
  5. requirements.txt +14 -3
  6. src/__init__.py +4 -0
  7. src/convert.py +21 -0
  8. src/helpers.py +29 -0
  9. src/lookups.py +162 -0
  10. src/synthesize.py +174 -0
  11. target_speaker.wav +0 -0
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+
165
+
166
+ # MY FILES
167
+ dev_roadmap.txt
README.md CHANGED
@@ -1,19 +1,24 @@
1
  ---
2
- title: Tts Mockingbird V2
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
  pinned: false
11
- short_description: Streamlit template space
12
  ---
13
 
14
- # Welcome to Streamlit!
 
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
 
 
 
17
 
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
1
  ---
2
+ title: "Mockingbird"
3
+ app_file: app.py
4
+ sdk: "streamlit"
5
+ python_version: 3.11.8
6
+ streamlit_version: 1.44.1
 
 
 
7
  pinned: false
 
8
  ---
9
 
10
+ # Mockingbird TTS Demo
11
+ This repo hosts Mockingbird, a demo of open Text-to-Speech tools.
12
 
13
+ Currently, 3 synthesizers are supported:
14
+ - [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model
15
+ - [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package and the models supplied via that
16
+ - [**ESpeak-NG's**](espeak-ng) synthetic voices
17
 
18
+ Voice conversion is achieved through Coqui.
19
+
20
+ Notes:
21
+ 1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
22
+ 2. Coqui is no longer being officially developed.
23
+ 3. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model.
24
+ 4. Not all synthesizers support a given language.
app.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import scipy
3
+ import os
4
+ import streamlit as st
5
+ import pandas as pd
6
+ from transformers import pipeline #set_seed,
7
+ from transformers import VitsTokenizer, VitsModel
8
+ from datasets import load_dataset, Audio
9
+ from huggingface_hub.inference_api import InferenceApi
10
+
11
+ from src import *
12
+
13
+
14
+ ########################
15
+ col1, col2 = st.columns([20,3])
16
+ with col2:
17
+ st.image('logo.png', use_container_width=True)
18
+ with col1:
19
+ st.title("Mockingbird")
20
+ st.header("A demo of open Text to Speech tools")
21
+
22
+ tts, about = st.tabs(["Text to speech", "**About**"])
23
+
24
+ ########################
25
+ with tts:
26
+
27
+ # Configurations -- language choice and text
28
+ tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso)
29
+ tts_text = st.text_area(label = "Please enter your sentence here:",
30
+ value="", placeholder=placeholders[tts_lang] )
31
+
32
+ target_speaker_file = st.file_uploader("If you would like to test voice conversion, you may upload your audio below. You should upload one file in .wav format. If you don't, a default file will be used.",
33
+ type=['wav'])
34
+
35
+ # Inference
36
+ if st.button("Generate"):
37
+
38
+ # Warning about alphabet support
39
+ if tts_lang in ['rus', 'fas']:
40
+ st.warning("WARNING! On Windows, ESpeak-NG has trouble synthesizing output when input is provided from non-Latin alphabets.")
41
+
42
+ st.divider()
43
+
44
+ # Synthesis
45
+ with st.spinner(":rainbow[Synthesizing, please wait... (this will be slowest the first time you generate audio in a new language)]"):
46
+ if tts_text == "":
47
+ tts_text=placeholders[tts_lang]
48
+
49
+ # First, make the audio
50
+ base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
51
+ base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
52
+ base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
53
+ try:
54
+ base_toucan= synth_toucan(tts_text, models[tts_lang]['toucan'])
55
+ except:
56
+ base_toucan=None
57
+ base_piper= synth_piper(tts_text, models[tts_lang]['piper'])
58
+
59
+ if tts_lang=="swh":
60
+ finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
61
+ finetuned_mms2 = synth_mms(tts_text, "khof312/mms-tts-swh-female-2")
62
+ if tts_lang=="spa":
63
+ finetuned_mms1 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-argentinian-monospeaker")
64
+ finetuned_mms2 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-chilean-monospeaker")
65
+ finetuned_mms3 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-colombian-monospeaker")
66
+ #finetuned_mms4 = synth_mms(tts_text, "khof312/mms-tts-spa-female")
67
+ if tts_lang=="lin":
68
+ #finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-lin-female")
69
+ try:
70
+ finetuned_africanvoices = synth_africanvoices(tts_text, models[tts_lang]['africanvoices'])
71
+ except:
72
+ pass
73
+
74
+ #vc_mms
75
+ #vc_coqui
76
+ #vc_espeakng
77
+ "## Synthesis"
78
+ "### Default models"
79
+ row1 = st.columns([1,1,2])
80
+ row2 = st.columns([1,1,2])
81
+ row3 = st.columns([1,1,2])
82
+ row4 = st.columns([1,1,2])
83
+ row5 = st.columns([1,1,2])
84
+ row6 = st.columns([1,1,2])
85
+
86
+ row1[0].write("**Model**")
87
+ row1[1].write("**Configuration**")
88
+ row1[2].write("**Audio**")
89
+
90
+ if base_mms is not None:
91
+ row2[0].write(f"[Meta MMS](https://huggingface.co/docs/transformers/main/en/model_doc/mms)")
92
+ row2[1].write("default")
93
+ row2[2].audio(base_mms[0], sample_rate = base_mms[1])
94
+
95
+ if base_coqui is not None:
96
+ row3[0].write(f"[Coqui](https://docs.coqui.ai/en/latest/index.html)")
97
+ row3[1].write("default")
98
+ row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
99
+
100
+ if base_espeakng is not None:
101
+ row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
102
+ row4[1].write("default")
103
+ row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
104
+
105
+ if base_toucan is not None:
106
+ row5[0].write(f"[IMS-Toucan](https://github.com/DigitalPhonetics/IMS-Toucan)")
107
+ row5[1].write("default")
108
+ row5[2].audio(base_toucan[0], sample_rate = base_toucan[1])
109
+
110
+ if base_piper is not None:
111
+ row6[0].write(f"[Piper](https://github.com/rhasspy/piper)")
112
+ row6[1].write("default")
113
+ row6[2].audio(base_piper[0], sample_rate = base_piper[1])
114
+
115
+ #################################################################
116
+ if tts_lang == "swh":
117
+ "### Fine Tuned"
118
+ row1 = st.columns([1,1,2])
119
+ row2 = st.columns([1,1,2])
120
+ row3 = st.columns([1,1,2])
121
+
122
+ row1[0].write("**Model**")
123
+ row1[1].write("**Configuration**")
124
+ row1[2].write("**Audio**")
125
+
126
+ row2[0].write(f"Meta MMS")
127
+ row2[1].write("[female 1](https://huggingface.co/khof312/mms-tts-swh-female-1)")
128
+ row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1])
129
+ row3[0].write(f"Meta MMS")
130
+ row3[1].write("[female 2](https://huggingface.co/khof312/mms-tts-swh-female-2)")
131
+ row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1])
132
+
133
+
134
+ if tts_lang == "spa":
135
+ "### Fine Tuned"
136
+ row1 = st.columns([1,1,2])
137
+ row2 = st.columns([1,1,2])
138
+ row3 = st.columns([1,1,2])
139
+ row4 = st.columns([1,1,2])
140
+ #row5 = st.columns([1,1,2])
141
+
142
+ row1[0].write("**Model**")
143
+ row1[1].write("**Configuration**")
144
+ row1[2].write("**Audio**")
145
+
146
+ row2[0].write(f"Meta MMS")
147
+ row2[1].write("[ylacombe - Argentinian](https://huggingface.co/ylacombe/mms-spa-finetuned-argentinian-monospeaker)")
148
+ row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1])
149
+ row3[0].write(f"Meta MMS")
150
+ row3[1].write("[ylacombe - Chilean](https://huggingface.co/ylacombe/mms-spa-finetuned-chilean-monospeaker)")
151
+ row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1])
152
+ row4[0].write(f"Meta MMS")
153
+ row4[1].write("[ylacombe - Colombian](https://huggingface.co/ylacombe/mms-spa-finetuned-colombian-monospeaker)")
154
+ row4[2].audio(finetuned_mms3[0], sample_rate = finetuned_mms3[1])
155
+ #row5[0].write(f"Meta MMS")
156
+ #row5[1].write("[khof312 - female](https://huggingface.co/khof312/mms-tts-spa-female)")
157
+ #row5[2].audio(finetuned_mms4[0], sample_rate = finetuned_mms4[1])
158
+
159
+ if tts_lang == "lin":
160
+ "### Fine Tuned"
161
+ row1 = st.columns([1,1,2])
162
+ #row2 = st.columns([1,1,2])
163
+ row3 = st.columns([1,1,2])
164
+
165
+ row1[0].write("**Model**")
166
+ row1[1].write("**Configuration**")
167
+ row1[2].write("**Audio**")
168
+
169
+ #row2[0].write(f"Meta MMS")
170
+ #row2[1].write("[khof312 - female](https://huggingface.co/khof312/mms-tts-lin-female)")
171
+ #row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1])
172
+
173
+ try:
174
+ row3[0].write(f"African voices")
175
+ row3[1].write("[African Voices]()")
176
+ row3[2].audio(finetuned_africanvoices[0], sample_rate = finetuned_africanvoices[1])
177
+ except:
178
+ pass
179
+
180
+ st.divider()
181
+
182
+ "## Voice conversion" #################################################################
183
+
184
+
185
+
186
+ st.warning('''Note: The naturalness of the audio will only be as good as that of the audio in "default models" above.''')
187
+
188
+ if target_speaker_file is not None:
189
+ rate, wav = scipy.io.wavfile.read(target_speaker_file)
190
+ scipy.io.wavfile.write("target_speaker_custom.wav", data=wav, rate=rate)
191
+ target_speaker = "target_speaker_custom.wav"
192
+ else:
193
+ target_speaker = "target_speaker.wav"
194
+
195
+ if base_mms is not None:
196
+ scipy.io.wavfile.write("source_speaker_mms.wav", rate=base_mms[1], data=base_mms[0].T)
197
+ converted_mms = convert_coqui('source_speaker_mms.wav', target_speaker)
198
+
199
+ if base_coqui is not None:
200
+ scipy.io.wavfile.write("source_speaker_coqui.wav", rate=base_coqui[1], data=base_coqui[0].T)
201
+ converted_coqui = convert_coqui('source_speaker_coqui.wav', target_speaker)
202
+
203
+ if base_espeakng is not None:
204
+ scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
205
+ converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
206
+
207
+ scipy.io.wavfile.write("source_speaker_toucan.wav", rate=base_toucan[1], data=base_toucan[0].T)
208
+ converted_toucan = convert_coqui('source_speaker_toucan.wav', target_speaker)
209
+
210
+ row1 = st.columns([1,1,2])
211
+ row2 = st.columns([1,1,2])
212
+ row3 = st.columns([1,1,2])
213
+ row4 = st.columns([1,1,2])
214
+
215
+ row1[0].write("**Model**")
216
+ row1[1].write("**Configuration**")
217
+ row1[2].write("**Audio**")
218
+
219
+ if base_mms is not None:
220
+ row1[0].write(f"Meta MMS")
221
+ row1[1].write(f"converted")
222
+ row1[2].audio(converted_mms[0], sample_rate = converted_mms[1])
223
+
224
+ if base_coqui is not None:
225
+ row2[0].write(f"Coqui")
226
+ row2[1].write(f"converted")
227
+ row2[2].audio(converted_coqui[0], sample_rate = converted_coqui[1])
228
+
229
+ if base_espeakng is not None:
230
+ row3[0].write(f"Espeak-ng")
231
+ row3[1].write(f"converted")
232
+ row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
233
+
234
+
235
+ row4[0].write(f"IMS Toucan")
236
+ row4[1].write(f"converted")
237
+ row4[2].audio(converted_toucan[0], sample_rate = converted_toucan[1])
238
+
239
+
240
+ #row3[0].write("MMS-TTS-SWH")
241
+ #row3[1].audio(synth, sample_rate=16_000)
242
+ #row3[2].audio(synth, sample_rate=16_000)
243
+
244
+ #st.audio(synth, sample_rate=16_000)
245
+ #data.write(np.random.randn(10, 1)
246
+
247
+
248
+ #col1.subheader("A wide column with a chart")
249
+ #col1.line_chart(data)
250
+
251
+ #col2.subheader("A narrow column with the data")
252
+ #col2.write(data)
253
+
254
+ with about:
255
+ #st.header("How it works")
256
+ st.markdown('''# Mockingbird TTS Demo
257
+ This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 3 synthesizers with multilingual offerings are supported out of the box:
258
+ - [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
259
+ - [**IMS Toucan**](https://github.com/DigitalPhonetics/IMS-Toucan), which supports 7000 languages.[^4]
260
+ - [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
261
+
262
+ On a case-by-case basis, for different languages of interest, I have added:
263
+ - [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available. Languages must be added on a model-by-model basis.
264
+ - Specific fine-tuned variants of Meta's MMS (either fine-tuned by [Yoach Lacombe](https://huggingface.co/ylacombe), or fine-tuned by me using his scripts).
265
+
266
+ I am in the process of adding support for:
267
+ - [**Piper**](https://github.com/rhasspy/piper), a TTS system that supports multiple voices per language and approximately 30 languages. To test different voices, please see the [Huggingface demo](https://huggingface.co/spaces/k2-fsa/text-to-speech).[^5]
268
+ - [**African Voices**](https://github.com/neulab/AfricanVoices), a CMU research project that fine-tuned synthesizers for different African languages. The site hosting the synthesizers is deprecated but they can be downloaded from Google's Wayback Machine. [^6]
269
+
270
+
271
+ Voice conversion is currently achieved through Coqui.
272
+
273
+ Notes:
274
+ 1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
275
+ 2. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model.
276
+ 3. Not all synthesizers support a given language.
277
+
278
+
279
+
280
+ [^1]: Endpoints used are of the form https://huggingface.co/facebook/mms-tts-[LANG].
281
+ Learn more:
282
+ [Docs](https://huggingface.co/docs/transformers/model_doc/mms) |
283
+ [Paper](https://arxiv.org/abs/2305.13516) |
284
+ [Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html)
285
+
286
+ [^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
287
+ [^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
288
+ [^4]: Language list is available in the Gradio API documentation [here](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS).
289
+ [^5]: The list of available voices is [here](https://github.com/rhasspy/piper/blob/master/VOICES.md), model checkpoints are [here](https://huggingface.co/datasets/rhasspy/piper-checkpoints/tree/main), and they can be tested [here](https://rhasspy.github.io/piper-samples/).
290
+ [^6]:
291
+ ''')
292
+
293
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ espeak-ng
requirements.txt CHANGED
@@ -1,3 +1,14 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets==2.14.7
2
+ librosa==0.10.1
3
+ pycountry==24.6.1
4
+ scipy==1.12.0
5
+ sentencepiece==0.2.0
6
+ transformers>=2.5
7
+ torch
8
+ IPython==8.26.0
9
+ TTS
10
+ pandas==1.5.3
11
+ wave
12
+ #py-espeak-ng
13
+ espeakng
14
+ dotenv
src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .helpers import *
2
+ from .lookups import *
3
+ from .synthesize import *
4
+ from .convert import *
src/convert.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import IPython
3
+ from TTS.api import TTS
4
+
5
+ def convert_coqui(source_wav:str, target_wav:str):
6
+ '''
7
+ Use Coqui TTS for zero-shot voice conversion.
8
+
9
+ Inputs:
10
+ source_wav: Wav of the thing you want to say.
11
+ target_wav: Wav of the speaker you want to hear.
12
+ Returns:
13
+ Streaming wav and sampling rate.
14
+ '''
15
+ # Get device
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+
18
+ tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to(device)
19
+ wav = tts.voice_conversion(source_wav=source_wav, target_wav=target_wav)
20
+
21
+ return wav, 24000 # Identified sampling rate of freevc24
src/helpers.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pycountry
2
+
3
+ iso_encoder = {"English":"eng",
4
+ "French":"fra",
5
+ "Moore": "mos"}
6
+
7
+ iso_decoder = dict((v,k) for k,v in iso_encoder.items())
8
+
9
+
10
+
11
+ def encode_iso(lang:str)-> str:
12
+ ''' Takes the name of a language and returns its ISO-3 code. '''
13
+ return iso_encoder[lang]
14
+
15
+ def decode_iso(iso:str)-> str:
16
+ ''' Takes an ISO-3 code and returns the name of the language. '''
17
+
18
+ if "-" in iso:
19
+ iso, suffix = iso.split("-", 1)
20
+ else:
21
+ suffix = None
22
+
23
+ name = pycountry.languages.get(alpha_3 = iso).name
24
+ name = name.replace("Mossi", "Mooré").replace("Swahili (individual language)", "Swahili")
25
+
26
+ if suffix is not None:
27
+ name+= f" - {suffix}"
28
+
29
+ return name
src/lookups.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ language_list = ['swh', 'ara','mya', 'eng', 'fra', 'hat', 'kmr', 'lin','mos','fas', 'por','ron', 'run','rus','spa', 'tur', 'ukr', 'urd'
2
+ #'ara','fas','ukr','tur', 'mya', 'rus',
3
+ #'kmr-script_latin', 'urd-script_arabic', 'urd-script_devanagari', 'urd-script_latin',
4
+ ]
5
+
6
+ #####################################
7
+ placeholders = {
8
+ 'swh': "Mfuko wa Kimataifa wa Watoto",
9
+ 'ara': "منظمة الأمم المتحدة للطفولة",
10
+ 'eng': "the United Nations International Children's Emergency Fund",
11
+ 'fra': "Le Fonds des Nations unies pour l'enfance",
12
+ 'hat': "Fon pou Timoun Nasyon Ini yo",
13
+ 'fas': "صندوق کودکان ملل متحد",
14
+ 'kmr': "سندووقی مناڵانی نەتەوە یەکگرتووەکان",
15
+ 'lin': 'Your phrase here',
16
+ 'mos': 'Your phrase here',
17
+ 'mya': "ကုလသမဂ္ဂ အပြည်ပြည်ဆိုင်ရာ ကလေးများရန်ပုံငွေအဖွဲ့",
18
+ 'por': "O Fundo das Nações Unidas para a Infância",
19
+ 'ron': "Fondul Internațional pentru Urgențe ale Copiilor al Națiunilor Unite",
20
+ 'run': "Your phrase here",
21
+ 'rus': 'Международного фонда помощи детям',
22
+ 'spa': "El Fondo de las Naciones Unidas para la Infancia",
23
+ 'tur': "Birleşmiş Milletler Çocuklara Yardım Fonu",
24
+ 'ukr': "Дитя́чий фонд Організа́ція Об'є́днаних На́цій",
25
+ 'urd': "اقوام متحدہ کا فنڈ برائے اطفال"
26
+ }
27
+
28
+ #####################################
29
+ models = {
30
+ 'swh': {
31
+ 'mms': 'facebook/mms-tts-swh',
32
+ 'coqui': None,
33
+ 'espeakng': 'sw',
34
+ 'toucan': 'Congo Swahili (swc)',
35
+ 'piper': ['Swahili', 'csukuangfj/vits-piper-sw_CD-lanfrica-medium'],
36
+ },
37
+ 'ara': {
38
+ 'mms': 'facebook/mms-tts-ara',
39
+ 'coqui': None, ################
40
+ 'espeakng': 'ar',
41
+ 'toucan': 'Standard Arabic (arb)',
42
+ 'piper': ['Arabic','csukuangfj/vits-piper-ar_JO-kareem-low'] # Other variant(s) exist but have been left out
43
+ },
44
+ 'mya': {
45
+ 'mms': 'facebook/mms-tts-mya',
46
+ 'coqui': None, ################
47
+ 'espeakng': 'my',
48
+ 'toucan': 'Burmese (mya)',
49
+ 'piper': None
50
+ },
51
+ 'eng': {
52
+ 'mms': 'facebook/mms-tts-eng',
53
+ 'coqui': None,
54
+ 'espeakng': 'en',
55
+ 'toucan': 'English (eng)',
56
+ 'piper': ['English', 'csukuangfj/vits-piper-en_US-glados|1 speaker'] # Other variant(s) exist but have been left out
57
+ },
58
+ 'fas':{
59
+ 'mms': 'facebook/mms-tts-fas',
60
+ 'coqui': None, #'tts_models/fa/custom/glow-tts',
61
+ 'espeakng': 'fa',
62
+ 'toucan': 'Persian (pes)', #'Dari (prs)'
63
+ 'piper': ['Persian', 'csukuangfj/vits-piper-fa_IR-amir-medium'],
64
+ #['Persian', 'csukuangfj/vits-piper-fa_IR-gyro-medium']],
65
+ #'mimic3': ['csukuangfj/vits-mimic3-fa-haaniye_low']
66
+ } ,
67
+ 'fra':{
68
+ 'mms': 'facebook/mms-tts-fra',
69
+ 'coqui': 'tts_models/fr/css10/vits', # Sampling rate: 22050 ## k2-fsa: 'csukuangfj/vits-coqui-fr-css10'
70
+ 'espeakng': 'fr',
71
+ 'toucan': 'French (fra)',
72
+ 'piper': ['French', 'csukuangfj/vits-piper-fr_FR-upmc-medium'] # Other variant(s) exist but have been left out
73
+ },
74
+ 'hat':{
75
+ 'mms': 'facebook/mms-tts-hat',
76
+ 'coqui': None,
77
+ 'espeakng': 'ht',
78
+ 'toucan': 'Haitian Creole (hat)',
79
+ 'piper': None
80
+ },
81
+ 'kmr': {
82
+ 'mms': 'facebook/mms-tts-kmr-script_arabic',
83
+ 'coqui': None, ################
84
+ 'espeakng': 'ku',
85
+ 'toucan': 'Kurdish (ckb)',
86
+ 'piper': None
87
+ },
88
+ 'lin':{
89
+ 'mms': None,
90
+ 'coqui': 'tts_models/lin/openbible/vits', # Sampling rate: 22050
91
+ 'espeakng': None,
92
+ 'toucan': 'Lingala (lin)',
93
+ 'piper': None,
94
+ 'africanvoices': 'cmu_lin_ope',
95
+ },
96
+ 'mos':{
97
+ 'mms': 'facebook/mms-tts-mos',
98
+ 'coqui': None,
99
+ 'espeakng': None,
100
+ 'toucan': 'Mossi (mos)',
101
+ 'piper': None
102
+ } ,
103
+ # 'Southern Pashto (pbt)', 'Northern Pashto (pbu)'
104
+ 'por':{
105
+ 'mms': 'facebook/mms-tts-por',
106
+ 'coqui': 'tts_models/pt/cv/vits', # Sampling rate: 22050
107
+ 'espeakng': 'pt-br',
108
+ 'toucan': 'Brazilian Portuguese (pt-br)',
109
+ 'piper': ['Portuguese', 'csukuangfj/vits-piper-pt_BR-edresson-low'],
110
+ #['Portuguese', 'csukuangfj/csukuangfj/vits-piper-pt_BR-faber-medium']
111
+ },
112
+ 'ron':{
113
+ 'mms': 'facebook/mms-tts-ron',
114
+ 'coqui': 'tts_models/ro/cv/vits', # Sampling rate: 22050 # csukuangfj/vits-coqui-ro-cv
115
+ 'espeakng': 'ro',
116
+ 'toucan': 'Romanian (ron)',
117
+ 'piper': ['Romanian', 'csukuangfj/vits-piper-ro_RO-mihai-medium']
118
+ },
119
+ 'run':{
120
+ 'mms': 'facebook/mms-tts-run',
121
+ 'coqui': None,
122
+ 'espeakng': None,
123
+ 'toucan': 'Rundi (run)',
124
+ 'piper': None
125
+ },
126
+ 'rus':{
127
+ 'mms': 'facebook/mms-tts-rus',
128
+ 'coqui': None,
129
+ 'espeakng': 'ru',
130
+ 'toucan': 'Russian (rus)',
131
+ 'piper': ['Russian', 'csukuangfj/vits-piper-ru_RU-irina-medium'] # Other variant(s) exist but have been left out
132
+ } ,
133
+ 'spa':{
134
+ 'mms': 'facebook/mms-tts-spa',
135
+ 'coqui': 'tts_models/es/css10/vits', # Sampling rate: 22050
136
+ 'espeakng': 'es-419',
137
+ 'toucan': 'Spanish (spa)',
138
+ 'piper': ['Spanish', 'csukuangfj/vits-piper-es-glados-medium'] # Other variant(s) exist but have been left out
139
+ },
140
+ 'tur': {
141
+ 'mms': 'facebook/mms-tts-tur',
142
+ 'coqui': None, ################
143
+ 'espeakng': 'tr',
144
+ 'toucan': 'Turkish (tur)',
145
+ 'piper': ['Turkish', 'csukuangfj/vits-piper-tr_TR-dfki-medium'] # Other variant(s) exist but have been left out
146
+ },
147
+ 'ukr': {
148
+ 'mms': 'facebook/mms-tts-ukr',
149
+ 'coqui': None, ################
150
+ 'espeakng': 'uk',
151
+ 'toucan': 'Ukrainian (ukr)',
152
+ 'piper': ['Ukrainian', 'csukuangfj/vits-piper-uk_UA-lada-x_low'] # Other variant(s) exist but have been left out
153
+ },
154
+ 'urd': {
155
+ 'mms': 'facebook/mms-tts-urd-script_arabic',
156
+ 'coqui': None, ################
157
+ 'espeakng': 'ur',
158
+ 'toucan': 'Urdu (urd)',
159
+ 'piper': None
160
+ },
161
+ }
162
+
src/synthesize.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import IPython
2
+ from huggingface_hub.inference_api import InferenceApi
3
+ import torch
4
+ from TTS.api import TTS
5
+ import wave
6
+ import espeakng
7
+ import subprocess
8
+ from scipy.io import wavfile
9
+ from transformers import pipeline
10
+ import os
11
+ import numpy as np
12
+ from gradio_client import Client, handle_file
13
+
14
+ from dotenv import load_dotenv
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+
19
+ def synth_mms(text:str, model:str):
20
+ '''
21
+ Use Huggingface inference pipeline to synthesize text.
22
+ (Can be replaced by inference API, but that requires stored API token.)
23
+
24
+ Inputs:
25
+ text: Text to synthesze
26
+ model: Model code of the form mms-tts-LAN
27
+ Returns:
28
+ Streaming numpy and sampling rate.
29
+ '''
30
+ #inference = InferenceApi(repo_id=f"facebook/{model}",
31
+ # token=API_TOKEN)
32
+ #mms_tts = inference(inputs=text,
33
+ # raw_response=True)._content
34
+
35
+ if model is not None:
36
+ pipe = pipeline("text-to-speech", model=model, device=-1, token=os.environ['HUGGINGFACE_KEY']
37
+ ) # Change device if it should use GPU
38
+ mms_tts = pipe(text)
39
+ return mms_tts['audio'], mms_tts['sampling_rate']
40
+ else:
41
+ return None
42
+
43
+
44
+
45
+ def synth_coqui(text:str, model:str):
46
+ '''
47
+ Use Coqui inference API to synthesize text.
48
+
49
+ Inputs:
50
+ text: Text to synthesze
51
+ model: Model code
52
+ Returns:
53
+ Streaming Wav and sampling rate.
54
+
55
+ IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model.
56
+ '''
57
+ if model is not None:
58
+ # Get device
59
+ device = "cuda" if torch.cuda.is_available() else "cpu"
60
+
61
+ # Init TTS
62
+ tts = TTS(model, progress_bar=False).to(device)
63
+
64
+ # Infer
65
+ wav = tts.tts(text=text) # is_multi_speaker=False
66
+
67
+ return np.array(wav), 22050
68
+ else:
69
+ return None
70
+
71
+
72
+ def synth_espeakng(text:str, model:str):
73
+ '''
74
+ Use ESpeak-NG to synthesize text.
75
+
76
+ Inputs:
77
+ text: Text to synthesze
78
+ model: Model code
79
+ Returns:
80
+ Streaming Wav and sampling rate.
81
+ '''
82
+ if model is not None:
83
+
84
+ subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text])
85
+ #esng = espeakng.Speaker()
86
+ #esng.voice = model
87
+ #esng.say(text, export_path="test.wav")
88
+
89
+ sampling_rate, wav = wavfile.read('test.wav')
90
+ os.remove("test.wav")
91
+
92
+ #wav = tts.tts(text=text)
93
+ return wav, sampling_rate
94
+ else:
95
+ return None
96
+
97
+ def synth_africanvoices(text:str, model:str):
98
+ '''
99
+ Use ESpeak-NG to synthesize text.
100
+
101
+ Inputs:
102
+ text: Text to synthesze
103
+ model: Model code
104
+ Returns:
105
+ Streaming Wav and sampling rate.
106
+ '''
107
+ if model is not None:
108
+
109
+ subprocess.run(['flite', f'-voice {model}.flitevox', f'"{text}"', " test.wav"])
110
+ #esng = espeakng.Speaker()
111
+ #esng.voice = model
112
+ #esng.say(text, export_path="test.wav")
113
+
114
+ sampling_rate, wav = wavfile.read('test.wav')
115
+ os.remove("test.wav")
116
+
117
+ #wav = tts.tts(text=text)
118
+ return wav, sampling_rate
119
+ else:
120
+ return None
121
+
122
+ def synth_toucan(text:str, model:str):
123
+ '''
124
+ Use Toucan to synthesize text.
125
+
126
+ Inputs:
127
+ text: Text to synthesze
128
+ model: Model code
129
+ Returns:
130
+ Streaming Wav and sampling rate.
131
+
132
+ NOTES: (1)This wrapper does not let you explore the full range of options possible with the API. (2) The API should allow you to generate female voices, however, it does not seem to be working at the moment. (3) This uses a Huggingface Gradio Space to compute via the API.
133
+ '''
134
+ client = Client("Flux9665/MassivelyMultilingualTTS")
135
+ result = client.predict(
136
+ prompt=text,
137
+ language=model,
138
+ reference_audio=handle_file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
139
+ voice_seed=123,
140
+ prosody_creativity=0.1,
141
+ duration_scaling_factor=1,
142
+ emb1=0,
143
+ #emb2=0,
144
+ api_name="/predict"
145
+ )
146
+ sampling_rate, wav = wavfile.read(result[0])
147
+ return wav, sampling_rate
148
+
149
+ def synth_piper(text:str, model:str):
150
+ '''
151
+ Use Toucan to synthesize text.
152
+
153
+ Inputs:
154
+ text: Text to synthesze
155
+ model: Model code
156
+ Returns:
157
+ Streaming Wav and sampling rate.
158
+
159
+ NOTES: (1) This uses a Huggingface Gradio Space to compute via the API.
160
+ '''
161
+ if model is not None:
162
+ client = Client("k2-fsa/text-to-speech")
163
+ result = client.predict(
164
+ language=model[0],
165
+ repo_id=model[1],
166
+ text=text,
167
+ sid="0",
168
+ speed=1,
169
+ api_name="/process"
170
+ )
171
+ sampling_rate, wav = wavfile.read(result[0])
172
+ return wav, sampling_rate
173
+ else:
174
+ return None
target_speaker.wav ADDED
Binary file (51.5 kB). View file