Spaces:
Running
Running
Yurii Paniv
commited on
Commit
·
a163565
1
Parent(s):
7af6095
Add version 3.0.0
Browse files- README.md +6 -3
- app.py +21 -15
- config.json +20 -2
README.md
CHANGED
|
@@ -18,8 +18,10 @@ Link to source code and models -> [https://github.com/robinhad/ukrainian-tts](ht
|
|
| 18 |
|
| 19 |
Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
|
| 20 |
# Support
|
| 21 |
-
If you like my work, please support -> [https://send.monobank.ua/jar/48iHq4xAXm](https://send.monobank.ua/jar/48iHq4xAXm)
|
| 22 |
-
|
|
|
|
|
|
|
| 23 |
|
| 24 |
`Mykyta (male)`:
|
| 25 |
|
|
@@ -53,7 +55,8 @@ tts-server --model_path path/to/model.pth \
|
|
| 53 |
# Attribution 🤝
|
| 54 |
|
| 55 |
- Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
|
| 56 |
-
- Mykyta, Olena
|
|
|
|
| 57 |
- Silence cutting using [HMM-GMM](https://github.com/proger/uk) - [Volodymyr Kyrylov @proger](https://github.com/proger)
|
| 58 |
- Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
|
| 59 |
- Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
|
|
|
|
| 18 |
|
| 19 |
Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
|
| 20 |
# Support
|
| 21 |
+
If you like my work, please support -> [https://send.monobank.ua/jar/48iHq4xAXm](https://send.monobank.ua/jar/48iHq4xAXm)
|
| 22 |
+
For collaboration and question please contact me here: [Telegram https://t.me/robinhad](https://t.me/robinhad) [Twitter https://twitter.com/robinhad](https://twitter.com/robinhad)
|
| 23 |
+
You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
|
| 24 |
+
# Examples
|
| 25 |
|
| 26 |
`Mykyta (male)`:
|
| 27 |
|
|
|
|
| 55 |
# Attribution 🤝
|
| 56 |
|
| 57 |
- Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
|
| 58 |
+
- Mykyta, Olena, Lada, Dmytro, Olha dataset - [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
|
| 59 |
+
- Dmytro voice - [Dmytro Chaplynskyi @dchaplinsky](https://github.com/dchaplinsky)
|
| 60 |
- Silence cutting using [HMM-GMM](https://github.com/proger/uk) - [Volodymyr Kyrylov @proger](https://github.com/proger)
|
| 61 |
- Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
|
| 62 |
- Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
|
app.py
CHANGED
|
@@ -20,6 +20,8 @@ class VoiceOption(Enum):
|
|
| 20 |
Olena = "Олена (жіночий) 👩"
|
| 21 |
Mykyta = "Микита (чоловічий) 👨"
|
| 22 |
Lada = "Лада (жіночий) 👩"
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
def download(url, file_name):
|
|
@@ -33,7 +35,7 @@ def download(url, file_name):
|
|
| 33 |
|
| 34 |
|
| 35 |
print("downloading uk/mykyta/vits-tts")
|
| 36 |
-
release_number = "v3.0.0
|
| 37 |
model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
|
| 38 |
config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
|
| 39 |
speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
|
|
@@ -71,11 +73,14 @@ def tts(text: str, voice: str, stress: str):
|
|
| 71 |
autostress_with_model = (
|
| 72 |
True if stress == StressOption.AutomaticStressWithModel.value else False
|
| 73 |
)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
| 79 |
text = preprocess_text(text, autostress_with_model)
|
| 80 |
text_limit = 7200
|
| 81 |
text = (
|
|
@@ -98,23 +103,24 @@ with open("README.md") as file:
|
|
| 98 |
iface = gr.Interface(
|
| 99 |
fn=tts,
|
| 100 |
inputs=[
|
| 101 |
-
gr.
|
| 102 |
label="Input",
|
| 103 |
-
|
| 104 |
),
|
| 105 |
-
gr.
|
| 106 |
label="Голос",
|
| 107 |
choices=[option.value for option in VoiceOption],
|
| 108 |
-
|
| 109 |
),
|
| 110 |
-
gr.
|
| 111 |
label="Наголоси",
|
| 112 |
choices=[option.value for option in StressOption],
|
|
|
|
| 113 |
),
|
| 114 |
],
|
| 115 |
outputs=[
|
| 116 |
-
gr.
|
| 117 |
-
gr.
|
| 118 |
],
|
| 119 |
title="🐸💬🇺🇦 - Coqui TTS",
|
| 120 |
description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
|
|
@@ -132,12 +138,12 @@ iface = gr.Interface(
|
|
| 132 |
],
|
| 133 |
[
|
| 134 |
"Вв+едіть, будь ласка, св+оє реч+ення.",
|
| 135 |
-
VoiceOption.
|
| 136 |
StressOption.AutomaticStress.value,
|
| 137 |
],
|
| 138 |
[
|
| 139 |
"Привіт, як тебе звати?",
|
| 140 |
-
VoiceOption.
|
| 141 |
StressOption.AutomaticStress.value,
|
| 142 |
],
|
| 143 |
[
|
|
|
|
| 20 |
Olena = "Олена (жіночий) 👩"
|
| 21 |
Mykyta = "Микита (чоловічий) 👨"
|
| 22 |
Lada = "Лада (жіночий) 👩"
|
| 23 |
+
Dmytro = "Дмитро (чоловічий) 👩"
|
| 24 |
+
Olga = "Ольга (жіночий) 👩"
|
| 25 |
|
| 26 |
|
| 27 |
def download(url, file_name):
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
print("downloading uk/mykyta/vits-tts")
|
| 38 |
+
release_number = "v3.0.0"
|
| 39 |
model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
|
| 40 |
config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
|
| 41 |
speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
|
|
|
|
| 73 |
autostress_with_model = (
|
| 74 |
True if stress == StressOption.AutomaticStressWithModel.value else False
|
| 75 |
)
|
| 76 |
+
voice_mapping = {
|
| 77 |
+
VoiceOption.Olena.value: "olena",
|
| 78 |
+
VoiceOption.Mykyta.value: "mykyta",
|
| 79 |
+
VoiceOption.Lada.value: "lada",
|
| 80 |
+
VoiceOption.Dmytro.value: "dmytro",
|
| 81 |
+
VoiceOption.Olga.value: "olga",
|
| 82 |
+
}
|
| 83 |
+
speaker_name = voice_mapping[voice]
|
| 84 |
text = preprocess_text(text, autostress_with_model)
|
| 85 |
text_limit = 7200
|
| 86 |
text = (
|
|
|
|
| 103 |
iface = gr.Interface(
|
| 104 |
fn=tts,
|
| 105 |
inputs=[
|
| 106 |
+
gr.components.Textbox(
|
| 107 |
label="Input",
|
| 108 |
+
value="Введіть, будь ласка, своє р+ечення.",
|
| 109 |
),
|
| 110 |
+
gr.components.Radio(
|
| 111 |
label="Голос",
|
| 112 |
choices=[option.value for option in VoiceOption],
|
| 113 |
+
value=VoiceOption.Olena.value,
|
| 114 |
),
|
| 115 |
+
gr.components.Radio(
|
| 116 |
label="Наголоси",
|
| 117 |
choices=[option.value for option in StressOption],
|
| 118 |
+
value=StressOption.AutomaticStress.value
|
| 119 |
),
|
| 120 |
],
|
| 121 |
outputs=[
|
| 122 |
+
gr.components.Audio(label="Output"),
|
| 123 |
+
gr.components.Textbox(label="Наголошений текст"),
|
| 124 |
],
|
| 125 |
title="🐸💬🇺🇦 - Coqui TTS",
|
| 126 |
description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
|
|
|
|
| 138 |
],
|
| 139 |
[
|
| 140 |
"Вв+едіть, будь ласка, св+оє реч+ення.",
|
| 141 |
+
VoiceOption.Dmytro.value,
|
| 142 |
StressOption.AutomaticStress.value,
|
| 143 |
],
|
| 144 |
[
|
| 145 |
"Привіт, як тебе звати?",
|
| 146 |
+
VoiceOption.Olga.value,
|
| 147 |
StressOption.AutomaticStress.value,
|
| 148 |
],
|
| 149 |
[
|
config.json
CHANGED
|
@@ -73,7 +73,7 @@
|
|
| 73 |
"griffin_lim_iters": 60,
|
| 74 |
"num_mels": 80,
|
| 75 |
"mel_fmin": 0,
|
| 76 |
-
"mel_fmax":
|
| 77 |
"spec_gain": 6.0,
|
| 78 |
"do_amp_to_db_linear": true,
|
| 79 |
"do_amp_to_db_mel": true,
|
|
@@ -158,11 +158,29 @@
|
|
| 158 |
null,
|
| 159 |
null
|
| 160 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
[
|
| 162 |
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
| 163 |
"lada",
|
| 164 |
null,
|
| 165 |
null
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
]
|
| 167 |
],
|
| 168 |
"eval_split_max_size": null,
|
|
@@ -243,7 +261,7 @@
|
|
| 243 |
"init_discriminator": true,
|
| 244 |
"use_spectral_norm_disriminator": false,
|
| 245 |
"use_speaker_embedding": true,
|
| 246 |
-
"num_speakers":
|
| 247 |
"speakers_file": "speakers.pth",
|
| 248 |
"d_vector_file": null,
|
| 249 |
"speaker_embedding_channels": 256,
|
|
|
|
| 73 |
"griffin_lim_iters": 60,
|
| 74 |
"num_mels": 80,
|
| 75 |
"mel_fmin": 0,
|
| 76 |
+
"mel_fmax": null,
|
| 77 |
"spec_gain": 6.0,
|
| 78 |
"do_amp_to_db_linear": true,
|
| 79 |
"do_amp_to_db_mel": true,
|
|
|
|
| 158 |
null,
|
| 159 |
null
|
| 160 |
],
|
| 161 |
+
[
|
| 162 |
+
"\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
|
| 163 |
+
"dmytro",
|
| 164 |
+
null,
|
| 165 |
+
null
|
| 166 |
+
],
|
| 167 |
[
|
| 168 |
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
| 169 |
"lada",
|
| 170 |
null,
|
| 171 |
null
|
| 172 |
+
],
|
| 173 |
+
[
|
| 174 |
+
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
| 175 |
+
"dmytro",
|
| 176 |
+
null,
|
| 177 |
+
null
|
| 178 |
+
],
|
| 179 |
+
[
|
| 180 |
+
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
| 181 |
+
"olga",
|
| 182 |
+
null,
|
| 183 |
+
null
|
| 184 |
]
|
| 185 |
],
|
| 186 |
"eval_split_max_size": null,
|
|
|
|
| 261 |
"init_discriminator": true,
|
| 262 |
"use_spectral_norm_disriminator": false,
|
| 263 |
"use_speaker_embedding": true,
|
| 264 |
+
"num_speakers": 5,
|
| 265 |
"speakers_file": "speakers.pth",
|
| 266 |
"d_vector_file": null,
|
| 267 |
"speaker_embedding_channels": 256,
|