Spaces:
Running
Running
New TTS: Parler Multi & incomplete Dia
Browse files- app/models.py +44 -2
app/models.py
CHANGED
@@ -47,9 +47,11 @@ AVAILABLE_MODELS = {
|
|
47 |
# Parler Large model
|
48 |
# 'parler-tts/parler_tts/large': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
|
49 |
# Parler Mini model
|
50 |
-
'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
|
51 |
# 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
|
52 |
# 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
|
|
|
|
|
53 |
|
54 |
# # Microsoft Edge TTS
|
55 |
# 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # API disabled
|
@@ -110,6 +112,9 @@ AVAILABLE_MODELS = {
|
|
110 |
# Index TTS
|
111 |
'IndexTeam/IndexTTS': 'IndexTeam/IndexTTS',
|
112 |
|
|
|
|
|
|
|
113 |
# HF TTS w issues
|
114 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
115 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
@@ -192,6 +197,7 @@ HF_SPACES = {
|
|
192 |
'text_param_index': 0,
|
193 |
'return_audio_index': 0,
|
194 |
'series': 'xVASynth',
|
|
|
195 |
},
|
196 |
|
197 |
# CoquiTTS (CPU)
|
@@ -230,6 +236,7 @@ HF_SPACES = {
|
|
230 |
'return_audio_index': 0,
|
231 |
'is_zero_gpu_space': True,
|
232 |
'series': 'Parler',
|
|
|
233 |
},
|
234 |
# Parler Large
|
235 |
'parler-tts/parler_tts/large': {
|
@@ -251,6 +258,16 @@ HF_SPACES = {
|
|
251 |
# 'emoji': '😃', # overlly jolly voice
|
252 |
},
|
253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
# Microsoft Edge TTS
|
255 |
'innoai/Edge-TTS-Text-to-Speech': {
|
256 |
'name': 'Microsoft® Edge TTS',
|
@@ -346,7 +363,7 @@ HF_SPACES = {
|
|
346 |
'series': 'Kokoro',
|
347 |
},
|
348 |
|
349 |
-
# StyleTTS Kokoro v1.0
|
350 |
'hexgrad/Kokoro-API': {
|
351 |
'name': 'Kokoro v1.0',
|
352 |
'function': '/predict',
|
@@ -517,6 +534,15 @@ HF_SPACES = {
|
|
517 |
'is_zero_gpu_space': True,
|
518 |
'series': 'Index',
|
519 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
}
|
521 |
|
522 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
@@ -607,6 +633,11 @@ OVERRIDE_INPUTS = {
|
|
607 |
1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
|
608 |
2: True, #use_large
|
609 |
},
|
|
|
|
|
|
|
|
|
|
|
610 |
'parler-tts/parler-tts-expresso': {
|
611 |
1: 'Elisabeth; Elisabeth\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
|
612 |
},
|
@@ -824,6 +855,17 @@ OVERRIDE_INPUTS = {
|
|
824 |
'IndexTeam/IndexTTS' : {
|
825 |
'prompt': DEFAULT_VOICE_SAMPLE, # voice
|
826 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
827 |
}
|
828 |
|
829 |
# minor mods to model from the same space
|
|
|
47 |
# Parler Large model
|
48 |
# 'parler-tts/parler_tts/large': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
|
49 |
# Parler Mini model
|
50 |
+
# 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
|
51 |
# 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
|
52 |
# 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
|
53 |
+
# Parler Mini Multi v1.1
|
54 |
+
'PHBJT/multi_parler_tts': 'PHBJT/multi_parler_tts',
|
55 |
|
56 |
# # Microsoft Edge TTS
|
57 |
# 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # API disabled
|
|
|
112 |
# Index TTS
|
113 |
'IndexTeam/IndexTTS': 'IndexTeam/IndexTTS',
|
114 |
|
115 |
+
# Dia
|
116 |
+
# 'nari-labs/Dia-1.6B': 'nari-labs/Dia-1.6B', # single speaker hallucinates
|
117 |
+
|
118 |
# HF TTS w issues
|
119 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
120 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
|
197 |
'text_param_index': 0,
|
198 |
'return_audio_index': 0,
|
199 |
'series': 'xVASynth',
|
200 |
+
'space_link': 'Pendrokar/xVASynth-TTS',
|
201 |
},
|
202 |
|
203 |
# CoquiTTS (CPU)
|
|
|
236 |
'return_audio_index': 0,
|
237 |
'is_zero_gpu_space': True,
|
238 |
'series': 'Parler',
|
239 |
+
'emoji': '😷', # broken space
|
240 |
},
|
241 |
# Parler Large
|
242 |
'parler-tts/parler_tts/large': {
|
|
|
258 |
# 'emoji': '😃', # overlly jolly voice
|
259 |
},
|
260 |
|
261 |
+
# Parler Mini trained on Expresso dataset
|
262 |
+
'PHBJT/multi_parler_tts': {
|
263 |
+
'name': 'Parler Mini Multi v1.1',
|
264 |
+
'function': '/gen_tts',
|
265 |
+
'text_param_index': 'text',
|
266 |
+
'return_audio_index': 1,
|
267 |
+
'is_zero_gpu_space': True,
|
268 |
+
'series': 'Parler',
|
269 |
+
},
|
270 |
+
|
271 |
# Microsoft Edge TTS
|
272 |
'innoai/Edge-TTS-Text-to-Speech': {
|
273 |
'name': 'Microsoft® Edge TTS',
|
|
|
363 |
'series': 'Kokoro',
|
364 |
},
|
365 |
|
366 |
+
# StyleTTS Kokoro v1.0 (CPU)
|
367 |
'hexgrad/Kokoro-API': {
|
368 |
'name': 'Kokoro v1.0',
|
369 |
'function': '/predict',
|
|
|
534 |
'is_zero_gpu_space': True,
|
535 |
'series': 'Index',
|
536 |
},
|
537 |
+
|
538 |
+
'nari-labs/Dia-1.6B' : {
|
539 |
+
'name': 'Dia',
|
540 |
+
'function': '/generate_audio',
|
541 |
+
'text_param_index': 'text_input',
|
542 |
+
'return_audio_index': 0,
|
543 |
+
'is_zero_gpu_space': True,
|
544 |
+
'series': 'Dia',
|
545 |
+
},
|
546 |
}
|
547 |
|
548 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
633 |
1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
|
634 |
2: True, #use_large
|
635 |
},
|
636 |
+
# multi-lang parler mini 1.1
|
637 |
+
'PHBJT/multi_parler_tts': {
|
638 |
+
1: 'a ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
|
639 |
+
2: False, #do_format
|
640 |
+
},
|
641 |
'parler-tts/parler-tts-expresso': {
|
642 |
1: 'Elisabeth; Elisabeth\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
|
643 |
},
|
|
|
855 |
'IndexTeam/IndexTTS' : {
|
856 |
'prompt': DEFAULT_VOICE_SAMPLE, # voice
|
857 |
},
|
858 |
+
|
859 |
+
# Dia
|
860 |
+
'nari-labs/Dia-1.6B': {
|
861 |
+
'audio_prompt_input': None,
|
862 |
+
'max_new_tokens': 860, # min tokens as we use only a single speaker
|
863 |
+
'cfg_scale': 3, # 1-5 # Higher values increase adherence to the text prompt.
|
864 |
+
'temperature': 1.3, # Lower values make the output more deterministic, higher values increase randomness.
|
865 |
+
'top_p': 0.95, # Filters vocabulary to the most likely tokens cumulatively reaching probability P.
|
866 |
+
'cfg_filter_top_k': 35, # Top k filter for CFG guidance.
|
867 |
+
'speed_factor': 0.94, # Adjusts the speed of the generated audio (1.0 = original speed).
|
868 |
+
},
|
869 |
}
|
870 |
|
871 |
# minor mods to model from the same space
|