Pendrokar commited on
Commit
76a480c
·
1 Parent(s): 065021e

New TTS: Parler Multi & incomplete Dia

Browse files
Files changed (1) hide show
  1. app/models.py +44 -2
app/models.py CHANGED
@@ -47,9 +47,11 @@ AVAILABLE_MODELS = {
47
  # Parler Large model
48
  # 'parler-tts/parler_tts/large': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
49
  # Parler Mini model
50
- 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
51
  # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
52
  # 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
 
 
53
 
54
  # # Microsoft Edge TTS
55
  # 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # API disabled
@@ -110,6 +112,9 @@ AVAILABLE_MODELS = {
110
  # Index TTS
111
  'IndexTeam/IndexTTS': 'IndexTeam/IndexTTS',
112
 
 
 
 
113
  # HF TTS w issues
114
  # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
115
  # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
@@ -192,6 +197,7 @@ HF_SPACES = {
192
  'text_param_index': 0,
193
  'return_audio_index': 0,
194
  'series': 'xVASynth',
 
195
  },
196
 
197
  # CoquiTTS (CPU)
@@ -230,6 +236,7 @@ HF_SPACES = {
230
  'return_audio_index': 0,
231
  'is_zero_gpu_space': True,
232
  'series': 'Parler',
 
233
  },
234
  # Parler Large
235
  'parler-tts/parler_tts/large': {
@@ -251,6 +258,16 @@ HF_SPACES = {
251
  # 'emoji': '😃', # overlly jolly voice
252
  },
253
 
 
 
 
 
 
 
 
 
 
 
254
  # Microsoft Edge TTS
255
  'innoai/Edge-TTS-Text-to-Speech': {
256
  'name': 'Microsoft® Edge TTS',
@@ -346,7 +363,7 @@ HF_SPACES = {
346
  'series': 'Kokoro',
347
  },
348
 
349
- # StyleTTS Kokoro v1.0
350
  'hexgrad/Kokoro-API': {
351
  'name': 'Kokoro v1.0',
352
  'function': '/predict',
@@ -517,6 +534,15 @@ HF_SPACES = {
517
  'is_zero_gpu_space': True,
518
  'series': 'Index',
519
  },
 
 
 
 
 
 
 
 
 
520
  }
521
 
522
  # for zero-shot TTS - voice sample used by XTTS (11 seconds)
@@ -607,6 +633,11 @@ OVERRIDE_INPUTS = {
607
  1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
608
  2: True, #use_large
609
  },
 
 
 
 
 
610
  'parler-tts/parler-tts-expresso': {
611
  1: 'Elisabeth; Elisabeth\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
612
  },
@@ -824,6 +855,17 @@ OVERRIDE_INPUTS = {
824
  'IndexTeam/IndexTTS' : {
825
  'prompt': DEFAULT_VOICE_SAMPLE, # voice
826
  },
 
 
 
 
 
 
 
 
 
 
 
827
  }
828
 
829
  # minor mods to model from the same space
 
47
  # Parler Large model
48
  # 'parler-tts/parler_tts/large': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
49
  # Parler Mini model
50
+ # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
51
  # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
52
  # 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0
53
+ # Parler Mini Multi v1.1
54
+ 'PHBJT/multi_parler_tts': 'PHBJT/multi_parler_tts',
55
 
56
  # # Microsoft Edge TTS
57
  # 'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # API disabled
 
112
  # Index TTS
113
  'IndexTeam/IndexTTS': 'IndexTeam/IndexTTS',
114
 
115
+ # Dia
116
+ # 'nari-labs/Dia-1.6B': 'nari-labs/Dia-1.6B', # single speaker hallucinates
117
+
118
  # HF TTS w issues
119
  # 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
120
  # 'PolyAI/pheme': '/predict#0', # sleepy HF Space
 
197
  'text_param_index': 0,
198
  'return_audio_index': 0,
199
  'series': 'xVASynth',
200
+ 'space_link': 'Pendrokar/xVASynth-TTS',
201
  },
202
 
203
  # CoquiTTS (CPU)
 
236
  'return_audio_index': 0,
237
  'is_zero_gpu_space': True,
238
  'series': 'Parler',
239
+ 'emoji': '😷', # broken space
240
  },
241
  # Parler Large
242
  'parler-tts/parler_tts/large': {
 
258
  # 'emoji': '😃', # overlly jolly voice
259
  },
260
 
261
+ # Parler Mini trained on Expresso dataset
262
+ 'PHBJT/multi_parler_tts': {
263
+ 'name': 'Parler Mini Multi v1.1',
264
+ 'function': '/gen_tts',
265
+ 'text_param_index': 'text',
266
+ 'return_audio_index': 1,
267
+ 'is_zero_gpu_space': True,
268
+ 'series': 'Parler',
269
+ },
270
+
271
  # Microsoft Edge TTS
272
  'innoai/Edge-TTS-Text-to-Speech': {
273
  'name': 'Microsoft® Edge TTS',
 
363
  'series': 'Kokoro',
364
  },
365
 
366
+ # StyleTTS Kokoro v1.0 (CPU)
367
  'hexgrad/Kokoro-API': {
368
  'name': 'Kokoro v1.0',
369
  'function': '/predict',
 
534
  'is_zero_gpu_space': True,
535
  'series': 'Index',
536
  },
537
+
538
+ 'nari-labs/Dia-1.6B' : {
539
+ 'name': 'Dia',
540
+ 'function': '/generate_audio',
541
+ 'text_param_index': 'text_input',
542
+ 'return_audio_index': 0,
543
+ 'is_zero_gpu_space': True,
544
+ 'series': 'Dia',
545
+ },
546
  }
547
 
548
  # for zero-shot TTS - voice sample used by XTTS (11 seconds)
 
633
  1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
634
  2: True, #use_large
635
  },
636
+ # multi-lang parler mini 1.1
637
+ 'PHBJT/multi_parler_tts': {
638
+ 1: 'a ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
639
+ 2: False, #do_format
640
+ },
641
  'parler-tts/parler-tts-expresso': {
642
  1: 'Elisabeth; Elisabeth\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
643
  },
 
855
  'IndexTeam/IndexTTS' : {
856
  'prompt': DEFAULT_VOICE_SAMPLE, # voice
857
  },
858
+
859
+ # Dia
860
+ 'nari-labs/Dia-1.6B': {
861
+ 'audio_prompt_input': None,
862
+ 'max_new_tokens': 860, # min tokens as we use only a single speaker
863
+ 'cfg_scale': 3, # 1-5 # Higher values increase adherence to the text prompt.
864
+ 'temperature': 1.3, # Lower values make the output more deterministic, higher values increase randomness.
865
+ 'top_p': 0.95, # Filters vocabulary to the most likely tokens cumulatively reaching probability P.
866
+ 'cfg_filter_top_k': 35, # Top k filter for CFG guidance.
867
+ 'speed_factor': 0.94, # Adjusts the speed of the generated audio (1.0 = original speed).
868
+ },
869
  }
870
 
871
  # minor mods to model from the same space