Fabrice-TIERCELIN commited on
Commit
dcb8a32
·
verified ·
1 Parent(s): dd59b92

Up to 5 generations

Browse files
Files changed (1) hide show
  1. app.py +167 -35
app.py CHANGED
@@ -35,11 +35,29 @@ else:
35
  tts = TTS(model_name, gpu=torch.cuda.is_available())
36
  tts.to(device_type)
37
 
38
- def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, randomize_seed, seed):
39
- start = time.time()
 
 
 
 
 
40
 
41
- if randomize_seed:
42
- seed = random.randint(0, max_64_bit_int)
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  if len(prompt) < 2:
45
  gr.Warning("Please give a longer prompt text")
@@ -75,7 +93,7 @@ def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, ra
75
  else:
76
  speaker_wav = "./examples/female.wav"
77
 
78
- output_filename = f"{re.sub('[^a-zA-Z0-9]', '_', prompt)}_{re.sub('[^a-zA-Z0-9]', '_', language)}"[:250] + ".wav"
79
 
80
  try:
81
  if language == "fr":
@@ -83,7 +101,13 @@ def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, ra
83
  language = "fr-fr"
84
  if m.find("/fr/") != -1:
85
  language = None
86
- predict_on_gpu(prompt, speaker_wav, language, output_filename, seed)
 
 
 
 
 
 
87
  except RuntimeError as e :
88
  if "device-assert" in str(e):
89
  # cannot do anything on cuda device side error, need to restart
@@ -99,17 +123,33 @@ def predict(prompt, language, gender, audio_file_pth, mic_file_path, use_mic, ra
99
  secondes = secondes - (minutes * 60)
100
  hours = math.floor(minutes / 60)
101
  minutes = minutes - (hours * 60)
102
- is_randomize_seed = False
103
  information = ("Start again to get a different result. " if is_randomize_seed else "") + "The sound has been generated in " + ((str(hours) + " h, ") if hours != 0 else "") + ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + str(secondes) + " sec."
104
 
105
  return (
106
- output_filename,
107
- output_filename,
 
 
 
108
  information,
109
  )
110
 
111
  @spaces.GPU(duration=60)
112
- def predict_on_gpu(prompt, speaker_wav, language, output_filename, seed):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  random.seed(seed)
114
  torch.manual_seed(seed)
115
 
@@ -117,13 +157,16 @@ def predict_on_gpu(prompt, speaker_wav, language, output_filename, seed):
117
  text = prompt,
118
  file_path = output_filename,
119
  speaker_wav = speaker_wav,
120
- language = language
 
121
  )
122
 
123
  with gr.Blocks() as interface:
124
- gr.HTML("Multi-language Text-to-Speech")
125
  gr.HTML(
126
  """
 
 
 
127
  <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
128
  <br/>
129
  XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
@@ -134,20 +177,21 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, wh
134
  <br/>
135
  <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
136
  <br/>
137
- <a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
138
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
139
  </p>
140
  """
141
  )
142
  with gr.Column():
143
  prompt = gr.Textbox(
144
- label="Text Prompt",
145
- info="One or two sentences at a time is better",
146
- value="Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
 
147
  )
148
  with gr.Group():
149
  language = gr.Dropdown(
150
- label="Language",
151
  info="Select an output language for the synthesised speech",
152
  choices=[
153
  ["Arabic", "ar"],
@@ -166,46 +210,134 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, wh
166
  ],
167
  max_choices=1,
168
  value="en",
 
169
  )
170
  gr.HTML("More languages <a href='https://huggingface.co/spaces/Brasd99/TTS-Voice-Cloner'>here</a>")
171
- gender = gr.Radio(["female", "male"], label="Gender", info="Gender of the voice")
 
 
 
 
 
172
  audio_file_pth = gr.Audio(
173
  label="Reference Audio",
174
  #info="Click on the ✎ button to upload your own target speaker audio",
175
  type="filepath",
176
  value=None,
 
177
  )
178
- mic_file_path = gr.Audio(sources=["microphone"],
179
- type="filepath",
180
- #info="Use your microphone to record audio",
181
- label="Use Microphone for Reference")
182
- use_mic = gr.Checkbox(label="Check to use Microphone as Reference",
183
- value=False,
184
- info="Notice: Microphone input may not work properly under traffic",)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  with gr.Accordion("Advanced options", open = False):
186
- debug_mode = gr.Checkbox(label = "Debug mode", value = False, info = "Show intermediate results")
187
- randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
188
- seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- submit = gr.Button("🚀 Speak", variant = "primary")
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- waveform_visual = gr.Video(label="Waveform Visual", autoplay=True)
193
- synthesised_audio = gr.Audio(label="Synthesised Audio", autoplay=False)
 
 
 
 
194
  information = gr.HTML()
195
 
196
- submit.click(predict, inputs = [
 
 
 
 
 
 
 
197
  prompt,
198
  language,
199
  gender,
200
  audio_file_pth,
201
  mic_file_path,
202
  use_mic,
 
 
203
  randomize_seed,
204
  seed
205
  ], outputs = [
206
- waveform_visual,
207
- synthesised_audio,
 
 
 
208
  information
209
  ], scroll_to_output = True)
210
 
211
- interface.queue().launch(debug=True)
 
35
  tts = TTS(model_name, gpu=torch.cuda.is_available())
36
  tts.to(device_type)
37
 
38
+ def update_output(output_number):
39
+ return [
40
+ gr.update(visible = (2 <= output_number)),
41
+ gr.update(visible = (3 <= output_number)),
42
+ gr.update(visible = (4 <= output_number)),
43
+ gr.update(visible = (5 <= output_number))
44
+ ]
45
 
46
+ def predict(
47
+ prompt,
48
+ language,
49
+ gender,
50
+ audio_file_pth,
51
+ mic_file_path,
52
+ use_mic,
53
+ generation_number,
54
+ temperature,
55
+ is_randomize_seed,
56
+ seed,
57
+ progress = gr.Progress()
58
+ ):
59
+ start = time.time()
60
+ progress(0, desc = "Preparing data...")
61
 
62
  if len(prompt) < 2:
63
  gr.Warning("Please give a longer prompt text")
 
93
  else:
94
  speaker_wav = "./examples/female.wav"
95
 
96
+ output_filename = []
97
 
98
  try:
99
  if language == "fr":
 
101
  language = "fr-fr"
102
  if m.find("/fr/") != -1:
103
  language = None
104
+
105
+ for i in range(5):
106
+ if i < generation_number:
107
+ output_filename.append(f"{i}_{re.sub('[^a-zA-Z0-9]', '_', language)}_{re.sub('[^a-zA-Z0-9]', '_', prompt)}"[:250] + ".wav")
108
+ predict_on_gpu(i, prompt, speaker_wav, language, output_filename[i], temperature, is_randomize_seed, seed, progress)
109
+ else:
110
+ output_filename.append(None)
111
  except RuntimeError as e :
112
  if "device-assert" in str(e):
113
  # cannot do anything on cuda device side error, need to restart
 
123
  secondes = secondes - (minutes * 60)
124
  hours = math.floor(minutes / 60)
125
  minutes = minutes - (hours * 60)
 
126
  information = ("Start again to get a different result. " if is_randomize_seed else "") + "The sound has been generated in " + ((str(hours) + " h, ") if hours != 0 else "") + ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + str(secondes) + " sec."
127
 
128
  return (
129
+ output_filename[0],
130
+ output_filename[1],
131
+ output_filename[2],
132
+ output_filename[3],
133
+ output_filename[4],
134
  information,
135
  )
136
 
137
  @spaces.GPU(duration=60)
138
+ def predict_on_gpu(
139
+ i,
140
+ prompt,
141
+ speaker_wav,
142
+ language,
143
+ output_filename,
144
+ temperature,
145
+ is_randomize_seed,
146
+ seed,
147
+ progress
148
+ ):
149
+ progress((i + 1) / 5, desc = "Generating the audio #" + str(i + 1) + "...")
150
+ if is_randomize_seed:
151
+ seed = random.randint(0, max_64_bit_int)
152
+
153
  random.seed(seed)
154
  torch.manual_seed(seed)
155
 
 
157
  text = prompt,
158
  file_path = output_filename,
159
  speaker_wav = speaker_wav,
160
+ language = language,
161
+ temperature = temperature
162
  )
163
 
164
  with gr.Blocks() as interface:
 
165
  gr.HTML(
166
  """
167
+ <h1><center>XTTS</center></h1>
168
+ <big><center>Generate long vocal from text in several languages following voice freely, without account, without watermark and download it</center></big>
169
+ <br/>
170
  <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
171
  <br/>
172
  XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
 
177
  <br/>
178
  <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
179
  <br/>
180
+ <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/Multi-language_Text-to-Speech?duplicate=true">
181
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
182
  </p>
183
  """
184
  )
185
  with gr.Column():
186
  prompt = gr.Textbox(
187
+ label = "Text Prompt",
188
+ info = "One or two sentences at a time is better",
189
+ value = "Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
190
+ elem_id = "prompt-id",
191
  )
192
  with gr.Group():
193
  language = gr.Dropdown(
194
+ label="Language",
195
  info="Select an output language for the synthesised speech",
196
  choices=[
197
  ["Arabic", "ar"],
 
210
  ],
211
  max_choices=1,
212
  value="en",
213
+ elem_id = "language-id",
214
  )
215
  gr.HTML("More languages <a href='https://huggingface.co/spaces/Brasd99/TTS-Voice-Cloner'>here</a>")
216
+ gender = gr.Radio(
217
+ ["female", "male"],
218
+ label="Gender",
219
+ info="Gender of the voice",
220
+ elem_id = "gender-id",
221
+ )
222
  audio_file_pth = gr.Audio(
223
  label="Reference Audio",
224
  #info="Click on the ✎ button to upload your own target speaker audio",
225
  type="filepath",
226
  value=None,
227
+ elem_id = "audio-file-pth-id",
228
  )
229
+ mic_file_path = gr.Audio(
230
+ sources=["microphone"],
231
+ type="filepath",
232
+ #info="Use your microphone to record audio",
233
+ label="Use Microphone for Reference",
234
+ elem_id = "mic-file-path-id",
235
+ )
236
+ use_mic = gr.Checkbox(
237
+ label = "Check to use Microphone as Reference",
238
+ value = False,
239
+ info = "Notice: Microphone input may not work properly under traffic",
240
+ elem_id = "use-mic-id",
241
+ )
242
+ generation_number = gr.Slider(
243
+ minimum = 1,
244
+ maximum = 5,
245
+ step = 1,
246
+ value = 1,
247
+ label = "Generation number",
248
+ info = "How many audios to generate",
249
+ elem_id = "generation-number-id"
250
+ )
251
  with gr.Accordion("Advanced options", open = False):
252
+ temperature = gr.Slider(
253
+ minimum = 0,
254
+ maximum = 10,
255
+ step = .1,
256
+ value = .75,
257
+ label = "Temperature",
258
+ elem_id = "temperature-id"
259
+ )
260
+ randomize_seed = gr.Checkbox(
261
+ label = "\U0001F3B2 Randomize seed",
262
+ value = True,
263
+ info = "If checked, result is always different",
264
+ elem_id = "randomize-seed-id"
265
+ )
266
+ seed = gr.Slider(
267
+ minimum = 0,
268
+ maximum = max_64_bit_int,
269
+ step = 1,
270
+ randomize = True,
271
+ label = "Seed",
272
+ elem_id = "seed-id"
273
+ )
274
+
275
+ submit = gr.Button(
276
+ "🚀 Speak",
277
+ variant = "primary",
278
+ elem_id = "submit-id"
279
+ )
280
+
281
+ synthesised_audio_1 = gr.Audio(
282
+ label="Synthesised Audio #1",
283
+ autoplay = False,
284
+ elem_id = "synthesised-audio-1-id"
285
+ )
286
+
287
+ synthesised_audio_2 = gr.Audio(
288
+ label="Synthesised Audio #2",
289
+ autoplay = False,
290
+ elem_id = "synthesised-audio-2-id",
291
+ visible = False
292
+ )
293
 
294
+ synthesised_audio_3 = gr.Audio(
295
+ label="Synthesised Audio #3",
296
+ autoplay = False,
297
+ elem_id = "synthesised-audio-3-id",
298
+ visible = False
299
+ )
300
+
301
+ synthesised_audio_4 = gr.Audio(
302
+ label="Synthesised Audio #4",
303
+ autoplay = False,
304
+ elem_id = "synthesised-audio-4-id",
305
+ visible = False
306
+ )
307
 
308
+ synthesised_audio_5 = gr.Audio(
309
+ label="Synthesised Audio #5",
310
+ autoplay = False,
311
+ elem_id = "synthesised-audio-5-id",
312
+ visible = False
313
+ )
314
  information = gr.HTML()
315
 
316
+ submit.click(fn = update_output, inputs = [
317
+ generation_number
318
+ ], outputs = [
319
+ synthesised_audio_2,
320
+ synthesised_audio_3,
321
+ synthesised_audio_4,
322
+ synthesised_audio_5
323
+ ], queue = False, show_progress = False).success(predict, inputs = [
324
  prompt,
325
  language,
326
  gender,
327
  audio_file_pth,
328
  mic_file_path,
329
  use_mic,
330
+ generation_number,
331
+ temperature,
332
  randomize_seed,
333
  seed
334
  ], outputs = [
335
+ synthesised_audio_1,
336
+ synthesised_audio_2,
337
+ synthesised_audio_3,
338
+ synthesised_audio_4,
339
+ synthesised_audio_5,
340
  information
341
  ], scroll_to_output = True)
342
 
343
+ interface.queue(max_size = 5).launch(debug=True)