mrfakename commited on
Commit
c04ba55
·
verified ·
1 Parent(s): 9766167

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +342 -90
app.py CHANGED
@@ -112,13 +112,24 @@ def generate_response(messages, model, tokenizer):
112
  return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
113
 
114
 
 
 
 
 
 
 
 
 
115
  @gpu_decorator
116
  def infer(
117
  ref_audio_orig,
118
  ref_text,
 
119
  gen_text,
 
120
  model,
121
  remove_silence,
 
122
  cross_fade_duration=0.15,
123
  nfe_step=32,
124
  speed=1,
@@ -128,10 +139,20 @@ def infer(
128
  gr.Warning("Please provide reference audio.")
129
  return gr.update(), gr.update(), ref_text
130
 
 
 
 
 
131
  if not gen_text.strip():
132
- gr.Warning("Please enter text to generate.")
133
  return gr.update(), gr.update(), ref_text
134
 
 
 
 
 
 
 
135
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
136
 
137
  if model == DEFAULT_TTS_MODEL:
@@ -192,18 +213,35 @@ with gr.Blocks() as app_tts:
192
  gr.Markdown("# Batched TTS")
193
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
194
  gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
 
 
195
  generate_btn = gr.Button("Synthesize", variant="primary")
196
  with gr.Accordion("Advanced Settings", open=False):
197
- ref_text_input = gr.Textbox(
198
- label="Reference Text",
199
- info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.",
200
- lines=2,
201
- )
 
 
 
202
  remove_silence = gr.Checkbox(
203
  label="Remove Silences",
204
  info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
205
  value=False,
206
  )
 
 
 
 
 
 
 
 
 
 
 
 
207
  speed_slider = gr.Slider(
208
  label="Speed",
209
  minimum=0.3,
@@ -215,9 +253,9 @@ with gr.Blocks() as app_tts:
215
  nfe_slider = gr.Slider(
216
  label="NFE Steps",
217
  minimum=4,
218
- maximum=64,
219
  value=32,
220
- step=2,
221
  info="Set the number of denoising steps.",
222
  )
223
  cross_fade_duration_slider = gr.Slider(
@@ -232,40 +270,88 @@ with gr.Blocks() as app_tts:
232
  audio_output = gr.Audio(label="Synthesized Audio")
233
  spectrogram_output = gr.Image(label="Spectrogram")
234
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  @gpu_decorator
236
  def basic_tts(
237
  ref_audio_input,
238
  ref_text_input,
 
239
  gen_text_input,
 
240
  remove_silence,
 
 
241
  cross_fade_duration_slider,
242
  nfe_slider,
243
  speed_slider,
244
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  audio_out, spectrogram_path, ref_text_out = infer(
246
  ref_audio_input,
247
  ref_text_input,
 
248
  gen_text_input,
 
249
  tts_model_choice,
250
  remove_silence,
 
251
  cross_fade_duration=cross_fade_duration_slider,
252
  nfe_step=nfe_slider,
253
  speed=speed_slider,
254
  )
255
- return audio_out, spectrogram_path, ref_text_out
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  generate_btn.click(
258
  basic_tts,
259
  inputs=[
260
  ref_audio_input,
261
  ref_text_input,
 
262
  gen_text_input,
 
263
  remove_silence,
 
 
264
  cross_fade_duration_slider,
265
  nfe_slider,
266
  speed_slider,
267
  ],
268
- outputs=[audio_output, spectrogram_output, ref_text_input],
269
  )
270
 
271
 
@@ -300,30 +386,30 @@ with gr.Blocks() as app_multistyle:
300
  """
301
  # Multiple Speech-Type Generation
302
 
303
- This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
304
  """
305
  )
306
 
307
  with gr.Row():
308
  gr.Markdown(
309
  """
310
- **Example Input:**
311
- {Regular} Hello, I'd like to order a sandwich please.
312
- {Surprised} What do you mean you're out of bread?
313
- {Sad} I really wanted a sandwich though...
314
- {Angry} You know what, darn you and your little shop!
315
- {Whisper} I'll just go back home and cry now.
316
- {Shouting} Why me?!
317
  """
318
  )
319
 
320
  gr.Markdown(
321
  """
322
- **Example Input 2:**
323
- {Speaker1_Happy} Hello, I'd like to order a sandwich please.
324
- {Speaker2_Regular} Sorry, we're out of bread.
325
- {Speaker1_Sad} I really wanted a sandwich though...
326
- {Speaker2_Whisper} I'll give you the last one I was hiding.
327
  """
328
  )
329
 
@@ -337,7 +423,10 @@ with gr.Blocks() as app_multistyle:
337
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
338
  regular_insert = gr.Button("Insert Label", variant="secondary")
339
  regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
340
- regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
 
 
 
341
 
342
  # Regular speech type (max 100)
343
  max_speech_types = 100
@@ -345,6 +434,7 @@ with gr.Blocks() as app_multistyle:
345
  speech_type_names = [regular_name]
346
  speech_type_audios = [regular_audio]
347
  speech_type_ref_texts = [regular_ref_text]
 
348
  speech_type_delete_btns = [None]
349
  speech_type_insert_btns = [regular_insert]
350
 
@@ -356,11 +446,15 @@ with gr.Blocks() as app_multistyle:
356
  delete_btn = gr.Button("Delete Type", variant="secondary")
357
  insert_btn = gr.Button("Insert Label", variant="secondary")
358
  audio_input = gr.Audio(label="Reference Audio", type="filepath")
359
- ref_text_input = gr.Textbox(label="Reference Text", lines=2)
 
 
 
360
  speech_type_rows.append(row)
361
  speech_type_names.append(name_input)
362
  speech_type_audios.append(audio_input)
363
  speech_type_ref_texts.append(ref_text_input)
 
364
  speech_type_delete_btns.append(delete_btn)
365
  speech_type_insert_btns.append(insert_btn)
366
 
@@ -385,21 +479,48 @@ with gr.Blocks() as app_multistyle:
385
 
386
  # Function to delete a speech type
387
  def delete_speech_type_fn():
388
- return gr.update(visible=False), None, None, None
389
 
390
- # Update delete button clicks
 
 
 
 
 
 
 
391
  for i in range(1, len(speech_type_delete_btns)):
392
  speech_type_delete_btns[i].click(
393
  delete_speech_type_fn,
394
- outputs=[speech_type_rows[i], speech_type_names[i], speech_type_audios[i], speech_type_ref_texts[i]],
 
 
 
 
 
 
 
 
 
 
 
395
  )
396
 
 
 
 
 
 
 
 
397
  # Text input for the prompt
398
  gen_text_input_multistyle = gr.Textbox(
399
  label="Text to Generate",
400
  lines=10,
401
  placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
402
  )
 
 
403
 
404
  def make_insert_speech_type_fn(index):
405
  def insert_speech_type_fn(current_text, speech_type_name):
@@ -423,6 +544,18 @@ with gr.Blocks() as app_multistyle:
423
  label="Remove Silences",
424
  value=True,
425
  )
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
  # Generate button
428
  generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
@@ -430,24 +563,60 @@ with gr.Blocks() as app_multistyle:
430
  # Output audio
431
  audio_output_multistyle = gr.Audio(label="Synthesized Audio")
432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  @gpu_decorator
434
  def generate_multistyle_speech(
435
  gen_text,
 
 
 
436
  *args,
437
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  speech_type_names_list = args[:max_speech_types]
439
  speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
440
  speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
441
- remove_silence = args[3 * max_speech_types]
 
442
  # Collect the speech types and their audios into a dict
443
  speech_types = OrderedDict()
444
 
 
 
 
445
  ref_text_idx = 0
446
- for name_input, audio_input, ref_text_input in zip(
447
- speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
448
  ):
 
449
  if name_input and audio_input:
450
- speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
451
  else:
452
  speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
453
  ref_text_idx += 1
@@ -473,12 +642,12 @@ with gr.Blocks() as app_multistyle:
473
  ref_audio = speech_types[current_style]["audio"]
474
  except KeyError:
475
  gr.Warning(f"Please provide reference audio for type {current_style}.")
476
- return [None] + [speech_types[style]["ref_text"] for style in speech_types]
477
  ref_text = speech_types[current_style].get("ref_text", "")
478
 
479
  # Generate speech for this segment
480
  audio_out, _, ref_text_out = infer(
481
- ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
482
  ) # show_info=print no pull to top when generating
483
  sr, audio_data = audio_out
484
 
@@ -488,29 +657,29 @@ with gr.Blocks() as app_multistyle:
488
  # Concatenate all audio segments
489
  if generated_audio_segments:
490
  final_audio_data = np.concatenate(generated_audio_segments)
491
- return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
492
  else:
493
  gr.Warning("No audio generated.")
494
- return [None] + [speech_types[style]["ref_text"] for style in speech_types]
495
 
496
  generate_multistyle_btn.click(
497
  generate_multistyle_speech,
498
- inputs=[
499
- gen_text_input_multistyle,
500
- ]
501
  + speech_type_names
502
  + speech_type_audios
503
  + speech_type_ref_texts
504
- + [
505
- remove_silence_multistyle,
506
- ],
507
- outputs=[audio_output_multistyle] + speech_type_ref_texts,
508
  )
509
 
510
  # Validation function to disable Generate button if speech types are missing
511
- def validate_speech_types(gen_text, regular_name, *args):
512
  speech_type_names_list = args
513
 
 
 
 
514
  # Collect the speech types names
515
  speech_types_available = set()
516
  if regular_name:
@@ -535,19 +704,28 @@ with gr.Blocks() as app_multistyle:
535
 
536
  gen_text_input_multistyle.change(
537
  validate_speech_types,
538
- inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
539
  outputs=generate_multistyle_btn,
540
  )
541
 
 
 
 
 
 
 
 
 
 
542
 
543
  with gr.Blocks() as app_chat:
544
  gr.Markdown(
545
  """
546
  # Voice Chat
547
- Have a conversation with an AI using your reference voice!
548
- 1. Upload a reference audio clip and optionally its transcript.
549
  2. Load the chat model.
550
- 3. Record your message through your microphone.
551
  4. The AI will respond using the reference voice.
552
  """
553
  )
@@ -607,18 +785,33 @@ Have a conversation with an AI using your reference voice!
607
  label="Remove Silences",
608
  value=True,
609
  )
610
- ref_text_chat = gr.Textbox(
611
- label="Reference Text",
612
- info="Optional: Leave blank to auto-transcribe",
613
- lines=2,
614
- )
 
 
 
615
  system_prompt_chat = gr.Textbox(
616
  label="System Prompt",
617
  value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
618
  lines=2,
619
  )
620
-
621
- chatbot_interface = gr.Chatbot(label="Conversation")
 
 
 
 
 
 
 
 
 
 
 
 
622
 
623
  with gr.Row():
624
  with gr.Column():
@@ -632,6 +825,8 @@ Have a conversation with an AI using your reference voice!
632
  label="Type your message",
633
  lines=1,
634
  )
 
 
635
  send_btn_chat = gr.Button("Send Message")
636
  clear_btn_chat = gr.Button("Clear Conversation")
637
 
@@ -646,17 +841,19 @@ Have a conversation with an AI using your reference voice!
646
 
647
  # Modify process_audio_input to use model and tokenizer from state
648
  @gpu_decorator
649
- def process_audio_input(audio_path, text, history, conv_state):
650
- """Handle audio or text input from user"""
651
-
652
- if not audio_path and not text.strip():
653
- return history, conv_state, ""
654
-
655
- if audio_path:
 
 
656
  text = preprocess_ref_audio_text(audio_path, text)[1]
657
 
658
  if not text.strip():
659
- return history, conv_state, ""
660
 
661
  conv_state.append({"role": "user", "content": text})
662
  history.append((text, None))
@@ -666,29 +863,50 @@ Have a conversation with an AI using your reference voice!
666
  conv_state.append({"role": "assistant", "content": response})
667
  history[-1] = (text, response)
668
 
669
- return history, conv_state, ""
670
 
671
  @gpu_decorator
672
- def generate_audio_response(history, ref_audio, ref_text, remove_silence):
 
 
673
  """Generate TTS audio for AI response"""
674
  if not history or not ref_audio:
675
- return None
676
 
677
  last_user_message, last_ai_response = history[-1]
678
  if not last_ai_response:
679
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680
 
681
  audio_result, _, ref_text_out = infer(
682
  ref_audio,
683
  ref_text,
 
684
  last_ai_response,
 
685
  tts_model_choice,
686
  remove_silence,
 
687
  cross_fade_duration=0.15,
688
  speed=1.0,
689
  show_info=print, # show_info=print no pull to top when generating
690
  )
691
- return audio_result, ref_text_out
692
 
693
  def clear_conversation():
694
  """Reset the conversation"""
@@ -704,15 +922,41 @@ Have a conversation with an AI using your reference voice!
704
  new_conv_state = [{"role": "system", "content": new_prompt}]
705
  return [], new_conv_state
706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
  # Handle audio input
708
  audio_input_chat.stop_recording(
709
  process_audio_input,
710
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
711
- outputs=[chatbot_interface, conversation_state],
712
  ).then(
713
  generate_audio_response,
714
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
715
- outputs=[audio_output_chat, ref_text_chat],
 
 
 
 
 
 
 
 
716
  ).then(
717
  lambda: None,
718
  None,
@@ -722,31 +966,39 @@ Have a conversation with an AI using your reference voice!
722
  # Handle text input
723
  text_input_chat.submit(
724
  process_audio_input,
725
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
726
- outputs=[chatbot_interface, conversation_state],
727
  ).then(
728
  generate_audio_response,
729
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
730
- outputs=[audio_output_chat, ref_text_chat],
731
- ).then(
732
- lambda: None,
733
- None,
734
- text_input_chat,
 
 
 
 
735
  )
736
 
737
  # Handle send button
738
  send_btn_chat.click(
739
  process_audio_input,
740
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
741
- outputs=[chatbot_interface, conversation_state],
742
  ).then(
743
  generate_audio_response,
744
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
745
- outputs=[audio_output_chat, ref_text_chat],
746
- ).then(
747
- lambda: None,
748
- None,
749
- text_input_chat,
 
 
 
 
750
  )
751
 
752
  # Handle clear button
@@ -775,9 +1027,9 @@ This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not
775
 
776
  The checkpoints currently support English and Chinese.
777
 
778
- If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
779
 
780
- **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
781
  """
782
  )
783
 
 
112
  return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
113
 
114
 
115
+ def read_text_file(file_path):
116
+ """Read content from a .txt file"""
117
+ if file_path:
118
+ with open(file_path, "r", encoding="utf-8") as f:
119
+ return f.read().strip()
120
+ return ""
121
+
122
+
123
  @gpu_decorator
124
  def infer(
125
  ref_audio_orig,
126
  ref_text,
127
+ ref_text_file,
128
  gen_text,
129
+ gen_text_file,
130
  model,
131
  remove_silence,
132
+ seed,
133
  cross_fade_duration=0.15,
134
  nfe_step=32,
135
  speed=1,
 
139
  gr.Warning("Please provide reference audio.")
140
  return gr.update(), gr.update(), ref_text
141
 
142
+ # Use text from file if provided, otherwise use direct text input
143
+ ref_text = read_text_file(ref_text_file) or ref_text
144
+ gen_text = read_text_file(gen_text_file) or gen_text
145
+
146
  if not gen_text.strip():
147
+ gr.Warning("Please enter text to generate or upload a text file.")
148
  return gr.update(), gr.update(), ref_text
149
 
150
+ # Set random seed for reproducibility
151
+ torch.manual_seed(seed)
152
+ np.random.seed(seed)
153
+ if torch.cuda.is_available():
154
+ torch.cuda.manual_seed_all(seed)
155
+
156
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
157
 
158
  if model == DEFAULT_TTS_MODEL:
 
213
  gr.Markdown("# Batched TTS")
214
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
215
  gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
216
+ with gr.Column(scale=1):
217
+ gen_text_file = gr.File(label="Upload Text File to Generate (.txt)", file_types=[".txt"])
218
  generate_btn = gr.Button("Synthesize", variant="primary")
219
  with gr.Accordion("Advanced Settings", open=False):
220
+ with gr.Row():
221
+ ref_text_input = gr.Textbox(
222
+ label="Reference Text",
223
+ info="Leave blank to automatically transcribe the reference audio. If you enter text or upload a file, it will override automatic transcription.",
224
+ lines=2,
225
+ )
226
+ with gr.Column(scale=1):
227
+ ref_text_file = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
228
  remove_silence = gr.Checkbox(
229
  label="Remove Silences",
230
  info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
231
  value=False,
232
  )
233
+ with gr.Row():
234
+ randomize_seed = gr.Checkbox(
235
+ label="Randomize Seed",
236
+ value=True,
237
+ info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
238
+ )
239
+ seed_input = gr.Textbox(
240
+ label="Seed",
241
+ value="0",
242
+ placeholder="Enter a seed value",
243
+ scale=1,
244
+ )
245
  speed_slider = gr.Slider(
246
  label="Speed",
247
  minimum=0.3,
 
253
  nfe_slider = gr.Slider(
254
  label="NFE Steps",
255
  minimum=4,
256
+ maximum=71,
257
  value=32,
258
+ step=1,
259
  info="Set the number of denoising steps.",
260
  )
261
  cross_fade_duration_slider = gr.Slider(
 
270
  audio_output = gr.Audio(label="Synthesized Audio")
271
  spectrogram_output = gr.Image(label="Spectrogram")
272
 
273
+ @gpu_decorator
274
+ def update_gen_text_from_file(file):
275
+ """Update the generate text input when a .txt file is uploaded"""
276
+ text = read_text_file(file)
277
+ return gr.update(value=text)
278
+
279
+ @gpu_decorator
280
+ def update_ref_text_from_file(file):
281
+ """Update the reference text input when a .txt file is uploaded"""
282
+ text = read_text_file(file)
283
+ return gr.update(value=text)
284
+
285
  @gpu_decorator
286
  def basic_tts(
287
  ref_audio_input,
288
  ref_text_input,
289
+ ref_text_file,
290
  gen_text_input,
291
+ gen_text_file,
292
  remove_silence,
293
+ randomize_seed,
294
+ seed_input,
295
  cross_fade_duration_slider,
296
  nfe_slider,
297
  speed_slider,
298
  ):
299
+ # Determine the seed to use
300
+ if randomize_seed:
301
+ seed = np.random.randint(0, 2**31)
302
+ else:
303
+ try:
304
+ seed = int(seed_input)
305
+ if seed < 0:
306
+ gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
307
+ seed = np.random.randint(0, 2**31)
308
+ except ValueError:
309
+ gr.Warning("Invalid seed value. Using random seed instead.")
310
+ seed = np.random.randint(0, 2**31)
311
+
312
  audio_out, spectrogram_path, ref_text_out = infer(
313
  ref_audio_input,
314
  ref_text_input,
315
+ ref_text_file,
316
  gen_text_input,
317
+ gen_text_file,
318
  tts_model_choice,
319
  remove_silence,
320
+ seed=seed,
321
  cross_fade_duration=cross_fade_duration_slider,
322
  nfe_step=nfe_slider,
323
  speed=speed_slider,
324
  )
325
+ return audio_out, spectrogram_path, ref_text_out, str(seed)
326
+
327
+ gen_text_file.change(
328
+ update_gen_text_from_file,
329
+ inputs=[gen_text_file],
330
+ outputs=[gen_text_input],
331
+ )
332
+
333
+ ref_text_file.change(
334
+ update_ref_text_from_file,
335
+ inputs=[ref_text_file],
336
+ outputs=[ref_text_input],
337
+ )
338
 
339
  generate_btn.click(
340
  basic_tts,
341
  inputs=[
342
  ref_audio_input,
343
  ref_text_input,
344
+ ref_text_file,
345
  gen_text_input,
346
+ gen_text_file,
347
  remove_silence,
348
+ randomize_seed,
349
+ seed_input,
350
  cross_fade_duration_slider,
351
  nfe_slider,
352
  speed_slider,
353
  ],
354
+ outputs=[audio_output, spectrogram_output, ref_text_input, seed_input],
355
  )
356
 
357
 
 
386
  """
387
  # Multiple Speech-Type Generation
388
 
389
+ This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, or upload a .txt file with the same format. The system will generate speech using the appropriate type. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
390
  """
391
  )
392
 
393
  with gr.Row():
394
  gr.Markdown(
395
  """
396
+ **Example Input:**
397
+ {Regular} Hello, I'd like to order a sandwich please.
398
+ {Surprised} What do you mean you're out of bread?
399
+ {Sad} I really wanted a sandwich though...
400
+ {Angry} You know what, darn you and your little shop!
401
+ {Whisper} I'll just go back home and cry now.
402
+ {Shouting} Why me?!
403
  """
404
  )
405
 
406
  gr.Markdown(
407
  """
408
+ **Example Input 2:**
409
+ {Speaker1_Happy} Hello, I'd like to order a sandwich please.
410
+ {Speaker2_Regular} Sorry, we're out of bread.
411
+ {Speaker1_Sad} I really wanted a sandwich though...
412
+ {Speaker2_Whisper} I'll give you the last one I was hiding.
413
  """
414
  )
415
 
 
423
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
424
  regular_insert = gr.Button("Insert Label", variant="secondary")
425
  regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
426
+ with gr.Row():
427
+ regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
428
+ with gr.Column(scale=1):
429
+ regular_ref_text_file = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
430
 
431
  # Regular speech type (max 100)
432
  max_speech_types = 100
 
434
  speech_type_names = [regular_name]
435
  speech_type_audios = [regular_audio]
436
  speech_type_ref_texts = [regular_ref_text]
437
+ speech_type_ref_text_files = [regular_ref_text_file]
438
  speech_type_delete_btns = [None]
439
  speech_type_insert_btns = [regular_insert]
440
 
 
446
  delete_btn = gr.Button("Delete Type", variant="secondary")
447
  insert_btn = gr.Button("Insert Label", variant="secondary")
448
  audio_input = gr.Audio(label="Reference Audio", type="filepath")
449
+ with gr.Row():
450
+ ref_text_input = gr.Textbox(label="Reference Text", lines=2)
451
+ with gr.Column(scale=1):
452
+ ref_text_file_input = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
453
  speech_type_rows.append(row)
454
  speech_type_names.append(name_input)
455
  speech_type_audios.append(audio_input)
456
  speech_type_ref_texts.append(ref_text_input)
457
+ speech_type_ref_text_files.append(ref_text_file_input)
458
  speech_type_delete_btns.append(delete_btn)
459
  speech_type_insert_btns.append(insert_btn)
460
 
 
479
 
480
  # Function to delete a speech type
481
  def delete_speech_type_fn():
482
+ return gr.update(visible=False), None, None, None, None
483
 
484
+ # Function to update reference text from file
485
+ @gpu_decorator
486
+ def update_ref_text_from_file(file):
487
+ """Update the reference text input when a .txt file is uploaded"""
488
+ text = read_text_file(file)
489
+ return gr.update(value=text)
490
+
491
+ # Update delete button clicks and ref text file changes
492
  for i in range(1, len(speech_type_delete_btns)):
493
  speech_type_delete_btns[i].click(
494
  delete_speech_type_fn,
495
+ outputs=[
496
+ speech_type_rows[i],
497
+ speech_type_names[i],
498
+ speech_type_audios[i],
499
+ speech_type_ref_texts[i],
500
+ speech_type_ref_text_files[i],
501
+ ],
502
+ )
503
+ speech_type_ref_text_files[i].change(
504
+ update_ref_text_from_file,
505
+ inputs=[speech_type_ref_text_files[i]],
506
+ outputs=[speech_type_ref_texts[i]],
507
  )
508
 
509
+ # Update regular speech type ref text file
510
+ regular_ref_text_file.change(
511
+ update_ref_text_from_file,
512
+ inputs=[regular_ref_text_file],
513
+ outputs=[regular_ref_text],
514
+ )
515
+
516
  # Text input for the prompt
517
  gen_text_input_multistyle = gr.Textbox(
518
  label="Text to Generate",
519
  lines=10,
520
  placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
521
  )
522
+ with gr.Column(scale=1):
523
+ gen_text_file_multistyle = gr.File(label="Upload Text File to Generate (.txt)", file_types=[".txt"])
524
 
525
  def make_insert_speech_type_fn(index):
526
  def insert_speech_type_fn(current_text, speech_type_name):
 
544
  label="Remove Silences",
545
  value=True,
546
  )
547
+ with gr.Row():
548
+ randomize_seed_multistyle = gr.Checkbox(
549
+ label="Randomize Seed",
550
+ value=True,
551
+ info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
552
+ )
553
+ seed_input_multistyle = gr.Textbox(
554
+ label="Seed",
555
+ value="0",
556
+ placeholder="Enter a seed value",
557
+ scale=1,
558
+ )
559
 
560
  # Generate button
561
  generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
 
563
  # Output audio
564
  audio_output_multistyle = gr.Audio(label="Synthesized Audio")
565
 
566
+ @gpu_decorator
567
+ def update_gen_text_from_file(file):
568
+ """Update the generate text input when a .txt file is uploaded"""
569
+ text = read_text_file(file)
570
+ return gr.update(value=text)
571
+
572
+ gen_text_file_multistyle.change(
573
+ fn=lambda file, text, regular, *names: (
574
+ update_gen_text_from_file(file),
575
+ validate_speech_types(text, file, regular, *names),
576
+ ),
577
+ inputs=[gen_text_file_multistyle, gen_text_input_multistyle, regular_name] + speech_type_names,
578
+ outputs=[gen_text_input_multistyle, generate_multistyle_btn],
579
+ )
580
+
581
  @gpu_decorator
582
  def generate_multistyle_speech(
583
  gen_text,
584
+ gen_text_file,
585
+ randomize_seed,
586
+ seed_input,
587
  *args,
588
  ):
589
+ # Determine the seed to use
590
+ if randomize_seed:
591
+ seed = np.random.randint(0, 2**31)
592
+ else:
593
+ try:
594
+ seed = int(seed_input)
595
+ if seed < 0:
596
+ gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
597
+ seed = np.random.randint(0, 2**31)
598
+ except ValueError:
599
+ gr.Warning("Invalid seed value. Using random seed instead.")
600
+ seed = np.random.randint(0, 2**31)
601
+
602
  speech_type_names_list = args[:max_speech_types]
603
  speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
604
  speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
605
+ speech_type_ref_text_files_list = args[3 * max_speech_types : 4 * max_speech_types]
606
+ remove_silence = args[4 * max_speech_types]
607
  # Collect the speech types and their audios into a dict
608
  speech_types = OrderedDict()
609
 
610
+ # Use text from file if provided, otherwise use direct text input
611
+ gen_text = read_text_file(gen_text_file) or gen_text
612
+
613
  ref_text_idx = 0
614
+ for name_input, audio_input, ref_text_input, ref_text_file_input in zip(
615
+ speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list, speech_type_ref_text_files_list
616
  ):
617
+ ref_text = read_text_file(ref_text_file_input) or ref_text_input
618
  if name_input and audio_input:
619
+ speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text}
620
  else:
621
  speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
622
  ref_text_idx += 1
 
642
  ref_audio = speech_types[current_style]["audio"]
643
  except KeyError:
644
  gr.Warning(f"Please provide reference audio for type {current_style}.")
645
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
646
  ref_text = speech_types[current_style].get("ref_text", "")
647
 
648
  # Generate speech for this segment
649
  audio_out, _, ref_text_out = infer(
650
+ ref_audio, ref_text, None, text, None, tts_model_choice, remove_silence, seed, 0, show_info=print
651
  ) # show_info=print no pull to top when generating
652
  sr, audio_data = audio_out
653
 
 
657
  # Concatenate all audio segments
658
  if generated_audio_segments:
659
  final_audio_data = np.concatenate(generated_audio_segments)
660
+ return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
661
  else:
662
  gr.Warning("No audio generated.")
663
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
664
 
665
  generate_multistyle_btn.click(
666
  generate_multistyle_speech,
667
+ inputs=[gen_text_input_multistyle, gen_text_file_multistyle, randomize_seed_multistyle, seed_input_multistyle]
 
 
668
  + speech_type_names
669
  + speech_type_audios
670
  + speech_type_ref_texts
671
+ + speech_type_ref_text_files
672
+ + [remove_silence_multistyle],
673
+ outputs=[audio_output_multistyle] + speech_type_ref_texts + [seed_input_multistyle],
 
674
  )
675
 
676
  # Validation function to disable Generate button if speech types are missing
677
+ def validate_speech_types(gen_text, gen_text_file, regular_name, *args):
678
  speech_type_names_list = args
679
 
680
+ # Use text from file if provided, otherwise use direct text input
681
+ gen_text = read_text_file(gen_text_file) or gen_text
682
+
683
  # Collect the speech types names
684
  speech_types_available = set()
685
  if regular_name:
 
704
 
705
  gen_text_input_multistyle.change(
706
  validate_speech_types,
707
+ inputs=[gen_text_input_multistyle, gen_text_file_multistyle, regular_name] + speech_type_names,
708
  outputs=generate_multistyle_btn,
709
  )
710
 
711
+ gen_text_file_multistyle.change(
712
+ fn=lambda file, text, regular, *names: (
713
+ update_gen_text_from_file(file),
714
+ validate_speech_types(text, file, regular, *names),
715
+ ),
716
+ inputs=[gen_text_file_multistyle, gen_text_input_multistyle, regular_name] + speech_type_names,
717
+ outputs=[gen_text_input_multistyle, generate_multistyle_btn],
718
+ )
719
+
720
 
721
  with gr.Blocks() as app_chat:
722
  gr.Markdown(
723
  """
724
  # Voice Chat
725
+ Have a conversation with an AI using your reference voice!
726
+ 1. Upload a reference audio clip and optionally its transcript (via text or .txt file).
727
  2. Load the chat model.
728
+ 3. Record your message through your microphone or type it.
729
  4. The AI will respond using the reference voice.
730
  """
731
  )
 
785
  label="Remove Silences",
786
  value=True,
787
  )
788
+ with gr.Row():
789
+ ref_text_chat = gr.Textbox(
790
+ label="Reference Text",
791
+ info="Optional: Leave blank to auto-transcribe",
792
+ lines=2,
793
+ )
794
+ with gr.Column(scale=1):
795
+ ref_text_file_chat = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
796
  system_prompt_chat = gr.Textbox(
797
  label="System Prompt",
798
  value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
799
  lines=2,
800
  )
801
+ with gr.Row():
802
+ randomize_seed_chat = gr.Checkbox(
803
+ label="Randomize Seed",
804
+ value=True,
805
+ info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
806
+ )
807
+ seed_input_chat = gr.Textbox(
808
+ label="Seed",
809
+ value="0",
810
+ placeholder="Enter a seed value",
811
+ scale=1,
812
+ )
813
+
814
+ chatbot_interface = gr.Chatbot(label="Conversation", type="messages")
815
 
816
  with gr.Row():
817
  with gr.Column():
 
825
  label="Type your message",
826
  lines=1,
827
  )
828
+ with gr.Column(scale=1):
829
+ text_file_chat = gr.File(label="Upload Text File (.txt)", file_types=[".txt"])
830
  send_btn_chat = gr.Button("Send Message")
831
  clear_btn_chat = gr.Button("Clear Conversation")
832
 
 
841
 
842
  # Modify process_audio_input to use model and tokenizer from state
843
  @gpu_decorator
844
+ def process_audio_input(audio_path, text, text_file, history, conv_state):
845
+ """Handle audio, text, or file input from user"""
846
+ if not audio_path and not text.strip() and not text_file:
847
+ return history, conv_state, "", None
848
+
849
+ # Use file input if provided, then direct text input, then audio transcription
850
+ if text_file:
851
+ text = read_text_file(text_file)
852
+ elif audio_path:
853
  text = preprocess_ref_audio_text(audio_path, text)[1]
854
 
855
  if not text.strip():
856
+ return history, conv_state, "", None
857
 
858
  conv_state.append({"role": "user", "content": text})
859
  history.append((text, None))
 
863
  conv_state.append({"role": "assistant", "content": response})
864
  history[-1] = (text, response)
865
 
866
+ return history, conv_state, "", None
867
 
868
  @gpu_decorator
869
+ def generate_audio_response(
870
+ history, ref_audio, ref_text, ref_text_file, remove_silence, randomize_seed, seed_input
871
+ ):
872
  """Generate TTS audio for AI response"""
873
  if not history or not ref_audio:
874
+ return None, ref_text, seed_input
875
 
876
  last_user_message, last_ai_response = history[-1]
877
  if not last_ai_response:
878
+ return None, ref_text, seed_input
879
+
880
+ # Determine the seed to use
881
+ if randomize_seed:
882
+ seed = np.random.randint(0, 2**31)
883
+ else:
884
+ try:
885
+ seed = int(seed_input)
886
+ if seed < 0:
887
+ gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
888
+ seed = np.random.randint(0, 2**31)
889
+ except ValueError:
890
+ gr.Warning("Invalid seed value. Using random seed instead.")
891
+ seed = np.random.randint(0, 2**31)
892
+
893
+ # Use text from file if provided, otherwise use direct text input
894
+ ref_text = read_text_file(ref_text_file) or ref_text
895
 
896
  audio_result, _, ref_text_out = infer(
897
  ref_audio,
898
  ref_text,
899
+ None,
900
  last_ai_response,
901
+ None,
902
  tts_model_choice,
903
  remove_silence,
904
+ seed=seed,
905
  cross_fade_duration=0.15,
906
  speed=1.0,
907
  show_info=print, # show_info=print no pull to top when generating
908
  )
909
+ return audio_result, ref_text_out, str(seed)
910
 
911
  def clear_conversation():
912
  """Reset the conversation"""
 
922
  new_conv_state = [{"role": "system", "content": new_prompt}]
923
  return [], new_conv_state
924
 
925
+ @gpu_decorator
926
+ def update_text_from_file(file):
927
+ """Update the text input when a .txt file is uploaded"""
928
+ text = read_text_file(file)
929
+ return gr.update(value=text), None
930
+
931
+ ref_text_file_chat.change(
932
+ update_ref_text_from_file,
933
+ inputs=[ref_text_file_chat],
934
+ outputs=[ref_text_chat],
935
+ )
936
+
937
+ text_file_chat.change(
938
+ update_text_from_file,
939
+ inputs=[text_file_chat],
940
+ outputs=[text_input_chat, text_file_chat],
941
+ )
942
+
943
  # Handle audio input
944
  audio_input_chat.stop_recording(
945
  process_audio_input,
946
+ inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
947
+ outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
948
  ).then(
949
  generate_audio_response,
950
+ inputs=[
951
+ chatbot_interface,
952
+ ref_audio_chat,
953
+ ref_text_chat,
954
+ ref_text_file_chat,
955
+ remove_silence_chat,
956
+ randomize_seed_chat,
957
+ seed_input_chat,
958
+ ],
959
+ outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
960
  ).then(
961
  lambda: None,
962
  None,
 
966
  # Handle text input
967
  text_input_chat.submit(
968
  process_audio_input,
969
+ inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
970
+ outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
971
  ).then(
972
  generate_audio_response,
973
+ inputs=[
974
+ chatbot_interface,
975
+ ref_audio_chat,
976
+ ref_text_chat,
977
+ ref_text_file_chat,
978
+ remove_silence_chat,
979
+ randomize_seed_chat,
980
+ seed_input_chat,
981
+ ],
982
+ outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
983
  )
984
 
985
  # Handle send button
986
  send_btn_chat.click(
987
  process_audio_input,
988
+ inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
989
+ outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
990
  ).then(
991
  generate_audio_response,
992
+ inputs=[
993
+ chatbot_interface,
994
+ ref_audio_chat,
995
+ ref_text_chat,
996
+ ref_text_file_chat,
997
+ remove_silence_chat,
998
+ randomize_seed_chat,
999
+ seed_input_chat,
1000
+ ],
1001
+ outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
1002
  )
1003
 
1004
  # Handle clear button
 
1027
 
1028
  The checkpoints currently support English and Chinese.
1029
 
1030
+ If you're having issues, try converting your заборreference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
1031
 
1032
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided via text or .txt file. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
1033
  """
1034
  )
1035