Spaces:
Running
on
Zero
Running
on
Zero
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
@@ -112,13 +112,24 @@ def generate_response(messages, model, tokenizer):
|
|
112 |
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
113 |
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
@gpu_decorator
|
116 |
def infer(
|
117 |
ref_audio_orig,
|
118 |
ref_text,
|
|
|
119 |
gen_text,
|
|
|
120 |
model,
|
121 |
remove_silence,
|
|
|
122 |
cross_fade_duration=0.15,
|
123 |
nfe_step=32,
|
124 |
speed=1,
|
@@ -128,10 +139,20 @@ def infer(
|
|
128 |
gr.Warning("Please provide reference audio.")
|
129 |
return gr.update(), gr.update(), ref_text
|
130 |
|
|
|
|
|
|
|
|
|
131 |
if not gen_text.strip():
|
132 |
-
gr.Warning("Please enter text to generate.")
|
133 |
return gr.update(), gr.update(), ref_text
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
136 |
|
137 |
if model == DEFAULT_TTS_MODEL:
|
@@ -192,18 +213,35 @@ with gr.Blocks() as app_tts:
|
|
192 |
gr.Markdown("# Batched TTS")
|
193 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
194 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
|
|
|
|
195 |
generate_btn = gr.Button("Synthesize", variant="primary")
|
196 |
with gr.Accordion("Advanced Settings", open=False):
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
202 |
remove_silence = gr.Checkbox(
|
203 |
label="Remove Silences",
|
204 |
info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
|
205 |
value=False,
|
206 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
speed_slider = gr.Slider(
|
208 |
label="Speed",
|
209 |
minimum=0.3,
|
@@ -215,9 +253,9 @@ with gr.Blocks() as app_tts:
|
|
215 |
nfe_slider = gr.Slider(
|
216 |
label="NFE Steps",
|
217 |
minimum=4,
|
218 |
-
maximum=
|
219 |
value=32,
|
220 |
-
step=
|
221 |
info="Set the number of denoising steps.",
|
222 |
)
|
223 |
cross_fade_duration_slider = gr.Slider(
|
@@ -232,40 +270,88 @@ with gr.Blocks() as app_tts:
|
|
232 |
audio_output = gr.Audio(label="Synthesized Audio")
|
233 |
spectrogram_output = gr.Image(label="Spectrogram")
|
234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
@gpu_decorator
|
236 |
def basic_tts(
|
237 |
ref_audio_input,
|
238 |
ref_text_input,
|
|
|
239 |
gen_text_input,
|
|
|
240 |
remove_silence,
|
|
|
|
|
241 |
cross_fade_duration_slider,
|
242 |
nfe_slider,
|
243 |
speed_slider,
|
244 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
audio_out, spectrogram_path, ref_text_out = infer(
|
246 |
ref_audio_input,
|
247 |
ref_text_input,
|
|
|
248 |
gen_text_input,
|
|
|
249 |
tts_model_choice,
|
250 |
remove_silence,
|
|
|
251 |
cross_fade_duration=cross_fade_duration_slider,
|
252 |
nfe_step=nfe_slider,
|
253 |
speed=speed_slider,
|
254 |
)
|
255 |
-
return audio_out, spectrogram_path, ref_text_out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
generate_btn.click(
|
258 |
basic_tts,
|
259 |
inputs=[
|
260 |
ref_audio_input,
|
261 |
ref_text_input,
|
|
|
262 |
gen_text_input,
|
|
|
263 |
remove_silence,
|
|
|
|
|
264 |
cross_fade_duration_slider,
|
265 |
nfe_slider,
|
266 |
speed_slider,
|
267 |
],
|
268 |
-
outputs=[audio_output, spectrogram_output, ref_text_input],
|
269 |
)
|
270 |
|
271 |
|
@@ -300,30 +386,30 @@ with gr.Blocks() as app_multistyle:
|
|
300 |
"""
|
301 |
# Multiple Speech-Type Generation
|
302 |
|
303 |
-
This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below,
|
304 |
"""
|
305 |
)
|
306 |
|
307 |
with gr.Row():
|
308 |
gr.Markdown(
|
309 |
"""
|
310 |
-
**Example Input:**
|
311 |
-
{Regular} Hello, I'd like to order a sandwich please.
|
312 |
-
{Surprised} What do you mean you're out of bread?
|
313 |
-
{Sad} I really wanted a sandwich though...
|
314 |
-
{Angry} You know what, darn you and your little shop!
|
315 |
-
{Whisper} I'll just go back home and cry now.
|
316 |
-
{Shouting} Why me?!
|
317 |
"""
|
318 |
)
|
319 |
|
320 |
gr.Markdown(
|
321 |
"""
|
322 |
-
**Example Input 2:**
|
323 |
-
{Speaker1_Happy} Hello, I'd like to order a sandwich please.
|
324 |
-
{Speaker2_Regular} Sorry, we're out of bread.
|
325 |
-
{Speaker1_Sad} I really wanted a sandwich though...
|
326 |
-
{Speaker2_Whisper} I'll give you the last one I was hiding.
|
327 |
"""
|
328 |
)
|
329 |
|
@@ -337,7 +423,10 @@ with gr.Blocks() as app_multistyle:
|
|
337 |
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
338 |
regular_insert = gr.Button("Insert Label", variant="secondary")
|
339 |
regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
|
340 |
-
|
|
|
|
|
|
|
341 |
|
342 |
# Regular speech type (max 100)
|
343 |
max_speech_types = 100
|
@@ -345,6 +434,7 @@ with gr.Blocks() as app_multistyle:
|
|
345 |
speech_type_names = [regular_name]
|
346 |
speech_type_audios = [regular_audio]
|
347 |
speech_type_ref_texts = [regular_ref_text]
|
|
|
348 |
speech_type_delete_btns = [None]
|
349 |
speech_type_insert_btns = [regular_insert]
|
350 |
|
@@ -356,11 +446,15 @@ with gr.Blocks() as app_multistyle:
|
|
356 |
delete_btn = gr.Button("Delete Type", variant="secondary")
|
357 |
insert_btn = gr.Button("Insert Label", variant="secondary")
|
358 |
audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
359 |
-
|
|
|
|
|
|
|
360 |
speech_type_rows.append(row)
|
361 |
speech_type_names.append(name_input)
|
362 |
speech_type_audios.append(audio_input)
|
363 |
speech_type_ref_texts.append(ref_text_input)
|
|
|
364 |
speech_type_delete_btns.append(delete_btn)
|
365 |
speech_type_insert_btns.append(insert_btn)
|
366 |
|
@@ -385,21 +479,48 @@ with gr.Blocks() as app_multistyle:
|
|
385 |
|
386 |
# Function to delete a speech type
|
387 |
def delete_speech_type_fn():
|
388 |
-
return gr.update(visible=False), None, None, None
|
389 |
|
390 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
for i in range(1, len(speech_type_delete_btns)):
|
392 |
speech_type_delete_btns[i].click(
|
393 |
delete_speech_type_fn,
|
394 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
)
|
396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
# Text input for the prompt
|
398 |
gen_text_input_multistyle = gr.Textbox(
|
399 |
label="Text to Generate",
|
400 |
lines=10,
|
401 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
402 |
)
|
|
|
|
|
403 |
|
404 |
def make_insert_speech_type_fn(index):
|
405 |
def insert_speech_type_fn(current_text, speech_type_name):
|
@@ -423,6 +544,18 @@ with gr.Blocks() as app_multistyle:
|
|
423 |
label="Remove Silences",
|
424 |
value=True,
|
425 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
# Generate button
|
428 |
generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
|
@@ -430,24 +563,60 @@ with gr.Blocks() as app_multistyle:
|
|
430 |
# Output audio
|
431 |
audio_output_multistyle = gr.Audio(label="Synthesized Audio")
|
432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
@gpu_decorator
|
434 |
def generate_multistyle_speech(
|
435 |
gen_text,
|
|
|
|
|
|
|
436 |
*args,
|
437 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
speech_type_names_list = args[:max_speech_types]
|
439 |
speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
|
440 |
speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
|
441 |
-
|
|
|
442 |
# Collect the speech types and their audios into a dict
|
443 |
speech_types = OrderedDict()
|
444 |
|
|
|
|
|
|
|
445 |
ref_text_idx = 0
|
446 |
-
for name_input, audio_input, ref_text_input in zip(
|
447 |
-
speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
|
448 |
):
|
|
|
449 |
if name_input and audio_input:
|
450 |
-
speech_types[name_input] = {"audio": audio_input, "ref_text":
|
451 |
else:
|
452 |
speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
|
453 |
ref_text_idx += 1
|
@@ -473,12 +642,12 @@ with gr.Blocks() as app_multistyle:
|
|
473 |
ref_audio = speech_types[current_style]["audio"]
|
474 |
except KeyError:
|
475 |
gr.Warning(f"Please provide reference audio for type {current_style}.")
|
476 |
-
return [None] + [speech_types[style]["ref_text"] for style in speech_types]
|
477 |
ref_text = speech_types[current_style].get("ref_text", "")
|
478 |
|
479 |
# Generate speech for this segment
|
480 |
audio_out, _, ref_text_out = infer(
|
481 |
-
ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
|
482 |
) # show_info=print no pull to top when generating
|
483 |
sr, audio_data = audio_out
|
484 |
|
@@ -488,29 +657,29 @@ with gr.Blocks() as app_multistyle:
|
|
488 |
# Concatenate all audio segments
|
489 |
if generated_audio_segments:
|
490 |
final_audio_data = np.concatenate(generated_audio_segments)
|
491 |
-
return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
|
492 |
else:
|
493 |
gr.Warning("No audio generated.")
|
494 |
-
return [None] + [speech_types[style]["ref_text"] for style in speech_types]
|
495 |
|
496 |
generate_multistyle_btn.click(
|
497 |
generate_multistyle_speech,
|
498 |
-
inputs=[
|
499 |
-
gen_text_input_multistyle,
|
500 |
-
]
|
501 |
+ speech_type_names
|
502 |
+ speech_type_audios
|
503 |
+ speech_type_ref_texts
|
504 |
-
+
|
505 |
-
|
506 |
-
],
|
507 |
-
outputs=[audio_output_multistyle] + speech_type_ref_texts,
|
508 |
)
|
509 |
|
510 |
# Validation function to disable Generate button if speech types are missing
|
511 |
-
def validate_speech_types(gen_text, regular_name, *args):
|
512 |
speech_type_names_list = args
|
513 |
|
|
|
|
|
|
|
514 |
# Collect the speech types names
|
515 |
speech_types_available = set()
|
516 |
if regular_name:
|
@@ -535,19 +704,28 @@ with gr.Blocks() as app_multistyle:
|
|
535 |
|
536 |
gen_text_input_multistyle.change(
|
537 |
validate_speech_types,
|
538 |
-
inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
|
539 |
outputs=generate_multistyle_btn,
|
540 |
)
|
541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
|
543 |
with gr.Blocks() as app_chat:
|
544 |
gr.Markdown(
|
545 |
"""
|
546 |
# Voice Chat
|
547 |
-
Have a conversation with an AI using your reference voice!
|
548 |
-
1. Upload a reference audio clip and optionally its transcript.
|
549 |
2. Load the chat model.
|
550 |
-
3. Record your message through your microphone.
|
551 |
4. The AI will respond using the reference voice.
|
552 |
"""
|
553 |
)
|
@@ -607,18 +785,33 @@ Have a conversation with an AI using your reference voice!
|
|
607 |
label="Remove Silences",
|
608 |
value=True,
|
609 |
)
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
|
|
|
|
|
|
615 |
system_prompt_chat = gr.Textbox(
|
616 |
label="System Prompt",
|
617 |
value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
|
618 |
lines=2,
|
619 |
)
|
620 |
-
|
621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
622 |
|
623 |
with gr.Row():
|
624 |
with gr.Column():
|
@@ -632,6 +825,8 @@ Have a conversation with an AI using your reference voice!
|
|
632 |
label="Type your message",
|
633 |
lines=1,
|
634 |
)
|
|
|
|
|
635 |
send_btn_chat = gr.Button("Send Message")
|
636 |
clear_btn_chat = gr.Button("Clear Conversation")
|
637 |
|
@@ -646,17 +841,19 @@ Have a conversation with an AI using your reference voice!
|
|
646 |
|
647 |
# Modify process_audio_input to use model and tokenizer from state
|
648 |
@gpu_decorator
|
649 |
-
def process_audio_input(audio_path, text, history, conv_state):
|
650 |
-
"""Handle audio or
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
if
|
|
|
|
|
656 |
text = preprocess_ref_audio_text(audio_path, text)[1]
|
657 |
|
658 |
if not text.strip():
|
659 |
-
return history, conv_state, ""
|
660 |
|
661 |
conv_state.append({"role": "user", "content": text})
|
662 |
history.append((text, None))
|
@@ -666,29 +863,50 @@ Have a conversation with an AI using your reference voice!
|
|
666 |
conv_state.append({"role": "assistant", "content": response})
|
667 |
history[-1] = (text, response)
|
668 |
|
669 |
-
return history, conv_state, ""
|
670 |
|
671 |
@gpu_decorator
|
672 |
-
def generate_audio_response(
|
|
|
|
|
673 |
"""Generate TTS audio for AI response"""
|
674 |
if not history or not ref_audio:
|
675 |
-
return None
|
676 |
|
677 |
last_user_message, last_ai_response = history[-1]
|
678 |
if not last_ai_response:
|
679 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
680 |
|
681 |
audio_result, _, ref_text_out = infer(
|
682 |
ref_audio,
|
683 |
ref_text,
|
|
|
684 |
last_ai_response,
|
|
|
685 |
tts_model_choice,
|
686 |
remove_silence,
|
|
|
687 |
cross_fade_duration=0.15,
|
688 |
speed=1.0,
|
689 |
show_info=print, # show_info=print no pull to top when generating
|
690 |
)
|
691 |
-
return audio_result, ref_text_out
|
692 |
|
693 |
def clear_conversation():
|
694 |
"""Reset the conversation"""
|
@@ -704,15 +922,41 @@ Have a conversation with an AI using your reference voice!
|
|
704 |
new_conv_state = [{"role": "system", "content": new_prompt}]
|
705 |
return [], new_conv_state
|
706 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
707 |
# Handle audio input
|
708 |
audio_input_chat.stop_recording(
|
709 |
process_audio_input,
|
710 |
-
inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
|
711 |
-
outputs=[chatbot_interface, conversation_state],
|
712 |
).then(
|
713 |
generate_audio_response,
|
714 |
-
inputs=[
|
715 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
).then(
|
717 |
lambda: None,
|
718 |
None,
|
@@ -722,31 +966,39 @@ Have a conversation with an AI using your reference voice!
|
|
722 |
# Handle text input
|
723 |
text_input_chat.submit(
|
724 |
process_audio_input,
|
725 |
-
inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
|
726 |
-
outputs=[chatbot_interface, conversation_state],
|
727 |
).then(
|
728 |
generate_audio_response,
|
729 |
-
inputs=[
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
|
|
|
|
|
|
|
|
735 |
)
|
736 |
|
737 |
# Handle send button
|
738 |
send_btn_chat.click(
|
739 |
process_audio_input,
|
740 |
-
inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
|
741 |
-
outputs=[chatbot_interface, conversation_state],
|
742 |
).then(
|
743 |
generate_audio_response,
|
744 |
-
inputs=[
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
|
|
|
|
|
|
|
|
750 |
)
|
751 |
|
752 |
# Handle clear button
|
@@ -775,9 +1027,9 @@ This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not
|
|
775 |
|
776 |
The checkpoints currently support English and Chinese.
|
777 |
|
778 |
-
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
|
779 |
|
780 |
-
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
|
781 |
"""
|
782 |
)
|
783 |
|
|
|
112 |
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
113 |
|
114 |
|
115 |
+
def read_text_file(file_path):
|
116 |
+
"""Read content from a .txt file"""
|
117 |
+
if file_path:
|
118 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
119 |
+
return f.read().strip()
|
120 |
+
return ""
|
121 |
+
|
122 |
+
|
123 |
@gpu_decorator
|
124 |
def infer(
|
125 |
ref_audio_orig,
|
126 |
ref_text,
|
127 |
+
ref_text_file,
|
128 |
gen_text,
|
129 |
+
gen_text_file,
|
130 |
model,
|
131 |
remove_silence,
|
132 |
+
seed,
|
133 |
cross_fade_duration=0.15,
|
134 |
nfe_step=32,
|
135 |
speed=1,
|
|
|
139 |
gr.Warning("Please provide reference audio.")
|
140 |
return gr.update(), gr.update(), ref_text
|
141 |
|
142 |
+
# Use text from file if provided, otherwise use direct text input
|
143 |
+
ref_text = read_text_file(ref_text_file) or ref_text
|
144 |
+
gen_text = read_text_file(gen_text_file) or gen_text
|
145 |
+
|
146 |
if not gen_text.strip():
|
147 |
+
gr.Warning("Please enter text to generate or upload a text file.")
|
148 |
return gr.update(), gr.update(), ref_text
|
149 |
|
150 |
+
# Set random seed for reproducibility
|
151 |
+
torch.manual_seed(seed)
|
152 |
+
np.random.seed(seed)
|
153 |
+
if torch.cuda.is_available():
|
154 |
+
torch.cuda.manual_seed_all(seed)
|
155 |
+
|
156 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
157 |
|
158 |
if model == DEFAULT_TTS_MODEL:
|
|
|
213 |
gr.Markdown("# Batched TTS")
|
214 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
215 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
216 |
+
with gr.Column(scale=1):
|
217 |
+
gen_text_file = gr.File(label="Upload Text File to Generate (.txt)", file_types=[".txt"])
|
218 |
generate_btn = gr.Button("Synthesize", variant="primary")
|
219 |
with gr.Accordion("Advanced Settings", open=False):
|
220 |
+
with gr.Row():
|
221 |
+
ref_text_input = gr.Textbox(
|
222 |
+
label="Reference Text",
|
223 |
+
info="Leave blank to automatically transcribe the reference audio. If you enter text or upload a file, it will override automatic transcription.",
|
224 |
+
lines=2,
|
225 |
+
)
|
226 |
+
with gr.Column(scale=1):
|
227 |
+
ref_text_file = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
|
228 |
remove_silence = gr.Checkbox(
|
229 |
label="Remove Silences",
|
230 |
info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
|
231 |
value=False,
|
232 |
)
|
233 |
+
with gr.Row():
|
234 |
+
randomize_seed = gr.Checkbox(
|
235 |
+
label="Randomize Seed",
|
236 |
+
value=True,
|
237 |
+
info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
|
238 |
+
)
|
239 |
+
seed_input = gr.Textbox(
|
240 |
+
label="Seed",
|
241 |
+
value="0",
|
242 |
+
placeholder="Enter a seed value",
|
243 |
+
scale=1,
|
244 |
+
)
|
245 |
speed_slider = gr.Slider(
|
246 |
label="Speed",
|
247 |
minimum=0.3,
|
|
|
253 |
nfe_slider = gr.Slider(
|
254 |
label="NFE Steps",
|
255 |
minimum=4,
|
256 |
+
maximum=71,
|
257 |
value=32,
|
258 |
+
step=1,
|
259 |
info="Set the number of denoising steps.",
|
260 |
)
|
261 |
cross_fade_duration_slider = gr.Slider(
|
|
|
270 |
audio_output = gr.Audio(label="Synthesized Audio")
|
271 |
spectrogram_output = gr.Image(label="Spectrogram")
|
272 |
|
273 |
+
@gpu_decorator
|
274 |
+
def update_gen_text_from_file(file):
|
275 |
+
"""Update the generate text input when a .txt file is uploaded"""
|
276 |
+
text = read_text_file(file)
|
277 |
+
return gr.update(value=text)
|
278 |
+
|
279 |
+
@gpu_decorator
|
280 |
+
def update_ref_text_from_file(file):
|
281 |
+
"""Update the reference text input when a .txt file is uploaded"""
|
282 |
+
text = read_text_file(file)
|
283 |
+
return gr.update(value=text)
|
284 |
+
|
285 |
@gpu_decorator
|
286 |
def basic_tts(
|
287 |
ref_audio_input,
|
288 |
ref_text_input,
|
289 |
+
ref_text_file,
|
290 |
gen_text_input,
|
291 |
+
gen_text_file,
|
292 |
remove_silence,
|
293 |
+
randomize_seed,
|
294 |
+
seed_input,
|
295 |
cross_fade_duration_slider,
|
296 |
nfe_slider,
|
297 |
speed_slider,
|
298 |
):
|
299 |
+
# Determine the seed to use
|
300 |
+
if randomize_seed:
|
301 |
+
seed = np.random.randint(0, 2**31)
|
302 |
+
else:
|
303 |
+
try:
|
304 |
+
seed = int(seed_input)
|
305 |
+
if seed < 0:
|
306 |
+
gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
|
307 |
+
seed = np.random.randint(0, 2**31)
|
308 |
+
except ValueError:
|
309 |
+
gr.Warning("Invalid seed value. Using random seed instead.")
|
310 |
+
seed = np.random.randint(0, 2**31)
|
311 |
+
|
312 |
audio_out, spectrogram_path, ref_text_out = infer(
|
313 |
ref_audio_input,
|
314 |
ref_text_input,
|
315 |
+
ref_text_file,
|
316 |
gen_text_input,
|
317 |
+
gen_text_file,
|
318 |
tts_model_choice,
|
319 |
remove_silence,
|
320 |
+
seed=seed,
|
321 |
cross_fade_duration=cross_fade_duration_slider,
|
322 |
nfe_step=nfe_slider,
|
323 |
speed=speed_slider,
|
324 |
)
|
325 |
+
return audio_out, spectrogram_path, ref_text_out, str(seed)
|
326 |
+
|
327 |
+
gen_text_file.change(
|
328 |
+
update_gen_text_from_file,
|
329 |
+
inputs=[gen_text_file],
|
330 |
+
outputs=[gen_text_input],
|
331 |
+
)
|
332 |
+
|
333 |
+
ref_text_file.change(
|
334 |
+
update_ref_text_from_file,
|
335 |
+
inputs=[ref_text_file],
|
336 |
+
outputs=[ref_text_input],
|
337 |
+
)
|
338 |
|
339 |
generate_btn.click(
|
340 |
basic_tts,
|
341 |
inputs=[
|
342 |
ref_audio_input,
|
343 |
ref_text_input,
|
344 |
+
ref_text_file,
|
345 |
gen_text_input,
|
346 |
+
gen_text_file,
|
347 |
remove_silence,
|
348 |
+
randomize_seed,
|
349 |
+
seed_input,
|
350 |
cross_fade_duration_slider,
|
351 |
nfe_slider,
|
352 |
speed_slider,
|
353 |
],
|
354 |
+
outputs=[audio_output, spectrogram_output, ref_text_input, seed_input],
|
355 |
)
|
356 |
|
357 |
|
|
|
386 |
"""
|
387 |
# Multiple Speech-Type Generation
|
388 |
|
389 |
+
This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, or upload a .txt file with the same format. The system will generate speech using the appropriate type. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
|
390 |
"""
|
391 |
)
|
392 |
|
393 |
with gr.Row():
|
394 |
gr.Markdown(
|
395 |
"""
|
396 |
+
**Example Input:**
|
397 |
+
{Regular} Hello, I'd like to order a sandwich please.
|
398 |
+
{Surprised} What do you mean you're out of bread?
|
399 |
+
{Sad} I really wanted a sandwich though...
|
400 |
+
{Angry} You know what, darn you and your little shop!
|
401 |
+
{Whisper} I'll just go back home and cry now.
|
402 |
+
{Shouting} Why me?!
|
403 |
"""
|
404 |
)
|
405 |
|
406 |
gr.Markdown(
|
407 |
"""
|
408 |
+
**Example Input 2:**
|
409 |
+
{Speaker1_Happy} Hello, I'd like to order a sandwich please.
|
410 |
+
{Speaker2_Regular} Sorry, we're out of bread.
|
411 |
+
{Speaker1_Sad} I really wanted a sandwich though...
|
412 |
+
{Speaker2_Whisper} I'll give you the last one I was hiding.
|
413 |
"""
|
414 |
)
|
415 |
|
|
|
423 |
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
424 |
regular_insert = gr.Button("Insert Label", variant="secondary")
|
425 |
regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
|
426 |
+
with gr.Row():
|
427 |
+
regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
|
428 |
+
with gr.Column(scale=1):
|
429 |
+
regular_ref_text_file = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
|
430 |
|
431 |
# Regular speech type (max 100)
|
432 |
max_speech_types = 100
|
|
|
434 |
speech_type_names = [regular_name]
|
435 |
speech_type_audios = [regular_audio]
|
436 |
speech_type_ref_texts = [regular_ref_text]
|
437 |
+
speech_type_ref_text_files = [regular_ref_text_file]
|
438 |
speech_type_delete_btns = [None]
|
439 |
speech_type_insert_btns = [regular_insert]
|
440 |
|
|
|
446 |
delete_btn = gr.Button("Delete Type", variant="secondary")
|
447 |
insert_btn = gr.Button("Insert Label", variant="secondary")
|
448 |
audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
449 |
+
with gr.Row():
|
450 |
+
ref_text_input = gr.Textbox(label="Reference Text", lines=2)
|
451 |
+
with gr.Column(scale=1):
|
452 |
+
ref_text_file_input = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
|
453 |
speech_type_rows.append(row)
|
454 |
speech_type_names.append(name_input)
|
455 |
speech_type_audios.append(audio_input)
|
456 |
speech_type_ref_texts.append(ref_text_input)
|
457 |
+
speech_type_ref_text_files.append(ref_text_file_input)
|
458 |
speech_type_delete_btns.append(delete_btn)
|
459 |
speech_type_insert_btns.append(insert_btn)
|
460 |
|
|
|
479 |
|
480 |
# Function to delete a speech type
|
481 |
def delete_speech_type_fn():
|
482 |
+
return gr.update(visible=False), None, None, None, None
|
483 |
|
484 |
+
# Function to update reference text from file
|
485 |
+
@gpu_decorator
|
486 |
+
def update_ref_text_from_file(file):
|
487 |
+
"""Update the reference text input when a .txt file is uploaded"""
|
488 |
+
text = read_text_file(file)
|
489 |
+
return gr.update(value=text)
|
490 |
+
|
491 |
+
# Update delete button clicks and ref text file changes
|
492 |
for i in range(1, len(speech_type_delete_btns)):
|
493 |
speech_type_delete_btns[i].click(
|
494 |
delete_speech_type_fn,
|
495 |
+
outputs=[
|
496 |
+
speech_type_rows[i],
|
497 |
+
speech_type_names[i],
|
498 |
+
speech_type_audios[i],
|
499 |
+
speech_type_ref_texts[i],
|
500 |
+
speech_type_ref_text_files[i],
|
501 |
+
],
|
502 |
+
)
|
503 |
+
speech_type_ref_text_files[i].change(
|
504 |
+
update_ref_text_from_file,
|
505 |
+
inputs=[speech_type_ref_text_files[i]],
|
506 |
+
outputs=[speech_type_ref_texts[i]],
|
507 |
)
|
508 |
|
509 |
+
# Update regular speech type ref text file
|
510 |
+
regular_ref_text_file.change(
|
511 |
+
update_ref_text_from_file,
|
512 |
+
inputs=[regular_ref_text_file],
|
513 |
+
outputs=[regular_ref_text],
|
514 |
+
)
|
515 |
+
|
516 |
# Text input for the prompt
|
517 |
gen_text_input_multistyle = gr.Textbox(
|
518 |
label="Text to Generate",
|
519 |
lines=10,
|
520 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
521 |
)
|
522 |
+
with gr.Column(scale=1):
|
523 |
+
gen_text_file_multistyle = gr.File(label="Upload Text File to Generate (.txt)", file_types=[".txt"])
|
524 |
|
525 |
def make_insert_speech_type_fn(index):
|
526 |
def insert_speech_type_fn(current_text, speech_type_name):
|
|
|
544 |
label="Remove Silences",
|
545 |
value=True,
|
546 |
)
|
547 |
+
with gr.Row():
|
548 |
+
randomize_seed_multistyle = gr.Checkbox(
|
549 |
+
label="Randomize Seed",
|
550 |
+
value=True,
|
551 |
+
info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
|
552 |
+
)
|
553 |
+
seed_input_multistyle = gr.Textbox(
|
554 |
+
label="Seed",
|
555 |
+
value="0",
|
556 |
+
placeholder="Enter a seed value",
|
557 |
+
scale=1,
|
558 |
+
)
|
559 |
|
560 |
# Generate button
|
561 |
generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
|
|
|
563 |
# Output audio
|
564 |
audio_output_multistyle = gr.Audio(label="Synthesized Audio")
|
565 |
|
566 |
+
@gpu_decorator
|
567 |
+
def update_gen_text_from_file(file):
|
568 |
+
"""Update the generate text input when a .txt file is uploaded"""
|
569 |
+
text = read_text_file(file)
|
570 |
+
return gr.update(value=text)
|
571 |
+
|
572 |
+
gen_text_file_multistyle.change(
|
573 |
+
fn=lambda file, text, regular, *names: (
|
574 |
+
update_gen_text_from_file(file),
|
575 |
+
validate_speech_types(text, file, regular, *names),
|
576 |
+
),
|
577 |
+
inputs=[gen_text_file_multistyle, gen_text_input_multistyle, regular_name] + speech_type_names,
|
578 |
+
outputs=[gen_text_input_multistyle, generate_multistyle_btn],
|
579 |
+
)
|
580 |
+
|
581 |
@gpu_decorator
|
582 |
def generate_multistyle_speech(
|
583 |
gen_text,
|
584 |
+
gen_text_file,
|
585 |
+
randomize_seed,
|
586 |
+
seed_input,
|
587 |
*args,
|
588 |
):
|
589 |
+
# Determine the seed to use
|
590 |
+
if randomize_seed:
|
591 |
+
seed = np.random.randint(0, 2**31)
|
592 |
+
else:
|
593 |
+
try:
|
594 |
+
seed = int(seed_input)
|
595 |
+
if seed < 0:
|
596 |
+
gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
|
597 |
+
seed = np.random.randint(0, 2**31)
|
598 |
+
except ValueError:
|
599 |
+
gr.Warning("Invalid seed value. Using random seed instead.")
|
600 |
+
seed = np.random.randint(0, 2**31)
|
601 |
+
|
602 |
speech_type_names_list = args[:max_speech_types]
|
603 |
speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
|
604 |
speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
|
605 |
+
speech_type_ref_text_files_list = args[3 * max_speech_types : 4 * max_speech_types]
|
606 |
+
remove_silence = args[4 * max_speech_types]
|
607 |
# Collect the speech types and their audios into a dict
|
608 |
speech_types = OrderedDict()
|
609 |
|
610 |
+
# Use text from file if provided, otherwise use direct text input
|
611 |
+
gen_text = read_text_file(gen_text_file) or gen_text
|
612 |
+
|
613 |
ref_text_idx = 0
|
614 |
+
for name_input, audio_input, ref_text_input, ref_text_file_input in zip(
|
615 |
+
speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list, speech_type_ref_text_files_list
|
616 |
):
|
617 |
+
ref_text = read_text_file(ref_text_file_input) or ref_text_input
|
618 |
if name_input and audio_input:
|
619 |
+
speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text}
|
620 |
else:
|
621 |
speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
|
622 |
ref_text_idx += 1
|
|
|
642 |
ref_audio = speech_types[current_style]["audio"]
|
643 |
except KeyError:
|
644 |
gr.Warning(f"Please provide reference audio for type {current_style}.")
|
645 |
+
return [None] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
|
646 |
ref_text = speech_types[current_style].get("ref_text", "")
|
647 |
|
648 |
# Generate speech for this segment
|
649 |
audio_out, _, ref_text_out = infer(
|
650 |
+
ref_audio, ref_text, None, text, None, tts_model_choice, remove_silence, seed, 0, show_info=print
|
651 |
) # show_info=print no pull to top when generating
|
652 |
sr, audio_data = audio_out
|
653 |
|
|
|
657 |
# Concatenate all audio segments
|
658 |
if generated_audio_segments:
|
659 |
final_audio_data = np.concatenate(generated_audio_segments)
|
660 |
+
return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
|
661 |
else:
|
662 |
gr.Warning("No audio generated.")
|
663 |
+
return [None] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
|
664 |
|
665 |
generate_multistyle_btn.click(
|
666 |
generate_multistyle_speech,
|
667 |
+
inputs=[gen_text_input_multistyle, gen_text_file_multistyle, randomize_seed_multistyle, seed_input_multistyle]
|
|
|
|
|
668 |
+ speech_type_names
|
669 |
+ speech_type_audios
|
670 |
+ speech_type_ref_texts
|
671 |
+
+ speech_type_ref_text_files
|
672 |
+
+ [remove_silence_multistyle],
|
673 |
+
outputs=[audio_output_multistyle] + speech_type_ref_texts + [seed_input_multistyle],
|
|
|
674 |
)
|
675 |
|
676 |
# Validation function to disable Generate button if speech types are missing
|
677 |
+
def validate_speech_types(gen_text, gen_text_file, regular_name, *args):
|
678 |
speech_type_names_list = args
|
679 |
|
680 |
+
# Use text from file if provided, otherwise use direct text input
|
681 |
+
gen_text = read_text_file(gen_text_file) or gen_text
|
682 |
+
|
683 |
# Collect the speech types names
|
684 |
speech_types_available = set()
|
685 |
if regular_name:
|
|
|
704 |
|
705 |
gen_text_input_multistyle.change(
|
706 |
validate_speech_types,
|
707 |
+
inputs=[gen_text_input_multistyle, gen_text_file_multistyle, regular_name] + speech_type_names,
|
708 |
outputs=generate_multistyle_btn,
|
709 |
)
|
710 |
|
711 |
+
gen_text_file_multistyle.change(
|
712 |
+
fn=lambda file, text, regular, *names: (
|
713 |
+
update_gen_text_from_file(file),
|
714 |
+
validate_speech_types(text, file, regular, *names),
|
715 |
+
),
|
716 |
+
inputs=[gen_text_file_multistyle, gen_text_input_multistyle, regular_name] + speech_type_names,
|
717 |
+
outputs=[gen_text_input_multistyle, generate_multistyle_btn],
|
718 |
+
)
|
719 |
+
|
720 |
|
721 |
with gr.Blocks() as app_chat:
|
722 |
gr.Markdown(
|
723 |
"""
|
724 |
# Voice Chat
|
725 |
+
Have a conversation with an AI using your reference voice!
|
726 |
+
1. Upload a reference audio clip and optionally its transcript (via text or .txt file).
|
727 |
2. Load the chat model.
|
728 |
+
3. Record your message through your microphone or type it.
|
729 |
4. The AI will respond using the reference voice.
|
730 |
"""
|
731 |
)
|
|
|
785 |
label="Remove Silences",
|
786 |
value=True,
|
787 |
)
|
788 |
+
with gr.Row():
|
789 |
+
ref_text_chat = gr.Textbox(
|
790 |
+
label="Reference Text",
|
791 |
+
info="Optional: Leave blank to auto-transcribe",
|
792 |
+
lines=2,
|
793 |
+
)
|
794 |
+
with gr.Column(scale=1):
|
795 |
+
ref_text_file_chat = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
|
796 |
system_prompt_chat = gr.Textbox(
|
797 |
label="System Prompt",
|
798 |
value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
|
799 |
lines=2,
|
800 |
)
|
801 |
+
with gr.Row():
|
802 |
+
randomize_seed_chat = gr.Checkbox(
|
803 |
+
label="Randomize Seed",
|
804 |
+
value=True,
|
805 |
+
info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
|
806 |
+
)
|
807 |
+
seed_input_chat = gr.Textbox(
|
808 |
+
label="Seed",
|
809 |
+
value="0",
|
810 |
+
placeholder="Enter a seed value",
|
811 |
+
scale=1,
|
812 |
+
)
|
813 |
+
|
814 |
+
chatbot_interface = gr.Chatbot(label="Conversation", type="messages")
|
815 |
|
816 |
with gr.Row():
|
817 |
with gr.Column():
|
|
|
825 |
label="Type your message",
|
826 |
lines=1,
|
827 |
)
|
828 |
+
with gr.Column(scale=1):
|
829 |
+
text_file_chat = gr.File(label="Upload Text File (.txt)", file_types=[".txt"])
|
830 |
send_btn_chat = gr.Button("Send Message")
|
831 |
clear_btn_chat = gr.Button("Clear Conversation")
|
832 |
|
|
|
841 |
|
842 |
# Modify process_audio_input to use model and tokenizer from state
|
843 |
@gpu_decorator
|
844 |
+
def process_audio_input(audio_path, text, text_file, history, conv_state):
|
845 |
+
"""Handle audio, text, or file input from user"""
|
846 |
+
if not audio_path and not text.strip() and not text_file:
|
847 |
+
return history, conv_state, "", None
|
848 |
+
|
849 |
+
# Use file input if provided, then direct text input, then audio transcription
|
850 |
+
if text_file:
|
851 |
+
text = read_text_file(text_file)
|
852 |
+
elif audio_path:
|
853 |
text = preprocess_ref_audio_text(audio_path, text)[1]
|
854 |
|
855 |
if not text.strip():
|
856 |
+
return history, conv_state, "", None
|
857 |
|
858 |
conv_state.append({"role": "user", "content": text})
|
859 |
history.append((text, None))
|
|
|
863 |
conv_state.append({"role": "assistant", "content": response})
|
864 |
history[-1] = (text, response)
|
865 |
|
866 |
+
return history, conv_state, "", None
|
867 |
|
868 |
@gpu_decorator
|
869 |
+
def generate_audio_response(
|
870 |
+
history, ref_audio, ref_text, ref_text_file, remove_silence, randomize_seed, seed_input
|
871 |
+
):
|
872 |
"""Generate TTS audio for AI response"""
|
873 |
if not history or not ref_audio:
|
874 |
+
return None, ref_text, seed_input
|
875 |
|
876 |
last_user_message, last_ai_response = history[-1]
|
877 |
if not last_ai_response:
|
878 |
+
return None, ref_text, seed_input
|
879 |
+
|
880 |
+
# Determine the seed to use
|
881 |
+
if randomize_seed:
|
882 |
+
seed = np.random.randint(0, 2**31)
|
883 |
+
else:
|
884 |
+
try:
|
885 |
+
seed = int(seed_input)
|
886 |
+
if seed < 0:
|
887 |
+
gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
|
888 |
+
seed = np.random.randint(0, 2**31)
|
889 |
+
except ValueError:
|
890 |
+
gr.Warning("Invalid seed value. Using random seed instead.")
|
891 |
+
seed = np.random.randint(0, 2**31)
|
892 |
+
|
893 |
+
# Use text from file if provided, otherwise use direct text input
|
894 |
+
ref_text = read_text_file(ref_text_file) or ref_text
|
895 |
|
896 |
audio_result, _, ref_text_out = infer(
|
897 |
ref_audio,
|
898 |
ref_text,
|
899 |
+
None,
|
900 |
last_ai_response,
|
901 |
+
None,
|
902 |
tts_model_choice,
|
903 |
remove_silence,
|
904 |
+
seed=seed,
|
905 |
cross_fade_duration=0.15,
|
906 |
speed=1.0,
|
907 |
show_info=print, # show_info=print no pull to top when generating
|
908 |
)
|
909 |
+
return audio_result, ref_text_out, str(seed)
|
910 |
|
911 |
def clear_conversation():
|
912 |
"""Reset the conversation"""
|
|
|
922 |
new_conv_state = [{"role": "system", "content": new_prompt}]
|
923 |
return [], new_conv_state
|
924 |
|
925 |
+
@gpu_decorator
|
926 |
+
def update_text_from_file(file):
|
927 |
+
"""Update the text input when a .txt file is uploaded"""
|
928 |
+
text = read_text_file(file)
|
929 |
+
return gr.update(value=text), None
|
930 |
+
|
931 |
+
ref_text_file_chat.change(
|
932 |
+
update_ref_text_from_file,
|
933 |
+
inputs=[ref_text_file_chat],
|
934 |
+
outputs=[ref_text_chat],
|
935 |
+
)
|
936 |
+
|
937 |
+
text_file_chat.change(
|
938 |
+
update_text_from_file,
|
939 |
+
inputs=[text_file_chat],
|
940 |
+
outputs=[text_input_chat, text_file_chat],
|
941 |
+
)
|
942 |
+
|
943 |
# Handle audio input
|
944 |
audio_input_chat.stop_recording(
|
945 |
process_audio_input,
|
946 |
+
inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
|
947 |
+
outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
|
948 |
).then(
|
949 |
generate_audio_response,
|
950 |
+
inputs=[
|
951 |
+
chatbot_interface,
|
952 |
+
ref_audio_chat,
|
953 |
+
ref_text_chat,
|
954 |
+
ref_text_file_chat,
|
955 |
+
remove_silence_chat,
|
956 |
+
randomize_seed_chat,
|
957 |
+
seed_input_chat,
|
958 |
+
],
|
959 |
+
outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
|
960 |
).then(
|
961 |
lambda: None,
|
962 |
None,
|
|
|
966 |
# Handle text input
|
967 |
text_input_chat.submit(
|
968 |
process_audio_input,
|
969 |
+
inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
|
970 |
+
outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
|
971 |
).then(
|
972 |
generate_audio_response,
|
973 |
+
inputs=[
|
974 |
+
chatbot_interface,
|
975 |
+
ref_audio_chat,
|
976 |
+
ref_text_chat,
|
977 |
+
ref_text_file_chat,
|
978 |
+
remove_silence_chat,
|
979 |
+
randomize_seed_chat,
|
980 |
+
seed_input_chat,
|
981 |
+
],
|
982 |
+
outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
|
983 |
)
|
984 |
|
985 |
# Handle send button
|
986 |
send_btn_chat.click(
|
987 |
process_audio_input,
|
988 |
+
inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
|
989 |
+
outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
|
990 |
).then(
|
991 |
generate_audio_response,
|
992 |
+
inputs=[
|
993 |
+
chatbot_interface,
|
994 |
+
ref_audio_chat,
|
995 |
+
ref_text_chat,
|
996 |
+
ref_text_file_chat,
|
997 |
+
remove_silence_chat,
|
998 |
+
randomize_seed_chat,
|
999 |
+
seed_input_chat,
|
1000 |
+
],
|
1001 |
+
outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
|
1002 |
)
|
1003 |
|
1004 |
# Handle clear button
|
|
|
1027 |
|
1028 |
The checkpoints currently support English and Chinese.
|
1029 |
|
1030 |
+
If you're having issues, try converting your заборreference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
|
1031 |
|
1032 |
+
**NOTE: Reference text will be automatically transcribed with Whisper if not provided via text or .txt file. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
|
1033 |
"""
|
1034 |
)
|
1035 |
|