Spaces:
Running
Running
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +46 -19
- src/f5_tts/infer/utils_infer.py +13 -12
app.py
CHANGED
|
@@ -51,6 +51,8 @@ E2TTS_ema_model = load_model(
|
|
| 51 |
UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
|
| 52 |
)
|
| 53 |
|
|
|
|
|
|
|
| 54 |
chat_model_state = None
|
| 55 |
chat_tokenizer_state = None
|
| 56 |
|
|
@@ -129,7 +131,6 @@ with gr.Blocks() as app_tts:
|
|
| 129 |
gr.Markdown("# Batched TTS")
|
| 130 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
| 131 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
| 132 |
-
model_choice = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
| 133 |
generate_btn = gr.Button("Synthesize", variant="primary")
|
| 134 |
with gr.Accordion("Advanced Settings", open=False):
|
| 135 |
ref_text_input = gr.Textbox(
|
|
@@ -162,13 +163,31 @@ with gr.Blocks() as app_tts:
|
|
| 162 |
audio_output = gr.Audio(label="Synthesized Audio")
|
| 163 |
spectrogram_output = gr.Image(label="Spectrogram")
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
generate_btn.click(
|
| 166 |
-
|
| 167 |
inputs=[
|
| 168 |
ref_audio_input,
|
| 169 |
ref_text_input,
|
| 170 |
gen_text_input,
|
| 171 |
-
model_choice,
|
| 172 |
remove_silence,
|
| 173 |
cross_fade_duration_slider,
|
| 174 |
speed_slider,
|
|
@@ -345,9 +364,6 @@ with gr.Blocks() as app_multistyle:
|
|
| 345 |
outputs=gen_text_input_multistyle,
|
| 346 |
)
|
| 347 |
|
| 348 |
-
# Model choice
|
| 349 |
-
model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
| 350 |
-
|
| 351 |
with gr.Accordion("Advanced Settings", open=False):
|
| 352 |
remove_silence_multistyle = gr.Checkbox(
|
| 353 |
label="Remove Silences",
|
|
@@ -371,7 +387,6 @@ with gr.Blocks() as app_multistyle:
|
|
| 371 |
speech_type_names_list = args[:num_additional_speech_types]
|
| 372 |
speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
|
| 373 |
speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
|
| 374 |
-
model_choice = args[3 * num_additional_speech_types + 1]
|
| 375 |
remove_silence = args[3 * num_additional_speech_types + 1]
|
| 376 |
|
| 377 |
# Collect the speech types and their audios into a dict
|
|
@@ -405,7 +420,7 @@ with gr.Blocks() as app_multistyle:
|
|
| 405 |
|
| 406 |
# Generate speech for this segment
|
| 407 |
audio, _ = infer(
|
| 408 |
-
ref_audio, ref_text, text,
|
| 409 |
) # show_info=print no pull to top when generating
|
| 410 |
sr, audio_data = audio
|
| 411 |
|
|
@@ -430,7 +445,6 @@ with gr.Blocks() as app_multistyle:
|
|
| 430 |
+ speech_type_audios
|
| 431 |
+ speech_type_ref_texts
|
| 432 |
+ [
|
| 433 |
-
model_choice_multistyle,
|
| 434 |
remove_silence_multistyle,
|
| 435 |
],
|
| 436 |
outputs=audio_output_multistyle,
|
|
@@ -518,11 +532,6 @@ Have a conversation with an AI using your reference voice!
|
|
| 518 |
ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
|
| 519 |
with gr.Column():
|
| 520 |
with gr.Accordion("Advanced Settings", open=False):
|
| 521 |
-
model_choice_chat = gr.Radio(
|
| 522 |
-
choices=["F5-TTS", "E2-TTS"],
|
| 523 |
-
label="TTS Model",
|
| 524 |
-
value="F5-TTS",
|
| 525 |
-
)
|
| 526 |
remove_silence_chat = gr.Checkbox(
|
| 527 |
label="Remove Silences",
|
| 528 |
value=True,
|
|
@@ -589,7 +598,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 589 |
return history, conv_state, ""
|
| 590 |
|
| 591 |
@gpu_decorator
|
| 592 |
-
def generate_audio_response(history, ref_audio, ref_text,
|
| 593 |
"""Generate TTS audio for AI response"""
|
| 594 |
if not history or not ref_audio:
|
| 595 |
return None
|
|
@@ -602,7 +611,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 602 |
ref_audio,
|
| 603 |
ref_text,
|
| 604 |
last_ai_response,
|
| 605 |
-
|
| 606 |
remove_silence,
|
| 607 |
cross_fade_duration=0.15,
|
| 608 |
speed=1.0,
|
|
@@ -631,7 +640,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 631 |
outputs=[chatbot_interface, conversation_state],
|
| 632 |
).then(
|
| 633 |
generate_audio_response,
|
| 634 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat,
|
| 635 |
outputs=[audio_output_chat],
|
| 636 |
).then(
|
| 637 |
lambda: None,
|
|
@@ -646,7 +655,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 646 |
outputs=[chatbot_interface, conversation_state],
|
| 647 |
).then(
|
| 648 |
generate_audio_response,
|
| 649 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat,
|
| 650 |
outputs=[audio_output_chat],
|
| 651 |
).then(
|
| 652 |
lambda: None,
|
|
@@ -661,7 +670,7 @@ Have a conversation with an AI using your reference voice!
|
|
| 661 |
outputs=[chatbot_interface, conversation_state],
|
| 662 |
).then(
|
| 663 |
generate_audio_response,
|
| 664 |
-
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat,
|
| 665 |
outputs=[audio_output_chat],
|
| 666 |
).then(
|
| 667 |
lambda: None,
|
|
@@ -700,6 +709,24 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
|
|
| 700 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
| 701 |
"""
|
| 702 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
gr.TabbedInterface(
|
| 704 |
[app_tts, app_multistyle, app_chat, app_credits],
|
| 705 |
["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
|
|
|
|
| 51 |
UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
|
| 52 |
)
|
| 53 |
|
| 54 |
+
DEFAULT_TTS_MODEL = "F5-TTS"
|
| 55 |
+
tts_model_choice = DEFAULT_TTS_MODEL
|
| 56 |
chat_model_state = None
|
| 57 |
chat_tokenizer_state = None
|
| 58 |
|
|
|
|
| 131 |
gr.Markdown("# Batched TTS")
|
| 132 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
| 133 |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
|
|
|
|
| 134 |
generate_btn = gr.Button("Synthesize", variant="primary")
|
| 135 |
with gr.Accordion("Advanced Settings", open=False):
|
| 136 |
ref_text_input = gr.Textbox(
|
|
|
|
| 163 |
audio_output = gr.Audio(label="Synthesized Audio")
|
| 164 |
spectrogram_output = gr.Image(label="Spectrogram")
|
| 165 |
|
| 166 |
+
@gpu_decorator
|
| 167 |
+
def basic_tts(
|
| 168 |
+
ref_audio_input,
|
| 169 |
+
ref_text_input,
|
| 170 |
+
gen_text_input,
|
| 171 |
+
remove_silence,
|
| 172 |
+
cross_fade_duration_slider,
|
| 173 |
+
speed_slider,
|
| 174 |
+
):
|
| 175 |
+
return infer(
|
| 176 |
+
ref_audio_input,
|
| 177 |
+
ref_text_input,
|
| 178 |
+
gen_text_input,
|
| 179 |
+
tts_model_choice,
|
| 180 |
+
remove_silence,
|
| 181 |
+
cross_fade_duration_slider,
|
| 182 |
+
speed_slider,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
generate_btn.click(
|
| 186 |
+
basic_tts,
|
| 187 |
inputs=[
|
| 188 |
ref_audio_input,
|
| 189 |
ref_text_input,
|
| 190 |
gen_text_input,
|
|
|
|
| 191 |
remove_silence,
|
| 192 |
cross_fade_duration_slider,
|
| 193 |
speed_slider,
|
|
|
|
| 364 |
outputs=gen_text_input_multistyle,
|
| 365 |
)
|
| 366 |
|
|
|
|
|
|
|
|
|
|
| 367 |
with gr.Accordion("Advanced Settings", open=False):
|
| 368 |
remove_silence_multistyle = gr.Checkbox(
|
| 369 |
label="Remove Silences",
|
|
|
|
| 387 |
speech_type_names_list = args[:num_additional_speech_types]
|
| 388 |
speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
|
| 389 |
speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
|
|
|
|
| 390 |
remove_silence = args[3 * num_additional_speech_types + 1]
|
| 391 |
|
| 392 |
# Collect the speech types and their audios into a dict
|
|
|
|
| 420 |
|
| 421 |
# Generate speech for this segment
|
| 422 |
audio, _ = infer(
|
| 423 |
+
ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
|
| 424 |
) # show_info=print no pull to top when generating
|
| 425 |
sr, audio_data = audio
|
| 426 |
|
|
|
|
| 445 |
+ speech_type_audios
|
| 446 |
+ speech_type_ref_texts
|
| 447 |
+ [
|
|
|
|
| 448 |
remove_silence_multistyle,
|
| 449 |
],
|
| 450 |
outputs=audio_output_multistyle,
|
|
|
|
| 532 |
ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
|
| 533 |
with gr.Column():
|
| 534 |
with gr.Accordion("Advanced Settings", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
remove_silence_chat = gr.Checkbox(
|
| 536 |
label="Remove Silences",
|
| 537 |
value=True,
|
|
|
|
| 598 |
return history, conv_state, ""
|
| 599 |
|
| 600 |
@gpu_decorator
|
| 601 |
+
def generate_audio_response(history, ref_audio, ref_text, remove_silence):
|
| 602 |
"""Generate TTS audio for AI response"""
|
| 603 |
if not history or not ref_audio:
|
| 604 |
return None
|
|
|
|
| 611 |
ref_audio,
|
| 612 |
ref_text,
|
| 613 |
last_ai_response,
|
| 614 |
+
tts_model_choice,
|
| 615 |
remove_silence,
|
| 616 |
cross_fade_duration=0.15,
|
| 617 |
speed=1.0,
|
|
|
|
| 640 |
outputs=[chatbot_interface, conversation_state],
|
| 641 |
).then(
|
| 642 |
generate_audio_response,
|
| 643 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
| 644 |
outputs=[audio_output_chat],
|
| 645 |
).then(
|
| 646 |
lambda: None,
|
|
|
|
| 655 |
outputs=[chatbot_interface, conversation_state],
|
| 656 |
).then(
|
| 657 |
generate_audio_response,
|
| 658 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
| 659 |
outputs=[audio_output_chat],
|
| 660 |
).then(
|
| 661 |
lambda: None,
|
|
|
|
| 670 |
outputs=[chatbot_interface, conversation_state],
|
| 671 |
).then(
|
| 672 |
generate_audio_response,
|
| 673 |
+
inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
|
| 674 |
outputs=[audio_output_chat],
|
| 675 |
).then(
|
| 676 |
lambda: None,
|
|
|
|
| 709 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
| 710 |
"""
|
| 711 |
)
|
| 712 |
+
|
| 713 |
+
def switch_tts_model(new_choice):
|
| 714 |
+
global tts_model_choice
|
| 715 |
+
tts_model_choice = new_choice
|
| 716 |
+
|
| 717 |
+
if not USING_SPACES:
|
| 718 |
+
choose_tts_model = gr.Radio(
|
| 719 |
+
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 720 |
+
)
|
| 721 |
+
else:
|
| 722 |
+
choose_tts_model = gr.Radio(
|
| 723 |
+
choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
|
| 724 |
+
)
|
| 725 |
+
choose_tts_model.change(
|
| 726 |
+
switch_tts_model,
|
| 727 |
+
inputs=choose_tts_model,
|
| 728 |
+
)
|
| 729 |
+
|
| 730 |
gr.TabbedInterface(
|
| 731 |
[app_tts, app_multistyle, app_chat, app_credits],
|
| 732 |
["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
|
src/f5_tts/infer/utils_infer.py
CHANGED
|
@@ -282,13 +282,13 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
| 282 |
audio_data = audio_file.read()
|
| 283 |
audio_hash = hashlib.md5(audio_data).hexdigest()
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
global asr_pipe
|
| 293 |
if asr_pipe is None:
|
| 294 |
initialize_asr_pipeline(device=device)
|
|
@@ -300,11 +300,10 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
| 300 |
generate_kwargs={"task": "transcribe"},
|
| 301 |
return_timestamps=False,
|
| 302 |
)["text"].strip()
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
_ref_audio_cache[audio_hash] = ref_text
|
| 308 |
|
| 309 |
# Ensure ref_text ends with a proper sentence-ending punctuation
|
| 310 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
|
@@ -313,6 +312,8 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
|
|
| 313 |
else:
|
| 314 |
ref_text += ". "
|
| 315 |
|
|
|
|
|
|
|
| 316 |
return ref_audio, ref_text
|
| 317 |
|
| 318 |
|
|
|
|
| 282 |
audio_data = audio_file.read()
|
| 283 |
audio_hash = hashlib.md5(audio_data).hexdigest()
|
| 284 |
|
| 285 |
+
if not ref_text.strip():
|
| 286 |
+
global _ref_audio_cache
|
| 287 |
+
if audio_hash in _ref_audio_cache:
|
| 288 |
+
# Use cached asr transcription
|
| 289 |
+
show_info("Using cached reference text...")
|
| 290 |
+
ref_text = _ref_audio_cache[audio_hash]
|
| 291 |
+
else:
|
| 292 |
global asr_pipe
|
| 293 |
if asr_pipe is None:
|
| 294 |
initialize_asr_pipeline(device=device)
|
|
|
|
| 300 |
generate_kwargs={"task": "transcribe"},
|
| 301 |
return_timestamps=False,
|
| 302 |
)["text"].strip()
|
| 303 |
+
# Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
|
| 304 |
+
_ref_audio_cache[audio_hash] = ref_text
|
| 305 |
+
else:
|
| 306 |
+
show_info("Using custom reference text...")
|
|
|
|
| 307 |
|
| 308 |
# Ensure ref_text ends with a proper sentence-ending punctuation
|
| 309 |
if not ref_text.endswith(". ") and not ref_text.endswith("。"):
|
|
|
|
| 312 |
else:
|
| 313 |
ref_text += ". "
|
| 314 |
|
| 315 |
+
print("ref_text ", ref_text)
|
| 316 |
+
|
| 317 |
return ref_audio, ref_text
|
| 318 |
|
| 319 |
|