Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -249,7 +249,7 @@ def replace_numbers_with_words(sentence):
|
|
249 |
|
250 |
@spaces.GPU
|
251 |
def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
252 |
-
audio_path,
|
253 |
|
254 |
codec_audio_sr = 16000
|
255 |
codec_sr = 50
|
@@ -268,7 +268,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
268 |
|
269 |
# text normalization
|
270 |
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
271 |
-
orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
272 |
|
273 |
[orig_transcript, segments, _, _] = transcribe_en(audio_path)
|
274 |
orig_transcript = orig_transcript.lower()
|
@@ -344,7 +344,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
344 |
|
345 |
@spaces.GPU
|
346 |
def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
347 |
-
audio_path,
|
348 |
|
349 |
codec_audio_sr = 16000
|
350 |
codec_sr = 50
|
@@ -363,7 +363,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
363 |
|
364 |
# text normalization
|
365 |
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
366 |
-
orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
367 |
|
368 |
[orig_transcript, segments, _, _] = transcribe_en(audio_path)
|
369 |
orig_transcript = orig_transcript.lower()
|
@@ -444,7 +444,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
444 |
|
445 |
@spaces.GPU
|
446 |
def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
447 |
-
audio_path,
|
448 |
|
449 |
codec_audio_sr = 16000
|
450 |
codec_sr = 50
|
@@ -464,7 +464,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
464 |
|
465 |
# text normalization
|
466 |
target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
467 |
-
orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
468 |
|
469 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
470 |
|
@@ -542,7 +542,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
542 |
|
543 |
@spaces.GPU
|
544 |
def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
545 |
-
audio_path,
|
546 |
|
547 |
codec_audio_sr = 16000
|
548 |
codec_sr = 50
|
@@ -562,7 +562,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
562 |
|
563 |
# text normalization
|
564 |
target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
565 |
-
orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
566 |
|
567 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
568 |
|
@@ -692,233 +692,233 @@ if __name__ == "__main__":
|
|
692 |
|
693 |
with gr.Row():
|
694 |
with gr.Column(scale=2):
|
695 |
-
|
696 |
value=f"{DEMO_PATH}/84_121550_000074_000000.wav",
|
697 |
label="Input Audio",
|
698 |
type="filepath",
|
699 |
interactive=True
|
700 |
)
|
701 |
with gr.Group():
|
702 |
-
|
703 |
label="Original transcript",
|
704 |
lines=5,
|
705 |
value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
|
706 |
info="Use whisperx model to get the transcript."
|
707 |
)
|
708 |
-
|
709 |
|
710 |
with gr.Column(scale=3):
|
711 |
with gr.Group():
|
712 |
-
|
713 |
label="Text",
|
714 |
lines=7,
|
715 |
value="but when I saw the mirage of the lake in the distance, which the sense deceives, lost not by distance any of its marks.",
|
716 |
interactive=True
|
717 |
)
|
718 |
-
|
719 |
|
720 |
with gr.Column(scale=2):
|
721 |
-
|
722 |
|
723 |
with gr.Row():
|
724 |
with gr.Accordion("Advanced Settings", open=False):
|
725 |
-
|
726 |
-
|
727 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
728 |
-
|
729 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
730 |
-
|
731 |
info="cfg stride, 5 is a good value for English, change if you don't like the results")
|
732 |
-
|
733 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
734 |
-
|
735 |
|
736 |
-
|
737 |
|
738 |
-
|
739 |
fn=transcribe_en,
|
740 |
-
inputs=[
|
741 |
-
outputs=[
|
742 |
)
|
743 |
|
744 |
-
|
745 |
inputs=[
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
],
|
750 |
-
outputs=[
|
751 |
|
752 |
-
|
753 |
inputs=[
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
],
|
758 |
-
outputs=[
|
759 |
)
|
760 |
|
761 |
with gr.Tab("English TTS"):
|
762 |
|
763 |
with gr.Row():
|
764 |
with gr.Column(scale=2):
|
765 |
-
|
766 |
with gr.Group():
|
767 |
-
|
768 |
info="Use whisperx model to get the transcript.")
|
769 |
-
|
770 |
|
771 |
with gr.Column(scale=3):
|
772 |
with gr.Group():
|
773 |
-
|
774 |
-
|
775 |
|
776 |
with gr.Column(scale=2):
|
777 |
-
|
778 |
|
779 |
with gr.Row():
|
780 |
with gr.Accordion("Advanced Settings", open=False):
|
781 |
-
|
782 |
-
|
783 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
784 |
-
|
785 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
786 |
-
|
787 |
info="cfg stride, 5 is a good value for English, change if you don't like the results")
|
788 |
-
|
789 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
790 |
-
|
791 |
|
792 |
-
|
793 |
|
794 |
-
|
795 |
-
inputs=[
|
796 |
-
outputs=[
|
797 |
|
798 |
-
|
799 |
inputs=[
|
800 |
-
|
801 |
-
|
802 |
-
|
803 |
],
|
804 |
-
outputs=[
|
805 |
|
806 |
-
|
807 |
inputs=[
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
],
|
812 |
-
outputs=[
|
813 |
)
|
814 |
|
815 |
with gr.Tab("Mandarin Speech Editing"):
|
816 |
|
817 |
with gr.Row():
|
818 |
with gr.Column(scale=2):
|
819 |
-
|
820 |
with gr.Group():
|
821 |
-
|
822 |
info="Use whisperx model to get the transcript.")
|
823 |
-
|
824 |
|
825 |
with gr.Column(scale=3):
|
826 |
with gr.Group():
|
827 |
-
|
828 |
-
|
829 |
|
830 |
with gr.Column(scale=2):
|
831 |
-
|
832 |
|
833 |
with gr.Row():
|
834 |
with gr.Accordion("Advanced Settings", open=False):
|
835 |
-
|
836 |
-
|
837 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
838 |
-
|
839 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
840 |
-
|
841 |
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
842 |
-
|
843 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
844 |
-
|
845 |
|
846 |
-
|
847 |
|
848 |
-
|
849 |
-
inputs=[
|
850 |
-
outputs=[
|
851 |
|
852 |
-
|
853 |
inputs=[
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
],
|
858 |
-
outputs=[
|
859 |
|
860 |
-
|
861 |
inputs=[
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
],
|
866 |
-
outputs=[
|
867 |
)
|
868 |
|
869 |
with gr.Tab("Mandarin TTS"):
|
870 |
|
871 |
with gr.Row():
|
872 |
with gr.Column(scale=2):
|
873 |
-
|
874 |
with gr.Group():
|
875 |
-
|
876 |
info="Use whisperx model to get the transcript.")
|
877 |
-
|
878 |
|
879 |
with gr.Column(scale=3):
|
880 |
with gr.Group():
|
881 |
-
|
882 |
-
|
883 |
|
884 |
with gr.Column(scale=2):
|
885 |
-
|
886 |
|
887 |
with gr.Row():
|
888 |
with gr.Accordion("Advanced Settings", open=False):
|
889 |
-
|
890 |
-
|
891 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
892 |
-
|
893 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
894 |
-
|
895 |
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
896 |
-
|
897 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
898 |
-
|
899 |
|
900 |
-
|
901 |
|
902 |
-
|
903 |
-
inputs=[
|
904 |
-
outputs=[
|
905 |
|
906 |
-
|
907 |
inputs=[
|
908 |
-
|
909 |
-
|
910 |
-
|
911 |
],
|
912 |
-
outputs=[
|
913 |
|
914 |
-
|
915 |
inputs=[
|
916 |
-
|
917 |
-
|
918 |
-
|
919 |
],
|
920 |
-
outputs=[
|
921 |
)
|
922 |
|
923 |
# Launch the Gradio demo
|
924 |
-
demo.launch()
|
|
|
249 |
|
250 |
@spaces.GPU
|
251 |
def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
252 |
+
audio_path, transcript):
|
253 |
|
254 |
codec_audio_sr = 16000
|
255 |
codec_sr = 50
|
|
|
268 |
|
269 |
# text normalization
|
270 |
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
271 |
+
# orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
272 |
|
273 |
[orig_transcript, segments, _, _] = transcribe_en(audio_path)
|
274 |
orig_transcript = orig_transcript.lower()
|
|
|
344 |
|
345 |
@spaces.GPU
|
346 |
def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
347 |
+
audio_path, transcript):
|
348 |
|
349 |
codec_audio_sr = 16000
|
350 |
codec_sr = 50
|
|
|
363 |
|
364 |
# text normalization
|
365 |
target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
366 |
+
# orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
367 |
|
368 |
[orig_transcript, segments, _, _] = transcribe_en(audio_path)
|
369 |
orig_transcript = orig_transcript.lower()
|
|
|
444 |
|
445 |
@spaces.GPU
|
446 |
def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
447 |
+
audio_path, transcript):
|
448 |
|
449 |
codec_audio_sr = 16000
|
450 |
codec_sr = 50
|
|
|
464 |
|
465 |
# text normalization
|
466 |
target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
467 |
+
# orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
468 |
|
469 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
470 |
|
|
|
542 |
|
543 |
@spaces.GPU
|
544 |
def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
545 |
+
audio_path, transcript):
|
546 |
|
547 |
codec_audio_sr = 16000
|
548 |
codec_sr = 50
|
|
|
562 |
|
563 |
# text normalization
|
564 |
target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
565 |
+
# orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
566 |
|
567 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
568 |
|
|
|
692 |
|
693 |
with gr.Row():
|
694 |
with gr.Column(scale=2):
|
695 |
+
input_audio1 = gr.Audio(
|
696 |
value=f"{DEMO_PATH}/84_121550_000074_000000.wav",
|
697 |
label="Input Audio",
|
698 |
type="filepath",
|
699 |
interactive=True
|
700 |
)
|
701 |
with gr.Group():
|
702 |
+
original_transcript1 = gr.Textbox(
|
703 |
label="Original transcript",
|
704 |
lines=5,
|
705 |
value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
|
706 |
info="Use whisperx model to get the transcript."
|
707 |
)
|
708 |
+
transcribe_btn1 = gr.Button(value="Transcribe")
|
709 |
|
710 |
with gr.Column(scale=3):
|
711 |
with gr.Group():
|
712 |
+
transcript1 = gr.Textbox(
|
713 |
label="Text",
|
714 |
lines=7,
|
715 |
value="but when I saw the mirage of the lake in the distance, which the sense deceives, lost not by distance any of its marks.",
|
716 |
interactive=True
|
717 |
)
|
718 |
+
run_btn1 = gr.Button(value="Run")
|
719 |
|
720 |
with gr.Column(scale=2):
|
721 |
+
output_audio1 = gr.Audio(label="Output Audio")
|
722 |
|
723 |
with gr.Row():
|
724 |
with gr.Accordion("Advanced Settings", open=False):
|
725 |
+
seed1 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
|
726 |
+
aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
727 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
728 |
+
cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
|
729 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
730 |
+
cfg_stride1 = gr.Number(label="cfg_stride", value=5,
|
731 |
info="cfg stride, 5 is a good value for English, change if you don't like the results")
|
732 |
+
prompt_length1 = gr.Number(label="prompt_length", value=3,
|
733 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
734 |
+
sub_amount1 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
735 |
|
736 |
+
success_output1 = gr.HTML()
|
737 |
|
738 |
+
transcribe_btn1.click(
|
739 |
fn=transcribe_en,
|
740 |
+
inputs=[input_audio1],
|
741 |
+
outputs=[original_transcript1, gr.State(), gr.State(), success_output1]
|
742 |
)
|
743 |
|
744 |
+
run_btn1.click(fn=run_edit_en,
|
745 |
inputs=[
|
746 |
+
seed1, sub_amount1,
|
747 |
+
aug_text1, cfg_coef1, cfg_stride1, prompt_length1,
|
748 |
+
input_audio1, transcript1,
|
749 |
],
|
750 |
+
outputs=[output_audio1, success_output1])
|
751 |
|
752 |
+
transcript1.submit(fn=run_edit_en,
|
753 |
inputs=[
|
754 |
+
seed1, sub_amount1,
|
755 |
+
aug_text1, cfg_coef1, cfg_stride1, prompt_length1,
|
756 |
+
input_audio1, transcript1,
|
757 |
],
|
758 |
+
outputs=[output_audio1, success_output1]
|
759 |
)
|
760 |
|
761 |
with gr.Tab("English TTS"):
|
762 |
|
763 |
with gr.Row():
|
764 |
with gr.Column(scale=2):
|
765 |
+
input_audio2 = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
|
766 |
with gr.Group():
|
767 |
+
original_transcript2 = gr.Textbox(label="Original transcript", lines=5, value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
|
768 |
info="Use whisperx model to get the transcript.")
|
769 |
+
transcribe_btn2 = gr.Button(value="Transcribe")
|
770 |
|
771 |
with gr.Column(scale=3):
|
772 |
with gr.Group():
|
773 |
+
transcript2 = gr.Textbox(label="Text", lines=7, value="I cannot believe that the same model can also do text to speech synthesis too!", interactive=True)
|
774 |
+
run_btn2 = gr.Button(value="Run")
|
775 |
|
776 |
with gr.Column(scale=2):
|
777 |
+
output_audio2 = gr.Audio(label="Output Audio")
|
778 |
|
779 |
with gr.Row():
|
780 |
with gr.Accordion("Advanced Settings", open=False):
|
781 |
+
seed2 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
|
782 |
+
aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
783 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
784 |
+
cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
|
785 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
786 |
+
cfg_stride2 = gr.Number(label="cfg_stride", value=5,
|
787 |
info="cfg stride, 5 is a good value for English, change if you don't like the results")
|
788 |
+
prompt_length2 = gr.Number(label="prompt_length", value=3,
|
789 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
790 |
+
sub_amount2 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
791 |
|
792 |
+
success_output2 = gr.HTML()
|
793 |
|
794 |
+
transcribe_btn2.click(fn=transcribe_en,
|
795 |
+
inputs=[input_audio2],
|
796 |
+
outputs=[original_transcript2, gr.State(), gr.State(), success_output2])
|
797 |
|
798 |
+
run_btn2.click(fn=run_tts_en,
|
799 |
inputs=[
|
800 |
+
seed2, sub_amount2,
|
801 |
+
aug_text2, cfg_coef2, cfg_stride2, prompt_length2,
|
802 |
+
input_audio2, transcript2,
|
803 |
],
|
804 |
+
outputs=[output_audio2, success_output2])
|
805 |
|
806 |
+
transcript2.submit(fn=run_tts_en,
|
807 |
inputs=[
|
808 |
+
seed2, sub_amount2,
|
809 |
+
aug_text2, cfg_coef2, cfg_stride2, prompt_length2,
|
810 |
+
input_audio2, transcript2,
|
811 |
],
|
812 |
+
outputs=[output_audio2, success_output2]
|
813 |
)
|
814 |
|
815 |
with gr.Tab("Mandarin Speech Editing"):
|
816 |
|
817 |
with gr.Row():
|
818 |
with gr.Column(scale=2):
|
819 |
+
input_audio3 = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
|
820 |
with gr.Group():
|
821 |
+
original_transcript3 = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
|
822 |
info="Use whisperx model to get the transcript.")
|
823 |
+
transcribe_btn3 = gr.Button(value="Transcribe")
|
824 |
|
825 |
with gr.Column(scale=3):
|
826 |
with gr.Group():
|
827 |
+
transcript3 = gr.Textbox(label="Text", lines=7, value="价格已基本都在一万到两万之间", interactive=True)
|
828 |
+
run_btn3 = gr.Button(value="Run")
|
829 |
|
830 |
with gr.Column(scale=2):
|
831 |
+
output_audio3 = gr.Audio(label="Output Audio")
|
832 |
|
833 |
with gr.Row():
|
834 |
with gr.Accordion("Advanced Settings", open=False):
|
835 |
+
seed3 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
|
836 |
+
aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
837 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
838 |
+
cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
|
839 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
840 |
+
cfg_stride3 = gr.Number(label="cfg_stride", value=1,
|
841 |
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
842 |
+
prompt_length3 = gr.Number(label="prompt_length", value=3,
|
843 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
844 |
+
sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
845 |
|
846 |
+
success_output3 = gr.HTML()
|
847 |
|
848 |
+
transcribe_btn3.click(fn=transcribe_zh,
|
849 |
+
inputs=[input_audio3],
|
850 |
+
outputs=[original_transcript3, gr.State(), gr.State(), success_output3])
|
851 |
|
852 |
+
run_btn3.click(fn=run_edit_zh,
|
853 |
inputs=[
|
854 |
+
seed3, sub_amount3,
|
855 |
+
aug_text3, cfg_coef3, cfg_stride3, prompt_length3,
|
856 |
+
input_audio3, transcript3,
|
857 |
],
|
858 |
+
outputs=[output_audio3, success_output3])
|
859 |
|
860 |
+
transcript3.submit(fn=run_edit_zh,
|
861 |
inputs=[
|
862 |
+
seed3, sub_amount3,
|
863 |
+
aug_text3, cfg_coef3, cfg_stride3, prompt_length3,
|
864 |
+
input_audio3, transcript3,
|
865 |
],
|
866 |
+
outputs=[output_audio3, success_output3]
|
867 |
)
|
868 |
|
869 |
with gr.Tab("Mandarin TTS"):
|
870 |
|
871 |
with gr.Row():
|
872 |
with gr.Column(scale=2):
|
873 |
+
input_audio4 = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
|
874 |
with gr.Group():
|
875 |
+
original_transcript4 = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
|
876 |
info="Use whisperx model to get the transcript.")
|
877 |
+
transcribe_btn4 = gr.Button(value="Transcribe")
|
878 |
|
879 |
with gr.Column(scale=3):
|
880 |
with gr.Group():
|
881 |
+
transcript4 = gr.Textbox(label="Text", lines=7, value="我简直不敢相信同一个模型也可以进行文本到语音的生成", interactive=True)
|
882 |
+
run_btn4 = gr.Button(value="Run")
|
883 |
|
884 |
with gr.Column(scale=2):
|
885 |
+
output_audio4 = gr.Audio(label="Output Audio")
|
886 |
|
887 |
with gr.Row():
|
888 |
with gr.Accordion("Advanced Settings", open=False):
|
889 |
+
seed4 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
|
890 |
+
aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
891 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
892 |
+
cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
|
893 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
894 |
+
cfg_stride4 = gr.Number(label="cfg_stride", value=1,
|
895 |
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
896 |
+
prompt_length4 = gr.Number(label="prompt_length", value=3,
|
897 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
898 |
+
sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
899 |
|
900 |
+
success_output4 = gr.HTML()
|
901 |
|
902 |
+
transcribe_btn4.click(fn=transcribe_zh,
|
903 |
+
inputs=[input_audio4],
|
904 |
+
outputs=[original_transcript4, gr.State(), gr.State(), success_output4])
|
905 |
|
906 |
+
run_btn4.click(fn=run_tts_zh,
|
907 |
inputs=[
|
908 |
+
seed4, sub_amount4,
|
909 |
+
aug_text4, cfg_coef4, cfg_stride4, prompt_length4,
|
910 |
+
input_audio4, transcript4,
|
911 |
],
|
912 |
+
outputs=[output_audio4, success_output4])
|
913 |
|
914 |
+
transcript4.submit(fn=run_tts_zh,
|
915 |
inputs=[
|
916 |
+
seed4, sub_amount4,
|
917 |
+
aug_text4, cfg_coef4, cfg_stride4, prompt_length4,
|
918 |
+
input_audio4, transcript4,
|
919 |
],
|
920 |
+
outputs=[output_audio4, success_output4]
|
921 |
)
|
922 |
|
923 |
# Launch the Gradio demo
|
924 |
+
demo.launch()
|