OpenSound commited on
Commit
951dfe7
·
verified ·
1 Parent(s): c6e03e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -113
app.py CHANGED
@@ -249,7 +249,7 @@ def replace_numbers_with_words(sentence):
249
 
250
  @spaces.GPU
251
  def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
252
- audio_path, original_transcript, transcript):
253
 
254
  codec_audio_sr = 16000
255
  codec_sr = 50
@@ -268,7 +268,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
268
 
269
  # text normalization
270
  target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
271
- orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
272
 
273
  [orig_transcript, segments, _, _] = transcribe_en(audio_path)
274
  orig_transcript = orig_transcript.lower()
@@ -344,7 +344,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
344
 
345
  @spaces.GPU
346
  def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
347
- audio_path, original_transcript, transcript):
348
 
349
  codec_audio_sr = 16000
350
  codec_sr = 50
@@ -363,7 +363,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
363
 
364
  # text normalization
365
  target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
366
- orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
367
 
368
  [orig_transcript, segments, _, _] = transcribe_en(audio_path)
369
  orig_transcript = orig_transcript.lower()
@@ -444,7 +444,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
444
 
445
  @spaces.GPU
446
  def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
447
- audio_path, original_transcript, transcript):
448
 
449
  codec_audio_sr = 16000
450
  codec_sr = 50
@@ -464,7 +464,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
464
 
465
  # text normalization
466
  target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
467
- orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
468
 
469
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
470
 
@@ -542,7 +542,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
542
 
543
  @spaces.GPU
544
  def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
545
- audio_path, original_transcript, transcript):
546
 
547
  codec_audio_sr = 16000
548
  codec_sr = 50
@@ -562,7 +562,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
562
 
563
  # text normalization
564
  target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
565
- orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
566
 
567
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
568
 
@@ -692,233 +692,233 @@ if __name__ == "__main__":
692
 
693
  with gr.Row():
694
  with gr.Column(scale=2):
695
- input_audio = gr.Audio(
696
  value=f"{DEMO_PATH}/84_121550_000074_000000.wav",
697
  label="Input Audio",
698
  type="filepath",
699
  interactive=True
700
  )
701
  with gr.Group():
702
- original_transcript = gr.Textbox(
703
  label="Original transcript",
704
  lines=5,
705
  value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
706
  info="Use whisperx model to get the transcript."
707
  )
708
- transcribe_btn = gr.Button(value="Transcribe")
709
 
710
  with gr.Column(scale=3):
711
  with gr.Group():
712
- transcript = gr.Textbox(
713
  label="Text",
714
  lines=7,
715
  value="but when I saw the mirage of the lake in the distance, which the sense deceives, lost not by distance any of its marks.",
716
  interactive=True
717
  )
718
- run_btn = gr.Button(value="Run")
719
 
720
  with gr.Column(scale=2):
721
- output_audio = gr.Audio(label="Output Audio")
722
 
723
  with gr.Row():
724
  with gr.Accordion("Advanced Settings", open=False):
725
- seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
726
- aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
727
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
728
- cfg_coef = gr.Number(label="cfg_coef", value=1.5,
729
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
730
- cfg_stride = gr.Number(label="cfg_stride", value=5,
731
  info="cfg stride, 5 is a good value for English, change if you don't like the results")
732
- prompt_length = gr.Number(label="prompt_length", value=3,
733
  info="used for tts prompt, will automatically cut the prompt audio to this length")
734
- sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
735
 
736
- success_output = gr.HTML()
737
 
738
- transcribe_btn.click(
739
  fn=transcribe_en,
740
- inputs=[input_audio],
741
- outputs=[original_transcript, gr.State(), gr.State(), success_output]
742
  )
743
 
744
- run_btn.click(fn=run_edit_en,
745
  inputs=[
746
- seed, sub_amount,
747
- aug_text, cfg_coef, cfg_stride, prompt_length,
748
- input_audio, original_transcript, transcript,
749
  ],
750
- outputs=[output_audio, success_output])
751
 
752
- transcript.submit(fn=run_edit_en,
753
  inputs=[
754
- seed, sub_amount,
755
- aug_text, cfg_coef, cfg_stride, prompt_length,
756
- input_audio, original_transcript, transcript,
757
  ],
758
- outputs=[output_audio, success_output]
759
  )
760
 
761
  with gr.Tab("English TTS"):
762
 
763
  with gr.Row():
764
  with gr.Column(scale=2):
765
- input_audio = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
766
  with gr.Group():
767
- original_transcript = gr.Textbox(label="Original transcript", lines=5, value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
768
  info="Use whisperx model to get the transcript.")
769
- transcribe_btn = gr.Button(value="Transcribe")
770
 
771
  with gr.Column(scale=3):
772
  with gr.Group():
773
- transcript = gr.Textbox(label="Text", lines=7, value="I cannot believe that the same model can also do text to speech synthesis too!", interactive=True)
774
- run_btn = gr.Button(value="Run")
775
 
776
  with gr.Column(scale=2):
777
- output_audio = gr.Audio(label="Output Audio")
778
 
779
  with gr.Row():
780
  with gr.Accordion("Advanced Settings", open=False):
781
- seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
782
- aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
783
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
784
- cfg_coef = gr.Number(label="cfg_coef", value=1.5,
785
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
786
- cfg_stride = gr.Number(label="cfg_stride", value=5,
787
  info="cfg stride, 5 is a good value for English, change if you don't like the results")
788
- prompt_length = gr.Number(label="prompt_length", value=3,
789
  info="used for tts prompt, will automatically cut the prompt audio to this length")
790
- sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
791
 
792
- success_output = gr.HTML()
793
 
794
- transcribe_btn.click(fn=transcribe_en,
795
- inputs=[input_audio],
796
- outputs=[original_transcript, gr.State(), gr.State(), success_output])
797
 
798
- run_btn.click(fn=run_tts_en,
799
  inputs=[
800
- seed, sub_amount,
801
- aug_text, cfg_coef, cfg_stride, prompt_length,
802
- input_audio, original_transcript, transcript,
803
  ],
804
- outputs=[output_audio, success_output])
805
 
806
- transcript.submit(fn=run_tts_en,
807
  inputs=[
808
- seed, sub_amount,
809
- aug_text, cfg_coef, cfg_stride, prompt_length,
810
- input_audio, original_transcript, transcript,
811
  ],
812
- outputs=[output_audio, success_output]
813
  )
814
 
815
  with gr.Tab("Mandarin Speech Editing"):
816
 
817
  with gr.Row():
818
  with gr.Column(scale=2):
819
- input_audio = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
820
  with gr.Group():
821
- original_transcript = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
822
  info="Use whisperx model to get the transcript.")
823
- transcribe_btn = gr.Button(value="Transcribe")
824
 
825
  with gr.Column(scale=3):
826
  with gr.Group():
827
- transcript = gr.Textbox(label="Text", lines=7, value="价格已基本都在一万到两万之间", interactive=True)
828
- run_btn = gr.Button(value="Run")
829
 
830
  with gr.Column(scale=2):
831
- output_audio = gr.Audio(label="Output Audio")
832
 
833
  with gr.Row():
834
  with gr.Accordion("Advanced Settings", open=False):
835
- seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
836
- aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
837
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
838
- cfg_coef = gr.Number(label="cfg_coef", value=1.5,
839
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
840
- cfg_stride = gr.Number(label="cfg_stride", value=1,
841
  info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
842
- prompt_length = gr.Number(label="prompt_length", value=3,
843
  info="used for tts prompt, will automatically cut the prompt audio to this length")
844
- sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
845
 
846
- success_output = gr.HTML()
847
 
848
- transcribe_btn.click(fn=transcribe_zh,
849
- inputs=[input_audio],
850
- outputs=[original_transcript, gr.State(), gr.State(), success_output])
851
 
852
- run_btn.click(fn=run_edit_zh,
853
  inputs=[
854
- seed, sub_amount,
855
- aug_text, cfg_coef, cfg_stride, prompt_length,
856
- input_audio, original_transcript, transcript,
857
  ],
858
- outputs=[output_audio, success_output])
859
 
860
- transcript.submit(fn=run_edit_zh,
861
  inputs=[
862
- seed, sub_amount,
863
- aug_text, cfg_coef, cfg_stride, prompt_length,
864
- input_audio, original_transcript, transcript,
865
  ],
866
- outputs=[output_audio, success_output]
867
  )
868
 
869
  with gr.Tab("Mandarin TTS"):
870
 
871
  with gr.Row():
872
  with gr.Column(scale=2):
873
- input_audio = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
874
  with gr.Group():
875
- original_transcript = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
876
  info="Use whisperx model to get the transcript.")
877
- transcribe_btn = gr.Button(value="Transcribe")
878
 
879
  with gr.Column(scale=3):
880
  with gr.Group():
881
- transcript = gr.Textbox(label="Text", lines=7, value="我简直不敢相信同一个模型也可以进行文本到语音的生成", interactive=True)
882
- run_btn = gr.Button(value="Run")
883
 
884
  with gr.Column(scale=2):
885
- output_audio = gr.Audio(label="Output Audio")
886
 
887
  with gr.Row():
888
  with gr.Accordion("Advanced Settings", open=False):
889
- seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
890
- aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
891
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
892
- cfg_coef = gr.Number(label="cfg_coef", value=1.5,
893
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
894
- cfg_stride = gr.Number(label="cfg_stride", value=1,
895
  info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
896
- prompt_length = gr.Number(label="prompt_length", value=3,
897
  info="used for tts prompt, will automatically cut the prompt audio to this length")
898
- sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
899
 
900
- success_output = gr.HTML()
901
 
902
- transcribe_btn.click(fn=transcribe_zh,
903
- inputs=[input_audio],
904
- outputs=[original_transcript, gr.State(), gr.State(), success_output])
905
 
906
- run_btn.click(fn=run_tts_zh,
907
  inputs=[
908
- seed, sub_amount,
909
- aug_text, cfg_coef, cfg_stride, prompt_length,
910
- input_audio, original_transcript, transcript,
911
  ],
912
- outputs=[output_audio, success_output])
913
 
914
- transcript.submit(fn=run_tts_zh,
915
  inputs=[
916
- seed, sub_amount,
917
- aug_text, cfg_coef, cfg_stride, prompt_length,
918
- input_audio, original_transcript, transcript,
919
  ],
920
- outputs=[output_audio, success_output]
921
  )
922
 
923
  # Launch the Gradio demo
924
- demo.launch()
 
249
 
250
  @spaces.GPU
251
  def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
252
+ audio_path, transcript):
253
 
254
  codec_audio_sr = 16000
255
  codec_sr = 50
 
268
 
269
  # text normalization
270
  target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
271
+ # orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
272
 
273
  [orig_transcript, segments, _, _] = transcribe_en(audio_path)
274
  orig_transcript = orig_transcript.lower()
 
344
 
345
  @spaces.GPU
346
  def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
347
+ audio_path, transcript):
348
 
349
  codec_audio_sr = 16000
350
  codec_sr = 50
 
363
 
364
  # text normalization
365
  target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
366
+ # orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
367
 
368
  [orig_transcript, segments, _, _] = transcribe_en(audio_path)
369
  orig_transcript = orig_transcript.lower()
 
444
 
445
  @spaces.GPU
446
  def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
447
+ audio_path, transcript):
448
 
449
  codec_audio_sr = 16000
450
  codec_sr = 50
 
464
 
465
  # text normalization
466
  target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
467
+ # orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
468
 
469
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
470
 
 
542
 
543
  @spaces.GPU
544
  def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
545
+ audio_path, transcript):
546
 
547
  codec_audio_sr = 16000
548
  codec_sr = 50
 
562
 
563
  # text normalization
564
  target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
565
+ # orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
566
 
567
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
568
 
 
692
 
693
  with gr.Row():
694
  with gr.Column(scale=2):
695
+ input_audio1 = gr.Audio(
696
  value=f"{DEMO_PATH}/84_121550_000074_000000.wav",
697
  label="Input Audio",
698
  type="filepath",
699
  interactive=True
700
  )
701
  with gr.Group():
702
+ original_transcript1 = gr.Textbox(
703
  label="Original transcript",
704
  lines=5,
705
  value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
706
  info="Use whisperx model to get the transcript."
707
  )
708
+ transcribe_btn1 = gr.Button(value="Transcribe")
709
 
710
  with gr.Column(scale=3):
711
  with gr.Group():
712
+ transcript1 = gr.Textbox(
713
  label="Text",
714
  lines=7,
715
  value="but when I saw the mirage of the lake in the distance, which the sense deceives, lost not by distance any of its marks.",
716
  interactive=True
717
  )
718
+ run_btn1 = gr.Button(value="Run")
719
 
720
  with gr.Column(scale=2):
721
+ output_audio1 = gr.Audio(label="Output Audio")
722
 
723
  with gr.Row():
724
  with gr.Accordion("Advanced Settings", open=False):
725
+ seed1 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
726
+ aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
727
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
728
+ cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
729
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
730
+ cfg_stride1 = gr.Number(label="cfg_stride", value=5,
731
  info="cfg stride, 5 is a good value for English, change if you don't like the results")
732
+ prompt_length1 = gr.Number(label="prompt_length", value=3,
733
  info="used for tts prompt, will automatically cut the prompt audio to this length")
734
+ sub_amount1 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
735
 
736
+ success_output1 = gr.HTML()
737
 
738
+ transcribe_btn1.click(
739
  fn=transcribe_en,
740
+ inputs=[input_audio1],
741
+ outputs=[original_transcript1, gr.State(), gr.State(), success_output1]
742
  )
743
 
744
+ run_btn1.click(fn=run_edit_en,
745
  inputs=[
746
+ seed1, sub_amount1,
747
+ aug_text1, cfg_coef1, cfg_stride1, prompt_length1,
748
+ input_audio1, transcript1,
749
  ],
750
+ outputs=[output_audio1, success_output1])
751
 
752
+ transcript1.submit(fn=run_edit_en,
753
  inputs=[
754
+ seed1, sub_amount1,
755
+ aug_text1, cfg_coef1, cfg_stride1, prompt_length1,
756
+ input_audio1, transcript1,
757
  ],
758
+ outputs=[output_audio1, success_output1]
759
  )
760
 
761
  with gr.Tab("English TTS"):
762
 
763
  with gr.Row():
764
  with gr.Column(scale=2):
765
+ input_audio2 = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
766
  with gr.Group():
767
+ original_transcript2 = gr.Textbox(label="Original transcript", lines=5, value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
768
  info="Use whisperx model to get the transcript.")
769
+ transcribe_btn2 = gr.Button(value="Transcribe")
770
 
771
  with gr.Column(scale=3):
772
  with gr.Group():
773
+ transcript2 = gr.Textbox(label="Text", lines=7, value="I cannot believe that the same model can also do text to speech synthesis too!", interactive=True)
774
+ run_btn2 = gr.Button(value="Run")
775
 
776
  with gr.Column(scale=2):
777
+ output_audio2 = gr.Audio(label="Output Audio")
778
 
779
  with gr.Row():
780
  with gr.Accordion("Advanced Settings", open=False):
781
+ seed2 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
782
+ aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
783
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
784
+ cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
785
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
786
+ cfg_stride2 = gr.Number(label="cfg_stride", value=5,
787
  info="cfg stride, 5 is a good value for English, change if you don't like the results")
788
+ prompt_length2 = gr.Number(label="prompt_length", value=3,
789
  info="used for tts prompt, will automatically cut the prompt audio to this length")
790
+ sub_amount2 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
791
 
792
+ success_output2 = gr.HTML()
793
 
794
+ transcribe_btn2.click(fn=transcribe_en,
795
+ inputs=[input_audio2],
796
+ outputs=[original_transcript2, gr.State(), gr.State(), success_output2])
797
 
798
+ run_btn2.click(fn=run_tts_en,
799
  inputs=[
800
+ seed2, sub_amount2,
801
+ aug_text2, cfg_coef2, cfg_stride2, prompt_length2,
802
+ input_audio2, transcript2,
803
  ],
804
+ outputs=[output_audio2, success_output2])
805
 
806
+ transcript2.submit(fn=run_tts_en,
807
  inputs=[
808
+ seed2, sub_amount2,
809
+ aug_text2, cfg_coef2, cfg_stride2, prompt_length2,
810
+ input_audio2, transcript2,
811
  ],
812
+ outputs=[output_audio2, success_output2]
813
  )
814
 
815
  with gr.Tab("Mandarin Speech Editing"):
816
 
817
  with gr.Row():
818
  with gr.Column(scale=2):
819
+ input_audio3 = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
820
  with gr.Group():
821
+ original_transcript3 = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
822
  info="Use whisperx model to get the transcript.")
823
+ transcribe_btn3 = gr.Button(value="Transcribe")
824
 
825
  with gr.Column(scale=3):
826
  with gr.Group():
827
+ transcript3 = gr.Textbox(label="Text", lines=7, value="价格已基本都在一万到两万之间", interactive=True)
828
+ run_btn3 = gr.Button(value="Run")
829
 
830
  with gr.Column(scale=2):
831
+ output_audio3 = gr.Audio(label="Output Audio")
832
 
833
  with gr.Row():
834
  with gr.Accordion("Advanced Settings", open=False):
835
+ seed3 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
836
+ aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
837
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
838
+ cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
839
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
840
+ cfg_stride3 = gr.Number(label="cfg_stride", value=1,
841
  info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
842
+ prompt_length3 = gr.Number(label="prompt_length", value=3,
843
  info="used for tts prompt, will automatically cut the prompt audio to this length")
844
+ sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
845
 
846
+ success_output3 = gr.HTML()
847
 
848
+ transcribe_btn3.click(fn=transcribe_zh,
849
+ inputs=[input_audio3],
850
+ outputs=[original_transcript3, gr.State(), gr.State(), success_output3])
851
 
852
+ run_btn3.click(fn=run_edit_zh,
853
  inputs=[
854
+ seed3, sub_amount3,
855
+ aug_text3, cfg_coef3, cfg_stride3, prompt_length3,
856
+ input_audio3, transcript3,
857
  ],
858
+ outputs=[output_audio3, success_output3])
859
 
860
+ transcript3.submit(fn=run_edit_zh,
861
  inputs=[
862
+ seed3, sub_amount3,
863
+ aug_text3, cfg_coef3, cfg_stride3, prompt_length3,
864
+ input_audio3, transcript3,
865
  ],
866
+ outputs=[output_audio3, success_output3]
867
  )
868
 
869
  with gr.Tab("Mandarin TTS"):
870
 
871
  with gr.Row():
872
  with gr.Column(scale=2):
873
+ input_audio4 = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
874
  with gr.Group():
875
+ original_transcript4 = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
876
  info="Use whisperx model to get the transcript.")
877
+ transcribe_btn4 = gr.Button(value="Transcribe")
878
 
879
  with gr.Column(scale=3):
880
  with gr.Group():
881
+ transcript4 = gr.Textbox(label="Text", lines=7, value="我简直不敢相信同一个模型也可以进行文本到语音的生成", interactive=True)
882
+ run_btn4 = gr.Button(value="Run")
883
 
884
  with gr.Column(scale=2):
885
+ output_audio4 = gr.Audio(label="Output Audio")
886
 
887
  with gr.Row():
888
  with gr.Accordion("Advanced Settings", open=False):
889
+ seed4 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
890
+ aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
891
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
892
+ cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
893
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
894
+ cfg_stride4 = gr.Number(label="cfg_stride", value=1,
895
  info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
896
+ prompt_length4 = gr.Number(label="prompt_length", value=3,
897
  info="used for tts prompt, will automatically cut the prompt audio to this length")
898
+ sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
899
 
900
+ success_output4 = gr.HTML()
901
 
902
+ transcribe_btn4.click(fn=transcribe_zh,
903
+ inputs=[input_audio4],
904
+ outputs=[original_transcript4, gr.State(), gr.State(), success_output4])
905
 
906
+ run_btn4.click(fn=run_tts_zh,
907
  inputs=[
908
+ seed4, sub_amount4,
909
+ aug_text4, cfg_coef4, cfg_stride4, prompt_length4,
910
+ input_audio4, transcript4,
911
  ],
912
+ outputs=[output_audio4, success_output4])
913
 
914
+ transcript4.submit(fn=run_tts_zh,
915
  inputs=[
916
+ seed4, sub_amount4,
917
+ aug_text4, cfg_coef4, cfg_stride4, prompt_length4,
918
+ input_audio4, transcript4,
919
  ],
920
+ outputs=[output_audio4, success_output4]
921
  )
922
 
923
  # Launch the Gradio demo
924
+ demo.launch()