Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Jan 2

Commit

951dfe7

verified ·

1 Parent(s): c6e03e2

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -113

app.py CHANGED Viewed

@@ -249,7 +249,7 @@ def replace_numbers_with_words(sentence):
 @spaces.GPU
 def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
-        audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
@@ -268,7 +268,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     # text normalization
     target_transcript = replace_numbers_with_words(transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
-    orig_transcript = replace_numbers_with_words(original_transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_en(audio_path)
     orig_transcript = orig_transcript.lower()
@@ -344,7 +344,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
 @spaces.GPU
 def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
-        audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
@@ -363,7 +363,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     # text normalization
     target_transcript = replace_numbers_with_words(transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
-    orig_transcript = replace_numbers_with_words(original_transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_en(audio_path)
     orig_transcript = orig_transcript.lower()
@@ -444,7 +444,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
 @spaces.GPU
 def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
-        audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
@@ -464,7 +464,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     # text normalization
     target_transcript = transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
-    orig_transcript = original_transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
@@ -542,7 +542,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
 @spaces.GPU
 def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
-        audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
@@ -562,7 +562,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     # text normalization
     target_transcript = transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
-    orig_transcript = original_transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
@@ -692,233 +692,233 @@ if __name__ == "__main__":
                 with gr.Row():
                     with gr.Column(scale=2):
-                        input_audio = gr.Audio(
                             value=f"{DEMO_PATH}/84_121550_000074_000000.wav",
                             label="Input Audio",
                             type="filepath",
                             interactive=True
                         )
                         with gr.Group():
-                            original_transcript = gr.Textbox(
                                 label="Original transcript",
                                 lines=5,
                                 value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
                                 info="Use whisperx model to get the transcript."
                             )
-                            transcribe_btn = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
-                            transcript = gr.Textbox(
                                 label="Text",
                                 lines=7,
                                 value="but when I saw the mirage of the lake in the distance, which the sense deceives, lost not by distance any of its marks.",
                                 interactive=True
                             )
-                            run_btn = gr.Button(value="Run")
                     with gr.Column(scale=2):
-                        output_audio = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
-                        aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
-                        cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride = gr.Number(label="cfg_stride", value=5,
                                             info="cfg stride, 5 is a good value for English, change if you don't like the results")
-                        prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
-                        sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
-                success_output = gr.HTML()
-                transcribe_btn.click(
                     fn=transcribe_en,
-                    inputs=[input_audio],
-                    outputs=[original_transcript, gr.State(), gr.State(), success_output]
                 )
-                run_btn.click(fn=run_edit_en,
                             inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                             ],
-                            outputs=[output_audio, success_output])
-                transcript.submit(fn=run_edit_en,
                         inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                         ],
-                    outputs=[output_audio, success_output]
                 )
             with gr.Tab("English TTS"):
                 with gr.Row():
                     with gr.Column(scale=2):
-                        input_audio = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
                         with gr.Group():
-                            original_transcript = gr.Textbox(label="Original transcript", lines=5, value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
                                                             info="Use whisperx model to get the transcript.")
-                            transcribe_btn = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
-                            transcript = gr.Textbox(label="Text", lines=7, value="I cannot believe that the same model can also do text to speech synthesis too!", interactive=True)
-                            run_btn = gr.Button(value="Run")
                     with gr.Column(scale=2):
-                        output_audio = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
-                        aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
-                        cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride = gr.Number(label="cfg_stride", value=5,
                                             info="cfg stride, 5 is a good value for English, change if you don't like the results")
-                        prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
-                        sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
-                success_output = gr.HTML()
-                transcribe_btn.click(fn=transcribe_en,
-                                    inputs=[input_audio],
-                                    outputs=[original_transcript, gr.State(), gr.State(), success_output])
-                run_btn.click(fn=run_tts_en,
                             inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                             ],
-                            outputs=[output_audio, success_output])
-                transcript.submit(fn=run_tts_en,
                         inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                         ],
-                    outputs=[output_audio, success_output]
                 )
             with gr.Tab("Mandarin Speech Editing"):
                 with gr.Row():
                     with gr.Column(scale=2):
-                        input_audio = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
                         with gr.Group():
-                            original_transcript = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
                                                             info="Use whisperx model to get the transcript.")
-                            transcribe_btn = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
-                            transcript = gr.Textbox(label="Text", lines=7, value="价格已基本都在一万到两万之间", interactive=True)
-                            run_btn = gr.Button(value="Run")
                     with gr.Column(scale=2):
-                        output_audio = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
-                        aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
-                        cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride = gr.Number(label="cfg_stride", value=1,
                                             info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
-                        prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
-                        sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
-                success_output = gr.HTML()
-                transcribe_btn.click(fn=transcribe_zh,
-                                    inputs=[input_audio],
-                                    outputs=[original_transcript, gr.State(), gr.State(), success_output])
-                run_btn.click(fn=run_edit_zh,
                             inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                             ],
-                            outputs=[output_audio, success_output])
-                transcript.submit(fn=run_edit_zh,
                         inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                         ],
-                    outputs=[output_audio, success_output]
                 )
             with gr.Tab("Mandarin TTS"):
                 with gr.Row():
                     with gr.Column(scale=2):
-                        input_audio = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
                         with gr.Group():
-                            original_transcript = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
                                                             info="Use whisperx model to get the transcript.")
-                            transcribe_btn = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
-                            transcript = gr.Textbox(label="Text", lines=7, value="我简直不敢相信同一个模型也可以进行文本到语音的生成", interactive=True)
-                            run_btn = gr.Button(value="Run")
                     with gr.Column(scale=2):
-                        output_audio = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
-                        aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
-                        cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride = gr.Number(label="cfg_stride", value=1,
                                             info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
-                        prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
-                        sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
-                success_output = gr.HTML()
-                transcribe_btn.click(fn=transcribe_zh,
-                                    inputs=[input_audio],
-                                    outputs=[original_transcript, gr.State(), gr.State(), success_output])
-                run_btn.click(fn=run_tts_zh,
                             inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                             ],
-                            outputs=[output_audio, success_output])
-                transcript.submit(fn=run_tts_zh,
                         inputs=[
-                                seed, sub_amount,
-                                aug_text, cfg_coef, cfg_stride, prompt_length,
-                                input_audio, original_transcript, transcript,
                         ],
-                    outputs=[output_audio, success_output]
                 )
         # Launch the Gradio demo
-        demo.launch()

 @spaces.GPU
 def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
+        audio_path, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
     # text normalization
     target_transcript = replace_numbers_with_words(transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
+    # orig_transcript = replace_numbers_with_words(original_transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_en(audio_path)
     orig_transcript = orig_transcript.lower()
 @spaces.GPU
 def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
+        audio_path, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
     # text normalization
     target_transcript = replace_numbers_with_words(transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
+    # orig_transcript = replace_numbers_with_words(original_transcript).replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_en(audio_path)
     orig_transcript = orig_transcript.lower()
 @spaces.GPU
 def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
+        audio_path, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
     # text normalization
     target_transcript = transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
+    # orig_transcript = original_transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
 @spaces.GPU
 def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
+        audio_path, transcript):
     codec_audio_sr = 16000
     codec_sr = 50
     # text normalization
     target_transcript = transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
+    # orig_transcript = original_transcript.replace("  ", " ").replace("  ", " ").replace("\n", " ")
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
                 with gr.Row():
                     with gr.Column(scale=2):
+                        input_audio1 = gr.Audio(
                             value=f"{DEMO_PATH}/84_121550_000074_000000.wav",
                             label="Input Audio",
                             type="filepath",
                             interactive=True
                         )
                         with gr.Group():
+                            original_transcript1 = gr.Textbox(
                                 label="Original transcript",
                                 lines=5,
                                 value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
                                 info="Use whisperx model to get the transcript."
                             )
+                            transcribe_btn1 = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
+                            transcript1 = gr.Textbox(
                                 label="Text",
                                 lines=7,
                                 value="but when I saw the mirage of the lake in the distance, which the sense deceives, lost not by distance any of its marks.",
                                 interactive=True
                             )
+                            run_btn1 = gr.Button(value="Run")
                     with gr.Column(scale=2):
+                        output_audio1 = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed1 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
+                        aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
+                        cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride1 = gr.Number(label="cfg_stride", value=5,
                                             info="cfg stride, 5 is a good value for English, change if you don't like the results")
+                        prompt_length1 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
+                        sub_amount1 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
+                success_output1 = gr.HTML()
+                transcribe_btn1.click(
                     fn=transcribe_en,
+                    inputs=[input_audio1],
+                    outputs=[original_transcript1, gr.State(), gr.State(), success_output1]
                 )
+                run_btn1.click(fn=run_edit_en,
                             inputs=[
+                                seed1, sub_amount1,
+                                aug_text1, cfg_coef1, cfg_stride1, prompt_length1,
+                                input_audio1, transcript1,
                             ],
+                            outputs=[output_audio1, success_output1])
+                transcript1.submit(fn=run_edit_en,
                         inputs=[
+                                seed1, sub_amount1,
+                                aug_text1, cfg_coef1, cfg_stride1, prompt_length1,
+                                input_audio1, transcript1,
                         ],
+                    outputs=[output_audio1, success_output1]
                 )
             with gr.Tab("English TTS"):
                 with gr.Row():
                     with gr.Column(scale=2):
+                        input_audio2 = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
                         with gr.Group():
+                            original_transcript2 = gr.Textbox(label="Original transcript", lines=5, value="but when I had approached so near to them the common object, which the sense deceives, lost not by distance any of its marks.",
                                                             info="Use whisperx model to get the transcript.")
+                            transcribe_btn2 = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
+                            transcript2 = gr.Textbox(label="Text", lines=7, value="I cannot believe that the same model can also do text to speech synthesis too!", interactive=True)
+                            run_btn2 = gr.Button(value="Run")
                     with gr.Column(scale=2):
+                        output_audio2 = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed2 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
+                        aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
+                        cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride2 = gr.Number(label="cfg_stride", value=5,
                                             info="cfg stride, 5 is a good value for English, change if you don't like the results")
+                        prompt_length2 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
+                        sub_amount2 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
+                success_output2 = gr.HTML()
+                transcribe_btn2.click(fn=transcribe_en,
+                                    inputs=[input_audio2],
+                                    outputs=[original_transcript2, gr.State(), gr.State(), success_output2])
+                run_btn2.click(fn=run_tts_en,
                             inputs=[
+                                seed2, sub_amount2,
+                                aug_text2, cfg_coef2, cfg_stride2, prompt_length2,
+                                input_audio2, transcript2,
                             ],
+                            outputs=[output_audio2, success_output2])
+                transcript2.submit(fn=run_tts_en,
                         inputs=[
+                                seed2, sub_amount2,
+                                aug_text2, cfg_coef2, cfg_stride2, prompt_length2,
+                                input_audio2, transcript2,
                         ],
+                    outputs=[output_audio2, success_output2]
                 )
             with gr.Tab("Mandarin Speech Editing"):
                 with gr.Row():
                     with gr.Column(scale=2):
+                        input_audio3 = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
                         with gr.Group():
+                            original_transcript3 = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
                                                             info="Use whisperx model to get the transcript.")
+                            transcribe_btn3 = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
+                            transcript3 = gr.Textbox(label="Text", lines=7, value="价格已基本都在一万到两万之间", interactive=True)
+                            run_btn3 = gr.Button(value="Run")
                     with gr.Column(scale=2):
+                        output_audio3 = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed3 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
+                        aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
+                        cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride3 = gr.Number(label="cfg_stride", value=1,
                                             info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
+                        prompt_length3 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
+                        sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
+                success_output3 = gr.HTML()
+                transcribe_btn3.click(fn=transcribe_zh,
+                                    inputs=[input_audio3],
+                                    outputs=[original_transcript3, gr.State(), gr.State(), success_output3])
+                run_btn3.click(fn=run_edit_zh,
                             inputs=[
+                                seed3, sub_amount3,
+                                aug_text3, cfg_coef3, cfg_stride3, prompt_length3,
+                                input_audio3, transcript3,
                             ],
+                            outputs=[output_audio3, success_output3])
+                transcript3.submit(fn=run_edit_zh,
                         inputs=[
+                                seed3, sub_amount3,
+                                aug_text3, cfg_coef3, cfg_stride3, prompt_length3,
+                                input_audio3, transcript3,
                         ],
+                    outputs=[output_audio3, success_output3]
                 )
             with gr.Tab("Mandarin TTS"):
                 with gr.Row():
                     with gr.Column(scale=2):
+                        input_audio4 = gr.Audio(value=f"{DEMO_PATH}/aishell3_test.wav", label="Input Audio", type="filepath", interactive=True)
                         with gr.Group():
+                            original_transcript4 = gr.Textbox(label="Original transcript", lines=5, value="价格已基本都在三万到六万之间",
                                                             info="Use whisperx model to get the transcript.")
+                            transcribe_btn4 = gr.Button(value="Transcribe")
                     with gr.Column(scale=3):
                         with gr.Group():
+                            transcript4 = gr.Textbox(label="Text", lines=7, value="我简直不敢相信同一个模型也可以进行文本到语音的生成", interactive=True)
+                            run_btn4 = gr.Button(value="Run")
                     with gr.Column(scale=2):
+                        output_audio4 = gr.Audio(label="Output Audio")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed4 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
+                        aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
+                        cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride4 = gr.Number(label="cfg_stride", value=1,
                                             info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
+                        prompt_length4 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
+                        sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
+                success_output4 = gr.HTML()
+                transcribe_btn4.click(fn=transcribe_zh,
+                                    inputs=[input_audio4],
+                                    outputs=[original_transcript4, gr.State(), gr.State(), success_output4])
+                run_btn4.click(fn=run_tts_zh,
                             inputs=[
+                                seed4, sub_amount4,
+                                aug_text4, cfg_coef4, cfg_stride4, prompt_length4,
+                                input_audio4, transcript4,
                             ],
+                            outputs=[output_audio4, success_output4])
+                transcript4.submit(fn=run_tts_zh,
                         inputs=[
+                                seed4, sub_amount4,
+                                aug_text4, cfg_coef4, cfg_stride4, prompt_length4,
+                                input_audio4, transcript4,
                         ],
+                    outputs=[output_audio4, success_output4]
                 )
         # Launch the Gradio demo
+        demo.launch()