Multi-language_Text-to-Speech

Sleeping

App Files Files Community

Fabrice-TIERCELIN commited on Sep 24, 2024

Commit

3001020

verified ·

1 Parent(s): 29a24a3

Recode the interface into block

Browse files

Files changed (1) hide show

app.py +35 -32

app.py CHANGED Viewed

@@ -82,9 +82,10 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
             "output.wav",
         )
-title = "Multi-language Text-to-Speech"
-description = f"""
 <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
 <br/>
 XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
@@ -98,21 +99,15 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, wh
 <a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </p>
-"""
-article = ""
-examples = [
-]
-gr.Interface(
-    fn=predict,
-    inputs=[
-        gr.Textbox(
             label="Text Prompt",
             info="One or two sentences at a time is better",
             value="Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
-        ),
-        gr.Dropdown(
             label="Language",
             info="Select an output language for the synthesised speech",
             choices=[
@@ -132,27 +127,35 @@ gr.Interface(
             ],
             max_choices=1,
             value="en",
-        ),
-        gr.Audio(
             label="Reference Audio",
             #info="Click on the ✎ button to upload your own target speaker audio",
             type="filepath",
             value="examples/female.wav",
-        ),
-        gr.Audio(sources=["microphone"],
                  type="filepath",
                  #info="Use your microphone to record audio",
-                 label="Use Microphone for Reference"),
-        gr.Checkbox(label="Check to use Microphone as Reference",
                     value=False,
-                    info="Notice: Microphone input may not work properly under traffic",),
-    ],
-    outputs=[
-        gr.Video(label="Waveform Visual", autoplay=True),
-        gr.Audio(label="Synthesised Audio", autoplay=False),
-    ],
-    title=title,
-    description=description,
-    article=article,
-    examples=examples,
-).queue().launch(debug=True)

             "output.wav",
         )
+with gr.Blocks() as interface:
+    gr.HTML("Multi-language Text-to-Speech")
+    gr.HTML(
+        """
 <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
 <br/>
 XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
 <a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </p>
+        """
+    )
+    with gr.Column():
+        prompt = gr.Textbox(
             label="Text Prompt",
             info="One or two sentences at a time is better",
             value="Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
+        )
+        language = gr.Dropdown(
             label="Language",
             info="Select an output language for the synthesised speech",
             choices=[
             ],
             max_choices=1,
             value="en",
+        )
+        audio_file_pth = gr.Audio(
             label="Reference Audio",
             #info="Click on the ✎ button to upload your own target speaker audio",
             type="filepath",
             value="examples/female.wav",
+        )
+        mic_file_path = gr.Audio(sources=["microphone"],
                  type="filepath",
                  #info="Use your microphone to record audio",
+                 label="Use Microphone for Reference")
+        use_mic = gr.Checkbox(label="Check to use Microphone as Reference",
                     value=False,
+                    info="Notice: Microphone input may not work properly under traffic",)
+        with gr.Accordion("Advanced options", open = False):
+             debug_mode = gr.Checkbox(label = "Debug mode", value = False, info = "Show intermediate results")
+        submit = gr.Button("🚀 Speak", variant = "primary")
+        waveform_visual = gr.Video(label="Waveform Visual", autoplay=True)
+        synthesised_audio = gr.Audio(label="Synthesised Audio", autoplay=False)
+        information = gr.HTML()
+    submit.click(predict, inputs = [
+        prompt, language, audio_file_pth, mic_file_path, use_mic
+    ], outputs = [
+        waveform_visual,
+        synthesised_audio,
+        information
+    ], scroll_to_output = True)
+interface.queue().launch(debug=True)