Recode the interface into block
Browse files
app.py
CHANGED
@@ -82,9 +82,10 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic):
|
|
82 |
"output.wav",
|
83 |
)
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
88 |
<a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
|
89 |
<br/>
|
90 |
XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
|
@@ -98,21 +99,15 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, wh
|
|
98 |
<a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
|
99 |
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
|
100 |
</p>
|
101 |
-
"""
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
]
|
106 |
-
|
107 |
-
gr.Interface(
|
108 |
-
fn=predict,
|
109 |
-
inputs=[
|
110 |
-
gr.Textbox(
|
111 |
label="Text Prompt",
|
112 |
info="One or two sentences at a time is better",
|
113 |
value="Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
|
114 |
-
)
|
115 |
-
gr.Dropdown(
|
116 |
label="Language",
|
117 |
info="Select an output language for the synthesised speech",
|
118 |
choices=[
|
@@ -132,27 +127,35 @@ gr.Interface(
|
|
132 |
],
|
133 |
max_choices=1,
|
134 |
value="en",
|
135 |
-
)
|
136 |
-
gr.Audio(
|
137 |
label="Reference Audio",
|
138 |
#info="Click on the ✎ button to upload your own target speaker audio",
|
139 |
type="filepath",
|
140 |
value="examples/female.wav",
|
141 |
-
)
|
142 |
-
gr.Audio(sources=["microphone"],
|
143 |
type="filepath",
|
144 |
#info="Use your microphone to record audio",
|
145 |
-
label="Use Microphone for Reference")
|
146 |
-
gr.Checkbox(label="Check to use Microphone as Reference",
|
147 |
value=False,
|
148 |
-
info="Notice: Microphone input may not work properly under traffic",)
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
gr.
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
"output.wav",
|
83 |
)
|
84 |
|
85 |
+
with gr.Blocks() as interface:
|
86 |
+
gr.HTML("Multi-language Text-to-Speech")
|
87 |
+
gr.HTML(
|
88 |
+
"""
|
89 |
<a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
|
90 |
<br/>
|
91 |
XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
|
|
|
99 |
<a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
|
100 |
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
|
101 |
</p>
|
102 |
+
"""
|
103 |
+
)
|
104 |
+
with gr.Column():
|
105 |
+
prompt = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
label="Text Prompt",
|
107 |
info="One or two sentences at a time is better",
|
108 |
value="Hello, World! Here is an example of light voice cloning. Try to upload your best audio samples quality",
|
109 |
+
)
|
110 |
+
language = gr.Dropdown(
|
111 |
label="Language",
|
112 |
info="Select an output language for the synthesised speech",
|
113 |
choices=[
|
|
|
127 |
],
|
128 |
max_choices=1,
|
129 |
value="en",
|
130 |
+
)
|
131 |
+
audio_file_pth = gr.Audio(
|
132 |
label="Reference Audio",
|
133 |
#info="Click on the ✎ button to upload your own target speaker audio",
|
134 |
type="filepath",
|
135 |
value="examples/female.wav",
|
136 |
+
)
|
137 |
+
mic_file_path = gr.Audio(sources=["microphone"],
|
138 |
type="filepath",
|
139 |
#info="Use your microphone to record audio",
|
140 |
+
label="Use Microphone for Reference")
|
141 |
+
use_mic = gr.Checkbox(label="Check to use Microphone as Reference",
|
142 |
value=False,
|
143 |
+
info="Notice: Microphone input may not work properly under traffic",)
|
144 |
+
with gr.Accordion("Advanced options", open = False):
|
145 |
+
debug_mode = gr.Checkbox(label = "Debug mode", value = False, info = "Show intermediate results")
|
146 |
+
|
147 |
+
submit = gr.Button("🚀 Speak", variant = "primary")
|
148 |
+
|
149 |
+
waveform_visual = gr.Video(label="Waveform Visual", autoplay=True)
|
150 |
+
synthesised_audio = gr.Audio(label="Synthesised Audio", autoplay=False)
|
151 |
+
information = gr.HTML()
|
152 |
+
|
153 |
+
submit.click(predict, inputs = [
|
154 |
+
prompt, language, audio_file_pth, mic_file_path, use_mic
|
155 |
+
], outputs = [
|
156 |
+
waveform_visual,
|
157 |
+
synthesised_audio,
|
158 |
+
information
|
159 |
+
], scroll_to_output = True)
|
160 |
+
|
161 |
+
interface.queue().launch(debug=True)
|