Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -29,15 +29,6 @@ plt.tight_layout(pad=0.5) # You can adjust the pad value as needed
|
|
29 |
torch.manual_seed(1234)
|
30 |
MAX_WAV_VALUE = 32768.0
|
31 |
|
32 |
-
DESCRIPTION = """
|
33 |
-
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
|
34 |
-
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
|
35 |
-
of each style token, we configured the attention module as a single-head.
|
36 |
-
|
37 |
-
Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
|
38 |
-
generated speech may show more distortion and miss-pronunciations.
|
39 |
-
"""
|
40 |
-
|
41 |
|
42 |
def load_checkpoint(filepath, device):
|
43 |
assert os.path.isfile(filepath)
|
@@ -133,8 +124,6 @@ def synthesize(text, gst_1, gst_2, gst_3, voc):
|
|
133 |
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
|
134 |
mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
|
135 |
alignments = alignments.squeeze().T.detach().numpy()
|
136 |
-
# fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
|
137 |
-
# fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)
|
138 |
|
139 |
# normalize numpy arrays between [-1, 1]
|
140 |
min_val = np.min(mel_outputs_postnet)
|
@@ -153,9 +142,6 @@ def synthesize(text, gst_1, gst_2, gst_3, voc):
|
|
153 |
return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align
|
154 |
|
155 |
|
156 |
-
# Custom Demo Interface:
|
157 |
-
# theme='ysharma/steampunk',
|
158 |
-
# css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
|
159 |
with gr.Blocks() as demo:
|
160 |
gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
|
161 |
"<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
|
@@ -180,27 +166,20 @@ with gr.Blocks() as demo:
|
|
180 |
container=False, value=0, min_width=300) # label="Vocoder")
|
181 |
greet_btn = gr.Button("Synthesize!", scale=1)
|
182 |
with gr.Column():
|
183 |
-
# wave_video = gr.make_waveform(audio)
|
184 |
with gr.Tab("Spectrogram"):
|
185 |
-
# spec_plot = gr.Plot()
|
186 |
spec_plot = gr.Image(container=False)
|
187 |
with gr.Tab("Alignment"):
|
188 |
-
# align_plot = gr.Plot()
|
189 |
align_plot = gr.Image(container=False)
|
190 |
wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
|
191 |
-
# play_video = gr.Button(label="Play", size='sm')
|
192 |
-
# audio_clip = gr.Audio(label="Generated Speech", type="numpy")
|
193 |
|
194 |
def display_video():
|
195 |
return wave_video
|
196 |
-
# play_video.click(fn=display_video)
|
197 |
greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
|
198 |
outputs=[wave_video, spec_plot, align_plot],
|
199 |
api_name="synthesize")
|
200 |
|
201 |
with gr.Row():
|
202 |
with gr.Column():
|
203 |
-
# gr.Markdown("### Audio Examples")
|
204 |
gr.Examples(examples=infer_from_text_examples,
|
205 |
inputs=[inp, gst_1, gst_2, gst_3, vocoder],
|
206 |
outputs=[wave_video, spec_plot, align_plot],
|
@@ -242,11 +221,4 @@ with gr.Blocks() as demo:
|
|
242 |
head for simplicity, ease control purposes, but also to observer whether this attention still
|
243 |
works with just one head."""
|
244 |
|
245 |
-
# gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
|
246 |
-
# "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
|
247 |
-
# "to control the relevance of each style token, we configured the attention module as a single-head. "
|
248 |
-
# "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
|
249 |
-
# "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
|
250 |
-
# "distortion and miss-pronunciations.")
|
251 |
-
|
252 |
demo.launch()
|
|
|
29 |
torch.manual_seed(1234)
|
30 |
MAX_WAV_VALUE = 32768.0
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def load_checkpoint(filepath, device):
|
34 |
assert os.path.isfile(filepath)
|
|
|
124 |
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
|
125 |
mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
|
126 |
alignments = alignments.squeeze().T.detach().numpy()
|
|
|
|
|
127 |
|
128 |
# normalize numpy arrays between [-1, 1]
|
129 |
min_val = np.min(mel_outputs_postnet)
|
|
|
142 |
return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align
|
143 |
|
144 |
|
|
|
|
|
|
|
145 |
with gr.Blocks() as demo:
|
146 |
gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
|
147 |
"<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
|
|
|
166 |
container=False, value=0, min_width=300) # label="Vocoder")
|
167 |
greet_btn = gr.Button("Synthesize!", scale=1)
|
168 |
with gr.Column():
|
|
|
169 |
with gr.Tab("Spectrogram"):
|
|
|
170 |
spec_plot = gr.Image(container=False)
|
171 |
with gr.Tab("Alignment"):
|
|
|
172 |
align_plot = gr.Image(container=False)
|
173 |
wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
|
|
|
|
|
174 |
|
175 |
def display_video():
|
176 |
return wave_video
|
|
|
177 |
greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
|
178 |
outputs=[wave_video, spec_plot, align_plot],
|
179 |
api_name="synthesize")
|
180 |
|
181 |
with gr.Row():
|
182 |
with gr.Column():
|
|
|
183 |
gr.Examples(examples=infer_from_text_examples,
|
184 |
inputs=[inp, gst_1, gst_2, gst_3, vocoder],
|
185 |
outputs=[wave_video, spec_plot, align_plot],
|
|
|
221 |
head for simplicity, ease control purposes, but also to observer whether this attention still
|
222 |
works with just one head."""
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
demo.launch()
|