AlexK-PL commited on
Commit
af18667
·
1 Parent(s): 1ee6f2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -28
app.py CHANGED
@@ -29,15 +29,6 @@ plt.tight_layout(pad=0.5) # You can adjust the pad value as needed
29
  torch.manual_seed(1234)
30
  MAX_WAV_VALUE = 32768.0
31
 
32
- DESCRIPTION = """
33
- This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
34
- The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
35
- of each style token, we configured the attention module as a single-head.
36
-
37
- Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
38
- generated speech may show more distortion and miss-pronunciations.
39
- """
40
-
41
 
42
  def load_checkpoint(filepath, device):
43
  assert os.path.isfile(filepath)
@@ -133,8 +124,6 @@ def synthesize(text, gst_1, gst_2, gst_3, voc):
133
  mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
134
  mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
135
  alignments = alignments.squeeze().T.detach().numpy()
136
- # fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
137
- # fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)
138
 
139
  # normalize numpy arrays between [-1, 1]
140
  min_val = np.min(mel_outputs_postnet)
@@ -153,9 +142,6 @@ def synthesize(text, gst_1, gst_2, gst_3, voc):
153
  return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align
154
 
155
 
156
- # Custom Demo Interface:
157
- # theme='ysharma/steampunk',
158
- # css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
159
  with gr.Blocks() as demo:
160
  gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
161
  "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
@@ -180,27 +166,20 @@ with gr.Blocks() as demo:
180
  container=False, value=0, min_width=300) # label="Vocoder")
181
  greet_btn = gr.Button("Synthesize!", scale=1)
182
  with gr.Column():
183
- # wave_video = gr.make_waveform(audio)
184
  with gr.Tab("Spectrogram"):
185
- # spec_plot = gr.Plot()
186
  spec_plot = gr.Image(container=False)
187
  with gr.Tab("Alignment"):
188
- # align_plot = gr.Plot()
189
  align_plot = gr.Image(container=False)
190
  wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
191
- # play_video = gr.Button(label="Play", size='sm')
192
- # audio_clip = gr.Audio(label="Generated Speech", type="numpy")
193
 
194
  def display_video():
195
  return wave_video
196
- # play_video.click(fn=display_video)
197
  greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
198
  outputs=[wave_video, spec_plot, align_plot],
199
  api_name="synthesize")
200
 
201
  with gr.Row():
202
  with gr.Column():
203
- # gr.Markdown("### Audio Examples")
204
  gr.Examples(examples=infer_from_text_examples,
205
  inputs=[inp, gst_1, gst_2, gst_3, vocoder],
206
  outputs=[wave_video, spec_plot, align_plot],
@@ -242,11 +221,4 @@ with gr.Blocks() as demo:
242
  head for simplicity, ease control purposes, but also to observer whether this attention still
243
  works with just one head."""
244
 
245
- # gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
246
- # "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
247
- # "to control the relevance of each style token, we configured the attention module as a single-head. "
248
- # "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
249
- # "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
250
- # "distortion and miss-pronunciations.")
251
-
252
  demo.launch()
 
29
  torch.manual_seed(1234)
30
  MAX_WAV_VALUE = 32768.0
31
 
 
 
 
 
 
 
 
 
 
32
 
33
  def load_checkpoint(filepath, device):
34
  assert os.path.isfile(filepath)
 
124
  mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
125
  mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
126
  alignments = alignments.squeeze().T.detach().numpy()
 
 
127
 
128
  # normalize numpy arrays between [-1, 1]
129
  min_val = np.min(mel_outputs_postnet)
 
142
  return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align
143
 
144
 
 
 
 
145
  with gr.Blocks() as demo:
146
  gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
147
  "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
 
166
  container=False, value=0, min_width=300) # label="Vocoder")
167
  greet_btn = gr.Button("Synthesize!", scale=1)
168
  with gr.Column():
 
169
  with gr.Tab("Spectrogram"):
 
170
  spec_plot = gr.Image(container=False)
171
  with gr.Tab("Alignment"):
 
172
  align_plot = gr.Image(container=False)
173
  wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
 
 
174
 
175
  def display_video():
176
  return wave_video
 
177
  greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
178
  outputs=[wave_video, spec_plot, align_plot],
179
  api_name="synthesize")
180
 
181
  with gr.Row():
182
  with gr.Column():
 
183
  gr.Examples(examples=infer_from_text_examples,
184
  inputs=[inp, gst_1, gst_2, gst_3, vocoder],
185
  outputs=[wave_video, spec_plot, align_plot],
 
221
  head for simplicity, ease control purposes, but also to observer whether this attention still
222
  works with just one head."""
223
 
 
 
 
 
 
 
 
224
  demo.launch()