Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -46,16 +46,16 @@ vocoder_model.eval(inference=False)
|
|
46 |
def plot_spec_align(mel, align):
|
47 |
|
48 |
fig_mel = plt.figure()
|
49 |
-
ax_mel = fig_mel.add_subplot(
|
50 |
ax_mel.imshow(mel)
|
51 |
ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
|
52 |
|
53 |
-
fig_align = plt.figure()
|
54 |
-
ax_align =
|
55 |
ax_align.imshow(align)
|
56 |
ax_align.set_title('Alignment', fontsize=12)
|
57 |
|
58 |
-
return fig_mel
|
59 |
|
60 |
|
61 |
def synthesize(text, gst_1, gst_2, gst_3):
|
@@ -77,14 +77,14 @@ def synthesize(text, gst_1, gst_2, gst_3):
|
|
77 |
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
|
78 |
mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
|
79 |
alignments = alignments.squeeze().T.detach().numpy()
|
80 |
-
fig_mel
|
81 |
|
82 |
-
return (22050, audio_numpy), fig_mel
|
83 |
|
84 |
|
85 |
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
|
86 |
gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
|
87 |
-
outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="
|
88 |
title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
|
89 |
iface.launch()
|
90 |
|
|
|
46 |
def plot_spec_align(mel, align):
|
47 |
|
48 |
fig_mel = plt.figure()
|
49 |
+
ax_mel = fig_mel.add_subplot(211)
|
50 |
ax_mel.imshow(mel)
|
51 |
ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
|
52 |
|
53 |
+
# fig_align = plt.figure()
|
54 |
+
ax_align = fig_mel.add_subplot(212) # fig_align
|
55 |
ax_align.imshow(align)
|
56 |
ax_align.set_title('Alignment', fontsize=12)
|
57 |
|
58 |
+
return fig_mel # fig_align
|
59 |
|
60 |
|
61 |
def synthesize(text, gst_1, gst_2, gst_3):
|
|
|
77 |
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
|
78 |
mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
|
79 |
alignments = alignments.squeeze().T.detach().numpy()
|
80 |
+
fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
|
81 |
|
82 |
+
return (22050, audio_numpy), fig_mel # fig_align
|
83 |
|
84 |
|
85 |
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
|
86 |
gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
|
87 |
+
outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),],
|
88 |
title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
|
89 |
iface.launch()
|
90 |
|