AlexK-PL commited on
Commit
5b2ce7f
·
1 Parent(s): c628e3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -46,16 +46,16 @@ vocoder_model.eval(inference=False)
46
  def plot_spec_align(mel, align):
47
 
48
  fig_mel = plt.figure()
49
- ax_mel = fig_mel.add_subplot(111)
50
  ax_mel.imshow(mel)
51
  ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
52
 
53
- fig_align = plt.figure()
54
- ax_align = fig_align.add_subplot(111)
55
  ax_align.imshow(align)
56
  ax_align.set_title('Alignment', fontsize=12)
57
 
58
- return fig_mel, fig_align
59
 
60
 
61
  def synthesize(text, gst_1, gst_2, gst_3):
@@ -77,14 +77,14 @@ def synthesize(text, gst_1, gst_2, gst_3):
77
  mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
78
  mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
79
  alignments = alignments.squeeze().T.detach().numpy()
80
- fig_mel, fig_align = plot_spec_align(mel_outputs_postnet, alignments)
81
 
82
- return (22050, audio_numpy), fig_mel, fig_align
83
 
84
 
85
  iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
86
  gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
87
- outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Spectrogram"), gr.Plot(label="Alignments")],
88
  title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
89
  iface.launch()
90
 
 
46
  def plot_spec_align(mel, align):
47
 
48
  fig_mel = plt.figure()
49
+ ax_mel = fig_mel.add_subplot(211)
50
  ax_mel.imshow(mel)
51
  ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
52
 
53
+ # fig_align = plt.figure()
54
+ ax_align = fig_mel.add_subplot(212) # fig_align
55
  ax_align.imshow(align)
56
  ax_align.set_title('Alignment', fontsize=12)
57
 
58
+ return fig_mel # fig_align
59
 
60
 
61
  def synthesize(text, gst_1, gst_2, gst_3):
 
77
  mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
78
  mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
79
  alignments = alignments.squeeze().T.detach().numpy()
80
+ fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
81
 
82
+ return (22050, audio_numpy), fig_mel # fig_align
83
 
84
 
85
  iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
86
  gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
87
+ outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),],
88
  title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
89
  iface.launch()
90