Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

AlexK-PL commited on Sep 25, 2023

Commit

af18667

1 Parent(s): 1ee6f2c

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -28

app.py CHANGED Viewed

@@ -29,15 +29,6 @@ plt.tight_layout(pad=0.5)  # You can adjust the pad value as needed
 torch.manual_seed(1234)
 MAX_WAV_VALUE = 32768.0
-DESCRIPTION = """
-This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
-The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
-of each style token, we configured the attention module as a single-head.
-Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
-generated speech may show more distortion and miss-pronunciations.
-"""
 def load_checkpoint(filepath, device):
     assert os.path.isfile(filepath)
@@ -133,8 +124,6 @@ def synthesize(text, gst_1, gst_2, gst_3, voc):
     mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
     mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
     alignments = alignments.squeeze().T.detach().numpy()
-    # fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
-    # fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)
     # normalize numpy arrays between [-1, 1]
     min_val = np.min(mel_outputs_postnet)
@@ -153,9 +142,6 @@ def synthesize(text, gst_1, gst_2, gst_3, voc):
     return aw, normalized_mel, normalized_align  # (22050, audio_numpy), fig_mel, fig_align
-# Custom Demo Interface:
-# theme='ysharma/steampunk',
-#                css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
 with gr.Blocks() as demo:
     gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
                 "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
@@ -180,27 +166,20 @@ with gr.Blocks() as demo:
                                            container=False, value=0, min_width=300)  # label="Vocoder")
                     greet_btn = gr.Button("Synthesize!", scale=1)
         with gr.Column():
-            # wave_video = gr.make_waveform(audio)
             with gr.Tab("Spectrogram"):
-                # spec_plot = gr.Plot()
                 spec_plot = gr.Image(container=False)
             with gr.Tab("Alignment"):
-                # align_plot = gr.Plot()
                 align_plot = gr.Image(container=False)
             wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
-            # play_video = gr.Button(label="Play", size='sm')
-            # audio_clip = gr.Audio(label="Generated Speech", type="numpy")
     def display_video():
         return wave_video
-    # play_video.click(fn=display_video)
     greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
                     outputs=[wave_video, spec_plot, align_plot],
                     api_name="synthesize")
     with gr.Row():
         with gr.Column():
-            # gr.Markdown("### Audio Examples")
             gr.Examples(examples=infer_from_text_examples,
                         inputs=[inp, gst_1, gst_2, gst_3, vocoder],
                         outputs=[wave_video, spec_plot, align_plot],
@@ -242,11 +221,4 @@ with gr.Blocks() as demo:
     head for simplicity, ease control purposes, but also to observer whether this attention still
     works with just one head."""
-    # gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
-    #             "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
-    #             "to control the relevance of each style token, we configured the attention module as a single-head. "
-    #             "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
-    #             "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
-    #             "distortion and miss-pronunciations.")
 demo.launch()

 torch.manual_seed(1234)
 MAX_WAV_VALUE = 32768.0
 def load_checkpoint(filepath, device):
     assert os.path.isfile(filepath)
     mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
     mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
     alignments = alignments.squeeze().T.detach().numpy()
     # normalize numpy arrays between [-1, 1]
     min_val = np.min(mel_outputs_postnet)
     return aw, normalized_mel, normalized_align  # (22050, audio_numpy), fig_mel, fig_align
 with gr.Blocks() as demo:
     gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
                 "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
                                            container=False, value=0, min_width=300)  # label="Vocoder")
                     greet_btn = gr.Button("Synthesize!", scale=1)
         with gr.Column():
             with gr.Tab("Spectrogram"):
                 spec_plot = gr.Image(container=False)
             with gr.Tab("Alignment"):
                 align_plot = gr.Image(container=False)
             wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
     def display_video():
         return wave_video
     greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
                     outputs=[wave_video, spec_plot, align_plot],
                     api_name="synthesize")
     with gr.Row():
         with gr.Column():
             gr.Examples(examples=infer_from_text_examples,
                         inputs=[inp, gst_1, gst_2, gst_3, vocoder],
                         outputs=[wave_video, spec_plot, align_plot],
     head for simplicity, ease control purposes, but also to observer whether this attention still
     works with just one head."""
 demo.launch()