Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

AlexK-PL commited on Sep 5, 2023

Commit

2f6ba98

1 Parent(s): d42b2ef

Outputs generated spectrogram and alignment

Browse files

Files changed (1) hide show

app.py +37 -4

app.py CHANGED Viewed

@@ -11,13 +11,19 @@ from melgan.utils.hparams import load_hparam
 import torch
 import numpy as np
 torch.manual_seed(1234)
 MAX_WAV_VALUE = 32768.0
-DESCRIPTION = """# Single-Head Attention Tacotron2 with Global Style Tokens
 This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
 The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
 of each style token, we configured the attention module as a single-head.
 """
 # load trained tacotron2 + GST model:
@@ -37,12 +43,31 @@ vocoder_model.load_state_dict(checkpoint['model_g'])
 vocoder_model.eval(inference=False)
 def synthesize(text, gst_1, gst_2, gst_3):
     sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
     sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
     # gst_head_scores = np.array([0.5, 0.15, 0.35])  # originally ([0.5, 0.15, 0.35])
-    gst_head_scores = np.array([gst_1, gst_2, gst_3])  # originally ([0.5, 0.15, 0.35])
     gst_scores = torch.from_numpy(gst_head_scores).float()
     mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
@@ -52,9 +77,17 @@ def synthesize(text, gst_1, gst_2, gst_3):
       audio = vocoder_model.inference(mel_outputs_postnet)
     audio_numpy = audio.data.cpu().detach().numpy()
-    return (22050, audio_numpy)
-iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"), gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")], outputs=[gr.Audio(label="Generated Speech", type="numpy"),], title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
 iface.launch()

 import torch
 import numpy as np
+from matplotlib import pyplot as plt
+from matplotlib import gridspec
 torch.manual_seed(1234)
 MAX_WAV_VALUE = 32768.0
+DESCRIPTION = """
 This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
 The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
 of each style token, we configured the attention module as a single-head.
+Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
+generated speech may show more distortion and misspronunciations.
 """
 # load trained tacotron2 + GST model:
 vocoder_model.eval(inference=False)
+def plot_spec_align(mel, align):
+    grid_spec = gridspec.GridSpec(1, 1)
+    ax = plt.subplot(grid_spec[0])
+    plt.imshow(mel)
+    plt.axis('off')
+    ax.set_title('Mel-Scale Spectrogram', fontsize=20)
+    ax = plt.subplot(grid_spec[1])
+    plt.imshow(align)
+    plt.axis('off')
+    ax.set_title('Alignment', fontsize=20)
+    plt.imshow(legend, interpolation='nearest')
+    plt.grid('off')
+    return plt
 def synthesize(text, gst_1, gst_2, gst_3):
     sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
     sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
     # gst_head_scores = np.array([0.5, 0.15, 0.35])  # originally ([0.5, 0.15, 0.35])
+    gst_head_scores = np.array([gst_1, gst_2, gst_3])
     gst_scores = torch.from_numpy(gst_head_scores).float()
     mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
       audio = vocoder_model.inference(mel_outputs_postnet)
     audio_numpy = audio.data.cpu().detach().numpy()
+    # prepare plot for the output:
+    mel_outputs_postnet = mel_outputs_postnet.squeeze().detach().numpy()
+    alignments = alignments.squeeze().detach().numpy()
+    plt = plot_spec_align(mel_outputs_postnet, alignments)
+    return (22050, audio_numpy), plt
+iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
+                                            gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
+                     outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Image(type="plot", label="Output"),],
+                     title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
 iface.launch()