Spaces:
Sleeping
Sleeping
Outputs generated spectrogram and alignment
Browse files
app.py
CHANGED
@@ -11,13 +11,19 @@ from melgan.utils.hparams import load_hparam
|
|
11 |
import torch
|
12 |
import numpy as np
|
13 |
|
|
|
|
|
|
|
14 |
torch.manual_seed(1234)
|
15 |
MAX_WAV_VALUE = 32768.0
|
16 |
|
17 |
-
DESCRIPTION = """
|
18 |
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
|
19 |
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
|
20 |
of each style token, we configured the attention module as a single-head.
|
|
|
|
|
|
|
21 |
"""
|
22 |
|
23 |
# load trained tacotron2 + GST model:
|
@@ -37,12 +43,31 @@ vocoder_model.load_state_dict(checkpoint['model_g'])
|
|
37 |
vocoder_model.eval(inference=False)
|
38 |
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def synthesize(text, gst_1, gst_2, gst_3):
|
41 |
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
|
42 |
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
|
43 |
|
44 |
# gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
|
45 |
-
gst_head_scores = np.array([gst_1, gst_2, gst_3])
|
46 |
gst_scores = torch.from_numpy(gst_head_scores).float()
|
47 |
|
48 |
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
|
@@ -52,9 +77,17 @@ def synthesize(text, gst_1, gst_2, gst_3):
|
|
52 |
audio = vocoder_model.inference(mel_outputs_postnet)
|
53 |
audio_numpy = audio.data.cpu().detach().numpy()
|
54 |
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
|
58 |
-
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
|
|
|
|
|
|
|
59 |
iface.launch()
|
60 |
|
|
|
11 |
import torch
|
12 |
import numpy as np
|
13 |
|
14 |
+
from matplotlib import pyplot as plt
|
15 |
+
from matplotlib import gridspec
|
16 |
+
|
17 |
torch.manual_seed(1234)
|
18 |
MAX_WAV_VALUE = 32768.0
|
19 |
|
20 |
+
DESCRIPTION = """
|
21 |
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
|
22 |
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
|
23 |
of each style token, we configured the attention module as a single-head.
|
24 |
+
|
25 |
+
Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
|
26 |
+
generated speech may show more distortion and misspronunciations.
|
27 |
"""
|
28 |
|
29 |
# load trained tacotron2 + GST model:
|
|
|
43 |
vocoder_model.eval(inference=False)
|
44 |
|
45 |
|
46 |
+
def plot_spec_align(mel, align):
|
47 |
+
grid_spec = gridspec.GridSpec(1, 1)
|
48 |
+
|
49 |
+
ax = plt.subplot(grid_spec[0])
|
50 |
+
plt.imshow(mel)
|
51 |
+
plt.axis('off')
|
52 |
+
ax.set_title('Mel-Scale Spectrogram', fontsize=20)
|
53 |
+
|
54 |
+
ax = plt.subplot(grid_spec[1])
|
55 |
+
plt.imshow(align)
|
56 |
+
plt.axis('off')
|
57 |
+
ax.set_title('Alignment', fontsize=20)
|
58 |
+
|
59 |
+
plt.imshow(legend, interpolation='nearest')
|
60 |
+
plt.grid('off')
|
61 |
+
|
62 |
+
return plt
|
63 |
+
|
64 |
+
|
65 |
def synthesize(text, gst_1, gst_2, gst_3):
|
66 |
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
|
67 |
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
|
68 |
|
69 |
# gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
|
70 |
+
gst_head_scores = np.array([gst_1, gst_2, gst_3])
|
71 |
gst_scores = torch.from_numpy(gst_head_scores).float()
|
72 |
|
73 |
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
|
|
|
77 |
audio = vocoder_model.inference(mel_outputs_postnet)
|
78 |
audio_numpy = audio.data.cpu().detach().numpy()
|
79 |
|
80 |
+
# prepare plot for the output:
|
81 |
+
mel_outputs_postnet = mel_outputs_postnet.squeeze().detach().numpy()
|
82 |
+
alignments = alignments.squeeze().detach().numpy()
|
83 |
+
plt = plot_spec_align(mel_outputs_postnet, alignments)
|
84 |
+
|
85 |
+
return (22050, audio_numpy), plt
|
86 |
|
87 |
|
88 |
+
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
|
89 |
+
gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
|
90 |
+
outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Image(type="plot", label="Output"),],
|
91 |
+
title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
|
92 |
iface.launch()
|
93 |
|