AlexK-PL commited on
Commit
2f6ba98
·
1 Parent(s): d42b2ef

Outputs generated spectrogram and alignment

Browse files
Files changed (1) hide show
  1. app.py +37 -4
app.py CHANGED
@@ -11,13 +11,19 @@ from melgan.utils.hparams import load_hparam
11
  import torch
12
  import numpy as np
13
 
 
 
 
14
  torch.manual_seed(1234)
15
  MAX_WAV_VALUE = 32768.0
16
 
17
- DESCRIPTION = """# Single-Head Attention Tacotron2 with Global Style Tokens
18
  This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
19
  The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
20
  of each style token, we configured the attention module as a single-head.
 
 
 
21
  """
22
 
23
  # load trained tacotron2 + GST model:
@@ -37,12 +43,31 @@ vocoder_model.load_state_dict(checkpoint['model_g'])
37
  vocoder_model.eval(inference=False)
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def synthesize(text, gst_1, gst_2, gst_3):
41
  sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
42
  sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
43
 
44
  # gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
45
- gst_head_scores = np.array([gst_1, gst_2, gst_3]) # originally ([0.5, 0.15, 0.35])
46
  gst_scores = torch.from_numpy(gst_head_scores).float()
47
 
48
  mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
@@ -52,9 +77,17 @@ def synthesize(text, gst_1, gst_2, gst_3):
52
  audio = vocoder_model.inference(mel_outputs_postnet)
53
  audio_numpy = audio.data.cpu().detach().numpy()
54
 
55
- return (22050, audio_numpy)
 
 
 
 
 
56
 
57
 
58
- iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"), gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")], outputs=[gr.Audio(label="Generated Speech", type="numpy"),], title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
 
 
 
59
  iface.launch()
60
 
 
11
  import torch
12
  import numpy as np
13
 
14
+ from matplotlib import pyplot as plt
15
+ from matplotlib import gridspec
16
+
17
  torch.manual_seed(1234)
18
  MAX_WAV_VALUE = 32768.0
19
 
20
+ DESCRIPTION = """
21
  This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
22
  The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
23
  of each style token, we configured the attention module as a single-head.
24
+
25
+ Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
26
+ generated speech may show more distortion and misspronunciations.
27
  """
28
 
29
  # load trained tacotron2 + GST model:
 
43
  vocoder_model.eval(inference=False)
44
 
45
 
46
+ def plot_spec_align(mel, align):
47
+ grid_spec = gridspec.GridSpec(1, 1)
48
+
49
+ ax = plt.subplot(grid_spec[0])
50
+ plt.imshow(mel)
51
+ plt.axis('off')
52
+ ax.set_title('Mel-Scale Spectrogram', fontsize=20)
53
+
54
+ ax = plt.subplot(grid_spec[1])
55
+ plt.imshow(align)
56
+ plt.axis('off')
57
+ ax.set_title('Alignment', fontsize=20)
58
+
59
+ plt.imshow(legend, interpolation='nearest')
60
+ plt.grid('off')
61
+
62
+ return plt
63
+
64
+
65
  def synthesize(text, gst_1, gst_2, gst_3):
66
  sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
67
  sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
68
 
69
  # gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
70
+ gst_head_scores = np.array([gst_1, gst_2, gst_3])
71
  gst_scores = torch.from_numpy(gst_head_scores).float()
72
 
73
  mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
 
77
  audio = vocoder_model.inference(mel_outputs_postnet)
78
  audio_numpy = audio.data.cpu().detach().numpy()
79
 
80
+ # prepare plot for the output:
81
+ mel_outputs_postnet = mel_outputs_postnet.squeeze().detach().numpy()
82
+ alignments = alignments.squeeze().detach().numpy()
83
+ plt = plot_spec_align(mel_outputs_postnet, alignments)
84
+
85
+ return (22050, audio_numpy), plt
86
 
87
 
88
+ iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:"),
89
+ gr.Slider(0.2, 0.45, label="Second style token weight:"), gr.Slider(0.2, 0.45, label="Third style token weight:")],
90
+ outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Image(type="plot", label="Output"),],
91
+ title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
92
  iface.launch()
93