Spaces:
Sleeping
Sleeping
Upload model files
Browse files- Decoder.py +379 -0
- Encoder.py +73 -0
- GST.py +368 -0
- Postnet.py +52 -0
- Tacotron2.py +112 -0
- audio_processing.py +93 -0
- hyper_parameters.py +70 -0
- logger.py +47 -0
- nn_layers.py +105 -0
- stft.py +140 -0
- utils.py +39 -0
Decoder.py
ADDED
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.autograd import Variable
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
from nn_layers import linear_module, location_layer
|
6 |
+
from utils import get_mask_from_lengths
|
7 |
+
|
8 |
+
|
9 |
+
class AttentionNet(nn.Module):
|
10 |
+
# 1024, 512, 128, 32, 31
|
11 |
+
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
|
12 |
+
attention_location_n_filters, attention_location_kernel_size):
|
13 |
+
super(AttentionNet, self).__init__()
|
14 |
+
self.query_layer = linear_module(attention_rnn_dim, attention_dim,
|
15 |
+
bias=False, w_init_gain='tanh')
|
16 |
+
# Projecting inputs into 128-D hidden representation
|
17 |
+
self.memory_layer = linear_module(embedding_dim, attention_dim, bias=False,
|
18 |
+
w_init_gain='tanh')
|
19 |
+
# Projecting into 1-D scalar value
|
20 |
+
self.v = linear_module(attention_dim, 1, bias=False)
|
21 |
+
# Convolutional layers to obtain location features and projecting them into 128-D hidden representation
|
22 |
+
self.location_layer = location_layer(attention_location_n_filters,
|
23 |
+
attention_location_kernel_size,
|
24 |
+
attention_dim)
|
25 |
+
self.score_mask_value = -float("inf")
|
26 |
+
|
27 |
+
def get_alignment_energies(self, query, processed_memory,
|
28 |
+
attention_weights_cat):
|
29 |
+
"""
|
30 |
+
PARAMS
|
31 |
+
------
|
32 |
+
query: decoder output (batch, n_mel_channels * n_frames_per_step)
|
33 |
+
processed_memory: processed encoder outputs (B, T_in, attention_dim)
|
34 |
+
attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
|
35 |
+
|
36 |
+
RETURNS
|
37 |
+
-------
|
38 |
+
alignment (batch, max_time)
|
39 |
+
"""
|
40 |
+
|
41 |
+
processed_query = self.query_layer(query.unsqueeze(1))
|
42 |
+
processed_attention_weights = self.location_layer(attention_weights_cat)
|
43 |
+
energies = self.v(torch.tanh(
|
44 |
+
processed_query + processed_attention_weights + processed_memory))
|
45 |
+
|
46 |
+
energies = energies.squeeze(-1) # eliminates the third dimension of the tensor, which is 1.
|
47 |
+
return energies
|
48 |
+
|
49 |
+
def forward(self, attention_hidden_state, memory, processed_memory,
|
50 |
+
attention_weights_cat, mask):
|
51 |
+
"""
|
52 |
+
PARAMS
|
53 |
+
------
|
54 |
+
attention_hidden_state: attention rnn last output
|
55 |
+
memory: encoder outputs
|
56 |
+
processed_memory: processed encoder outputs
|
57 |
+
attention_weights_cat: previous and cummulative attention weights
|
58 |
+
mask: binary mask for padded data
|
59 |
+
"""
|
60 |
+
alignment = self.get_alignment_energies(
|
61 |
+
attention_hidden_state, processed_memory, attention_weights_cat)
|
62 |
+
|
63 |
+
if mask is not None:
|
64 |
+
alignment.data.masked_fill_(mask, self.score_mask_value)
|
65 |
+
|
66 |
+
attention_weights = F.softmax(alignment, dim=1)
|
67 |
+
# I think attention_weights is a [BxNUMENCINPUTS] so with unsequeeze(1): [Bx1xNUMENCINPUTS] and memory is
|
68 |
+
# [BxNUMENCINPUTSx512]
|
69 |
+
attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
|
70 |
+
attention_context = attention_context.squeeze(1)
|
71 |
+
|
72 |
+
return attention_context, attention_weights
|
73 |
+
|
74 |
+
|
75 |
+
class Prenet(nn.Module):
|
76 |
+
def __init__(self, in_dim, sizes):
|
77 |
+
super(Prenet, self).__init__()
|
78 |
+
in_sizes = [in_dim] + sizes[:-1] # all list values but the last one. The result is a list of the in_dim element
|
79 |
+
# concatenated with sizes of layers (i.e. [80, 256])
|
80 |
+
self.layers = nn.ModuleList(
|
81 |
+
[linear_module(in_size, out_size, bias=False)
|
82 |
+
for (in_size, out_size) in zip(in_sizes, sizes)])
|
83 |
+
|
84 |
+
def forward(self, x):
|
85 |
+
for linear in self.layers:
|
86 |
+
x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
|
87 |
+
return x
|
88 |
+
|
89 |
+
|
90 |
+
class Decoder(nn.Module):
|
91 |
+
def __init__(self, tacotron_hyperparams):
|
92 |
+
super(Decoder, self).__init__()
|
93 |
+
self.n_mel_channels = tacotron_hyperparams['n_mel_channels']
|
94 |
+
self.n_frames_per_step = tacotron_hyperparams['number_frames_step']
|
95 |
+
self.encoder_embedding_dim = tacotron_hyperparams['encoder_embedding_dim']
|
96 |
+
self.attention_rnn_dim = tacotron_hyperparams['attention_rnn_dim'] # 1024
|
97 |
+
self.decoder_rnn_dim = tacotron_hyperparams['decoder_rnn_dim'] # 1024
|
98 |
+
self.prenet_dim = tacotron_hyperparams['prenet_dim']
|
99 |
+
self.max_decoder_steps = tacotron_hyperparams['max_decoder_steps']
|
100 |
+
# The threshold to decide whether stop or not stop decoding?
|
101 |
+
self.gate_threshold = tacotron_hyperparams['gate_threshold']
|
102 |
+
self.p_attention_dropout = tacotron_hyperparams['p_attention_dropout']
|
103 |
+
self.p_decoder_dropout = tacotron_hyperparams['p_decoder_dropout']
|
104 |
+
# Define the prenet: there is only one frame per step, so input dim is the number of mel channels.
|
105 |
+
# There are two fully connected layers:
|
106 |
+
self.prenet = Prenet(
|
107 |
+
tacotron_hyperparams['n_mel_channels'] * tacotron_hyperparams['number_frames_step'],
|
108 |
+
[tacotron_hyperparams['prenet_dim'], tacotron_hyperparams['prenet_dim']])
|
109 |
+
# input_size: 1024 + 512 (output of first LSTM cell + attention_context) / hidden_size: 1024
|
110 |
+
self.attention_rnn = nn.LSTMCell(
|
111 |
+
tacotron_hyperparams['prenet_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
|
112 |
+
tacotron_hyperparams['attention_rnn_dim'])
|
113 |
+
# return attention_weights and attention_context. Does the alignments.
|
114 |
+
self.attention_layer = AttentionNet(
|
115 |
+
tacotron_hyperparams['attention_rnn_dim'], tacotron_hyperparams['encoder_embedding_dim'],
|
116 |
+
tacotron_hyperparams['attention_dim'], tacotron_hyperparams['attention_location_n_filters'],
|
117 |
+
tacotron_hyperparams['attention_location_kernel_size'])
|
118 |
+
# input_size: 256 + 512 (attention_context + prenet_info), hidden_size: 1024
|
119 |
+
self.decoder_rnn = nn.LSTMCell(
|
120 |
+
tacotron_hyperparams['attention_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
|
121 |
+
tacotron_hyperparams['decoder_rnn_dim'], 1)
|
122 |
+
# (LSTM output)1024 + (attention_context)512, out_dim: number of mel channels. Last linear projection that
|
123 |
+
# generates an output decoder spectral frame.
|
124 |
+
self.linear_projection = linear_module(
|
125 |
+
tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
|
126 |
+
tacotron_hyperparams['n_mel_channels']*tacotron_hyperparams['number_frames_step'])
|
127 |
+
# decision whether to continue decoding.
|
128 |
+
self.gate_layer = linear_module(
|
129 |
+
tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'], 1,
|
130 |
+
bias=True, w_init_gain='sigmoid')
|
131 |
+
|
132 |
+
def get_go_frame(self, memory):
|
133 |
+
""" Gets all zeros frames to use as first decoder input
|
134 |
+
PARAMS
|
135 |
+
------
|
136 |
+
memory: decoder outputs
|
137 |
+
|
138 |
+
RETURNS
|
139 |
+
-------
|
140 |
+
decoder_input: all zeros frames
|
141 |
+
"""
|
142 |
+
B = memory.size(0)
|
143 |
+
decoder_input = Variable(memory.data.new(
|
144 |
+
B, self.n_mel_channels * self.n_frames_per_step).zero_())
|
145 |
+
return decoder_input
|
146 |
+
|
147 |
+
def initialize_decoder_states(self, memory, mask):
|
148 |
+
""" Initializes attention rnn states, decoder rnn states, attention
|
149 |
+
weights, attention cumulative weights, attention context, stores memory
|
150 |
+
and stores processed memory
|
151 |
+
PARAMS
|
152 |
+
------
|
153 |
+
memory: Encoder outputs
|
154 |
+
mask: Mask for padded data if training, expects None for inference
|
155 |
+
"""
|
156 |
+
B = memory.size(0)
|
157 |
+
MAX_TIME = memory.size(1)
|
158 |
+
|
159 |
+
self.attention_hidden = Variable(memory.data.new(
|
160 |
+
B, self.attention_rnn_dim).zero_())
|
161 |
+
self.attention_cell = Variable(memory.data.new(
|
162 |
+
B, self.attention_rnn_dim).zero_())
|
163 |
+
|
164 |
+
self.decoder_hidden = Variable(memory.data.new(
|
165 |
+
B, self.decoder_rnn_dim).zero_())
|
166 |
+
self.decoder_cell = Variable(memory.data.new(
|
167 |
+
B, self.decoder_rnn_dim).zero_())
|
168 |
+
|
169 |
+
self.attention_weights = Variable(memory.data.new(
|
170 |
+
B, MAX_TIME).zero_())
|
171 |
+
self.attention_weights_cum = Variable(memory.data.new(
|
172 |
+
B, MAX_TIME).zero_())
|
173 |
+
self.attention_context = Variable(memory.data.new(
|
174 |
+
B, self.encoder_embedding_dim).zero_())
|
175 |
+
|
176 |
+
self.memory = memory
|
177 |
+
self.processed_memory = self.attention_layer.memory_layer(memory)
|
178 |
+
self.mask = mask
|
179 |
+
|
180 |
+
def parse_decoder_inputs(self, decoder_inputs):
|
181 |
+
""" Prepares decoder inputs, i.e. mel outputs
|
182 |
+
PARAMS
|
183 |
+
------
|
184 |
+
decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
|
185 |
+
|
186 |
+
RETURNS
|
187 |
+
-------
|
188 |
+
inputs: processed decoder inputs
|
189 |
+
|
190 |
+
"""
|
191 |
+
# (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
|
192 |
+
decoder_inputs = decoder_inputs.transpose(1, 2)
|
193 |
+
# reshape decoder inputs in case we want to work with more than 1 frame per step (chunks). Otherwise, this next
|
194 |
+
# line does not just do anything
|
195 |
+
decoder_inputs = decoder_inputs.view(
|
196 |
+
decoder_inputs.size(0),
|
197 |
+
int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
|
198 |
+
# (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
|
199 |
+
decoder_inputs = decoder_inputs.transpose(0, 1)
|
200 |
+
return decoder_inputs
|
201 |
+
|
202 |
+
def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
|
203 |
+
""" Prepares decoder outputs for output
|
204 |
+
PARAMS
|
205 |
+
------
|
206 |
+
mel_outputs:
|
207 |
+
gate_outputs: gate output energies
|
208 |
+
alignments:
|
209 |
+
|
210 |
+
RETURNS
|
211 |
+
-------
|
212 |
+
mel_outputs:
|
213 |
+
gate_outpust: gate output energies
|
214 |
+
alignments:
|
215 |
+
"""
|
216 |
+
# (T_out, B) -> (B, T_out)
|
217 |
+
alignments = torch.stack(alignments).transpose(0, 1)
|
218 |
+
# (T_out, B) -> (B, T_out)
|
219 |
+
gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
|
220 |
+
gate_outputs = gate_outputs.contiguous()
|
221 |
+
# (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
|
222 |
+
mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
|
223 |
+
# decouple frames per step
|
224 |
+
mel_outputs = mel_outputs.view(
|
225 |
+
mel_outputs.size(0), -1, self.n_mel_channels)
|
226 |
+
# (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
|
227 |
+
mel_outputs = mel_outputs.transpose(1, 2)
|
228 |
+
|
229 |
+
return mel_outputs, gate_outputs, alignments
|
230 |
+
|
231 |
+
def decode(self, decoder_input):
|
232 |
+
""" Decoder step using stored states, attention and memory
|
233 |
+
PARAMS
|
234 |
+
------
|
235 |
+
decoder_input: previous mel output
|
236 |
+
|
237 |
+
RETURNS
|
238 |
+
-------
|
239 |
+
mel_output:
|
240 |
+
gate_output: gate output energies
|
241 |
+
attention_weights:
|
242 |
+
"""
|
243 |
+
# concatenates [Bx1024] and [Bx512]. All dimensions match except 1 (torch.cat -1)
|
244 |
+
# concatenate the i-th decoder hidden state together with the i-th attention context
|
245 |
+
cell_input = torch.cat((decoder_input, self.attention_context), -1)
|
246 |
+
# the previous input is for the following LSTM cell, initialized with zeroes the hidden states and the cell
|
247 |
+
# state.
|
248 |
+
# compute the (i+1)th attention hidden state based on the i-th decoder hidden state and attention context.
|
249 |
+
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
250 |
+
cell_input, (self.attention_hidden, self.attention_cell))
|
251 |
+
self.attention_hidden = F.dropout(self.attention_hidden, self.p_attention_dropout, self.training)
|
252 |
+
self.attention_cell = F.dropout(self.attention_cell, self.p_attention_dropout, self.training)
|
253 |
+
# concatenate the i-th state attention weights together with the cumulated from previous states to compute
|
254 |
+
# (i+1)th state
|
255 |
+
attention_weights_cat = torch.cat(
|
256 |
+
(self.attention_weights.unsqueeze(1),
|
257 |
+
self.attention_weights_cum.unsqueeze(1)), dim=1)
|
258 |
+
# compute (i+1)th attention context and provide (i+1)th attention weights based on the (i+1)th attention hidden
|
259 |
+
# state and (i)th and prev. weights
|
260 |
+
self.attention_context, self.attention_weights = self.attention_layer(
|
261 |
+
self.attention_hidden, self.memory, self.processed_memory,
|
262 |
+
attention_weights_cat, self.mask)
|
263 |
+
|
264 |
+
# cumulate attention_weights adding the (i+1)th to compute (i+2)th state
|
265 |
+
self.attention_weights_cum += self.attention_weights
|
266 |
+
|
267 |
+
decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)
|
268 |
+
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(decoder_input,
|
269 |
+
(self.decoder_hidden, self.decoder_cell))
|
270 |
+
self.decoder_hidden = F.dropout(self.decoder_hidden, self.p_decoder_dropout, self.training)
|
271 |
+
self.decoder_cell = F.dropout(self.decoder_cell, self.p_decoder_dropout, self.training)
|
272 |
+
|
273 |
+
decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)
|
274 |
+
decoder_output = self.linear_projection(decoder_hidden_attention_context)
|
275 |
+
|
276 |
+
gate_prediction = self.gate_layer(decoder_hidden_attention_context)
|
277 |
+
|
278 |
+
return decoder_output, gate_prediction, self.attention_weights
|
279 |
+
|
280 |
+
"""
|
281 |
+
# the decoder_output from ith step passes through the pre-net to compute new decoder hidden state and attention_
|
282 |
+
# context (i+1)th
|
283 |
+
prenet_output = self.prenet(decoder_input)
|
284 |
+
# the decoder_input now is the concatenation of the pre-net output and the new (i+1)th attention_context
|
285 |
+
decoder_input = torch.cat((prenet_output, self.attention_context), -1)
|
286 |
+
# another LSTM Cell to compute the decoder hidden (i+1)th state from the decoder_input
|
287 |
+
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
288 |
+
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
289 |
+
|
290 |
+
# with new attention_context we concatenate again with the new (i+1)th decoder_hidden state.
|
291 |
+
decoder_hidden_attention_context = torch.cat(
|
292 |
+
(self.decoder_hidden, self.attention_context), dim=1)
|
293 |
+
# the (i+1)th output is a linear projection of the decoder hidden state with a weight matrix plus bias.
|
294 |
+
decoder_output = self.linear_projection(
|
295 |
+
decoder_hidden_attention_context)
|
296 |
+
# check whether (i+1)th state is the last of the sequence
|
297 |
+
gate_prediction = self.gate_layer(decoder_hidden_attention_context)
|
298 |
+
return decoder_output, gate_prediction, self.attention_weights"""
|
299 |
+
|
300 |
+
def forward(self, memory, decoder_inputs, memory_lengths):
|
301 |
+
""" Decoder forward pass for training
|
302 |
+
PARAMS
|
303 |
+
------
|
304 |
+
memory: Encoder outputs
|
305 |
+
decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
|
306 |
+
memory_lengths: Encoder output lengths for attention masking.
|
307 |
+
|
308 |
+
RETURNS
|
309 |
+
-------
|
310 |
+
mel_outputs: mel outputs from the decoder
|
311 |
+
gate_outputs: gate outputs from the decoder
|
312 |
+
alignments: sequence of attention weights from the decoder
|
313 |
+
"""
|
314 |
+
|
315 |
+
decoder_input = self.get_go_frame(memory).unsqueeze(0)
|
316 |
+
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
|
317 |
+
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
|
318 |
+
decoder_inputs = self.prenet(decoder_inputs)
|
319 |
+
|
320 |
+
self.initialize_decoder_states(
|
321 |
+
memory, mask=~get_mask_from_lengths(memory_lengths))
|
322 |
+
|
323 |
+
mel_outputs, gate_outputs, alignments = [], [], []
|
324 |
+
|
325 |
+
while len(mel_outputs) < decoder_inputs.size(0) - 1:
|
326 |
+
decoder_input = decoder_inputs[len(mel_outputs)]
|
327 |
+
mel_output, gate_output, attention_weights = self.decode(
|
328 |
+
decoder_input)
|
329 |
+
# a class list, when += means concatenation of vectors
|
330 |
+
mel_outputs += [mel_output.squeeze(1)]
|
331 |
+
gate_outputs += [gate_output.squeeze()]
|
332 |
+
alignments += [attention_weights]
|
333 |
+
# getting the frame indexing from reference mel frames to pass it as the new input of the next decoding
|
334 |
+
# step: Teacher Forcing!
|
335 |
+
# Takes each time_step of sequences of all mini-batch samples (i.e. [48, 80] as the decoder_inputs is
|
336 |
+
# parsed as [189, 48, 80]).
|
337 |
+
|
338 |
+
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
339 |
+
mel_outputs, gate_outputs, alignments)
|
340 |
+
|
341 |
+
return mel_outputs, gate_outputs, alignments
|
342 |
+
|
343 |
+
def inference(self, memory):
|
344 |
+
""" Decoder inference
|
345 |
+
PARAMS
|
346 |
+
------
|
347 |
+
memory: Encoder outputs
|
348 |
+
|
349 |
+
RETURNS
|
350 |
+
-------
|
351 |
+
mel_outputs: mel outputs from the decoder
|
352 |
+
gate_outputs: gate outputs from the decoder
|
353 |
+
alignments: sequence of attention weights from the decoder
|
354 |
+
"""
|
355 |
+
decoder_input = self.get_go_frame(memory)
|
356 |
+
|
357 |
+
self.initialize_decoder_states(memory, mask=None)
|
358 |
+
|
359 |
+
mel_outputs, gate_outputs, alignments = [], [], []
|
360 |
+
while True:
|
361 |
+
decoder_input = self.prenet(decoder_input)
|
362 |
+
mel_output, gate_output, alignment = self.decode(decoder_input)
|
363 |
+
|
364 |
+
mel_outputs += [mel_output.squeeze(1)]
|
365 |
+
gate_outputs += [gate_output]
|
366 |
+
alignments += [alignment]
|
367 |
+
|
368 |
+
if torch.sigmoid(gate_output.data) > self.gate_threshold:
|
369 |
+
break
|
370 |
+
elif len(mel_outputs) == self.max_decoder_steps:
|
371 |
+
print("Warning! Reached max decoder steps")
|
372 |
+
break
|
373 |
+
|
374 |
+
decoder_input = mel_output
|
375 |
+
|
376 |
+
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
377 |
+
mel_outputs, gate_outputs, alignments)
|
378 |
+
|
379 |
+
return mel_outputs, gate_outputs, alignments
|
Encoder.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
from torch.nn import functional as F
|
3 |
+
from nn_layers import convolutional_module
|
4 |
+
|
5 |
+
|
6 |
+
class Encoder(nn.Module):
|
7 |
+
"""This is the encoder part of tacotron2. It includes a stack of three 1d convolutional layers
|
8 |
+
followed by batch normalization and ReLU activations, and a bidirectional LSTM layer.
|
9 |
+
These part encodes sequences of input characters."""
|
10 |
+
def __init__(self, encoder_params):
|
11 |
+
super(Encoder, self).__init__()
|
12 |
+
# we set the dropout applied at each convolutional layer, as specified in Tacotron2's paper
|
13 |
+
# self.dropout = nn.Dropout(0.5)
|
14 |
+
|
15 |
+
# A stack of convolution layers. For this model, there are 3 conv1d layers. We initialize a python
|
16 |
+
# list and run in a loop as many times as number of convolutional layers (three). In each
|
17 |
+
# iteration we initialize nn.Sequential container that permits us set a block of neural network
|
18 |
+
# modules. We need three equal nn sequences in a list. Then this list is properly registered using
|
19 |
+
# ModuleList class object (can act as an iterable, or be indexed).
|
20 |
+
# To see how the convolution is computed:
|
21 |
+
# https://pytorch.org/docs/stable/nn.html#conv1d
|
22 |
+
|
23 |
+
stack_of_convolutions = []
|
24 |
+
for _ in range(encoder_params['encoder_convs']):
|
25 |
+
conv_layer = nn.Sequential(convolutional_module(encoder_params['symbols_embedding_length'],
|
26 |
+
encoder_params['symbols_embedding_length'],
|
27 |
+
kernel_size=encoder_params['conv_kernel_size'],
|
28 |
+
stride=encoder_params['conv_stride'],
|
29 |
+
padding=int((encoder_params['conv_kernel_size'] - 1) / 2),
|
30 |
+
dilation=encoder_params['conv_dilation'],
|
31 |
+
w_init_gain=encoder_params['w_init_gain']),
|
32 |
+
nn.BatchNorm1d(encoder_params['symbols_embedding_length']))
|
33 |
+
stack_of_convolutions.append(conv_layer)
|
34 |
+
self.stack_conv = nn.ModuleList(stack_of_convolutions)
|
35 |
+
|
36 |
+
# Last part of the encoder is the bi-directional LSTM layer. As described in the original Tacotron2
|
37 |
+
# paper, there is only one BiLSTM layer with 256 units for each direction.
|
38 |
+
|
39 |
+
"""Can I add the bidirectional LSTM layer together with the convolutional stack??? CHECK IT OUT!"""
|
40 |
+
|
41 |
+
self.bi_lstm = nn.LSTM(encoder_params['symbols_embedding_length'],
|
42 |
+
int(encoder_params['symbols_embedding_length'] / 2), 1, batch_first=True,
|
43 |
+
bidirectional=True)
|
44 |
+
|
45 |
+
def forward(self, input_sequences, input_lengths):
|
46 |
+
for conv in self.stack_conv:
|
47 |
+
input_sequences = F.dropout(F.relu(conv(input_sequences)), 0.5, self.training)
|
48 |
+
|
49 |
+
input_sequences = input_sequences.transpose(1, 2)
|
50 |
+
# After convolution filters, is the original sequence length the same? CHECK IT OUT
|
51 |
+
input_lengths = input_lengths.cpu().numpy()
|
52 |
+
# Returns a packed sequence object with variable-length sequences before passing through BiLSTM layer
|
53 |
+
input_sequences = nn.utils.rnn.pack_padded_sequence(input_sequences, input_lengths, batch_first=True)
|
54 |
+
# nn.LSTM accepts packed variable length sequence tensors. The output will also return a packed variable
|
55 |
+
# length sequence tensor. The output dimension is (seq_length, batch, num_directions*hidden_size), but
|
56 |
+
# if batch_first is True, then (batch, seq_length, num_direction*hidden_size).
|
57 |
+
self.bi_lstm.flatten_parameters()
|
58 |
+
outputs, _ = self.bi_lstm(input_sequences)
|
59 |
+
# Pads again the tensor back to normal format before packing
|
60 |
+
outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
|
61 |
+
|
62 |
+
return outputs # [N, Max_seq_length, E_length]
|
63 |
+
|
64 |
+
def inference(self, x):
|
65 |
+
for conv in self.stack_conv:
|
66 |
+
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
67 |
+
|
68 |
+
x = x.transpose(1, 2)
|
69 |
+
|
70 |
+
self.bi_lstm.flatten_parameters()
|
71 |
+
outputs, _ = self.bi_lstm(x)
|
72 |
+
|
73 |
+
return outputs
|
GST.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.init as init
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
class GST(nn.Module):
|
9 |
+
|
10 |
+
def __init__(self, hyper_parameters):
|
11 |
+
|
12 |
+
super().__init__()
|
13 |
+
self.prosody_extractor = LogMelSpecReferenceEncoder()
|
14 |
+
self.stl = MultiSTL(hyper_parameters=hyper_parameters)
|
15 |
+
|
16 |
+
def forward(self, logmel_spec, logmel_lengths):
|
17 |
+
prosody_features_embedded = self.prosody_extractor(logmel_spec, logmel_lengths) # [N, 512]
|
18 |
+
style_embed, gst_scores = self.stl(prosody_features_embedded)
|
19 |
+
|
20 |
+
return style_embed, gst_scores
|
21 |
+
|
22 |
+
def inference(self, scores): # NEED TO DEFINE SCORES TENSOR DIMENSION!!
|
23 |
+
style_embed_inference = self.stl.inference(scores=scores)
|
24 |
+
|
25 |
+
return style_embed_inference
|
26 |
+
|
27 |
+
|
28 |
+
class PitchContourEncoder(nn.Module):
|
29 |
+
"""
|
30 |
+
|
31 |
+
"""
|
32 |
+
def __init__(self, hyper_parameters):
|
33 |
+
|
34 |
+
super().__init__()
|
35 |
+
|
36 |
+
K = len(hyper_parameters['ref_enc_out_channels'])
|
37 |
+
filters = [1] + hyper_parameters['ref_enc_out_channels']
|
38 |
+
kernel_sizes = hyper_parameters['seq_ref_enc_filter_size']
|
39 |
+
|
40 |
+
convs_2d = []
|
41 |
+
|
42 |
+
for i in range(K):
|
43 |
+
conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
|
44 |
+
kernel_size=(kernel_sizes[i], 3), stride=(1, 1),
|
45 |
+
padding=(int((kernel_sizes[i] - 1) / 2), int((3 - 1) / 2)), bias=True)
|
46 |
+
|
47 |
+
nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
|
48 |
+
|
49 |
+
convs_2d.append(conv2d_init)
|
50 |
+
|
51 |
+
self.convs2D = nn.ModuleList(convs_2d)
|
52 |
+
|
53 |
+
self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=hyper_parameters['ref_enc_out_channels'][i])
|
54 |
+
for i in range(K)])
|
55 |
+
|
56 |
+
# WEIGHT INITIALIZATION DEFAULT:
|
57 |
+
self.prosody_bi_lstm = nn.LSTM(input_size=int(176), hidden_size=int(512/2), num_layers=1, batch_first=True,
|
58 |
+
bidirectional=True)
|
59 |
+
|
60 |
+
def forward(self, bin_locations): # [N, BIN_SUBAND, LEN_MELSPEC] (BIN_SUBAND = 13)
|
61 |
+
N = bin_locations.size(0) # Number of samples
|
62 |
+
# Changing tensor dimensions to have 1 input channel for the first conv2D layer:
|
63 |
+
bin_locations = bin_locations.unsqueeze(1)
|
64 |
+
bin_locations = bin_locations.transpose(2, 3) # [N, 1, LEN_MELSPEC, BIN_SUBAND]
|
65 |
+
"""We implement ReLU gates at the output of Conv. layers. We could check it without"""
|
66 |
+
# For pitch tracking:
|
67 |
+
for conv2, bn2 in zip(self.convs2D, self.bns2D):
|
68 |
+
bin_locations = conv2(bin_locations)
|
69 |
+
bin_locations = bn2(bin_locations)
|
70 |
+
bin_locations = F.dropout(F.relu(bin_locations), 0.5, self.training) # [N, Cout, LEN_MELSPEC, BIN_SUBAND]
|
71 |
+
|
72 |
+
# Resize:
|
73 |
+
bin_locations = bin_locations.transpose(1, 2) # [N, LEN_MELSPEC, Cout, BIN_SUBAND]
|
74 |
+
T = bin_locations.size(1)
|
75 |
+
bin_locations = bin_locations.contiguous().view(N, T, -1) # [N, LEN_MELSPEC, Cout*BIN_SUBAND]
|
76 |
+
|
77 |
+
# Encode sequences into a bidirectional LSTM layer:
|
78 |
+
"""In our case, we do not care about the specific length of each sequence, as with the zero padding the encoder
|
79 |
+
should be able to also encode the different lengths and see zero when its over. That is why we do not apply
|
80 |
+
a packing padded sequence before LSTM layer."""
|
81 |
+
_, (encoded_prosody, cell_state) = self.prosody_bi_lstm(bin_locations)
|
82 |
+
|
83 |
+
encoded_prosody = encoded_prosody.transpose(0, 1)
|
84 |
+
encoded_prosody = encoded_prosody.contiguous().view(N, -1)
|
85 |
+
|
86 |
+
return encoded_prosody # should be [N, 512]
|
87 |
+
|
88 |
+
|
89 |
+
# DENSE GST Reference Encoder:
|
90 |
+
class ProsodyEncoder(nn.Module):
|
91 |
+
"""
|
92 |
+
This convolution class nn.Module performs two parallel convolution stacks, 1-D conv. and another 2-D conv.
|
93 |
+
Afterwards, the output of both will be concatenated to be passed, later, through a bidirectional LSTM layer.
|
94 |
+
"""
|
95 |
+
def __init__(self, hyper_parameters):
|
96 |
+
|
97 |
+
super().__init__()
|
98 |
+
|
99 |
+
K = len(hyper_parameters['ref_enc_out_channels'])
|
100 |
+
filters = [1] + hyper_parameters['ref_enc_out_channels']
|
101 |
+
kernel_sizes = hyper_parameters['seq_ref_enc_filter_size']
|
102 |
+
|
103 |
+
# I NEED TO ADJUST PADDING TO NOT LOSE THE TOTAL LENGTH OF SEQUENCE!!
|
104 |
+
convs_1d = []
|
105 |
+
convs_2d = []
|
106 |
+
|
107 |
+
for i in range(K):
|
108 |
+
conv1d_init = nn.Conv1d(in_channels=filters[i], out_channels=filters[i + 1],
|
109 |
+
kernel_size=kernel_sizes[i], stride=1,
|
110 |
+
padding=int((kernel_sizes[i] - 1) / 2), bias=True)
|
111 |
+
|
112 |
+
nn.init.xavier_uniform_(conv1d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
|
113 |
+
|
114 |
+
convs_1d.append(conv1d_init)
|
115 |
+
|
116 |
+
conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
|
117 |
+
kernel_size=(kernel_sizes[i], 3), stride=(1, 1),
|
118 |
+
padding=(int((kernel_sizes[i] - 1) / 2), int((3 - 1) / 2)), bias=True)
|
119 |
+
|
120 |
+
nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
|
121 |
+
|
122 |
+
convs_2d.append(conv2d_init)
|
123 |
+
|
124 |
+
self.convs1D = nn.ModuleList(convs_1d)
|
125 |
+
self.convs2D = nn.ModuleList(convs_2d)
|
126 |
+
|
127 |
+
self.bns1D = nn.ModuleList([nn.BatchNorm1d(num_features=hyper_parameters['ref_enc_out_channels'][i])
|
128 |
+
for i in range(K)])
|
129 |
+
self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=hyper_parameters['ref_enc_out_channels'][i])
|
130 |
+
for i in range(K)])
|
131 |
+
|
132 |
+
self.prosody_linear = nn.Linear(512, 256, bias=True)
|
133 |
+
torch.nn.init.xavier_uniform_(self.prosody_linear.weight, gain=torch.nn.init.calculate_gain('linear'))
|
134 |
+
|
135 |
+
# WEIGHT INITIALIZATION DEFAULT:
|
136 |
+
self.prosody_bi_lstm = nn.LSTM(input_size=int(256), hidden_size=int(512/2), num_layers=1, batch_first=True,
|
137 |
+
bidirectional=True)
|
138 |
+
|
139 |
+
def forward(self, bin_locations, pitch_intensities): # [N, LEN_MELSPEC, 1], [N, LEN_MELSPEC, 3]
|
140 |
+
N = bin_locations.size(0) # Number of samples
|
141 |
+
num_intensities = pitch_intensities.size(2)
|
142 |
+
# Changing tensor dimensions to have 1 input channel for the first conv2D layer:
|
143 |
+
pitch_intensities = pitch_intensities.view(N, 1, -1, num_intensities) # [N, 1, LEN_MELSPEC, num_intensities]
|
144 |
+
bin_locations = bin_locations.transpose(1, 2) # [N, 1, LEN_MELSPEC]
|
145 |
+
"""We implement ReLU gates at the output of Conv. layers. We could check it without"""
|
146 |
+
# For pitch tracking:
|
147 |
+
for conv, bn in zip(self.convs1D, self.bns1D):
|
148 |
+
bin_locations = conv(bin_locations)
|
149 |
+
bin_locations = bn(bin_locations)
|
150 |
+
bin_locations = F.dropout(F.relu(bin_locations), 0.5, self.training) # [N, Cout, T]
|
151 |
+
|
152 |
+
# For pitch intensities:
|
153 |
+
for conv2, bn2 in zip(self.convs2D, self.bns2D):
|
154 |
+
pitch_intensities = conv2(pitch_intensities)
|
155 |
+
pitch_intensities = bn2(pitch_intensities)
|
156 |
+
pitch_intensities = F.dropout(F.relu(pitch_intensities), 0.5, self.training) # [N, Cout, T, bins]
|
157 |
+
|
158 |
+
# Resize pitch intensities
|
159 |
+
bin_locations = bin_locations.transpose(1, 2) # [N, T, Cout]
|
160 |
+
pitch_intensities = pitch_intensities.transpose(1, 2) # [N, T, Cout, bins]
|
161 |
+
T = pitch_intensities.size(1)
|
162 |
+
pitch_intensities = pitch_intensities.contiguous().view(N, T, -1) # [N, T, Cout*bins]
|
163 |
+
|
164 |
+
# Concatenate features
|
165 |
+
pitch_convolved = torch.cat((bin_locations, pitch_intensities), 2)
|
166 |
+
|
167 |
+
# Linear projection (IS IT NECESSARY? DOES ACTIVATION FUNCTION IMPROVE THE RESULT?)
|
168 |
+
projection_pitch_convolved = F.dropout(F.tanh(self.prosody_linear(pitch_convolved)), 0.5, self.training)
|
169 |
+
|
170 |
+
# Encode sequences into a bidirectional LSTM layer:
|
171 |
+
"""In our case, we do not care about the specific length of each sequence, as with the zero padding the encoder
|
172 |
+
should be able to also encode the different lengths and see zero when its over. That is why we do not apply
|
173 |
+
a packing padded sequence before LSTM layer."""
|
174 |
+
_, (encoded_prosody, cell_state) = self.prosody_bi_lstm(projection_pitch_convolved)
|
175 |
+
|
176 |
+
encoded_prosody = encoded_prosody.transpose(0, 1)
|
177 |
+
encoded_prosody = encoded_prosody.contiguous().view(N, -1)
|
178 |
+
|
179 |
+
return encoded_prosody # should be [N, 512]
|
180 |
+
|
181 |
+
|
182 |
+
class LogMelSpecReferenceEncoder(nn.Module):
|
183 |
+
"""
|
184 |
+
"""
|
185 |
+
def __init__(self):
|
186 |
+
|
187 |
+
super().__init__()
|
188 |
+
|
189 |
+
reference_encoder_out_channels = [32, 32, 64, 64, 128, 128]
|
190 |
+
K = len(reference_encoder_out_channels)
|
191 |
+
filters = [1] + reference_encoder_out_channels
|
192 |
+
kernel_size = (3, 3)
|
193 |
+
stride = (2, 2)
|
194 |
+
padding = (1, 1)
|
195 |
+
|
196 |
+
convs_2d = []
|
197 |
+
|
198 |
+
for i in range(K):
|
199 |
+
conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
|
200 |
+
kernel_size=kernel_size, stride=stride,
|
201 |
+
padding=padding, bias=True)
|
202 |
+
|
203 |
+
nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
|
204 |
+
|
205 |
+
convs_2d.append(conv2d_init)
|
206 |
+
|
207 |
+
self.convs2D = nn.ModuleList(convs_2d)
|
208 |
+
self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=reference_encoder_out_channels[i])
|
209 |
+
for i in range(K)])
|
210 |
+
|
211 |
+
out_channels = self.calculate_channels(80, 3, 2, 1, K)
|
212 |
+
# self.gru = nn.GRU(input_size=reference_encoder_out_channels[-1] * out_channels, hidden_size=512,
|
213 |
+
# batch_first=True, bidirectional=False)
|
214 |
+
|
215 |
+
# WEIGHT INITIALIZATION DEFAULT:
|
216 |
+
self.bi_lstm = nn.LSTM(input_size=reference_encoder_out_channels[-1] * out_channels,
|
217 |
+
hidden_size=int(512/2), num_layers=1, batch_first=True, bidirectional=True)
|
218 |
+
|
219 |
+
def forward(self, logmel_spec, logmel_lengths): # [N, MEL_CHANNELS, LEN_MELSPEC]
|
220 |
+
N = logmel_spec.size(0) # Number of samples
|
221 |
+
# Changing tensor dimensions to have 1 input channel for the first conv2D layer:
|
222 |
+
logmel_spec = logmel_spec.unsqueeze(1)
|
223 |
+
logmel_spec = logmel_spec.transpose(2, 3) # [N, 1, LEN_MELSPEC, MEL_CHANNELS]
|
224 |
+
"""We implement ReLU gates at the output of Conv. layers. We could check it without"""
|
225 |
+
for conv2, bn2 in zip(self.convs2D, self.bns2D):
|
226 |
+
logmel_spec = conv2(logmel_spec)
|
227 |
+
logmel_spec = bn2(logmel_spec)
|
228 |
+
logmel_spec = F.dropout(F.relu(logmel_spec), 0.5, self.training) # [N, Cout, LEN_MELSPEC, BIN_SUBAND]
|
229 |
+
|
230 |
+
# Resize:
|
231 |
+
logmel_spec = logmel_spec.transpose(1, 2) # [N, LEN_MELSPEC, Cout, MEL_CHANNELS]
|
232 |
+
T = logmel_spec.size(1)
|
233 |
+
logmel_spec = logmel_spec.contiguous().view(N, T, -1) # [N, LEN_MELSPEC, Cout*BIN_SUBAND]
|
234 |
+
|
235 |
+
logmel_lengths = logmel_lengths.cpu().numpy()
|
236 |
+
last_hidden_states = torch.zeros(N, 512)
|
237 |
+
|
238 |
+
logmel_after_lengths = np.trunc(logmel_lengths / 2**6)
|
239 |
+
logmel_after_lengths = logmel_after_lengths + 1
|
240 |
+
logmel_after_lengths = logmel_after_lengths.astype(int)
|
241 |
+
logmel_after_lengths = torch.tensor(logmel_after_lengths)
|
242 |
+
# logmel_spec = nn.utils.rnn.pack_padded_sequence(logmel_spec, logmel_after_lengths, batch_first=True)
|
243 |
+
self.bi_lstm.flatten_parameters()
|
244 |
+
# memory, out = self.gru(logmel_spec)
|
245 |
+
outputs, (hidden_states, cell_state) = self.bi_lstm(logmel_spec)
|
246 |
+
hidden_states = hidden_states.transpose(0, 1)
|
247 |
+
hidden_states = hidden_states.contiguous().view(N, -1)
|
248 |
+
# outputs, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
|
249 |
+
|
250 |
+
# for j in range(N):
|
251 |
+
# last_hidden_states[j, :] = outputs[j, logmel_after_lengths[j] - 1, :]
|
252 |
+
|
253 |
+
# return last_hidden_states.cuda(non_blocking=True)
|
254 |
+
return hidden_states
|
255 |
+
|
256 |
+
def calculate_channels(self, L, kernel_size, stride, padding, n_convs):
|
257 |
+
for i in range(n_convs):
|
258 |
+
L = (L - kernel_size + 2 * padding) // stride + 1
|
259 |
+
return L
|
260 |
+
|
261 |
+
|
262 |
+
# BASIC FORM FOR NOW. NEEDS TO BE EXPANDED TO OUR NEW PROPOSAL
|
263 |
+
class MultiSTL(nn.Module):
|
264 |
+
|
265 |
+
"""
|
266 |
+
inputs --- [N, E]
|
267 |
+
"""
|
268 |
+
|
269 |
+
def __init__(self, hyper_parameters):
|
270 |
+
|
271 |
+
super().__init__()
|
272 |
+
# E = 256 / num_heads = 8 / token_num = 10!!
|
273 |
+
self.embed = nn.Parameter(torch.FloatTensor(hyper_parameters['token_num'],
|
274 |
+
hyper_parameters['E'] // hyper_parameters['num_heads']))
|
275 |
+
# d_q = hyper_parameters['E'] // 2
|
276 |
+
d_q = hyper_parameters['E']
|
277 |
+
d_k = hyper_parameters['E'] // hyper_parameters['num_heads']
|
278 |
+
|
279 |
+
self.attention = MultiHeadAttention(query_dim=d_q, key_dim=d_k,
|
280 |
+
num_units=hyper_parameters['E'], num_heads=hyper_parameters['num_heads'])
|
281 |
+
|
282 |
+
init.xavier_uniform_(self.embed, gain=init.calculate_gain('linear'))
|
283 |
+
|
284 |
+
def forward(self, inputs):
|
285 |
+
N = inputs.size(0) # Number of samples in the batch
|
286 |
+
query = inputs.unsqueeze(1) # [N, 1, E]
|
287 |
+
keys = F.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads]
|
288 |
+
style_embed, gst_scores = self.attention(query, keys)
|
289 |
+
|
290 |
+
return style_embed, gst_scores
|
291 |
+
|
292 |
+
def inference(self, scores):
|
293 |
+
keys = F.tanh(self.embed).unsqueeze(0)
|
294 |
+
style_embed_inference = self.attention.inference(keys, scores=scores)
|
295 |
+
|
296 |
+
return style_embed_inference
|
297 |
+
|
298 |
+
|
299 |
+
class MultiHeadAttention(nn.Module):
|
300 |
+
"""
|
301 |
+
input:
|
302 |
+
query --- [N, T_q, query_dim] T_q = 1
|
303 |
+
key --- [N, T_k, key_dim] T_k = 5 (num of tokens)
|
304 |
+
output:
|
305 |
+
out --- [N, T_q, num_units]
|
306 |
+
"""
|
307 |
+
|
308 |
+
def __init__(self, query_dim, key_dim, num_units, num_heads):
|
309 |
+
|
310 |
+
super().__init__()
|
311 |
+
self.num_units = num_units
|
312 |
+
self.num_heads = num_heads
|
313 |
+
self.key_dim = key_dim
|
314 |
+
#self.sparse_max = Sparsemax(dim=3)
|
315 |
+
|
316 |
+
# Linear projection of data (encoder and decoder states) into a fixed number of hidden units
|
317 |
+
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
|
318 |
+
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
319 |
+
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
320 |
+
|
321 |
+
def forward(self, query, key):
|
322 |
+
|
323 |
+
querys = self.W_query(query) # [N, T_q, num_units] the last dimension changes according to the output dim
|
324 |
+
keys = self.W_key(key) # [N, T_k, num_units]
|
325 |
+
values = self.W_value(key)
|
326 |
+
|
327 |
+
# the number of units set at the initialization is the total of hidden feature units we want. Then, we will
|
328 |
+
# assign a specific number of num_units according to the number of heads of the multi head Attention.
|
329 |
+
|
330 |
+
# Basically, style tokens are the number of heads we configure to learn different types of attention
|
331 |
+
#
|
332 |
+
split_size = self.num_units // self.num_heads # integer division, without remainder
|
333 |
+
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h]
|
334 |
+
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
335 |
+
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
336 |
+
|
337 |
+
# score = softmax(QK^T / (d_k ** 0.5))
|
338 |
+
scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
|
339 |
+
scores = scores / (self.key_dim ** 0.33) # cube root instead of square to prevent very small values
|
340 |
+
scores = F.softmax(scores, dim=3) # From dimension 3, length of Key sequences.
|
341 |
+
# scores = self.sparse_max(scores)
|
342 |
+
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
|
343 |
+
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
|
344 |
+
scores = scores.squeeze()
|
345 |
+
|
346 |
+
return out, scores
|
347 |
+
|
348 |
+
def inference(self, key, scores): # key [1, 5, 512/8] # [1, num_tokens]
|
349 |
+
"""Only need the keys that are already trained, and the scores that I impose"""
|
350 |
+
scores = scores.unsqueeze(0).unsqueeze(0).unsqueeze(0).expand(self.num_heads, -1, -1, -1)
|
351 |
+
# print(scores.shape)
|
352 |
+
values = self.W_value(key)
|
353 |
+
|
354 |
+
# the number of units set at the initialization is the total of hidden feature units we want. Then, we will
|
355 |
+
# assign a specific number of num_units according to the number of heads of the multi head Attention.
|
356 |
+
|
357 |
+
# Basically, style tokens are the number of heads we configure to learn different types of attention
|
358 |
+
#
|
359 |
+
split_size = self.num_units // self.num_heads # integer division, without remainder
|
360 |
+
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
361 |
+
|
362 |
+
# score = softmax(QK^T / (d_k ** 0.5))
|
363 |
+
|
364 |
+
# out = score * V
|
365 |
+
out = torch.matmul(scores, values) # [h, 1, T_q = 1, num_units/h]
|
366 |
+
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
|
367 |
+
|
368 |
+
return out
|
Postnet.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
from torch.nn import functional as F
|
4 |
+
from nn_layers import convolutional_module
|
5 |
+
|
6 |
+
|
7 |
+
class Postnet(nn.Module):
|
8 |
+
"""Postnet
|
9 |
+
- Five 1-d convolution with 512 channels and kernel size 5
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, tacotron_hyperparams):
|
13 |
+
super(Postnet, self).__init__()
|
14 |
+
# self.dropout = nn.Dropout(0.5)
|
15 |
+
self.convolutions = nn.ModuleList()
|
16 |
+
|
17 |
+
self.convolutions.append(
|
18 |
+
nn.Sequential(
|
19 |
+
convolutional_module(tacotron_hyperparams['n_mel_channels'],
|
20 |
+
tacotron_hyperparams['postnet_embedding_dim'],
|
21 |
+
kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
|
22 |
+
padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
|
23 |
+
dilation=1, w_init_gain='tanh'),
|
24 |
+
nn.BatchNorm1d(tacotron_hyperparams['postnet_embedding_dim']))
|
25 |
+
)
|
26 |
+
|
27 |
+
for i in range(1, tacotron_hyperparams['postnet_n_convolutions'] - 1):
|
28 |
+
self.convolutions.append(
|
29 |
+
nn.Sequential(
|
30 |
+
convolutional_module(tacotron_hyperparams['postnet_embedding_dim'],
|
31 |
+
tacotron_hyperparams['postnet_embedding_dim'],
|
32 |
+
kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
|
33 |
+
padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
|
34 |
+
dilation=1, w_init_gain='tanh'),
|
35 |
+
nn.BatchNorm1d(tacotron_hyperparams['postnet_embedding_dim']))
|
36 |
+
)
|
37 |
+
|
38 |
+
self.convolutions.append(
|
39 |
+
nn.Sequential(
|
40 |
+
convolutional_module(tacotron_hyperparams['postnet_embedding_dim'],
|
41 |
+
tacotron_hyperparams['n_mel_channels'],
|
42 |
+
kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
|
43 |
+
padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
|
44 |
+
dilation=1, w_init_gain='linear'),
|
45 |
+
nn.BatchNorm1d(tacotron_hyperparams['n_mel_channels']))
|
46 |
+
)
|
47 |
+
|
48 |
+
def forward(self, x):
|
49 |
+
for i in range(len(self.convolutions) - 1):
|
50 |
+
x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
|
51 |
+
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
|
52 |
+
return x
|
Tacotron2.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from math import sqrt
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
|
6 |
+
from Encoder import Encoder
|
7 |
+
from Decoder import Decoder
|
8 |
+
from Postnet import Postnet
|
9 |
+
from GST import GST
|
10 |
+
|
11 |
+
from utils import to_gpu, get_mask_from_lengths
|
12 |
+
from fp16_optimizer import fp32_to_fp16, fp16_to_fp32
|
13 |
+
|
14 |
+
|
15 |
+
class tacotron_2(nn.Module):
|
16 |
+
def __init__(self, tacotron_hyperparams):
|
17 |
+
super(tacotron_2, self).__init__()
|
18 |
+
self.mask_padding = tacotron_hyperparams['mask_padding']
|
19 |
+
self.fp16_run = tacotron_hyperparams['fp16_run']
|
20 |
+
self.n_mel_channels = tacotron_hyperparams['n_mel_channels']
|
21 |
+
self.n_frames_per_step = tacotron_hyperparams['number_frames_step']
|
22 |
+
self.embedding = nn.Embedding(
|
23 |
+
tacotron_hyperparams['n_symbols'], tacotron_hyperparams['symbols_embedding_length'])
|
24 |
+
# CHECK THIS OUT!!!
|
25 |
+
std = sqrt(2.0 / (tacotron_hyperparams['n_symbols'] + tacotron_hyperparams['symbols_embedding_length']))
|
26 |
+
val = sqrt(3.0) * std
|
27 |
+
self.embedding.weight.data.uniform_(-val, val)
|
28 |
+
self.encoder = Encoder(tacotron_hyperparams)
|
29 |
+
self.decoder = Decoder(tacotron_hyperparams)
|
30 |
+
self.postnet = Postnet(tacotron_hyperparams)
|
31 |
+
self.gst = GST(tacotron_hyperparams)
|
32 |
+
|
33 |
+
def parse_batch(self, batch):
|
34 |
+
# GST I add the new tensor from prosody features to train GST tokens:
|
35 |
+
text_padded, input_lengths, mel_padded, gate_padded, output_lengths, prosody_padded = batch
|
36 |
+
text_padded = to_gpu(text_padded).long()
|
37 |
+
max_len = int(torch.max(input_lengths.data).item()) # With item() you get the pure value (not in a tensor)
|
38 |
+
input_lengths = to_gpu(input_lengths).long()
|
39 |
+
mel_padded = to_gpu(mel_padded).float()
|
40 |
+
gate_padded = to_gpu(gate_padded).float()
|
41 |
+
output_lengths = to_gpu(output_lengths).long()
|
42 |
+
prosody_padded = to_gpu(prosody_padded).float()
|
43 |
+
|
44 |
+
return (
|
45 |
+
(text_padded, input_lengths, mel_padded, max_len, output_lengths, prosody_padded),
|
46 |
+
(mel_padded, gate_padded))
|
47 |
+
|
48 |
+
def parse_input(self, inputs):
|
49 |
+
inputs = fp32_to_fp16(inputs) if self.fp16_run else inputs
|
50 |
+
return inputs
|
51 |
+
|
52 |
+
def parse_output(self, outputs, output_lengths=None):
|
53 |
+
if self.mask_padding and output_lengths is not None:
|
54 |
+
mask = ~get_mask_from_lengths(output_lengths)
|
55 |
+
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
|
56 |
+
mask = mask.permute(1, 0, 2)
|
57 |
+
|
58 |
+
outputs[0].data.masked_fill_(mask, 0.0)
|
59 |
+
outputs[1].data.masked_fill_(mask, 0.0)
|
60 |
+
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
|
61 |
+
|
62 |
+
outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
|
63 |
+
|
64 |
+
return outputs
|
65 |
+
|
66 |
+
def forward(self, inputs):
|
67 |
+
inputs, input_lengths, targets, max_len, output_lengths, gst_prosody_padded = self.parse_input(inputs)
|
68 |
+
input_lengths, output_lengths = input_lengths.data, output_lengths.data
|
69 |
+
|
70 |
+
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
71 |
+
|
72 |
+
encoder_outputs = self.encoder(embedded_inputs, input_lengths)
|
73 |
+
|
74 |
+
# GST style embedding plus embedded_inputs before entering the decoder
|
75 |
+
# bin_locations = gst_prosody_padded[:, 0, :]
|
76 |
+
# pitch_intensities = gst_prosody_padded[:, 1:, :]
|
77 |
+
# bin_locations = bin_locations.unsqueeze(2)
|
78 |
+
gst_style_embedding, gst_scores = self.gst(gst_prosody_padded, output_lengths) # [N, 512]
|
79 |
+
gst_style_embedding = gst_style_embedding.expand_as(encoder_outputs)
|
80 |
+
|
81 |
+
encoder_outputs = encoder_outputs + gst_style_embedding
|
82 |
+
|
83 |
+
mel_outputs, gate_outputs, alignments = self.decoder(
|
84 |
+
encoder_outputs, targets, memory_lengths=input_lengths)
|
85 |
+
mel_outputs_postnet = self.postnet(mel_outputs)
|
86 |
+
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
87 |
+
|
88 |
+
return self.parse_output(
|
89 |
+
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments, gst_scores],
|
90 |
+
output_lengths)
|
91 |
+
|
92 |
+
def inference(self, inputs, gst_scores): # gst_scores must be a torch tensor
|
93 |
+
inputs = self.parse_input(inputs)
|
94 |
+
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
95 |
+
encoder_outputs = self.encoder.inference(embedded_inputs)
|
96 |
+
|
97 |
+
# GST inference:
|
98 |
+
gst_style_embedding = self.gst.inference(gst_scores)
|
99 |
+
gst_style_embedding = gst_style_embedding.expand_as(encoder_outputs)
|
100 |
+
|
101 |
+
encoder_outputs = encoder_outputs + gst_style_embedding
|
102 |
+
|
103 |
+
mel_outputs, gate_outputs, alignments = self.decoder.inference(
|
104 |
+
encoder_outputs)
|
105 |
+
|
106 |
+
mel_outputs_postnet = self.postnet(mel_outputs)
|
107 |
+
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
108 |
+
|
109 |
+
outputs = self.parse_output(
|
110 |
+
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
|
111 |
+
|
112 |
+
return outputs
|
audio_processing.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from scipy.signal import get_window
|
4 |
+
import librosa.util as librosa_util
|
5 |
+
|
6 |
+
|
7 |
+
def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
|
8 |
+
n_fft=800, dtype=np.float32, norm=None):
|
9 |
+
"""
|
10 |
+
# from librosa 0.6
|
11 |
+
Compute the sum-square envelope of a window function at a given hop length.
|
12 |
+
|
13 |
+
This is used to estimate modulation effects induced by windowing
|
14 |
+
observations in short-time fourier transforms.
|
15 |
+
|
16 |
+
Parameters
|
17 |
+
----------
|
18 |
+
window : string, tuple, number, callable, or list-like
|
19 |
+
Window specification, as in `get_window`
|
20 |
+
|
21 |
+
n_frames : int > 0
|
22 |
+
The number of analysis frames
|
23 |
+
|
24 |
+
hop_length : int > 0
|
25 |
+
The number of samples to advance between frames
|
26 |
+
|
27 |
+
win_length : [optional]
|
28 |
+
The length of the window function. By default, this matches `n_fft`.
|
29 |
+
|
30 |
+
n_fft : int > 0
|
31 |
+
The length of each analysis frame.
|
32 |
+
|
33 |
+
dtype : np.dtype
|
34 |
+
The data type of the output
|
35 |
+
|
36 |
+
Returns
|
37 |
+
-------
|
38 |
+
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
|
39 |
+
The sum-squared envelope of the window function
|
40 |
+
"""
|
41 |
+
if win_length is None:
|
42 |
+
win_length = n_fft
|
43 |
+
|
44 |
+
n = n_fft + hop_length * (n_frames - 1)
|
45 |
+
x = np.zeros(n, dtype=dtype)
|
46 |
+
|
47 |
+
# Compute the squared window at the desired length
|
48 |
+
win_sq = get_window(window, win_length, fftbins=True)
|
49 |
+
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
|
50 |
+
win_sq = librosa_util.pad_center(win_sq, n_fft)
|
51 |
+
|
52 |
+
# Fill the envelope
|
53 |
+
for i in range(n_frames):
|
54 |
+
sample = i * hop_length
|
55 |
+
x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
|
56 |
+
return x
|
57 |
+
|
58 |
+
|
59 |
+
def griffin_lim(magnitudes, stft_fn, n_iters=30):
|
60 |
+
"""
|
61 |
+
PARAMS
|
62 |
+
------
|
63 |
+
magnitudes: spectrogram magnitudes
|
64 |
+
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
|
65 |
+
"""
|
66 |
+
|
67 |
+
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
|
68 |
+
angles = angles.astype(np.float32)
|
69 |
+
angles = torch.autograd.Variable(torch.from_numpy(angles))
|
70 |
+
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
71 |
+
|
72 |
+
for i in range(n_iters):
|
73 |
+
_, angles = stft_fn.transform(signal)
|
74 |
+
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
75 |
+
return signal
|
76 |
+
|
77 |
+
|
78 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
79 |
+
"""
|
80 |
+
PARAMS
|
81 |
+
------
|
82 |
+
C: compression factor
|
83 |
+
"""
|
84 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
85 |
+
|
86 |
+
|
87 |
+
def dynamic_range_decompression(x, C=1):
|
88 |
+
"""
|
89 |
+
PARAMS
|
90 |
+
------
|
91 |
+
C: compression factor used to compress
|
92 |
+
"""
|
93 |
+
return torch.exp(x) / C
|
hyper_parameters.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from text import symbols
|
2 |
+
|
3 |
+
# creating a python dictionary with all hyper parameters
|
4 |
+
|
5 |
+
tacotron_params = {'filter_length': 1024, # audio parameters:
|
6 |
+
'hop_length': 256,
|
7 |
+
'win_length': 1024,
|
8 |
+
'n_mel_channels': 80,
|
9 |
+
'mel_fmin': 0.0,
|
10 |
+
'mel_fmax': 8000.0,
|
11 |
+
'sampling_rate': 22050,
|
12 |
+
'max_wav_value': 32768.0,
|
13 |
+
'clipping_value': 1e-5,
|
14 |
+
'C': 1,
|
15 |
+
# dataset parameters:
|
16 |
+
'load_mel_from_disk': False,
|
17 |
+
'sort_by_length': False,
|
18 |
+
'text_cleaners': ['english_cleaners'],
|
19 |
+
# embedding parameters:
|
20 |
+
'symbols_embedding_length': 512,
|
21 |
+
'n_symbols': len(symbols),
|
22 |
+
# encoder parameters:
|
23 |
+
'encoder_embedding_dim': 512,
|
24 |
+
'encoder_convs': 3,
|
25 |
+
'conv_kernel_size': 5,
|
26 |
+
'conv_stride': 1,
|
27 |
+
'conv_dilation': 1,
|
28 |
+
'w_init_gain': 'relu',
|
29 |
+
# decoder parameters:
|
30 |
+
'number_frames_step': 1,
|
31 |
+
'decoder_rnn_dim': 1024,
|
32 |
+
'prenet_dim': 256,
|
33 |
+
'max_decoder_steps': 1000,
|
34 |
+
'gate_threshold': 0.5, # Need to be reviewed
|
35 |
+
'p_attention_dropout': 0.1,
|
36 |
+
'p_decoder_dropout': 0.1,
|
37 |
+
# attention parameters:
|
38 |
+
'attention_rnn_dim': 1024,
|
39 |
+
'attention_dim': 128,
|
40 |
+
# location features parameters:
|
41 |
+
'attention_location_n_filters': 32,
|
42 |
+
'attention_location_kernel_size': 31,
|
43 |
+
# postnet parameters:
|
44 |
+
'postnet_embedding_dim': 512,
|
45 |
+
'postnet_kernel_size': 5,
|
46 |
+
'postnet_n_convolutions': 5,
|
47 |
+
# GST parameters:
|
48 |
+
'E': 512,
|
49 |
+
'token_num': 3,
|
50 |
+
'num_heads': 1,
|
51 |
+
'seq_ref_enc_filter_size': [3, 7, 11], # phoneme, word/silence, utterance levels respectively
|
52 |
+
'ref_enc_out_channels': [8, 16, 16],
|
53 |
+
# optimization parameters:
|
54 |
+
'use_saved_learning_rate': True,
|
55 |
+
'batch_size': 32, # 64 should be larger than the number of GPUs. Integer multiple of the num. of GPUs
|
56 |
+
'learning_rate': 1e-3,
|
57 |
+
'weight_decay': 1e-6,
|
58 |
+
'grad_clip_thresh': 1.0,
|
59 |
+
'mask_padding': False,
|
60 |
+
# experiment parameters:
|
61 |
+
'epochs': 300, # 160, 500
|
62 |
+
'iters_per_checkpoint': 1500, # 1000. How many iterations before validating
|
63 |
+
'seed': 1234,
|
64 |
+
'dynamic_loss_scaling': True, # CHECK IT OUT!
|
65 |
+
'distributed_run': False,
|
66 |
+
'dist_backend': 'nccl',
|
67 |
+
'dist_url': "/home/alex/PyTorch_TACOTRON_2/pycharm-tacotron2", # CHECK IT OUT!
|
68 |
+
'cudnn_enabled': True,
|
69 |
+
'cudnn_benchmark': False,
|
70 |
+
'fp16_run': False}
|
logger.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from tensorboardX import SummaryWriter
|
4 |
+
from plotting_utils import plot_alignment_to_numpy, plot_gst_scores_to_numpy, plot_spectrogram_to_numpy
|
5 |
+
from plotting_utils import plot_gate_outputs_to_numpy
|
6 |
+
|
7 |
+
|
8 |
+
class Tacotron2Logger(SummaryWriter):
|
9 |
+
def __init__(self, logdir):
|
10 |
+
super(Tacotron2Logger, self).__init__(logdir)
|
11 |
+
|
12 |
+
def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
|
13 |
+
iteration):
|
14 |
+
self.add_scalar("training.loss", reduced_loss, iteration)
|
15 |
+
self.add_scalar("grad.norm", grad_norm, iteration)
|
16 |
+
self.add_scalar("learning.rate", learning_rate, iteration)
|
17 |
+
self.add_scalar("duration", duration, iteration)
|
18 |
+
|
19 |
+
def log_validation(self, reduced_loss, model, y, y_pred, gst_scores, iteration):
|
20 |
+
self.add_scalar("validation.loss", reduced_loss, iteration)
|
21 |
+
_, mel_outputs, gate_outputs, alignments, _ = y_pred
|
22 |
+
mel_targets, gate_targets = y
|
23 |
+
|
24 |
+
# plot distribution of parameters
|
25 |
+
for tag, value in model.named_parameters():
|
26 |
+
tag = tag.replace('.', '/')
|
27 |
+
self.add_histogram(tag, value.data.cpu().numpy(), iteration)
|
28 |
+
|
29 |
+
# plot alignment, mel target and predicted, gate target and predicted
|
30 |
+
idx = random.randint(0, alignments.size(0) - 1)
|
31 |
+
|
32 |
+
align_idx = alignments[idx].data.cpu().numpy().T
|
33 |
+
gst_scores = gst_scores.data.cpu().numpy().T
|
34 |
+
# print("Validation GST scores before plotting to tensorboard: {}".format(gst_scores.shape))
|
35 |
+
meltarg_idx = mel_targets[idx].data.cpu().numpy()
|
36 |
+
melout_idx = mel_outputs[idx].data.cpu().numpy()
|
37 |
+
|
38 |
+
self.add_image("alignment", plot_alignment_to_numpy(align_idx), iteration)
|
39 |
+
self.add_image("gst_scores", plot_gst_scores_to_numpy(gst_scores), iteration)
|
40 |
+
self.add_image("mel_target", plot_spectrogram_to_numpy(meltarg_idx), iteration)
|
41 |
+
self.add_image("mel_predicted", plot_spectrogram_to_numpy(melout_idx), iteration)
|
42 |
+
self.add_image(
|
43 |
+
"gate",
|
44 |
+
plot_gate_outputs_to_numpy(
|
45 |
+
gate_targets[idx].data.cpu().numpy(),
|
46 |
+
F.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
|
47 |
+
iteration)
|
nn_layers.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
from librosa.filters import mel as librosa_mel_fn
|
4 |
+
from stft import STFT
|
5 |
+
|
6 |
+
clip_val = 1e-5
|
7 |
+
C = 1
|
8 |
+
|
9 |
+
|
10 |
+
class convolutional_module(nn.Module):
|
11 |
+
"""This class defines a 1d convolutional layer and its initialization for the system we are
|
12 |
+
replicating"""
|
13 |
+
def __init__(self, in_ch, out_ch, kernel_size=1, stride=1, padding=None, dilation=1, bias=True,
|
14 |
+
w_init_gain='linear'):
|
15 |
+
# in PyTorch you define your Models as subclasses of torch.nn.Module
|
16 |
+
super(convolutional_module, self).__init__()
|
17 |
+
if padding is None:
|
18 |
+
assert(kernel_size % 2 == 1)
|
19 |
+
padding = int(dilation * (kernel_size - 1) / 2)
|
20 |
+
|
21 |
+
# initialize the convolutional layer which is an instance of Conv1d
|
22 |
+
# torch.nn.Conv1d calls internally the method torch.nn.functional.conv1d, which accepts the
|
23 |
+
# input with the shape (minibatch x in_channels x input_w), and a weight of shape
|
24 |
+
# (out_channels x (in_channels/groups) x kernel_w). In our case, we do not split into groups.
|
25 |
+
# Then, our input shape will be (48 x 512 x 189) and the weights are set up as
|
26 |
+
# (512 x 512 x 5)
|
27 |
+
self.conv_layer = torch.nn.Conv1d(in_ch, out_ch, kernel_size=kernel_size, stride=stride,
|
28 |
+
padding=padding, dilation=dilation, bias=bias)
|
29 |
+
|
30 |
+
"""Useful information of Xavier initialization in:
|
31 |
+
https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/"""
|
32 |
+
torch.nn.init.xavier_uniform_(self.conv_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
33 |
+
|
34 |
+
def forward(self, x):
|
35 |
+
conv_output = self.conv_layer(x)
|
36 |
+
return conv_output
|
37 |
+
|
38 |
+
|
39 |
+
class linear_module(torch.nn.Module):
|
40 |
+
"""This class defines a linear layer and its initialization method for the system we are
|
41 |
+
replicating. This implements a linear transformation: y = xA^t + b"""
|
42 |
+
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
|
43 |
+
super(linear_module, self).__init__()
|
44 |
+
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
45 |
+
|
46 |
+
torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
47 |
+
|
48 |
+
def forward(self, x):
|
49 |
+
return self.linear_layer(x)
|
50 |
+
|
51 |
+
|
52 |
+
class location_layer(nn.Module):
|
53 |
+
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
|
54 |
+
super(location_layer, self).__init__()
|
55 |
+
padding = int((attention_kernel_size - 1) / 2)
|
56 |
+
"""We are being very restricting without training a bias"""
|
57 |
+
"""I think in_channels = 2 is k (number of vectors for every encoded stage position from prev.
|
58 |
+
alignment)."""
|
59 |
+
self.location_conv = convolutional_module(2, attention_n_filters, kernel_size=attention_kernel_size,
|
60 |
+
padding=padding, bias=False, stride=1, dilation=1)
|
61 |
+
self.location_dense = linear_module(attention_n_filters, attention_dim, bias=False,
|
62 |
+
w_init_gain='tanh')
|
63 |
+
|
64 |
+
def forward(self, attention_weights_cat):
|
65 |
+
processed_attention = self.location_conv(attention_weights_cat)
|
66 |
+
processed_attention = processed_attention.transpose(1, 2)
|
67 |
+
processed_attention = self.location_dense(processed_attention)
|
68 |
+
return processed_attention
|
69 |
+
|
70 |
+
|
71 |
+
class TacotronSTFT(nn.Module):
|
72 |
+
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
73 |
+
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
74 |
+
mel_fmax=8000.0):
|
75 |
+
super(TacotronSTFT, self).__init__()
|
76 |
+
self.n_mel_channels = n_mel_channels
|
77 |
+
self.sampling_rate = sampling_rate
|
78 |
+
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
79 |
+
mel_basis = librosa_mel_fn(
|
80 |
+
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
|
81 |
+
mel_basis = torch.from_numpy(mel_basis).float()
|
82 |
+
self.register_buffer('mel_basis', mel_basis)
|
83 |
+
|
84 |
+
def spectral_de_normalize(self, magnitudes):
|
85 |
+
output = torch.exp(magnitudes) / C
|
86 |
+
return output
|
87 |
+
|
88 |
+
def mel_spectrogram(self, y):
|
89 |
+
"""Computes mel-spectrograms from a batch of waves
|
90 |
+
PARAMS
|
91 |
+
------
|
92 |
+
y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
|
93 |
+
|
94 |
+
RETURNS
|
95 |
+
-------
|
96 |
+
mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
|
97 |
+
"""
|
98 |
+
assert(torch.min(y.data) >= -1)
|
99 |
+
assert(torch.max(y.data) <= 1)
|
100 |
+
|
101 |
+
magnitudes, phases = self.stft_fn.transform(y)
|
102 |
+
magnitudes = magnitudes.data
|
103 |
+
mel_output = torch.matmul(self.mel_basis, magnitudes)
|
104 |
+
mel_output = torch.log(torch.clamp(mel_output, min=clip_val) * C)
|
105 |
+
return mel_output
|
stft.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
BSD 3-Clause License
|
3 |
+
|
4 |
+
Copyright (c) 2017, Prem Seetharaman
|
5 |
+
All rights reserved.
|
6 |
+
|
7 |
+
* Redistribution and use in source and binary forms, with or without
|
8 |
+
modification, are permitted provided that the following conditions are met:
|
9 |
+
|
10 |
+
* Redistributions of source code must retain the above copyright notice,
|
11 |
+
this list of conditions and the following disclaimer.
|
12 |
+
|
13 |
+
* Redistributions in binary form must reproduce the above copyright notice, this
|
14 |
+
list of conditions and the following disclaimer in the
|
15 |
+
documentation and/or other materials provided with the distribution.
|
16 |
+
|
17 |
+
* Neither the name of the copyright holder nor the names of its
|
18 |
+
contributors may be used to endorse or promote products derived from this
|
19 |
+
software without specific prior written permission.
|
20 |
+
|
21 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
22 |
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
23 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
24 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
25 |
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
26 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
27 |
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
28 |
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
29 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
30 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
31 |
+
"""
|
32 |
+
|
33 |
+
import torch
|
34 |
+
import numpy as np
|
35 |
+
import torch.nn.functional as F
|
36 |
+
from torch.autograd import Variable
|
37 |
+
from scipy.signal import get_window
|
38 |
+
from librosa.util import pad_center, tiny
|
39 |
+
from audio_processing import window_sumsquare
|
40 |
+
|
41 |
+
|
42 |
+
class STFT(torch.nn.Module):
|
43 |
+
"""adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
|
44 |
+
def __init__(self, filter_length=800, hop_length=200, win_length=800,
|
45 |
+
window='hann'):
|
46 |
+
super(STFT, self).__init__()
|
47 |
+
self.filter_length = filter_length
|
48 |
+
self.hop_length = hop_length
|
49 |
+
self.win_length = win_length
|
50 |
+
self.window = window
|
51 |
+
self.forward_transform = None
|
52 |
+
scale = self.filter_length / self.hop_length
|
53 |
+
fourier_basis = np.fft.fft(np.eye(self.filter_length))
|
54 |
+
|
55 |
+
cutoff = int((self.filter_length / 2 + 1))
|
56 |
+
fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
|
57 |
+
np.imag(fourier_basis[:cutoff, :])])
|
58 |
+
|
59 |
+
forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
|
60 |
+
inverse_basis = torch.FloatTensor(
|
61 |
+
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
62 |
+
|
63 |
+
if window is not None:
|
64 |
+
assert(filter_length >= win_length)
|
65 |
+
# get window and zero center pad it to filter_length
|
66 |
+
fft_window = get_window(window, win_length, fftbins=True)
|
67 |
+
fft_window = pad_center(fft_window, filter_length)
|
68 |
+
fft_window = torch.from_numpy(fft_window).float()
|
69 |
+
|
70 |
+
# window the bases
|
71 |
+
forward_basis *= fft_window
|
72 |
+
inverse_basis *= fft_window
|
73 |
+
|
74 |
+
self.register_buffer('forward_basis', forward_basis.float())
|
75 |
+
self.register_buffer('inverse_basis', inverse_basis.float())
|
76 |
+
|
77 |
+
def transform(self, input_data):
|
78 |
+
num_batches = input_data.size(0)
|
79 |
+
num_samples = input_data.size(1)
|
80 |
+
|
81 |
+
self.num_samples = num_samples
|
82 |
+
|
83 |
+
# similar to librosa, reflect-pad the input
|
84 |
+
input_data = input_data.view(num_batches, 1, num_samples)
|
85 |
+
input_data = F.pad(
|
86 |
+
input_data.unsqueeze(1),
|
87 |
+
(int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
|
88 |
+
mode='reflect')
|
89 |
+
input_data = input_data.squeeze(1)
|
90 |
+
|
91 |
+
forward_transform = F.conv1d(
|
92 |
+
input_data,
|
93 |
+
Variable(self.forward_basis, requires_grad=False),
|
94 |
+
stride=self.hop_length,
|
95 |
+
padding=0)
|
96 |
+
|
97 |
+
cutoff = int((self.filter_length / 2) + 1)
|
98 |
+
real_part = forward_transform[:, :cutoff, :]
|
99 |
+
imag_part = forward_transform[:, cutoff:, :]
|
100 |
+
|
101 |
+
magnitude = torch.sqrt(real_part**2 + imag_part**2)
|
102 |
+
phase = torch.autograd.Variable(
|
103 |
+
torch.atan2(imag_part.data, real_part.data))
|
104 |
+
|
105 |
+
return magnitude, phase
|
106 |
+
|
107 |
+
def inverse(self, magnitude, phase):
|
108 |
+
recombine_magnitude_phase = torch.cat(
|
109 |
+
[magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
|
110 |
+
|
111 |
+
inverse_transform = F.conv_transpose1d(
|
112 |
+
recombine_magnitude_phase,
|
113 |
+
Variable(self.inverse_basis, requires_grad=False),
|
114 |
+
stride=self.hop_length,
|
115 |
+
padding=0)
|
116 |
+
|
117 |
+
if self.window is not None:
|
118 |
+
window_sum = window_sumsquare(
|
119 |
+
self.window, magnitude.size(-1), hop_length=self.hop_length,
|
120 |
+
win_length=self.win_length, n_fft=self.filter_length,
|
121 |
+
dtype=np.float32)
|
122 |
+
# remove modulation effects
|
123 |
+
approx_nonzero_indices = torch.from_numpy(
|
124 |
+
np.where(window_sum > tiny(window_sum))[0])
|
125 |
+
window_sum = torch.autograd.Variable(
|
126 |
+
torch.from_numpy(window_sum), requires_grad=False)
|
127 |
+
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
|
128 |
+
|
129 |
+
# scale by hop ratio
|
130 |
+
inverse_transform *= float(self.filter_length) / self.hop_length
|
131 |
+
|
132 |
+
inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
|
133 |
+
inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
|
134 |
+
|
135 |
+
return inverse_transform
|
136 |
+
|
137 |
+
def forward(self, input_data):
|
138 |
+
self.magnitude, self.phase = self.transform(input_data)
|
139 |
+
reconstruction = self.inverse(self.magnitude, self.phase)
|
140 |
+
return reconstruction
|
utils.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from scipy.io.wavfile import read
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
def get_mask_from_lengths(lengths):
|
7 |
+
max_len = torch.max(lengths).item()
|
8 |
+
ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
|
9 |
+
mask = (ids < lengths.unsqueeze(1)).byte()
|
10 |
+
# mask = (ids < lengths.unsqueeze(1).cuda()).cpu()
|
11 |
+
# mask = mask.byte()
|
12 |
+
return mask
|
13 |
+
|
14 |
+
|
15 |
+
# probably I won't use it from here
|
16 |
+
def load_wav_to_torch(full_path, sr):
|
17 |
+
sampling_rate, data = read(full_path)
|
18 |
+
assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
|
19 |
+
sr, sampling_rate, full_path)
|
20 |
+
return torch.FloatTensor(data.astype(np.float32))
|
21 |
+
|
22 |
+
|
23 |
+
# probably I won't use it from here
|
24 |
+
def load_filepaths_and_text(filename, sort_by_length, split="|"):
|
25 |
+
with open(filename, encoding='utf-8') as f:
|
26 |
+
filepaths_and_text = [line.strip().split(split) for line in f]
|
27 |
+
|
28 |
+
if sort_by_length:
|
29 |
+
filepaths_and_text.sort(key=lambda x: len(x[1]))
|
30 |
+
|
31 |
+
return filepaths_and_text
|
32 |
+
|
33 |
+
|
34 |
+
def to_gpu(x):
|
35 |
+
x = x.contiguous()
|
36 |
+
|
37 |
+
if torch.cuda.is_available():
|
38 |
+
x = x.cuda(non_blocking=True) # I understand this lets asynchronous processing
|
39 |
+
return torch.autograd.Variable(x)
|