AlexK-PL commited on
Commit
9329fc1
·
1 Parent(s): dde58d5

Upload model files

Browse files
Files changed (11) hide show
  1. Decoder.py +379 -0
  2. Encoder.py +73 -0
  3. GST.py +368 -0
  4. Postnet.py +52 -0
  5. Tacotron2.py +112 -0
  6. audio_processing.py +93 -0
  7. hyper_parameters.py +70 -0
  8. logger.py +47 -0
  9. nn_layers.py +105 -0
  10. stft.py +140 -0
  11. utils.py +39 -0
Decoder.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.autograd import Variable
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+ from nn_layers import linear_module, location_layer
6
+ from utils import get_mask_from_lengths
7
+
8
+
9
+ class AttentionNet(nn.Module):
10
+ # 1024, 512, 128, 32, 31
11
+ def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
12
+ attention_location_n_filters, attention_location_kernel_size):
13
+ super(AttentionNet, self).__init__()
14
+ self.query_layer = linear_module(attention_rnn_dim, attention_dim,
15
+ bias=False, w_init_gain='tanh')
16
+ # Projecting inputs into 128-D hidden representation
17
+ self.memory_layer = linear_module(embedding_dim, attention_dim, bias=False,
18
+ w_init_gain='tanh')
19
+ # Projecting into 1-D scalar value
20
+ self.v = linear_module(attention_dim, 1, bias=False)
21
+ # Convolutional layers to obtain location features and projecting them into 128-D hidden representation
22
+ self.location_layer = location_layer(attention_location_n_filters,
23
+ attention_location_kernel_size,
24
+ attention_dim)
25
+ self.score_mask_value = -float("inf")
26
+
27
+ def get_alignment_energies(self, query, processed_memory,
28
+ attention_weights_cat):
29
+ """
30
+ PARAMS
31
+ ------
32
+ query: decoder output (batch, n_mel_channels * n_frames_per_step)
33
+ processed_memory: processed encoder outputs (B, T_in, attention_dim)
34
+ attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
35
+
36
+ RETURNS
37
+ -------
38
+ alignment (batch, max_time)
39
+ """
40
+
41
+ processed_query = self.query_layer(query.unsqueeze(1))
42
+ processed_attention_weights = self.location_layer(attention_weights_cat)
43
+ energies = self.v(torch.tanh(
44
+ processed_query + processed_attention_weights + processed_memory))
45
+
46
+ energies = energies.squeeze(-1) # eliminates the third dimension of the tensor, which is 1.
47
+ return energies
48
+
49
+ def forward(self, attention_hidden_state, memory, processed_memory,
50
+ attention_weights_cat, mask):
51
+ """
52
+ PARAMS
53
+ ------
54
+ attention_hidden_state: attention rnn last output
55
+ memory: encoder outputs
56
+ processed_memory: processed encoder outputs
57
+ attention_weights_cat: previous and cummulative attention weights
58
+ mask: binary mask for padded data
59
+ """
60
+ alignment = self.get_alignment_energies(
61
+ attention_hidden_state, processed_memory, attention_weights_cat)
62
+
63
+ if mask is not None:
64
+ alignment.data.masked_fill_(mask, self.score_mask_value)
65
+
66
+ attention_weights = F.softmax(alignment, dim=1)
67
+ # I think attention_weights is a [BxNUMENCINPUTS] so with unsequeeze(1): [Bx1xNUMENCINPUTS] and memory is
68
+ # [BxNUMENCINPUTSx512]
69
+ attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
70
+ attention_context = attention_context.squeeze(1)
71
+
72
+ return attention_context, attention_weights
73
+
74
+
75
+ class Prenet(nn.Module):
76
+ def __init__(self, in_dim, sizes):
77
+ super(Prenet, self).__init__()
78
+ in_sizes = [in_dim] + sizes[:-1] # all list values but the last one. The result is a list of the in_dim element
79
+ # concatenated with sizes of layers (i.e. [80, 256])
80
+ self.layers = nn.ModuleList(
81
+ [linear_module(in_size, out_size, bias=False)
82
+ for (in_size, out_size) in zip(in_sizes, sizes)])
83
+
84
+ def forward(self, x):
85
+ for linear in self.layers:
86
+ x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
87
+ return x
88
+
89
+
90
+ class Decoder(nn.Module):
91
+ def __init__(self, tacotron_hyperparams):
92
+ super(Decoder, self).__init__()
93
+ self.n_mel_channels = tacotron_hyperparams['n_mel_channels']
94
+ self.n_frames_per_step = tacotron_hyperparams['number_frames_step']
95
+ self.encoder_embedding_dim = tacotron_hyperparams['encoder_embedding_dim']
96
+ self.attention_rnn_dim = tacotron_hyperparams['attention_rnn_dim'] # 1024
97
+ self.decoder_rnn_dim = tacotron_hyperparams['decoder_rnn_dim'] # 1024
98
+ self.prenet_dim = tacotron_hyperparams['prenet_dim']
99
+ self.max_decoder_steps = tacotron_hyperparams['max_decoder_steps']
100
+ # The threshold to decide whether stop or not stop decoding?
101
+ self.gate_threshold = tacotron_hyperparams['gate_threshold']
102
+ self.p_attention_dropout = tacotron_hyperparams['p_attention_dropout']
103
+ self.p_decoder_dropout = tacotron_hyperparams['p_decoder_dropout']
104
+ # Define the prenet: there is only one frame per step, so input dim is the number of mel channels.
105
+ # There are two fully connected layers:
106
+ self.prenet = Prenet(
107
+ tacotron_hyperparams['n_mel_channels'] * tacotron_hyperparams['number_frames_step'],
108
+ [tacotron_hyperparams['prenet_dim'], tacotron_hyperparams['prenet_dim']])
109
+ # input_size: 1024 + 512 (output of first LSTM cell + attention_context) / hidden_size: 1024
110
+ self.attention_rnn = nn.LSTMCell(
111
+ tacotron_hyperparams['prenet_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
112
+ tacotron_hyperparams['attention_rnn_dim'])
113
+ # return attention_weights and attention_context. Does the alignments.
114
+ self.attention_layer = AttentionNet(
115
+ tacotron_hyperparams['attention_rnn_dim'], tacotron_hyperparams['encoder_embedding_dim'],
116
+ tacotron_hyperparams['attention_dim'], tacotron_hyperparams['attention_location_n_filters'],
117
+ tacotron_hyperparams['attention_location_kernel_size'])
118
+ # input_size: 256 + 512 (attention_context + prenet_info), hidden_size: 1024
119
+ self.decoder_rnn = nn.LSTMCell(
120
+ tacotron_hyperparams['attention_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
121
+ tacotron_hyperparams['decoder_rnn_dim'], 1)
122
+ # (LSTM output)1024 + (attention_context)512, out_dim: number of mel channels. Last linear projection that
123
+ # generates an output decoder spectral frame.
124
+ self.linear_projection = linear_module(
125
+ tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
126
+ tacotron_hyperparams['n_mel_channels']*tacotron_hyperparams['number_frames_step'])
127
+ # decision whether to continue decoding.
128
+ self.gate_layer = linear_module(
129
+ tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'], 1,
130
+ bias=True, w_init_gain='sigmoid')
131
+
132
+ def get_go_frame(self, memory):
133
+ """ Gets all zeros frames to use as first decoder input
134
+ PARAMS
135
+ ------
136
+ memory: decoder outputs
137
+
138
+ RETURNS
139
+ -------
140
+ decoder_input: all zeros frames
141
+ """
142
+ B = memory.size(0)
143
+ decoder_input = Variable(memory.data.new(
144
+ B, self.n_mel_channels * self.n_frames_per_step).zero_())
145
+ return decoder_input
146
+
147
+ def initialize_decoder_states(self, memory, mask):
148
+ """ Initializes attention rnn states, decoder rnn states, attention
149
+ weights, attention cumulative weights, attention context, stores memory
150
+ and stores processed memory
151
+ PARAMS
152
+ ------
153
+ memory: Encoder outputs
154
+ mask: Mask for padded data if training, expects None for inference
155
+ """
156
+ B = memory.size(0)
157
+ MAX_TIME = memory.size(1)
158
+
159
+ self.attention_hidden = Variable(memory.data.new(
160
+ B, self.attention_rnn_dim).zero_())
161
+ self.attention_cell = Variable(memory.data.new(
162
+ B, self.attention_rnn_dim).zero_())
163
+
164
+ self.decoder_hidden = Variable(memory.data.new(
165
+ B, self.decoder_rnn_dim).zero_())
166
+ self.decoder_cell = Variable(memory.data.new(
167
+ B, self.decoder_rnn_dim).zero_())
168
+
169
+ self.attention_weights = Variable(memory.data.new(
170
+ B, MAX_TIME).zero_())
171
+ self.attention_weights_cum = Variable(memory.data.new(
172
+ B, MAX_TIME).zero_())
173
+ self.attention_context = Variable(memory.data.new(
174
+ B, self.encoder_embedding_dim).zero_())
175
+
176
+ self.memory = memory
177
+ self.processed_memory = self.attention_layer.memory_layer(memory)
178
+ self.mask = mask
179
+
180
+ def parse_decoder_inputs(self, decoder_inputs):
181
+ """ Prepares decoder inputs, i.e. mel outputs
182
+ PARAMS
183
+ ------
184
+ decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
185
+
186
+ RETURNS
187
+ -------
188
+ inputs: processed decoder inputs
189
+
190
+ """
191
+ # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
192
+ decoder_inputs = decoder_inputs.transpose(1, 2)
193
+ # reshape decoder inputs in case we want to work with more than 1 frame per step (chunks). Otherwise, this next
194
+ # line does not just do anything
195
+ decoder_inputs = decoder_inputs.view(
196
+ decoder_inputs.size(0),
197
+ int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
198
+ # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
199
+ decoder_inputs = decoder_inputs.transpose(0, 1)
200
+ return decoder_inputs
201
+
202
+ def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
203
+ """ Prepares decoder outputs for output
204
+ PARAMS
205
+ ------
206
+ mel_outputs:
207
+ gate_outputs: gate output energies
208
+ alignments:
209
+
210
+ RETURNS
211
+ -------
212
+ mel_outputs:
213
+ gate_outpust: gate output energies
214
+ alignments:
215
+ """
216
+ # (T_out, B) -> (B, T_out)
217
+ alignments = torch.stack(alignments).transpose(0, 1)
218
+ # (T_out, B) -> (B, T_out)
219
+ gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
220
+ gate_outputs = gate_outputs.contiguous()
221
+ # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
222
+ mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
223
+ # decouple frames per step
224
+ mel_outputs = mel_outputs.view(
225
+ mel_outputs.size(0), -1, self.n_mel_channels)
226
+ # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
227
+ mel_outputs = mel_outputs.transpose(1, 2)
228
+
229
+ return mel_outputs, gate_outputs, alignments
230
+
231
+ def decode(self, decoder_input):
232
+ """ Decoder step using stored states, attention and memory
233
+ PARAMS
234
+ ------
235
+ decoder_input: previous mel output
236
+
237
+ RETURNS
238
+ -------
239
+ mel_output:
240
+ gate_output: gate output energies
241
+ attention_weights:
242
+ """
243
+ # concatenates [Bx1024] and [Bx512]. All dimensions match except 1 (torch.cat -1)
244
+ # concatenate the i-th decoder hidden state together with the i-th attention context
245
+ cell_input = torch.cat((decoder_input, self.attention_context), -1)
246
+ # the previous input is for the following LSTM cell, initialized with zeroes the hidden states and the cell
247
+ # state.
248
+ # compute the (i+1)th attention hidden state based on the i-th decoder hidden state and attention context.
249
+ self.attention_hidden, self.attention_cell = self.attention_rnn(
250
+ cell_input, (self.attention_hidden, self.attention_cell))
251
+ self.attention_hidden = F.dropout(self.attention_hidden, self.p_attention_dropout, self.training)
252
+ self.attention_cell = F.dropout(self.attention_cell, self.p_attention_dropout, self.training)
253
+ # concatenate the i-th state attention weights together with the cumulated from previous states to compute
254
+ # (i+1)th state
255
+ attention_weights_cat = torch.cat(
256
+ (self.attention_weights.unsqueeze(1),
257
+ self.attention_weights_cum.unsqueeze(1)), dim=1)
258
+ # compute (i+1)th attention context and provide (i+1)th attention weights based on the (i+1)th attention hidden
259
+ # state and (i)th and prev. weights
260
+ self.attention_context, self.attention_weights = self.attention_layer(
261
+ self.attention_hidden, self.memory, self.processed_memory,
262
+ attention_weights_cat, self.mask)
263
+
264
+ # cumulate attention_weights adding the (i+1)th to compute (i+2)th state
265
+ self.attention_weights_cum += self.attention_weights
266
+
267
+ decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)
268
+ self.decoder_hidden, self.decoder_cell = self.decoder_rnn(decoder_input,
269
+ (self.decoder_hidden, self.decoder_cell))
270
+ self.decoder_hidden = F.dropout(self.decoder_hidden, self.p_decoder_dropout, self.training)
271
+ self.decoder_cell = F.dropout(self.decoder_cell, self.p_decoder_dropout, self.training)
272
+
273
+ decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)
274
+ decoder_output = self.linear_projection(decoder_hidden_attention_context)
275
+
276
+ gate_prediction = self.gate_layer(decoder_hidden_attention_context)
277
+
278
+ return decoder_output, gate_prediction, self.attention_weights
279
+
280
+ """
281
+ # the decoder_output from ith step passes through the pre-net to compute new decoder hidden state and attention_
282
+ # context (i+1)th
283
+ prenet_output = self.prenet(decoder_input)
284
+ # the decoder_input now is the concatenation of the pre-net output and the new (i+1)th attention_context
285
+ decoder_input = torch.cat((prenet_output, self.attention_context), -1)
286
+ # another LSTM Cell to compute the decoder hidden (i+1)th state from the decoder_input
287
+ self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
288
+ decoder_input, (self.decoder_hidden, self.decoder_cell))
289
+
290
+ # with new attention_context we concatenate again with the new (i+1)th decoder_hidden state.
291
+ decoder_hidden_attention_context = torch.cat(
292
+ (self.decoder_hidden, self.attention_context), dim=1)
293
+ # the (i+1)th output is a linear projection of the decoder hidden state with a weight matrix plus bias.
294
+ decoder_output = self.linear_projection(
295
+ decoder_hidden_attention_context)
296
+ # check whether (i+1)th state is the last of the sequence
297
+ gate_prediction = self.gate_layer(decoder_hidden_attention_context)
298
+ return decoder_output, gate_prediction, self.attention_weights"""
299
+
300
+ def forward(self, memory, decoder_inputs, memory_lengths):
301
+ """ Decoder forward pass for training
302
+ PARAMS
303
+ ------
304
+ memory: Encoder outputs
305
+ decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
306
+ memory_lengths: Encoder output lengths for attention masking.
307
+
308
+ RETURNS
309
+ -------
310
+ mel_outputs: mel outputs from the decoder
311
+ gate_outputs: gate outputs from the decoder
312
+ alignments: sequence of attention weights from the decoder
313
+ """
314
+
315
+ decoder_input = self.get_go_frame(memory).unsqueeze(0)
316
+ decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
317
+ decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
318
+ decoder_inputs = self.prenet(decoder_inputs)
319
+
320
+ self.initialize_decoder_states(
321
+ memory, mask=~get_mask_from_lengths(memory_lengths))
322
+
323
+ mel_outputs, gate_outputs, alignments = [], [], []
324
+
325
+ while len(mel_outputs) < decoder_inputs.size(0) - 1:
326
+ decoder_input = decoder_inputs[len(mel_outputs)]
327
+ mel_output, gate_output, attention_weights = self.decode(
328
+ decoder_input)
329
+ # a class list, when += means concatenation of vectors
330
+ mel_outputs += [mel_output.squeeze(1)]
331
+ gate_outputs += [gate_output.squeeze()]
332
+ alignments += [attention_weights]
333
+ # getting the frame indexing from reference mel frames to pass it as the new input of the next decoding
334
+ # step: Teacher Forcing!
335
+ # Takes each time_step of sequences of all mini-batch samples (i.e. [48, 80] as the decoder_inputs is
336
+ # parsed as [189, 48, 80]).
337
+
338
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
339
+ mel_outputs, gate_outputs, alignments)
340
+
341
+ return mel_outputs, gate_outputs, alignments
342
+
343
+ def inference(self, memory):
344
+ """ Decoder inference
345
+ PARAMS
346
+ ------
347
+ memory: Encoder outputs
348
+
349
+ RETURNS
350
+ -------
351
+ mel_outputs: mel outputs from the decoder
352
+ gate_outputs: gate outputs from the decoder
353
+ alignments: sequence of attention weights from the decoder
354
+ """
355
+ decoder_input = self.get_go_frame(memory)
356
+
357
+ self.initialize_decoder_states(memory, mask=None)
358
+
359
+ mel_outputs, gate_outputs, alignments = [], [], []
360
+ while True:
361
+ decoder_input = self.prenet(decoder_input)
362
+ mel_output, gate_output, alignment = self.decode(decoder_input)
363
+
364
+ mel_outputs += [mel_output.squeeze(1)]
365
+ gate_outputs += [gate_output]
366
+ alignments += [alignment]
367
+
368
+ if torch.sigmoid(gate_output.data) > self.gate_threshold:
369
+ break
370
+ elif len(mel_outputs) == self.max_decoder_steps:
371
+ print("Warning! Reached max decoder steps")
372
+ break
373
+
374
+ decoder_input = mel_output
375
+
376
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
377
+ mel_outputs, gate_outputs, alignments)
378
+
379
+ return mel_outputs, gate_outputs, alignments
Encoder.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from torch.nn import functional as F
3
+ from nn_layers import convolutional_module
4
+
5
+
6
+ class Encoder(nn.Module):
7
+ """This is the encoder part of tacotron2. It includes a stack of three 1d convolutional layers
8
+ followed by batch normalization and ReLU activations, and a bidirectional LSTM layer.
9
+ These part encodes sequences of input characters."""
10
+ def __init__(self, encoder_params):
11
+ super(Encoder, self).__init__()
12
+ # we set the dropout applied at each convolutional layer, as specified in Tacotron2's paper
13
+ # self.dropout = nn.Dropout(0.5)
14
+
15
+ # A stack of convolution layers. For this model, there are 3 conv1d layers. We initialize a python
16
+ # list and run in a loop as many times as number of convolutional layers (three). In each
17
+ # iteration we initialize nn.Sequential container that permits us set a block of neural network
18
+ # modules. We need three equal nn sequences in a list. Then this list is properly registered using
19
+ # ModuleList class object (can act as an iterable, or be indexed).
20
+ # To see how the convolution is computed:
21
+ # https://pytorch.org/docs/stable/nn.html#conv1d
22
+
23
+ stack_of_convolutions = []
24
+ for _ in range(encoder_params['encoder_convs']):
25
+ conv_layer = nn.Sequential(convolutional_module(encoder_params['symbols_embedding_length'],
26
+ encoder_params['symbols_embedding_length'],
27
+ kernel_size=encoder_params['conv_kernel_size'],
28
+ stride=encoder_params['conv_stride'],
29
+ padding=int((encoder_params['conv_kernel_size'] - 1) / 2),
30
+ dilation=encoder_params['conv_dilation'],
31
+ w_init_gain=encoder_params['w_init_gain']),
32
+ nn.BatchNorm1d(encoder_params['symbols_embedding_length']))
33
+ stack_of_convolutions.append(conv_layer)
34
+ self.stack_conv = nn.ModuleList(stack_of_convolutions)
35
+
36
+ # Last part of the encoder is the bi-directional LSTM layer. As described in the original Tacotron2
37
+ # paper, there is only one BiLSTM layer with 256 units for each direction.
38
+
39
+ """Can I add the bidirectional LSTM layer together with the convolutional stack??? CHECK IT OUT!"""
40
+
41
+ self.bi_lstm = nn.LSTM(encoder_params['symbols_embedding_length'],
42
+ int(encoder_params['symbols_embedding_length'] / 2), 1, batch_first=True,
43
+ bidirectional=True)
44
+
45
+ def forward(self, input_sequences, input_lengths):
46
+ for conv in self.stack_conv:
47
+ input_sequences = F.dropout(F.relu(conv(input_sequences)), 0.5, self.training)
48
+
49
+ input_sequences = input_sequences.transpose(1, 2)
50
+ # After convolution filters, is the original sequence length the same? CHECK IT OUT
51
+ input_lengths = input_lengths.cpu().numpy()
52
+ # Returns a packed sequence object with variable-length sequences before passing through BiLSTM layer
53
+ input_sequences = nn.utils.rnn.pack_padded_sequence(input_sequences, input_lengths, batch_first=True)
54
+ # nn.LSTM accepts packed variable length sequence tensors. The output will also return a packed variable
55
+ # length sequence tensor. The output dimension is (seq_length, batch, num_directions*hidden_size), but
56
+ # if batch_first is True, then (batch, seq_length, num_direction*hidden_size).
57
+ self.bi_lstm.flatten_parameters()
58
+ outputs, _ = self.bi_lstm(input_sequences)
59
+ # Pads again the tensor back to normal format before packing
60
+ outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
61
+
62
+ return outputs # [N, Max_seq_length, E_length]
63
+
64
+ def inference(self, x):
65
+ for conv in self.stack_conv:
66
+ x = F.dropout(F.relu(conv(x)), 0.5, self.training)
67
+
68
+ x = x.transpose(1, 2)
69
+
70
+ self.bi_lstm.flatten_parameters()
71
+ outputs, _ = self.bi_lstm(x)
72
+
73
+ return outputs
GST.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.init as init
4
+ import torch.nn.functional as F
5
+ import numpy as np
6
+
7
+
8
+ class GST(nn.Module):
9
+
10
+ def __init__(self, hyper_parameters):
11
+
12
+ super().__init__()
13
+ self.prosody_extractor = LogMelSpecReferenceEncoder()
14
+ self.stl = MultiSTL(hyper_parameters=hyper_parameters)
15
+
16
+ def forward(self, logmel_spec, logmel_lengths):
17
+ prosody_features_embedded = self.prosody_extractor(logmel_spec, logmel_lengths) # [N, 512]
18
+ style_embed, gst_scores = self.stl(prosody_features_embedded)
19
+
20
+ return style_embed, gst_scores
21
+
22
+ def inference(self, scores): # NEED TO DEFINE SCORES TENSOR DIMENSION!!
23
+ style_embed_inference = self.stl.inference(scores=scores)
24
+
25
+ return style_embed_inference
26
+
27
+
28
+ class PitchContourEncoder(nn.Module):
29
+ """
30
+
31
+ """
32
+ def __init__(self, hyper_parameters):
33
+
34
+ super().__init__()
35
+
36
+ K = len(hyper_parameters['ref_enc_out_channels'])
37
+ filters = [1] + hyper_parameters['ref_enc_out_channels']
38
+ kernel_sizes = hyper_parameters['seq_ref_enc_filter_size']
39
+
40
+ convs_2d = []
41
+
42
+ for i in range(K):
43
+ conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
44
+ kernel_size=(kernel_sizes[i], 3), stride=(1, 1),
45
+ padding=(int((kernel_sizes[i] - 1) / 2), int((3 - 1) / 2)), bias=True)
46
+
47
+ nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
48
+
49
+ convs_2d.append(conv2d_init)
50
+
51
+ self.convs2D = nn.ModuleList(convs_2d)
52
+
53
+ self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=hyper_parameters['ref_enc_out_channels'][i])
54
+ for i in range(K)])
55
+
56
+ # WEIGHT INITIALIZATION DEFAULT:
57
+ self.prosody_bi_lstm = nn.LSTM(input_size=int(176), hidden_size=int(512/2), num_layers=1, batch_first=True,
58
+ bidirectional=True)
59
+
60
+ def forward(self, bin_locations): # [N, BIN_SUBAND, LEN_MELSPEC] (BIN_SUBAND = 13)
61
+ N = bin_locations.size(0) # Number of samples
62
+ # Changing tensor dimensions to have 1 input channel for the first conv2D layer:
63
+ bin_locations = bin_locations.unsqueeze(1)
64
+ bin_locations = bin_locations.transpose(2, 3) # [N, 1, LEN_MELSPEC, BIN_SUBAND]
65
+ """We implement ReLU gates at the output of Conv. layers. We could check it without"""
66
+ # For pitch tracking:
67
+ for conv2, bn2 in zip(self.convs2D, self.bns2D):
68
+ bin_locations = conv2(bin_locations)
69
+ bin_locations = bn2(bin_locations)
70
+ bin_locations = F.dropout(F.relu(bin_locations), 0.5, self.training) # [N, Cout, LEN_MELSPEC, BIN_SUBAND]
71
+
72
+ # Resize:
73
+ bin_locations = bin_locations.transpose(1, 2) # [N, LEN_MELSPEC, Cout, BIN_SUBAND]
74
+ T = bin_locations.size(1)
75
+ bin_locations = bin_locations.contiguous().view(N, T, -1) # [N, LEN_MELSPEC, Cout*BIN_SUBAND]
76
+
77
+ # Encode sequences into a bidirectional LSTM layer:
78
+ """In our case, we do not care about the specific length of each sequence, as with the zero padding the encoder
79
+ should be able to also encode the different lengths and see zero when its over. That is why we do not apply
80
+ a packing padded sequence before LSTM layer."""
81
+ _, (encoded_prosody, cell_state) = self.prosody_bi_lstm(bin_locations)
82
+
83
+ encoded_prosody = encoded_prosody.transpose(0, 1)
84
+ encoded_prosody = encoded_prosody.contiguous().view(N, -1)
85
+
86
+ return encoded_prosody # should be [N, 512]
87
+
88
+
89
+ # DENSE GST Reference Encoder:
90
+ class ProsodyEncoder(nn.Module):
91
+ """
92
+ This convolution class nn.Module performs two parallel convolution stacks, 1-D conv. and another 2-D conv.
93
+ Afterwards, the output of both will be concatenated to be passed, later, through a bidirectional LSTM layer.
94
+ """
95
+ def __init__(self, hyper_parameters):
96
+
97
+ super().__init__()
98
+
99
+ K = len(hyper_parameters['ref_enc_out_channels'])
100
+ filters = [1] + hyper_parameters['ref_enc_out_channels']
101
+ kernel_sizes = hyper_parameters['seq_ref_enc_filter_size']
102
+
103
+ # I NEED TO ADJUST PADDING TO NOT LOSE THE TOTAL LENGTH OF SEQUENCE!!
104
+ convs_1d = []
105
+ convs_2d = []
106
+
107
+ for i in range(K):
108
+ conv1d_init = nn.Conv1d(in_channels=filters[i], out_channels=filters[i + 1],
109
+ kernel_size=kernel_sizes[i], stride=1,
110
+ padding=int((kernel_sizes[i] - 1) / 2), bias=True)
111
+
112
+ nn.init.xavier_uniform_(conv1d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
113
+
114
+ convs_1d.append(conv1d_init)
115
+
116
+ conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
117
+ kernel_size=(kernel_sizes[i], 3), stride=(1, 1),
118
+ padding=(int((kernel_sizes[i] - 1) / 2), int((3 - 1) / 2)), bias=True)
119
+
120
+ nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
121
+
122
+ convs_2d.append(conv2d_init)
123
+
124
+ self.convs1D = nn.ModuleList(convs_1d)
125
+ self.convs2D = nn.ModuleList(convs_2d)
126
+
127
+ self.bns1D = nn.ModuleList([nn.BatchNorm1d(num_features=hyper_parameters['ref_enc_out_channels'][i])
128
+ for i in range(K)])
129
+ self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=hyper_parameters['ref_enc_out_channels'][i])
130
+ for i in range(K)])
131
+
132
+ self.prosody_linear = nn.Linear(512, 256, bias=True)
133
+ torch.nn.init.xavier_uniform_(self.prosody_linear.weight, gain=torch.nn.init.calculate_gain('linear'))
134
+
135
+ # WEIGHT INITIALIZATION DEFAULT:
136
+ self.prosody_bi_lstm = nn.LSTM(input_size=int(256), hidden_size=int(512/2), num_layers=1, batch_first=True,
137
+ bidirectional=True)
138
+
139
+ def forward(self, bin_locations, pitch_intensities): # [N, LEN_MELSPEC, 1], [N, LEN_MELSPEC, 3]
140
+ N = bin_locations.size(0) # Number of samples
141
+ num_intensities = pitch_intensities.size(2)
142
+ # Changing tensor dimensions to have 1 input channel for the first conv2D layer:
143
+ pitch_intensities = pitch_intensities.view(N, 1, -1, num_intensities) # [N, 1, LEN_MELSPEC, num_intensities]
144
+ bin_locations = bin_locations.transpose(1, 2) # [N, 1, LEN_MELSPEC]
145
+ """We implement ReLU gates at the output of Conv. layers. We could check it without"""
146
+ # For pitch tracking:
147
+ for conv, bn in zip(self.convs1D, self.bns1D):
148
+ bin_locations = conv(bin_locations)
149
+ bin_locations = bn(bin_locations)
150
+ bin_locations = F.dropout(F.relu(bin_locations), 0.5, self.training) # [N, Cout, T]
151
+
152
+ # For pitch intensities:
153
+ for conv2, bn2 in zip(self.convs2D, self.bns2D):
154
+ pitch_intensities = conv2(pitch_intensities)
155
+ pitch_intensities = bn2(pitch_intensities)
156
+ pitch_intensities = F.dropout(F.relu(pitch_intensities), 0.5, self.training) # [N, Cout, T, bins]
157
+
158
+ # Resize pitch intensities
159
+ bin_locations = bin_locations.transpose(1, 2) # [N, T, Cout]
160
+ pitch_intensities = pitch_intensities.transpose(1, 2) # [N, T, Cout, bins]
161
+ T = pitch_intensities.size(1)
162
+ pitch_intensities = pitch_intensities.contiguous().view(N, T, -1) # [N, T, Cout*bins]
163
+
164
+ # Concatenate features
165
+ pitch_convolved = torch.cat((bin_locations, pitch_intensities), 2)
166
+
167
+ # Linear projection (IS IT NECESSARY? DOES ACTIVATION FUNCTION IMPROVE THE RESULT?)
168
+ projection_pitch_convolved = F.dropout(F.tanh(self.prosody_linear(pitch_convolved)), 0.5, self.training)
169
+
170
+ # Encode sequences into a bidirectional LSTM layer:
171
+ """In our case, we do not care about the specific length of each sequence, as with the zero padding the encoder
172
+ should be able to also encode the different lengths and see zero when its over. That is why we do not apply
173
+ a packing padded sequence before LSTM layer."""
174
+ _, (encoded_prosody, cell_state) = self.prosody_bi_lstm(projection_pitch_convolved)
175
+
176
+ encoded_prosody = encoded_prosody.transpose(0, 1)
177
+ encoded_prosody = encoded_prosody.contiguous().view(N, -1)
178
+
179
+ return encoded_prosody # should be [N, 512]
180
+
181
+
182
+ class LogMelSpecReferenceEncoder(nn.Module):
183
+ """
184
+ """
185
+ def __init__(self):
186
+
187
+ super().__init__()
188
+
189
+ reference_encoder_out_channels = [32, 32, 64, 64, 128, 128]
190
+ K = len(reference_encoder_out_channels)
191
+ filters = [1] + reference_encoder_out_channels
192
+ kernel_size = (3, 3)
193
+ stride = (2, 2)
194
+ padding = (1, 1)
195
+
196
+ convs_2d = []
197
+
198
+ for i in range(K):
199
+ conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
200
+ kernel_size=kernel_size, stride=stride,
201
+ padding=padding, bias=True)
202
+
203
+ nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
204
+
205
+ convs_2d.append(conv2d_init)
206
+
207
+ self.convs2D = nn.ModuleList(convs_2d)
208
+ self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=reference_encoder_out_channels[i])
209
+ for i in range(K)])
210
+
211
+ out_channels = self.calculate_channels(80, 3, 2, 1, K)
212
+ # self.gru = nn.GRU(input_size=reference_encoder_out_channels[-1] * out_channels, hidden_size=512,
213
+ # batch_first=True, bidirectional=False)
214
+
215
+ # WEIGHT INITIALIZATION DEFAULT:
216
+ self.bi_lstm = nn.LSTM(input_size=reference_encoder_out_channels[-1] * out_channels,
217
+ hidden_size=int(512/2), num_layers=1, batch_first=True, bidirectional=True)
218
+
219
+ def forward(self, logmel_spec, logmel_lengths): # [N, MEL_CHANNELS, LEN_MELSPEC]
220
+ N = logmel_spec.size(0) # Number of samples
221
+ # Changing tensor dimensions to have 1 input channel for the first conv2D layer:
222
+ logmel_spec = logmel_spec.unsqueeze(1)
223
+ logmel_spec = logmel_spec.transpose(2, 3) # [N, 1, LEN_MELSPEC, MEL_CHANNELS]
224
+ """We implement ReLU gates at the output of Conv. layers. We could check it without"""
225
+ for conv2, bn2 in zip(self.convs2D, self.bns2D):
226
+ logmel_spec = conv2(logmel_spec)
227
+ logmel_spec = bn2(logmel_spec)
228
+ logmel_spec = F.dropout(F.relu(logmel_spec), 0.5, self.training) # [N, Cout, LEN_MELSPEC, BIN_SUBAND]
229
+
230
+ # Resize:
231
+ logmel_spec = logmel_spec.transpose(1, 2) # [N, LEN_MELSPEC, Cout, MEL_CHANNELS]
232
+ T = logmel_spec.size(1)
233
+ logmel_spec = logmel_spec.contiguous().view(N, T, -1) # [N, LEN_MELSPEC, Cout*BIN_SUBAND]
234
+
235
+ logmel_lengths = logmel_lengths.cpu().numpy()
236
+ last_hidden_states = torch.zeros(N, 512)
237
+
238
+ logmel_after_lengths = np.trunc(logmel_lengths / 2**6)
239
+ logmel_after_lengths = logmel_after_lengths + 1
240
+ logmel_after_lengths = logmel_after_lengths.astype(int)
241
+ logmel_after_lengths = torch.tensor(logmel_after_lengths)
242
+ # logmel_spec = nn.utils.rnn.pack_padded_sequence(logmel_spec, logmel_after_lengths, batch_first=True)
243
+ self.bi_lstm.flatten_parameters()
244
+ # memory, out = self.gru(logmel_spec)
245
+ outputs, (hidden_states, cell_state) = self.bi_lstm(logmel_spec)
246
+ hidden_states = hidden_states.transpose(0, 1)
247
+ hidden_states = hidden_states.contiguous().view(N, -1)
248
+ # outputs, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
249
+
250
+ # for j in range(N):
251
+ # last_hidden_states[j, :] = outputs[j, logmel_after_lengths[j] - 1, :]
252
+
253
+ # return last_hidden_states.cuda(non_blocking=True)
254
+ return hidden_states
255
+
256
+ def calculate_channels(self, L, kernel_size, stride, padding, n_convs):
257
+ for i in range(n_convs):
258
+ L = (L - kernel_size + 2 * padding) // stride + 1
259
+ return L
260
+
261
+
262
+ # BASIC FORM FOR NOW. NEEDS TO BE EXPANDED TO OUR NEW PROPOSAL
263
+ class MultiSTL(nn.Module):
264
+
265
+ """
266
+ inputs --- [N, E]
267
+ """
268
+
269
+ def __init__(self, hyper_parameters):
270
+
271
+ super().__init__()
272
+ # E = 256 / num_heads = 8 / token_num = 10!!
273
+ self.embed = nn.Parameter(torch.FloatTensor(hyper_parameters['token_num'],
274
+ hyper_parameters['E'] // hyper_parameters['num_heads']))
275
+ # d_q = hyper_parameters['E'] // 2
276
+ d_q = hyper_parameters['E']
277
+ d_k = hyper_parameters['E'] // hyper_parameters['num_heads']
278
+
279
+ self.attention = MultiHeadAttention(query_dim=d_q, key_dim=d_k,
280
+ num_units=hyper_parameters['E'], num_heads=hyper_parameters['num_heads'])
281
+
282
+ init.xavier_uniform_(self.embed, gain=init.calculate_gain('linear'))
283
+
284
+ def forward(self, inputs):
285
+ N = inputs.size(0) # Number of samples in the batch
286
+ query = inputs.unsqueeze(1) # [N, 1, E]
287
+ keys = F.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads]
288
+ style_embed, gst_scores = self.attention(query, keys)
289
+
290
+ return style_embed, gst_scores
291
+
292
+ def inference(self, scores):
293
+ keys = F.tanh(self.embed).unsqueeze(0)
294
+ style_embed_inference = self.attention.inference(keys, scores=scores)
295
+
296
+ return style_embed_inference
297
+
298
+
299
+ class MultiHeadAttention(nn.Module):
300
+ """
301
+ input:
302
+ query --- [N, T_q, query_dim] T_q = 1
303
+ key --- [N, T_k, key_dim] T_k = 5 (num of tokens)
304
+ output:
305
+ out --- [N, T_q, num_units]
306
+ """
307
+
308
+ def __init__(self, query_dim, key_dim, num_units, num_heads):
309
+
310
+ super().__init__()
311
+ self.num_units = num_units
312
+ self.num_heads = num_heads
313
+ self.key_dim = key_dim
314
+ #self.sparse_max = Sparsemax(dim=3)
315
+
316
+ # Linear projection of data (encoder and decoder states) into a fixed number of hidden units
317
+ self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
318
+ self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
319
+ self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
320
+
321
+ def forward(self, query, key):
322
+
323
+ querys = self.W_query(query) # [N, T_q, num_units] the last dimension changes according to the output dim
324
+ keys = self.W_key(key) # [N, T_k, num_units]
325
+ values = self.W_value(key)
326
+
327
+ # the number of units set at the initialization is the total of hidden feature units we want. Then, we will
328
+ # assign a specific number of num_units according to the number of heads of the multi head Attention.
329
+
330
+ # Basically, style tokens are the number of heads we configure to learn different types of attention
331
+ #
332
+ split_size = self.num_units // self.num_heads # integer division, without remainder
333
+ querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h]
334
+ keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
335
+ values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
336
+
337
+ # score = softmax(QK^T / (d_k ** 0.5))
338
+ scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
339
+ scores = scores / (self.key_dim ** 0.33) # cube root instead of square to prevent very small values
340
+ scores = F.softmax(scores, dim=3) # From dimension 3, length of Key sequences.
341
+ # scores = self.sparse_max(scores)
342
+ out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
343
+ out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
344
+ scores = scores.squeeze()
345
+
346
+ return out, scores
347
+
348
+ def inference(self, key, scores): # key [1, 5, 512/8] # [1, num_tokens]
349
+ """Only need the keys that are already trained, and the scores that I impose"""
350
+ scores = scores.unsqueeze(0).unsqueeze(0).unsqueeze(0).expand(self.num_heads, -1, -1, -1)
351
+ # print(scores.shape)
352
+ values = self.W_value(key)
353
+
354
+ # the number of units set at the initialization is the total of hidden feature units we want. Then, we will
355
+ # assign a specific number of num_units according to the number of heads of the multi head Attention.
356
+
357
+ # Basically, style tokens are the number of heads we configure to learn different types of attention
358
+ #
359
+ split_size = self.num_units // self.num_heads # integer division, without remainder
360
+ values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
361
+
362
+ # score = softmax(QK^T / (d_k ** 0.5))
363
+
364
+ # out = score * V
365
+ out = torch.matmul(scores, values) # [h, 1, T_q = 1, num_units/h]
366
+ out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
367
+
368
+ return out
Postnet.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from torch.nn import functional as F
4
+ from nn_layers import convolutional_module
5
+
6
+
7
+ class Postnet(nn.Module):
8
+ """Postnet
9
+ - Five 1-d convolution with 512 channels and kernel size 5
10
+ """
11
+
12
+ def __init__(self, tacotron_hyperparams):
13
+ super(Postnet, self).__init__()
14
+ # self.dropout = nn.Dropout(0.5)
15
+ self.convolutions = nn.ModuleList()
16
+
17
+ self.convolutions.append(
18
+ nn.Sequential(
19
+ convolutional_module(tacotron_hyperparams['n_mel_channels'],
20
+ tacotron_hyperparams['postnet_embedding_dim'],
21
+ kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
22
+ padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
23
+ dilation=1, w_init_gain='tanh'),
24
+ nn.BatchNorm1d(tacotron_hyperparams['postnet_embedding_dim']))
25
+ )
26
+
27
+ for i in range(1, tacotron_hyperparams['postnet_n_convolutions'] - 1):
28
+ self.convolutions.append(
29
+ nn.Sequential(
30
+ convolutional_module(tacotron_hyperparams['postnet_embedding_dim'],
31
+ tacotron_hyperparams['postnet_embedding_dim'],
32
+ kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
33
+ padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
34
+ dilation=1, w_init_gain='tanh'),
35
+ nn.BatchNorm1d(tacotron_hyperparams['postnet_embedding_dim']))
36
+ )
37
+
38
+ self.convolutions.append(
39
+ nn.Sequential(
40
+ convolutional_module(tacotron_hyperparams['postnet_embedding_dim'],
41
+ tacotron_hyperparams['n_mel_channels'],
42
+ kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
43
+ padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
44
+ dilation=1, w_init_gain='linear'),
45
+ nn.BatchNorm1d(tacotron_hyperparams['n_mel_channels']))
46
+ )
47
+
48
+ def forward(self, x):
49
+ for i in range(len(self.convolutions) - 1):
50
+ x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
51
+ x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
52
+ return x
Tacotron2.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import sqrt
2
+
3
+ import torch
4
+ from torch import nn
5
+
6
+ from Encoder import Encoder
7
+ from Decoder import Decoder
8
+ from Postnet import Postnet
9
+ from GST import GST
10
+
11
+ from utils import to_gpu, get_mask_from_lengths
12
+ from fp16_optimizer import fp32_to_fp16, fp16_to_fp32
13
+
14
+
15
+ class tacotron_2(nn.Module):
16
+ def __init__(self, tacotron_hyperparams):
17
+ super(tacotron_2, self).__init__()
18
+ self.mask_padding = tacotron_hyperparams['mask_padding']
19
+ self.fp16_run = tacotron_hyperparams['fp16_run']
20
+ self.n_mel_channels = tacotron_hyperparams['n_mel_channels']
21
+ self.n_frames_per_step = tacotron_hyperparams['number_frames_step']
22
+ self.embedding = nn.Embedding(
23
+ tacotron_hyperparams['n_symbols'], tacotron_hyperparams['symbols_embedding_length'])
24
+ # CHECK THIS OUT!!!
25
+ std = sqrt(2.0 / (tacotron_hyperparams['n_symbols'] + tacotron_hyperparams['symbols_embedding_length']))
26
+ val = sqrt(3.0) * std
27
+ self.embedding.weight.data.uniform_(-val, val)
28
+ self.encoder = Encoder(tacotron_hyperparams)
29
+ self.decoder = Decoder(tacotron_hyperparams)
30
+ self.postnet = Postnet(tacotron_hyperparams)
31
+ self.gst = GST(tacotron_hyperparams)
32
+
33
+ def parse_batch(self, batch):
34
+ # GST I add the new tensor from prosody features to train GST tokens:
35
+ text_padded, input_lengths, mel_padded, gate_padded, output_lengths, prosody_padded = batch
36
+ text_padded = to_gpu(text_padded).long()
37
+ max_len = int(torch.max(input_lengths.data).item()) # With item() you get the pure value (not in a tensor)
38
+ input_lengths = to_gpu(input_lengths).long()
39
+ mel_padded = to_gpu(mel_padded).float()
40
+ gate_padded = to_gpu(gate_padded).float()
41
+ output_lengths = to_gpu(output_lengths).long()
42
+ prosody_padded = to_gpu(prosody_padded).float()
43
+
44
+ return (
45
+ (text_padded, input_lengths, mel_padded, max_len, output_lengths, prosody_padded),
46
+ (mel_padded, gate_padded))
47
+
48
+ def parse_input(self, inputs):
49
+ inputs = fp32_to_fp16(inputs) if self.fp16_run else inputs
50
+ return inputs
51
+
52
+ def parse_output(self, outputs, output_lengths=None):
53
+ if self.mask_padding and output_lengths is not None:
54
+ mask = ~get_mask_from_lengths(output_lengths)
55
+ mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
56
+ mask = mask.permute(1, 0, 2)
57
+
58
+ outputs[0].data.masked_fill_(mask, 0.0)
59
+ outputs[1].data.masked_fill_(mask, 0.0)
60
+ outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
61
+
62
+ outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
63
+
64
+ return outputs
65
+
66
+ def forward(self, inputs):
67
+ inputs, input_lengths, targets, max_len, output_lengths, gst_prosody_padded = self.parse_input(inputs)
68
+ input_lengths, output_lengths = input_lengths.data, output_lengths.data
69
+
70
+ embedded_inputs = self.embedding(inputs).transpose(1, 2)
71
+
72
+ encoder_outputs = self.encoder(embedded_inputs, input_lengths)
73
+
74
+ # GST style embedding plus embedded_inputs before entering the decoder
75
+ # bin_locations = gst_prosody_padded[:, 0, :]
76
+ # pitch_intensities = gst_prosody_padded[:, 1:, :]
77
+ # bin_locations = bin_locations.unsqueeze(2)
78
+ gst_style_embedding, gst_scores = self.gst(gst_prosody_padded, output_lengths) # [N, 512]
79
+ gst_style_embedding = gst_style_embedding.expand_as(encoder_outputs)
80
+
81
+ encoder_outputs = encoder_outputs + gst_style_embedding
82
+
83
+ mel_outputs, gate_outputs, alignments = self.decoder(
84
+ encoder_outputs, targets, memory_lengths=input_lengths)
85
+ mel_outputs_postnet = self.postnet(mel_outputs)
86
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
87
+
88
+ return self.parse_output(
89
+ [mel_outputs, mel_outputs_postnet, gate_outputs, alignments, gst_scores],
90
+ output_lengths)
91
+
92
+ def inference(self, inputs, gst_scores): # gst_scores must be a torch tensor
93
+ inputs = self.parse_input(inputs)
94
+ embedded_inputs = self.embedding(inputs).transpose(1, 2)
95
+ encoder_outputs = self.encoder.inference(embedded_inputs)
96
+
97
+ # GST inference:
98
+ gst_style_embedding = self.gst.inference(gst_scores)
99
+ gst_style_embedding = gst_style_embedding.expand_as(encoder_outputs)
100
+
101
+ encoder_outputs = encoder_outputs + gst_style_embedding
102
+
103
+ mel_outputs, gate_outputs, alignments = self.decoder.inference(
104
+ encoder_outputs)
105
+
106
+ mel_outputs_postnet = self.postnet(mel_outputs)
107
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
108
+
109
+ outputs = self.parse_output(
110
+ [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
111
+
112
+ return outputs
audio_processing.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from scipy.signal import get_window
4
+ import librosa.util as librosa_util
5
+
6
+
7
+ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
8
+ n_fft=800, dtype=np.float32, norm=None):
9
+ """
10
+ # from librosa 0.6
11
+ Compute the sum-square envelope of a window function at a given hop length.
12
+
13
+ This is used to estimate modulation effects induced by windowing
14
+ observations in short-time fourier transforms.
15
+
16
+ Parameters
17
+ ----------
18
+ window : string, tuple, number, callable, or list-like
19
+ Window specification, as in `get_window`
20
+
21
+ n_frames : int > 0
22
+ The number of analysis frames
23
+
24
+ hop_length : int > 0
25
+ The number of samples to advance between frames
26
+
27
+ win_length : [optional]
28
+ The length of the window function. By default, this matches `n_fft`.
29
+
30
+ n_fft : int > 0
31
+ The length of each analysis frame.
32
+
33
+ dtype : np.dtype
34
+ The data type of the output
35
+
36
+ Returns
37
+ -------
38
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
39
+ The sum-squared envelope of the window function
40
+ """
41
+ if win_length is None:
42
+ win_length = n_fft
43
+
44
+ n = n_fft + hop_length * (n_frames - 1)
45
+ x = np.zeros(n, dtype=dtype)
46
+
47
+ # Compute the squared window at the desired length
48
+ win_sq = get_window(window, win_length, fftbins=True)
49
+ win_sq = librosa_util.normalize(win_sq, norm=norm)**2
50
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
51
+
52
+ # Fill the envelope
53
+ for i in range(n_frames):
54
+ sample = i * hop_length
55
+ x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
56
+ return x
57
+
58
+
59
+ def griffin_lim(magnitudes, stft_fn, n_iters=30):
60
+ """
61
+ PARAMS
62
+ ------
63
+ magnitudes: spectrogram magnitudes
64
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
65
+ """
66
+
67
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
68
+ angles = angles.astype(np.float32)
69
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
70
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
71
+
72
+ for i in range(n_iters):
73
+ _, angles = stft_fn.transform(signal)
74
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
75
+ return signal
76
+
77
+
78
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
79
+ """
80
+ PARAMS
81
+ ------
82
+ C: compression factor
83
+ """
84
+ return torch.log(torch.clamp(x, min=clip_val) * C)
85
+
86
+
87
+ def dynamic_range_decompression(x, C=1):
88
+ """
89
+ PARAMS
90
+ ------
91
+ C: compression factor used to compress
92
+ """
93
+ return torch.exp(x) / C
hyper_parameters.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text import symbols
2
+
3
+ # creating a python dictionary with all hyper parameters
4
+
5
+ tacotron_params = {'filter_length': 1024, # audio parameters:
6
+ 'hop_length': 256,
7
+ 'win_length': 1024,
8
+ 'n_mel_channels': 80,
9
+ 'mel_fmin': 0.0,
10
+ 'mel_fmax': 8000.0,
11
+ 'sampling_rate': 22050,
12
+ 'max_wav_value': 32768.0,
13
+ 'clipping_value': 1e-5,
14
+ 'C': 1,
15
+ # dataset parameters:
16
+ 'load_mel_from_disk': False,
17
+ 'sort_by_length': False,
18
+ 'text_cleaners': ['english_cleaners'],
19
+ # embedding parameters:
20
+ 'symbols_embedding_length': 512,
21
+ 'n_symbols': len(symbols),
22
+ # encoder parameters:
23
+ 'encoder_embedding_dim': 512,
24
+ 'encoder_convs': 3,
25
+ 'conv_kernel_size': 5,
26
+ 'conv_stride': 1,
27
+ 'conv_dilation': 1,
28
+ 'w_init_gain': 'relu',
29
+ # decoder parameters:
30
+ 'number_frames_step': 1,
31
+ 'decoder_rnn_dim': 1024,
32
+ 'prenet_dim': 256,
33
+ 'max_decoder_steps': 1000,
34
+ 'gate_threshold': 0.5, # Need to be reviewed
35
+ 'p_attention_dropout': 0.1,
36
+ 'p_decoder_dropout': 0.1,
37
+ # attention parameters:
38
+ 'attention_rnn_dim': 1024,
39
+ 'attention_dim': 128,
40
+ # location features parameters:
41
+ 'attention_location_n_filters': 32,
42
+ 'attention_location_kernel_size': 31,
43
+ # postnet parameters:
44
+ 'postnet_embedding_dim': 512,
45
+ 'postnet_kernel_size': 5,
46
+ 'postnet_n_convolutions': 5,
47
+ # GST parameters:
48
+ 'E': 512,
49
+ 'token_num': 3,
50
+ 'num_heads': 1,
51
+ 'seq_ref_enc_filter_size': [3, 7, 11], # phoneme, word/silence, utterance levels respectively
52
+ 'ref_enc_out_channels': [8, 16, 16],
53
+ # optimization parameters:
54
+ 'use_saved_learning_rate': True,
55
+ 'batch_size': 32, # 64 should be larger than the number of GPUs. Integer multiple of the num. of GPUs
56
+ 'learning_rate': 1e-3,
57
+ 'weight_decay': 1e-6,
58
+ 'grad_clip_thresh': 1.0,
59
+ 'mask_padding': False,
60
+ # experiment parameters:
61
+ 'epochs': 300, # 160, 500
62
+ 'iters_per_checkpoint': 1500, # 1000. How many iterations before validating
63
+ 'seed': 1234,
64
+ 'dynamic_loss_scaling': True, # CHECK IT OUT!
65
+ 'distributed_run': False,
66
+ 'dist_backend': 'nccl',
67
+ 'dist_url': "/home/alex/PyTorch_TACOTRON_2/pycharm-tacotron2", # CHECK IT OUT!
68
+ 'cudnn_enabled': True,
69
+ 'cudnn_benchmark': False,
70
+ 'fp16_run': False}
logger.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import torch.nn.functional as F
3
+ from tensorboardX import SummaryWriter
4
+ from plotting_utils import plot_alignment_to_numpy, plot_gst_scores_to_numpy, plot_spectrogram_to_numpy
5
+ from plotting_utils import plot_gate_outputs_to_numpy
6
+
7
+
8
+ class Tacotron2Logger(SummaryWriter):
9
+ def __init__(self, logdir):
10
+ super(Tacotron2Logger, self).__init__(logdir)
11
+
12
+ def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
13
+ iteration):
14
+ self.add_scalar("training.loss", reduced_loss, iteration)
15
+ self.add_scalar("grad.norm", grad_norm, iteration)
16
+ self.add_scalar("learning.rate", learning_rate, iteration)
17
+ self.add_scalar("duration", duration, iteration)
18
+
19
+ def log_validation(self, reduced_loss, model, y, y_pred, gst_scores, iteration):
20
+ self.add_scalar("validation.loss", reduced_loss, iteration)
21
+ _, mel_outputs, gate_outputs, alignments, _ = y_pred
22
+ mel_targets, gate_targets = y
23
+
24
+ # plot distribution of parameters
25
+ for tag, value in model.named_parameters():
26
+ tag = tag.replace('.', '/')
27
+ self.add_histogram(tag, value.data.cpu().numpy(), iteration)
28
+
29
+ # plot alignment, mel target and predicted, gate target and predicted
30
+ idx = random.randint(0, alignments.size(0) - 1)
31
+
32
+ align_idx = alignments[idx].data.cpu().numpy().T
33
+ gst_scores = gst_scores.data.cpu().numpy().T
34
+ # print("Validation GST scores before plotting to tensorboard: {}".format(gst_scores.shape))
35
+ meltarg_idx = mel_targets[idx].data.cpu().numpy()
36
+ melout_idx = mel_outputs[idx].data.cpu().numpy()
37
+
38
+ self.add_image("alignment", plot_alignment_to_numpy(align_idx), iteration)
39
+ self.add_image("gst_scores", plot_gst_scores_to_numpy(gst_scores), iteration)
40
+ self.add_image("mel_target", plot_spectrogram_to_numpy(meltarg_idx), iteration)
41
+ self.add_image("mel_predicted", plot_spectrogram_to_numpy(melout_idx), iteration)
42
+ self.add_image(
43
+ "gate",
44
+ plot_gate_outputs_to_numpy(
45
+ gate_targets[idx].data.cpu().numpy(),
46
+ F.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
47
+ iteration)
nn_layers.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from librosa.filters import mel as librosa_mel_fn
4
+ from stft import STFT
5
+
6
+ clip_val = 1e-5
7
+ C = 1
8
+
9
+
10
+ class convolutional_module(nn.Module):
11
+ """This class defines a 1d convolutional layer and its initialization for the system we are
12
+ replicating"""
13
+ def __init__(self, in_ch, out_ch, kernel_size=1, stride=1, padding=None, dilation=1, bias=True,
14
+ w_init_gain='linear'):
15
+ # in PyTorch you define your Models as subclasses of torch.nn.Module
16
+ super(convolutional_module, self).__init__()
17
+ if padding is None:
18
+ assert(kernel_size % 2 == 1)
19
+ padding = int(dilation * (kernel_size - 1) / 2)
20
+
21
+ # initialize the convolutional layer which is an instance of Conv1d
22
+ # torch.nn.Conv1d calls internally the method torch.nn.functional.conv1d, which accepts the
23
+ # input with the shape (minibatch x in_channels x input_w), and a weight of shape
24
+ # (out_channels x (in_channels/groups) x kernel_w). In our case, we do not split into groups.
25
+ # Then, our input shape will be (48 x 512 x 189) and the weights are set up as
26
+ # (512 x 512 x 5)
27
+ self.conv_layer = torch.nn.Conv1d(in_ch, out_ch, kernel_size=kernel_size, stride=stride,
28
+ padding=padding, dilation=dilation, bias=bias)
29
+
30
+ """Useful information of Xavier initialization in:
31
+ https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/"""
32
+ torch.nn.init.xavier_uniform_(self.conv_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
33
+
34
+ def forward(self, x):
35
+ conv_output = self.conv_layer(x)
36
+ return conv_output
37
+
38
+
39
+ class linear_module(torch.nn.Module):
40
+ """This class defines a linear layer and its initialization method for the system we are
41
+ replicating. This implements a linear transformation: y = xA^t + b"""
42
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
43
+ super(linear_module, self).__init__()
44
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
45
+
46
+ torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
47
+
48
+ def forward(self, x):
49
+ return self.linear_layer(x)
50
+
51
+
52
+ class location_layer(nn.Module):
53
+ def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
54
+ super(location_layer, self).__init__()
55
+ padding = int((attention_kernel_size - 1) / 2)
56
+ """We are being very restricting without training a bias"""
57
+ """I think in_channels = 2 is k (number of vectors for every encoded stage position from prev.
58
+ alignment)."""
59
+ self.location_conv = convolutional_module(2, attention_n_filters, kernel_size=attention_kernel_size,
60
+ padding=padding, bias=False, stride=1, dilation=1)
61
+ self.location_dense = linear_module(attention_n_filters, attention_dim, bias=False,
62
+ w_init_gain='tanh')
63
+
64
+ def forward(self, attention_weights_cat):
65
+ processed_attention = self.location_conv(attention_weights_cat)
66
+ processed_attention = processed_attention.transpose(1, 2)
67
+ processed_attention = self.location_dense(processed_attention)
68
+ return processed_attention
69
+
70
+
71
+ class TacotronSTFT(nn.Module):
72
+ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
73
+ n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
74
+ mel_fmax=8000.0):
75
+ super(TacotronSTFT, self).__init__()
76
+ self.n_mel_channels = n_mel_channels
77
+ self.sampling_rate = sampling_rate
78
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
79
+ mel_basis = librosa_mel_fn(
80
+ sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
81
+ mel_basis = torch.from_numpy(mel_basis).float()
82
+ self.register_buffer('mel_basis', mel_basis)
83
+
84
+ def spectral_de_normalize(self, magnitudes):
85
+ output = torch.exp(magnitudes) / C
86
+ return output
87
+
88
+ def mel_spectrogram(self, y):
89
+ """Computes mel-spectrograms from a batch of waves
90
+ PARAMS
91
+ ------
92
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
93
+
94
+ RETURNS
95
+ -------
96
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
97
+ """
98
+ assert(torch.min(y.data) >= -1)
99
+ assert(torch.max(y.data) <= 1)
100
+
101
+ magnitudes, phases = self.stft_fn.transform(y)
102
+ magnitudes = magnitudes.data
103
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
104
+ mel_output = torch.log(torch.clamp(mel_output, min=clip_val) * C)
105
+ return mel_output
stft.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2017, Prem Seetharaman
5
+ All rights reserved.
6
+
7
+ * Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice,
11
+ this list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice, this
14
+ list of conditions and the following disclaimer in the
15
+ documentation and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from this
19
+ software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+
33
+ import torch
34
+ import numpy as np
35
+ import torch.nn.functional as F
36
+ from torch.autograd import Variable
37
+ from scipy.signal import get_window
38
+ from librosa.util import pad_center, tiny
39
+ from audio_processing import window_sumsquare
40
+
41
+
42
+ class STFT(torch.nn.Module):
43
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
44
+ def __init__(self, filter_length=800, hop_length=200, win_length=800,
45
+ window='hann'):
46
+ super(STFT, self).__init__()
47
+ self.filter_length = filter_length
48
+ self.hop_length = hop_length
49
+ self.win_length = win_length
50
+ self.window = window
51
+ self.forward_transform = None
52
+ scale = self.filter_length / self.hop_length
53
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
54
+
55
+ cutoff = int((self.filter_length / 2 + 1))
56
+ fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
57
+ np.imag(fourier_basis[:cutoff, :])])
58
+
59
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
60
+ inverse_basis = torch.FloatTensor(
61
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :])
62
+
63
+ if window is not None:
64
+ assert(filter_length >= win_length)
65
+ # get window and zero center pad it to filter_length
66
+ fft_window = get_window(window, win_length, fftbins=True)
67
+ fft_window = pad_center(fft_window, filter_length)
68
+ fft_window = torch.from_numpy(fft_window).float()
69
+
70
+ # window the bases
71
+ forward_basis *= fft_window
72
+ inverse_basis *= fft_window
73
+
74
+ self.register_buffer('forward_basis', forward_basis.float())
75
+ self.register_buffer('inverse_basis', inverse_basis.float())
76
+
77
+ def transform(self, input_data):
78
+ num_batches = input_data.size(0)
79
+ num_samples = input_data.size(1)
80
+
81
+ self.num_samples = num_samples
82
+
83
+ # similar to librosa, reflect-pad the input
84
+ input_data = input_data.view(num_batches, 1, num_samples)
85
+ input_data = F.pad(
86
+ input_data.unsqueeze(1),
87
+ (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
88
+ mode='reflect')
89
+ input_data = input_data.squeeze(1)
90
+
91
+ forward_transform = F.conv1d(
92
+ input_data,
93
+ Variable(self.forward_basis, requires_grad=False),
94
+ stride=self.hop_length,
95
+ padding=0)
96
+
97
+ cutoff = int((self.filter_length / 2) + 1)
98
+ real_part = forward_transform[:, :cutoff, :]
99
+ imag_part = forward_transform[:, cutoff:, :]
100
+
101
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
102
+ phase = torch.autograd.Variable(
103
+ torch.atan2(imag_part.data, real_part.data))
104
+
105
+ return magnitude, phase
106
+
107
+ def inverse(self, magnitude, phase):
108
+ recombine_magnitude_phase = torch.cat(
109
+ [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
110
+
111
+ inverse_transform = F.conv_transpose1d(
112
+ recombine_magnitude_phase,
113
+ Variable(self.inverse_basis, requires_grad=False),
114
+ stride=self.hop_length,
115
+ padding=0)
116
+
117
+ if self.window is not None:
118
+ window_sum = window_sumsquare(
119
+ self.window, magnitude.size(-1), hop_length=self.hop_length,
120
+ win_length=self.win_length, n_fft=self.filter_length,
121
+ dtype=np.float32)
122
+ # remove modulation effects
123
+ approx_nonzero_indices = torch.from_numpy(
124
+ np.where(window_sum > tiny(window_sum))[0])
125
+ window_sum = torch.autograd.Variable(
126
+ torch.from_numpy(window_sum), requires_grad=False)
127
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
128
+
129
+ # scale by hop ratio
130
+ inverse_transform *= float(self.filter_length) / self.hop_length
131
+
132
+ inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
133
+ inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
134
+
135
+ return inverse_transform
136
+
137
+ def forward(self, input_data):
138
+ self.magnitude, self.phase = self.transform(input_data)
139
+ reconstruction = self.inverse(self.magnitude, self.phase)
140
+ return reconstruction
utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy.io.wavfile import read
3
+ import torch
4
+
5
+
6
+ def get_mask_from_lengths(lengths):
7
+ max_len = torch.max(lengths).item()
8
+ ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
9
+ mask = (ids < lengths.unsqueeze(1)).byte()
10
+ # mask = (ids < lengths.unsqueeze(1).cuda()).cpu()
11
+ # mask = mask.byte()
12
+ return mask
13
+
14
+
15
+ # probably I won't use it from here
16
+ def load_wav_to_torch(full_path, sr):
17
+ sampling_rate, data = read(full_path)
18
+ assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
19
+ sr, sampling_rate, full_path)
20
+ return torch.FloatTensor(data.astype(np.float32))
21
+
22
+
23
+ # probably I won't use it from here
24
+ def load_filepaths_and_text(filename, sort_by_length, split="|"):
25
+ with open(filename, encoding='utf-8') as f:
26
+ filepaths_and_text = [line.strip().split(split) for line in f]
27
+
28
+ if sort_by_length:
29
+ filepaths_and_text.sort(key=lambda x: len(x[1]))
30
+
31
+ return filepaths_and_text
32
+
33
+
34
+ def to_gpu(x):
35
+ x = x.contiguous()
36
+
37
+ if torch.cuda.is_available():
38
+ x = x.cuda(non_blocking=True) # I understand this lets asynchronous processing
39
+ return torch.autograd.Variable(x)