Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

AlexK-PL commited on Sep 3, 2023

Commit

9329fc1

1 Parent(s): dde58d5

Upload model files

Browse files

Files changed (11) hide show

Decoder.py +379 -0
Encoder.py +73 -0
GST.py +368 -0
Postnet.py +52 -0
Tacotron2.py +112 -0
audio_processing.py +93 -0
hyper_parameters.py +70 -0
logger.py +47 -0
nn_layers.py +105 -0
stft.py +140 -0
utils.py +39 -0

Decoder.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import torch
+from torch.autograd import Variable
+from torch import nn
+from torch.nn import functional as F
+from nn_layers import linear_module, location_layer
+from utils import get_mask_from_lengths
+class AttentionNet(nn.Module):
+    # 1024, 512, 128, 32, 31
+    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
+                 attention_location_n_filters, attention_location_kernel_size):
+        super(AttentionNet, self).__init__()
+        self.query_layer = linear_module(attention_rnn_dim, attention_dim,
+                                      bias=False, w_init_gain='tanh')
+        # Projecting inputs into 128-D hidden representation
+        self.memory_layer = linear_module(embedding_dim, attention_dim, bias=False,
+                                       w_init_gain='tanh')
+        # Projecting into 1-D scalar value
+        self.v = linear_module(attention_dim, 1, bias=False)
+        # Convolutional layers to obtain location features and projecting them into 128-D hidden representation
+        self.location_layer = location_layer(attention_location_n_filters,
+                                            attention_location_kernel_size,
+                                            attention_dim)
+        self.score_mask_value = -float("inf")
+    def get_alignment_energies(self, query, processed_memory,
+                               attention_weights_cat):
+        """
+        PARAMS
+        ------
+        query: decoder output (batch, n_mel_channels * n_frames_per_step)
+        processed_memory: processed encoder outputs (B, T_in, attention_dim)
+        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
+        RETURNS
+        -------
+        alignment (batch, max_time)
+        """
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(
+            processed_query + processed_attention_weights + processed_memory))
+        energies = energies.squeeze(-1) # eliminates the third dimension of the tensor, which is 1.
+        return energies
+    def forward(self, attention_hidden_state, memory, processed_memory,
+                attention_weights_cat, mask):
+        """
+        PARAMS
+        ------
+        attention_hidden_state: attention rnn last output
+        memory: encoder outputs
+        processed_memory: processed encoder outputs
+        attention_weights_cat: previous and cummulative attention weights
+        mask: binary mask for padded data
+        """
+        alignment = self.get_alignment_energies(
+            attention_hidden_state, processed_memory, attention_weights_cat)
+        if mask is not None:
+            alignment.data.masked_fill_(mask, self.score_mask_value)
+        attention_weights = F.softmax(alignment, dim=1)
+        # I think attention_weights is a [BxNUMENCINPUTS] so with unsequeeze(1): [Bx1xNUMENCINPUTS] and memory is
+        # [BxNUMENCINPUTSx512]
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+        return attention_context, attention_weights
+class Prenet(nn.Module):
+    def __init__(self, in_dim, sizes):
+        super(Prenet, self).__init__()
+        in_sizes = [in_dim] + sizes[:-1] # all list values but the last one. The result is a list of the in_dim element
+        # concatenated with sizes of layers (i.e. [80, 256])
+        self.layers = nn.ModuleList(
+            [linear_module(in_size, out_size, bias=False)
+             for (in_size, out_size) in zip(in_sizes, sizes)])
+    def forward(self, x):
+        for linear in self.layers:
+            x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, tacotron_hyperparams):
+        super(Decoder, self).__init__()
+        self.n_mel_channels = tacotron_hyperparams['n_mel_channels']
+        self.n_frames_per_step = tacotron_hyperparams['number_frames_step']
+        self.encoder_embedding_dim = tacotron_hyperparams['encoder_embedding_dim']
+        self.attention_rnn_dim = tacotron_hyperparams['attention_rnn_dim'] # 1024
+        self.decoder_rnn_dim = tacotron_hyperparams['decoder_rnn_dim'] # 1024
+        self.prenet_dim = tacotron_hyperparams['prenet_dim']
+        self.max_decoder_steps = tacotron_hyperparams['max_decoder_steps']
+        # The threshold to decide whether stop or not stop decoding?
+        self.gate_threshold = tacotron_hyperparams['gate_threshold']
+        self.p_attention_dropout = tacotron_hyperparams['p_attention_dropout']
+        self.p_decoder_dropout = tacotron_hyperparams['p_decoder_dropout']
+        # Define the prenet: there is only one frame per step, so input dim is the number of mel channels.
+        # There are two fully connected layers:
+        self.prenet = Prenet(
+            tacotron_hyperparams['n_mel_channels'] * tacotron_hyperparams['number_frames_step'],
+            [tacotron_hyperparams['prenet_dim'], tacotron_hyperparams['prenet_dim']])
+        # input_size: 1024 + 512 (output of first LSTM cell + attention_context) / hidden_size: 1024
+        self.attention_rnn = nn.LSTMCell(
+            tacotron_hyperparams['prenet_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
+            tacotron_hyperparams['attention_rnn_dim'])
+        # return attention_weights and attention_context. Does the alignments.
+        self.attention_layer = AttentionNet(
+            tacotron_hyperparams['attention_rnn_dim'], tacotron_hyperparams['encoder_embedding_dim'],
+            tacotron_hyperparams['attention_dim'], tacotron_hyperparams['attention_location_n_filters'],
+            tacotron_hyperparams['attention_location_kernel_size'])
+        # input_size: 256 + 512 (attention_context + prenet_info), hidden_size: 1024
+        self.decoder_rnn = nn.LSTMCell(
+            tacotron_hyperparams['attention_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
+            tacotron_hyperparams['decoder_rnn_dim'], 1)
+        # (LSTM output)1024 + (attention_context)512, out_dim: number of mel channels. Last linear projection that
+        # generates an output decoder spectral frame.
+        self.linear_projection = linear_module(
+            tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'],
+            tacotron_hyperparams['n_mel_channels']*tacotron_hyperparams['number_frames_step'])
+        # decision whether to continue decoding.
+        self.gate_layer = linear_module(
+            tacotron_hyperparams['decoder_rnn_dim'] + tacotron_hyperparams['encoder_embedding_dim'], 1,
+            bias=True, w_init_gain='sigmoid')
+    def get_go_frame(self, memory):
+        """ Gets all zeros frames to use as first decoder input
+        PARAMS
+        ------
+        memory: decoder outputs
+        RETURNS
+        -------
+        decoder_input: all zeros frames
+        """
+        B = memory.size(0)
+        decoder_input = Variable(memory.data.new(
+            B, self.n_mel_channels * self.n_frames_per_step).zero_())
+        return decoder_input
+    def initialize_decoder_states(self, memory, mask):
+        """ Initializes attention rnn states, decoder rnn states, attention
+        weights, attention cumulative weights, attention context, stores memory
+        and stores processed memory
+        PARAMS
+        ------
+        memory: Encoder outputs
+        mask: Mask for padded data if training, expects None for inference
+        """
+        B = memory.size(0)
+        MAX_TIME = memory.size(1)
+        self.attention_hidden = Variable(memory.data.new(
+            B, self.attention_rnn_dim).zero_())
+        self.attention_cell = Variable(memory.data.new(
+            B, self.attention_rnn_dim).zero_())
+        self.decoder_hidden = Variable(memory.data.new(
+            B, self.decoder_rnn_dim).zero_())
+        self.decoder_cell = Variable(memory.data.new(
+            B, self.decoder_rnn_dim).zero_())
+        self.attention_weights = Variable(memory.data.new(
+            B, MAX_TIME).zero_())
+        self.attention_weights_cum = Variable(memory.data.new(
+            B, MAX_TIME).zero_())
+        self.attention_context = Variable(memory.data.new(
+            B, self.encoder_embedding_dim).zero_())
+        self.memory = memory
+        self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+    def parse_decoder_inputs(self, decoder_inputs):
+        """ Prepares decoder inputs, i.e. mel outputs
+        PARAMS
+        ------
+        decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
+        RETURNS
+        -------
+        inputs: processed decoder inputs
+        """
+        # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
+        decoder_inputs = decoder_inputs.transpose(1, 2)
+        # reshape decoder inputs in case we want to work with more than 1 frame per step (chunks). Otherwise, this next
+        # line does not just do anything
+        decoder_inputs = decoder_inputs.view(
+            decoder_inputs.size(0),
+            int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
+        # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
+        decoder_inputs = decoder_inputs.transpose(0, 1)
+        return decoder_inputs
+    def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
+        """ Prepares decoder outputs for output
+        PARAMS
+        ------
+        mel_outputs:
+        gate_outputs: gate output energies
+        alignments:
+        RETURNS
+        -------
+        mel_outputs:
+        gate_outpust: gate output energies
+        alignments:
+        """
+        # (T_out, B) -> (B, T_out)
+        alignments = torch.stack(alignments).transpose(0, 1)
+        # (T_out, B) -> (B, T_out)
+        gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
+        gate_outputs = gate_outputs.contiguous()
+        # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
+        mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
+        # decouple frames per step
+        mel_outputs = mel_outputs.view(
+            mel_outputs.size(0), -1, self.n_mel_channels)
+        # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
+        mel_outputs = mel_outputs.transpose(1, 2)
+        return mel_outputs, gate_outputs, alignments
+    def decode(self, decoder_input):
+        """ Decoder step using stored states, attention and memory
+        PARAMS
+        ------
+        decoder_input: previous mel output
+        RETURNS
+        -------
+        mel_output:
+        gate_output: gate output energies
+        attention_weights:
+        """
+        # concatenates [Bx1024] and [Bx512]. All dimensions match except 1 (torch.cat -1)
+        # concatenate the i-th decoder hidden state together with the i-th attention context
+        cell_input = torch.cat((decoder_input, self.attention_context), -1)
+        # the previous input is for the following LSTM cell, initialized with zeroes the hidden states and the cell
+        # state.
+        # compute the (i+1)th attention hidden state based on the i-th decoder hidden state and attention context.
+        self.attention_hidden, self.attention_cell = self.attention_rnn(
+            cell_input, (self.attention_hidden, self.attention_cell))
+        self.attention_hidden = F.dropout(self.attention_hidden, self.p_attention_dropout, self.training)
+        self.attention_cell = F.dropout(self.attention_cell, self.p_attention_dropout, self.training)
+        # concatenate the i-th state attention weights together with the cumulated from previous states to compute
+        # (i+1)th state
+        attention_weights_cat = torch.cat(
+            (self.attention_weights.unsqueeze(1),
+             self.attention_weights_cum.unsqueeze(1)), dim=1)
+        # compute (i+1)th attention context and provide (i+1)th attention weights based on the (i+1)th attention hidden
+        # state and (i)th and prev. weights
+        self.attention_context, self.attention_weights = self.attention_layer(
+            self.attention_hidden, self.memory, self.processed_memory,
+            attention_weights_cat, self.mask)
+        # cumulate attention_weights adding the (i+1)th to compute (i+2)th state
+        self.attention_weights_cum += self.attention_weights
+        decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(decoder_input,
+                                                                  (self.decoder_hidden, self.decoder_cell))
+        self.decoder_hidden = F.dropout(self.decoder_hidden, self.p_decoder_dropout, self.training)
+        self.decoder_cell = F.dropout(self.decoder_cell, self.p_decoder_dropout, self.training)
+        decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)
+        decoder_output = self.linear_projection(decoder_hidden_attention_context)
+        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
+        return decoder_output, gate_prediction, self.attention_weights
+        """
+        # the decoder_output from ith step passes through the pre-net to compute new decoder hidden state and attention_
+        # context (i+1)th
+        prenet_output = self.prenet(decoder_input)
+        # the decoder_input now is the concatenation of the pre-net output and the new (i+1)th attention_context
+        decoder_input = torch.cat((prenet_output, self.attention_context), -1)
+        # another LSTM Cell to compute the decoder hidden (i+1)th state from the decoder_input
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+            decoder_input, (self.decoder_hidden, self.decoder_cell))
+        # with new attention_context we concatenate again with the new (i+1)th decoder_hidden state.
+        decoder_hidden_attention_context = torch.cat(
+            (self.decoder_hidden, self.attention_context), dim=1)
+        # the (i+1)th output is a linear projection of the decoder hidden state with a weight matrix plus bias.
+        decoder_output = self.linear_projection(
+            decoder_hidden_attention_context)
+        # check whether (i+1)th state is the last of the sequence
+        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
+        return decoder_output, gate_prediction, self.attention_weights"""
+    def forward(self, memory, decoder_inputs, memory_lengths):
+        """ Decoder forward pass for training
+        PARAMS
+        ------
+        memory: Encoder outputs
+        decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
+        memory_lengths: Encoder output lengths for attention masking.
+        RETURNS
+        -------
+        mel_outputs: mel outputs from the decoder
+        gate_outputs: gate outputs from the decoder
+        alignments: sequence of attention weights from the decoder
+        """
+        decoder_input = self.get_go_frame(memory).unsqueeze(0)
+        decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
+        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
+        decoder_inputs = self.prenet(decoder_inputs)
+        self.initialize_decoder_states(
+            memory, mask=~get_mask_from_lengths(memory_lengths))
+        mel_outputs, gate_outputs, alignments = [], [], []
+        while len(mel_outputs) < decoder_inputs.size(0) - 1:
+            decoder_input = decoder_inputs[len(mel_outputs)]
+            mel_output, gate_output, attention_weights = self.decode(
+                decoder_input)
+            # a class list, when += means concatenation of vectors
+            mel_outputs += [mel_output.squeeze(1)]
+            gate_outputs += [gate_output.squeeze()]
+            alignments += [attention_weights]
+            # getting the frame indexing from reference mel frames to pass it as the new input of the next decoding
+            # step: Teacher Forcing!
+            # Takes each time_step of sequences of all mini-batch samples (i.e. [48, 80] as the decoder_inputs is
+            # parsed as [189, 48, 80]).
+        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
+            mel_outputs, gate_outputs, alignments)
+        return mel_outputs, gate_outputs, alignments
+    def inference(self, memory):
+        """ Decoder inference
+        PARAMS
+        ------
+        memory: Encoder outputs
+        RETURNS
+        -------
+        mel_outputs: mel outputs from the decoder
+        gate_outputs: gate outputs from the decoder
+        alignments: sequence of attention weights from the decoder
+        """
+        decoder_input = self.get_go_frame(memory)
+        self.initialize_decoder_states(memory, mask=None)
+        mel_outputs, gate_outputs, alignments = [], [], []
+        while True:
+            decoder_input = self.prenet(decoder_input)
+            mel_output, gate_output, alignment = self.decode(decoder_input)
+            mel_outputs += [mel_output.squeeze(1)]
+            gate_outputs += [gate_output]
+            alignments += [alignment]
+            if torch.sigmoid(gate_output.data) > self.gate_threshold:
+                break
+            elif len(mel_outputs) == self.max_decoder_steps:
+                print("Warning! Reached max decoder steps")
+                break
+            decoder_input = mel_output
+        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
+            mel_outputs, gate_outputs, alignments)
+        return mel_outputs, gate_outputs, alignments

Encoder.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from torch import nn
+from torch.nn import functional as F
+from nn_layers import convolutional_module
+class Encoder(nn.Module):
+    """This is the encoder part of tacotron2. It includes a stack of three 1d convolutional layers
+    followed by batch normalization and ReLU activations, and a bidirectional LSTM layer.
+    These part encodes sequences of input characters."""
+    def __init__(self, encoder_params):
+        super(Encoder, self).__init__()
+        # we set the dropout applied at each convolutional layer, as specified in Tacotron2's paper
+        #  self.dropout = nn.Dropout(0.5)
+        # A stack of convolution layers. For this model, there are 3 conv1d layers. We initialize a python
+        # list and run in a loop as many times as number of convolutional layers (three). In each
+        # iteration we initialize nn.Sequential container that permits us set a block of neural network
+        # modules. We need three equal nn sequences in a list. Then this list is properly registered using
+        # ModuleList class object (can act as an iterable, or be indexed).
+        # To see how the convolution is computed:
+        # https://pytorch.org/docs/stable/nn.html#conv1d
+        stack_of_convolutions = []
+        for _ in range(encoder_params['encoder_convs']):
+            conv_layer = nn.Sequential(convolutional_module(encoder_params['symbols_embedding_length'],
+                                                            encoder_params['symbols_embedding_length'],
+                                                            kernel_size=encoder_params['conv_kernel_size'],
+                                                            stride=encoder_params['conv_stride'],
+                                                            padding=int((encoder_params['conv_kernel_size'] - 1) / 2),
+                                                            dilation=encoder_params['conv_dilation'],
+                                                            w_init_gain=encoder_params['w_init_gain']),
+                                       nn.BatchNorm1d(encoder_params['symbols_embedding_length']))
+            stack_of_convolutions.append(conv_layer)
+        self.stack_conv = nn.ModuleList(stack_of_convolutions)
+        # Last part of the encoder is the bi-directional LSTM layer. As described in the original Tacotron2
+        # paper, there is only one BiLSTM layer with 256 units for each direction.
+        """Can I add the bidirectional LSTM layer together with the convolutional stack??? CHECK IT OUT!"""
+        self.bi_lstm = nn.LSTM(encoder_params['symbols_embedding_length'],
+                               int(encoder_params['symbols_embedding_length'] / 2), 1, batch_first=True,
+                               bidirectional=True)
+    def forward(self, input_sequences, input_lengths):
+            for conv in self.stack_conv:
+                input_sequences = F.dropout(F.relu(conv(input_sequences)), 0.5, self.training)
+            input_sequences = input_sequences.transpose(1, 2)
+            # After convolution filters, is the original sequence length the same? CHECK IT OUT
+            input_lengths = input_lengths.cpu().numpy()
+            # Returns a packed sequence object with variable-length sequences before passing through BiLSTM layer
+            input_sequences = nn.utils.rnn.pack_padded_sequence(input_sequences, input_lengths, batch_first=True)
+            # nn.LSTM accepts packed variable length sequence tensors. The output will also return a packed variable
+            # length sequence tensor. The output dimension is (seq_length, batch, num_directions*hidden_size), but
+            # if batch_first is True, then (batch, seq_length, num_direction*hidden_size).
+            self.bi_lstm.flatten_parameters()
+            outputs, _ = self.bi_lstm(input_sequences)
+            # Pads again the tensor back to normal format before packing
+            outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
+            return outputs  # [N, Max_seq_length, E_length]
+    def inference(self, x):
+        for conv in self.stack_conv:
+            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+        x = x.transpose(1, 2)
+        self.bi_lstm.flatten_parameters()
+        outputs, _ = self.bi_lstm(x)
+        return outputs

GST.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import torch
+import torch.nn as nn
+import torch.nn.init as init
+import torch.nn.functional as F
+import numpy as np
+class GST(nn.Module):
+    def __init__(self, hyper_parameters):
+        super().__init__()
+        self.prosody_extractor = LogMelSpecReferenceEncoder()
+        self.stl = MultiSTL(hyper_parameters=hyper_parameters)
+    def forward(self, logmel_spec, logmel_lengths):
+        prosody_features_embedded = self.prosody_extractor(logmel_spec, logmel_lengths)  # [N, 512]
+        style_embed, gst_scores = self.stl(prosody_features_embedded)
+        return style_embed, gst_scores
+    def inference(self, scores):  # NEED TO DEFINE SCORES TENSOR DIMENSION!!
+        style_embed_inference = self.stl.inference(scores=scores)
+        return style_embed_inference
+class PitchContourEncoder(nn.Module):
+    """
+    """
+    def __init__(self, hyper_parameters):
+        super().__init__()
+        K = len(hyper_parameters['ref_enc_out_channels'])
+        filters = [1] + hyper_parameters['ref_enc_out_channels']
+        kernel_sizes = hyper_parameters['seq_ref_enc_filter_size']
+        convs_2d = []
+        for i in range(K):
+            conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
+                                    kernel_size=(kernel_sizes[i], 3), stride=(1, 1),
+                                    padding=(int((kernel_sizes[i] - 1) / 2), int((3 - 1) / 2)), bias=True)
+            nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
+            convs_2d.append(conv2d_init)
+        self.convs2D = nn.ModuleList(convs_2d)
+        self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=hyper_parameters['ref_enc_out_channels'][i])
+                                    for i in range(K)])
+        # WEIGHT INITIALIZATION DEFAULT:
+        self.prosody_bi_lstm = nn.LSTM(input_size=int(176), hidden_size=int(512/2), num_layers=1, batch_first=True,
+                                       bidirectional=True)
+    def forward(self, bin_locations):  # [N, BIN_SUBAND, LEN_MELSPEC] (BIN_SUBAND = 13)
+        N = bin_locations.size(0)  # Number of samples
+        # Changing tensor dimensions to have 1 input channel for the first conv2D layer:
+        bin_locations = bin_locations.unsqueeze(1)
+        bin_locations = bin_locations.transpose(2, 3)  # [N, 1, LEN_MELSPEC, BIN_SUBAND]
+        """We implement ReLU gates at the output of Conv. layers. We could check it without"""
+        # For pitch tracking:
+        for conv2, bn2 in zip(self.convs2D, self.bns2D):
+            bin_locations = conv2(bin_locations)
+            bin_locations = bn2(bin_locations)
+            bin_locations = F.dropout(F.relu(bin_locations), 0.5, self.training)  # [N, Cout, LEN_MELSPEC, BIN_SUBAND]
+        # Resize:
+        bin_locations = bin_locations.transpose(1, 2)  # [N, LEN_MELSPEC, Cout, BIN_SUBAND]
+        T = bin_locations.size(1)
+        bin_locations = bin_locations.contiguous().view(N, T, -1)  # [N, LEN_MELSPEC, Cout*BIN_SUBAND]
+        # Encode sequences into a bidirectional LSTM layer:
+        """In our case, we do not care about the specific length of each sequence, as with the zero padding the encoder
+        should be able to also encode the different lengths and see zero when its over. That is why we do not apply
+        a packing padded sequence before LSTM layer."""
+        _, (encoded_prosody, cell_state) = self.prosody_bi_lstm(bin_locations)
+        encoded_prosody = encoded_prosody.transpose(0, 1)
+        encoded_prosody = encoded_prosody.contiguous().view(N, -1)
+        return encoded_prosody  # should be [N, 512]
+# DENSE GST Reference Encoder:
+class ProsodyEncoder(nn.Module):
+    """
+    This convolution class nn.Module performs two parallel convolution stacks, 1-D conv. and another 2-D conv.
+    Afterwards, the output of both will be concatenated to be passed, later, through a bidirectional LSTM layer.
+    """
+    def __init__(self, hyper_parameters):
+        super().__init__()
+        K = len(hyper_parameters['ref_enc_out_channels'])
+        filters = [1] + hyper_parameters['ref_enc_out_channels']
+        kernel_sizes = hyper_parameters['seq_ref_enc_filter_size']
+        # I NEED TO ADJUST PADDING TO NOT LOSE THE TOTAL LENGTH OF SEQUENCE!!
+        convs_1d = []
+        convs_2d = []
+        for i in range(K):
+            conv1d_init = nn.Conv1d(in_channels=filters[i], out_channels=filters[i + 1],
+                                    kernel_size=kernel_sizes[i], stride=1,
+                                    padding=int((kernel_sizes[i] - 1) / 2), bias=True)
+            nn.init.xavier_uniform_(conv1d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
+            convs_1d.append(conv1d_init)
+            conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
+                                    kernel_size=(kernel_sizes[i], 3), stride=(1, 1),
+                                    padding=(int((kernel_sizes[i] - 1) / 2), int((3 - 1) / 2)), bias=True)
+            nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
+            convs_2d.append(conv2d_init)
+        self.convs1D = nn.ModuleList(convs_1d)
+        self.convs2D = nn.ModuleList(convs_2d)
+        self.bns1D = nn.ModuleList([nn.BatchNorm1d(num_features=hyper_parameters['ref_enc_out_channels'][i])
+                                    for i in range(K)])
+        self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=hyper_parameters['ref_enc_out_channels'][i])
+                                    for i in range(K)])
+        self.prosody_linear = nn.Linear(512, 256, bias=True)
+        torch.nn.init.xavier_uniform_(self.prosody_linear.weight, gain=torch.nn.init.calculate_gain('linear'))
+        # WEIGHT INITIALIZATION DEFAULT:
+        self.prosody_bi_lstm = nn.LSTM(input_size=int(256), hidden_size=int(512/2), num_layers=1, batch_first=True,
+                                       bidirectional=True)
+    def forward(self, bin_locations, pitch_intensities):  # [N, LEN_MELSPEC, 1], [N, LEN_MELSPEC, 3]
+        N = bin_locations.size(0)  # Number of samples
+        num_intensities = pitch_intensities.size(2)
+        # Changing tensor dimensions to have 1 input channel for the first conv2D layer:
+        pitch_intensities = pitch_intensities.view(N, 1, -1, num_intensities)  # [N, 1, LEN_MELSPEC, num_intensities]
+        bin_locations = bin_locations.transpose(1, 2)  # [N, 1, LEN_MELSPEC]
+        """We implement ReLU gates at the output of Conv. layers. We could check it without"""
+        # For pitch tracking:
+        for conv, bn in zip(self.convs1D, self.bns1D):
+            bin_locations = conv(bin_locations)
+            bin_locations = bn(bin_locations)
+            bin_locations = F.dropout(F.relu(bin_locations), 0.5, self.training)  # [N, Cout, T]
+        # For pitch intensities:
+        for conv2, bn2 in zip(self.convs2D, self.bns2D):
+            pitch_intensities = conv2(pitch_intensities)
+            pitch_intensities = bn2(pitch_intensities)
+            pitch_intensities = F.dropout(F.relu(pitch_intensities), 0.5, self.training)  # [N, Cout, T, bins]
+        # Resize pitch intensities
+        bin_locations = bin_locations.transpose(1, 2)  # [N, T, Cout]
+        pitch_intensities = pitch_intensities.transpose(1, 2)  # [N, T, Cout, bins]
+        T = pitch_intensities.size(1)
+        pitch_intensities = pitch_intensities.contiguous().view(N, T, -1)  # [N, T, Cout*bins]
+        # Concatenate features
+        pitch_convolved = torch.cat((bin_locations, pitch_intensities), 2)
+        # Linear projection (IS IT NECESSARY? DOES ACTIVATION FUNCTION IMPROVE THE RESULT?)
+        projection_pitch_convolved = F.dropout(F.tanh(self.prosody_linear(pitch_convolved)), 0.5, self.training)
+        # Encode sequences into a bidirectional LSTM layer:
+        """In our case, we do not care about the specific length of each sequence, as with the zero padding the encoder
+        should be able to also encode the different lengths and see zero when its over. That is why we do not apply
+        a packing padded sequence before LSTM layer."""
+        _, (encoded_prosody, cell_state) = self.prosody_bi_lstm(projection_pitch_convolved)
+        encoded_prosody = encoded_prosody.transpose(0, 1)
+        encoded_prosody = encoded_prosody.contiguous().view(N, -1)
+        return encoded_prosody  # should be [N, 512]
+class LogMelSpecReferenceEncoder(nn.Module):
+    """
+    """
+    def __init__(self):
+        super().__init__()
+        reference_encoder_out_channels = [32, 32, 64, 64, 128, 128]
+        K = len(reference_encoder_out_channels)
+        filters = [1] + reference_encoder_out_channels
+        kernel_size = (3, 3)
+        stride = (2, 2)
+        padding = (1, 1)
+        convs_2d = []
+        for i in range(K):
+            conv2d_init = nn.Conv2d(in_channels=filters[i], out_channels=filters[i + 1],
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, bias=True)
+            nn.init.xavier_uniform_(conv2d_init.weight, gain=torch.nn.init.calculate_gain('linear'))
+            convs_2d.append(conv2d_init)
+        self.convs2D = nn.ModuleList(convs_2d)
+        self.bns2D = nn.ModuleList([nn.BatchNorm2d(num_features=reference_encoder_out_channels[i])
+                                    for i in range(K)])
+        out_channels = self.calculate_channels(80, 3, 2, 1, K)
+        # self.gru = nn.GRU(input_size=reference_encoder_out_channels[-1] * out_channels, hidden_size=512,
+        #                   batch_first=True, bidirectional=False)
+        # WEIGHT INITIALIZATION DEFAULT:
+        self.bi_lstm = nn.LSTM(input_size=reference_encoder_out_channels[-1] * out_channels,
+                               hidden_size=int(512/2), num_layers=1, batch_first=True, bidirectional=True)
+    def forward(self, logmel_spec, logmel_lengths):  # [N, MEL_CHANNELS, LEN_MELSPEC]
+        N = logmel_spec.size(0)  # Number of samples
+        # Changing tensor dimensions to have 1 input channel for the first conv2D layer:
+        logmel_spec = logmel_spec.unsqueeze(1)
+        logmel_spec = logmel_spec.transpose(2, 3)  # [N, 1, LEN_MELSPEC, MEL_CHANNELS]
+        """We implement ReLU gates at the output of Conv. layers. We could check it without"""
+        for conv2, bn2 in zip(self.convs2D, self.bns2D):
+            logmel_spec = conv2(logmel_spec)
+            logmel_spec = bn2(logmel_spec)
+            logmel_spec = F.dropout(F.relu(logmel_spec), 0.5, self.training)  # [N, Cout, LEN_MELSPEC, BIN_SUBAND]
+        # Resize:
+        logmel_spec = logmel_spec.transpose(1, 2)  # [N, LEN_MELSPEC, Cout, MEL_CHANNELS]
+        T = logmel_spec.size(1)
+        logmel_spec = logmel_spec.contiguous().view(N, T, -1)  # [N, LEN_MELSPEC, Cout*BIN_SUBAND]
+        logmel_lengths = logmel_lengths.cpu().numpy()
+        last_hidden_states = torch.zeros(N, 512)
+        logmel_after_lengths = np.trunc(logmel_lengths / 2**6)
+        logmel_after_lengths = logmel_after_lengths + 1
+        logmel_after_lengths = logmel_after_lengths.astype(int)
+        logmel_after_lengths = torch.tensor(logmel_after_lengths)
+        # logmel_spec = nn.utils.rnn.pack_padded_sequence(logmel_spec, logmel_after_lengths, batch_first=True)
+        self.bi_lstm.flatten_parameters()
+        # memory, out = self.gru(logmel_spec)
+        outputs, (hidden_states, cell_state) = self.bi_lstm(logmel_spec)
+        hidden_states = hidden_states.transpose(0, 1)
+        hidden_states = hidden_states.contiguous().view(N, -1)
+        # outputs, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
+        # for j in range(N):
+        #     last_hidden_states[j, :] = outputs[j, logmel_after_lengths[j] - 1, :]
+        # return last_hidden_states.cuda(non_blocking=True)
+        return hidden_states
+    def calculate_channels(self, L, kernel_size, stride, padding, n_convs):
+        for i in range(n_convs):
+            L = (L - kernel_size + 2 * padding) // stride + 1
+        return L
+# BASIC FORM FOR NOW. NEEDS TO BE EXPANDED TO OUR NEW PROPOSAL
+class MultiSTL(nn.Module):
+    """
+    inputs --- [N, E]
+    """
+    def __init__(self, hyper_parameters):
+        super().__init__()
+        # E = 256 / num_heads = 8 / token_num = 10!!
+        self.embed = nn.Parameter(torch.FloatTensor(hyper_parameters['token_num'],
+                                                    hyper_parameters['E'] // hyper_parameters['num_heads']))
+        # d_q = hyper_parameters['E'] // 2
+        d_q = hyper_parameters['E']
+        d_k = hyper_parameters['E'] // hyper_parameters['num_heads']
+        self.attention = MultiHeadAttention(query_dim=d_q, key_dim=d_k,
+                                            num_units=hyper_parameters['E'], num_heads=hyper_parameters['num_heads'])
+        init.xavier_uniform_(self.embed, gain=init.calculate_gain('linear'))
+    def forward(self, inputs):
+        N = inputs.size(0)  # Number of samples in the batch
+        query = inputs.unsqueeze(1)  # [N, 1, E]
+        keys = F.tanh(self.embed).unsqueeze(0).expand(N, -1, -1)  # [N, token_num, E // num_heads]
+        style_embed, gst_scores = self.attention(query, keys)
+        return style_embed, gst_scores
+    def inference(self, scores):
+        keys = F.tanh(self.embed).unsqueeze(0)
+        style_embed_inference = self.attention.inference(keys, scores=scores)
+        return style_embed_inference
+class MultiHeadAttention(nn.Module):
+    """
+    input:
+        query --- [N, T_q, query_dim]  T_q = 1
+        key --- [N, T_k, key_dim]  T_k = 5 (num of tokens)
+    output:
+        out --- [N, T_q, num_units]
+    """
+    def __init__(self, query_dim, key_dim, num_units, num_heads):
+        super().__init__()
+        self.num_units = num_units
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        #self.sparse_max = Sparsemax(dim=3)
+        # Linear projection of data (encoder and decoder states) into a fixed number of hidden units
+        self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
+        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+        self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+    def forward(self, query, key):
+        querys = self.W_query(query)  # [N, T_q, num_units] the last dimension changes according to the output dim
+        keys = self.W_key(key)  # [N, T_k, num_units]
+        values = self.W_value(key)
+        # the number of units set at the initialization is the total of hidden feature units we want. Then, we will
+        # assign a specific number of num_units according to the number of heads of the multi head Attention.
+        # Basically, style tokens are the number of heads we configure to learn different types of attention
+        #
+        split_size = self.num_units // self.num_heads  # integer division, without remainder
+        querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)  # [h, N, T_q, num_units/h]
+        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
+        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
+        # score = softmax(QK^T / (d_k ** 0.5))
+        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
+        scores = scores / (self.key_dim ** 0.33)  # cube root instead of square to prevent very small values
+        scores = F.softmax(scores, dim=3)  # From dimension 3, length of Key sequences.
+        # scores = self.sparse_max(scores)
+        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
+        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
+        scores = scores.squeeze()
+        return out, scores
+    def inference(self, key, scores):  # key [1, 5, 512/8] # [1, num_tokens]
+        """Only need the keys that are already trained, and the scores that I impose"""
+        scores = scores.unsqueeze(0).unsqueeze(0).unsqueeze(0).expand(self.num_heads, -1, -1, -1)
+        # print(scores.shape)
+        values = self.W_value(key)
+        # the number of units set at the initialization is the total of hidden feature units we want. Then, we will
+        # assign a specific number of num_units according to the number of heads of the multi head Attention.
+        # Basically, style tokens are the number of heads we configure to learn different types of attention
+        #
+        split_size = self.num_units // self.num_heads  # integer division, without remainder
+        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
+        # score = softmax(QK^T / (d_k ** 0.5))
+        # out = score * V
+        out = torch.matmul(scores, values)  # [h, 1, T_q = 1, num_units/h]
+        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
+        return out

Postnet.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from nn_layers import convolutional_module
+class Postnet(nn.Module):
+    """Postnet
+        - Five 1-d convolution with 512 channels and kernel size 5
+    """
+    def __init__(self, tacotron_hyperparams):
+        super(Postnet, self).__init__()
+        #  self.dropout = nn.Dropout(0.5)
+        self.convolutions = nn.ModuleList()
+        self.convolutions.append(
+            nn.Sequential(
+                convolutional_module(tacotron_hyperparams['n_mel_channels'],
+                                     tacotron_hyperparams['postnet_embedding_dim'],
+                         kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
+                         padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
+                         dilation=1, w_init_gain='tanh'),
+                nn.BatchNorm1d(tacotron_hyperparams['postnet_embedding_dim']))
+        )
+        for i in range(1, tacotron_hyperparams['postnet_n_convolutions'] - 1):
+            self.convolutions.append(
+                nn.Sequential(
+                    convolutional_module(tacotron_hyperparams['postnet_embedding_dim'],
+                             tacotron_hyperparams['postnet_embedding_dim'],
+                             kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
+                             padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
+                             dilation=1, w_init_gain='tanh'),
+                    nn.BatchNorm1d(tacotron_hyperparams['postnet_embedding_dim']))
+            )
+        self.convolutions.append(
+            nn.Sequential(
+                convolutional_module(tacotron_hyperparams['postnet_embedding_dim'],
+                                     tacotron_hyperparams['n_mel_channels'],
+                         kernel_size=tacotron_hyperparams['postnet_kernel_size'], stride=1,
+                         padding=int((tacotron_hyperparams['postnet_kernel_size'] - 1) / 2),
+                         dilation=1, w_init_gain='linear'),
+                nn.BatchNorm1d(tacotron_hyperparams['n_mel_channels']))
+            )
+    def forward(self, x):
+        for i in range(len(self.convolutions) - 1):
+            x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
+        x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
+        return x

Tacotron2.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from math import sqrt
+import torch
+from torch import nn
+from Encoder import Encoder
+from Decoder import Decoder
+from Postnet import Postnet
+from GST import GST
+from utils import to_gpu, get_mask_from_lengths
+from fp16_optimizer import fp32_to_fp16, fp16_to_fp32
+class tacotron_2(nn.Module):
+    def __init__(self, tacotron_hyperparams):
+        super(tacotron_2, self).__init__()
+        self.mask_padding = tacotron_hyperparams['mask_padding']
+        self.fp16_run = tacotron_hyperparams['fp16_run']
+        self.n_mel_channels = tacotron_hyperparams['n_mel_channels']
+        self.n_frames_per_step = tacotron_hyperparams['number_frames_step']
+        self.embedding = nn.Embedding(
+            tacotron_hyperparams['n_symbols'], tacotron_hyperparams['symbols_embedding_length'])
+        # CHECK THIS OUT!!!
+        std = sqrt(2.0 / (tacotron_hyperparams['n_symbols'] + tacotron_hyperparams['symbols_embedding_length']))
+        val = sqrt(3.0) * std
+        self.embedding.weight.data.uniform_(-val, val)
+        self.encoder = Encoder(tacotron_hyperparams)
+        self.decoder = Decoder(tacotron_hyperparams)
+        self.postnet = Postnet(tacotron_hyperparams)
+        self.gst = GST(tacotron_hyperparams)
+    def parse_batch(self, batch):
+        # GST I add the new tensor from prosody features to train GST tokens:
+        text_padded, input_lengths, mel_padded, gate_padded, output_lengths, prosody_padded = batch
+        text_padded = to_gpu(text_padded).long()
+        max_len = int(torch.max(input_lengths.data).item())  # With item() you get the pure value (not in a tensor)
+        input_lengths = to_gpu(input_lengths).long()
+        mel_padded = to_gpu(mel_padded).float()
+        gate_padded = to_gpu(gate_padded).float()
+        output_lengths = to_gpu(output_lengths).long()
+        prosody_padded = to_gpu(prosody_padded).float()
+        return (
+            (text_padded, input_lengths, mel_padded, max_len, output_lengths, prosody_padded),
+            (mel_padded, gate_padded))
+    def parse_input(self, inputs):
+        inputs = fp32_to_fp16(inputs) if self.fp16_run else inputs
+        return inputs
+    def parse_output(self, outputs, output_lengths=None):
+        if self.mask_padding and output_lengths is not None:
+            mask = ~get_mask_from_lengths(output_lengths)
+            mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
+            mask = mask.permute(1, 0, 2)
+            outputs[0].data.masked_fill_(mask, 0.0)
+            outputs[1].data.masked_fill_(mask, 0.0)
+            outputs[2].data.masked_fill_(mask[:, 0, :], 1e3)  # gate energies
+        outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
+        return outputs
+    def forward(self, inputs):
+        inputs, input_lengths, targets, max_len, output_lengths, gst_prosody_padded = self.parse_input(inputs)
+        input_lengths, output_lengths = input_lengths.data, output_lengths.data
+        embedded_inputs = self.embedding(inputs).transpose(1, 2)
+        encoder_outputs = self.encoder(embedded_inputs, input_lengths)
+        # GST style embedding plus embedded_inputs before entering the decoder
+        # bin_locations = gst_prosody_padded[:, 0, :]
+        # pitch_intensities = gst_prosody_padded[:, 1:, :]
+        # bin_locations = bin_locations.unsqueeze(2)
+        gst_style_embedding, gst_scores = self.gst(gst_prosody_padded, output_lengths)  # [N, 512]
+        gst_style_embedding = gst_style_embedding.expand_as(encoder_outputs)
+        encoder_outputs = encoder_outputs + gst_style_embedding
+        mel_outputs, gate_outputs, alignments = self.decoder(
+            encoder_outputs, targets, memory_lengths=input_lengths)
+        mel_outputs_postnet = self.postnet(mel_outputs)
+        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+        return self.parse_output(
+            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments, gst_scores],
+            output_lengths)
+    def inference(self, inputs, gst_scores):  # gst_scores must be a torch tensor
+        inputs = self.parse_input(inputs)
+        embedded_inputs = self.embedding(inputs).transpose(1, 2)
+        encoder_outputs = self.encoder.inference(embedded_inputs)
+        # GST inference:
+        gst_style_embedding = self.gst.inference(gst_scores)
+        gst_style_embedding = gst_style_embedding.expand_as(encoder_outputs)
+        encoder_outputs = encoder_outputs + gst_style_embedding
+        mel_outputs, gate_outputs, alignments = self.decoder.inference(
+            encoder_outputs)
+        mel_outputs_postnet = self.postnet(mel_outputs)
+        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+        outputs = self.parse_output(
+            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
+        return outputs

audio_processing.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import numpy as np
+from scipy.signal import get_window
+import librosa.util as librosa_util
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
+                     n_fft=800, dtype=np.float32, norm=None):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
+    return x
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C

hyper_parameters.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from text import symbols
+# creating a python dictionary with all hyper parameters
+tacotron_params = {'filter_length': 1024,  # audio parameters:
+                   'hop_length': 256,
+                   'win_length': 1024,
+                   'n_mel_channels': 80,
+                   'mel_fmin': 0.0,
+                   'mel_fmax': 8000.0,
+                   'sampling_rate': 22050,
+                   'max_wav_value': 32768.0,
+                   'clipping_value': 1e-5,
+                   'C': 1,
+                   # dataset parameters:
+                   'load_mel_from_disk': False,
+                   'sort_by_length': False,
+                   'text_cleaners': ['english_cleaners'],
+                   # embedding parameters:
+                   'symbols_embedding_length': 512,
+                   'n_symbols': len(symbols),
+                   # encoder parameters:
+                   'encoder_embedding_dim': 512,
+                   'encoder_convs': 3,
+                   'conv_kernel_size': 5,
+                   'conv_stride': 1,
+                   'conv_dilation': 1,
+                   'w_init_gain': 'relu',
+                   # decoder parameters:
+                   'number_frames_step': 1,
+                   'decoder_rnn_dim': 1024,
+                   'prenet_dim': 256,
+                   'max_decoder_steps': 1000,
+                   'gate_threshold': 0.5,  # Need to be reviewed
+                   'p_attention_dropout': 0.1,
+                   'p_decoder_dropout': 0.1,
+                   # attention parameters:
+                   'attention_rnn_dim': 1024,
+                   'attention_dim': 128,
+                   # location features parameters:
+                   'attention_location_n_filters': 32,
+                   'attention_location_kernel_size': 31,
+                   # postnet parameters:
+                   'postnet_embedding_dim': 512,
+                   'postnet_kernel_size': 5,
+                   'postnet_n_convolutions': 5,
+                   # GST parameters:
+                   'E': 512,
+                   'token_num': 3,
+                   'num_heads': 1,
+                   'seq_ref_enc_filter_size': [3, 7, 11],  # phoneme, word/silence, utterance levels respectively
+                   'ref_enc_out_channels': [8, 16, 16],
+                   # optimization parameters:
+                   'use_saved_learning_rate': True,
+                   'batch_size': 32,  # 64 should be larger than the number of GPUs. Integer multiple of the num. of GPUs
+                   'learning_rate': 1e-3,
+                   'weight_decay': 1e-6,
+                   'grad_clip_thresh': 1.0,
+                   'mask_padding': False,
+                   # experiment parameters:
+                   'epochs': 300,  # 160, 500
+                   'iters_per_checkpoint': 1500,  # 1000. How many iterations before validating
+                   'seed': 1234,
+                   'dynamic_loss_scaling': True,  # CHECK IT OUT!
+                   'distributed_run': False,
+                   'dist_backend': 'nccl',
+                   'dist_url': "/home/alex/PyTorch_TACOTRON_2/pycharm-tacotron2",  # CHECK IT OUT!
+                   'cudnn_enabled': True,
+                   'cudnn_benchmark': False,
+                   'fp16_run': False}

logger.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import random
+import torch.nn.functional as F
+from tensorboardX import SummaryWriter
+from plotting_utils import plot_alignment_to_numpy, plot_gst_scores_to_numpy, plot_spectrogram_to_numpy
+from plotting_utils import plot_gate_outputs_to_numpy
+class Tacotron2Logger(SummaryWriter):
+    def __init__(self, logdir):
+        super(Tacotron2Logger, self).__init__(logdir)
+    def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
+                     iteration):
+        self.add_scalar("training.loss", reduced_loss, iteration)
+        self.add_scalar("grad.norm", grad_norm, iteration)
+        self.add_scalar("learning.rate", learning_rate, iteration)
+        self.add_scalar("duration", duration, iteration)
+    def log_validation(self, reduced_loss, model, y, y_pred, gst_scores, iteration):
+        self.add_scalar("validation.loss", reduced_loss, iteration)
+        _, mel_outputs, gate_outputs, alignments, _ = y_pred
+        mel_targets, gate_targets = y
+        # plot distribution of parameters
+        for tag, value in model.named_parameters():
+            tag = tag.replace('.', '/')
+            self.add_histogram(tag, value.data.cpu().numpy(), iteration)
+        # plot alignment, mel target and predicted, gate target and predicted
+        idx = random.randint(0, alignments.size(0) - 1)
+        align_idx = alignments[idx].data.cpu().numpy().T
+        gst_scores = gst_scores.data.cpu().numpy().T
+        # print("Validation GST scores before plotting to tensorboard: {}".format(gst_scores.shape))
+        meltarg_idx = mel_targets[idx].data.cpu().numpy()
+        melout_idx = mel_outputs[idx].data.cpu().numpy()
+        self.add_image("alignment", plot_alignment_to_numpy(align_idx), iteration)
+        self.add_image("gst_scores", plot_gst_scores_to_numpy(gst_scores), iteration)
+        self.add_image("mel_target", plot_spectrogram_to_numpy(meltarg_idx), iteration)
+        self.add_image("mel_predicted", plot_spectrogram_to_numpy(melout_idx), iteration)
+        self.add_image(
+            "gate",
+            plot_gate_outputs_to_numpy(
+                gate_targets[idx].data.cpu().numpy(),
+                F.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
+            iteration)

nn_layers.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+from torch import nn
+from librosa.filters import mel as librosa_mel_fn
+from stft import STFT
+clip_val = 1e-5
+C = 1
+class convolutional_module(nn.Module):
+    """This class defines a 1d convolutional layer and its initialization for the system we are
+    replicating"""
+    def __init__(self, in_ch, out_ch, kernel_size=1, stride=1, padding=None, dilation=1, bias=True,
+                 w_init_gain='linear'):
+        # in PyTorch you define your Models as subclasses of torch.nn.Module
+        super(convolutional_module, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+        # initialize the convolutional layer which is an instance of Conv1d
+        # torch.nn.Conv1d calls internally the method torch.nn.functional.conv1d, which accepts the
+        # input with the shape (minibatch x in_channels x input_w), and a weight of shape
+        # (out_channels x (in_channels/groups) x kernel_w). In our case, we do not split into groups.
+        # Then, our input shape will be (48 x 512 x 189) and the weights are set up as
+        # (512 x 512 x 5)
+        self.conv_layer = torch.nn.Conv1d(in_ch, out_ch, kernel_size=kernel_size, stride=stride,
+                                          padding=padding, dilation=dilation, bias=bias)
+        """Useful information of Xavier initialization in:
+        https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/"""
+        torch.nn.init.xavier_uniform_(self.conv_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        conv_output = self.conv_layer(x)
+        return conv_output
+class linear_module(torch.nn.Module):
+    """This class defines a linear layer and its initialization method for the system we are
+    replicating. This implements a linear transformation: y = xA^t + b"""
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(linear_module, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class location_layer(nn.Module):
+    def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
+        super(location_layer, self).__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        """We are being very restricting without training a bias"""
+        """I think in_channels = 2 is k (number of vectors for every encoded stage position from prev.
+        alignment)."""
+        self.location_conv = convolutional_module(2, attention_n_filters, kernel_size=attention_kernel_size,
+                                                  padding=padding, bias=False, stride=1, dilation=1)
+        self.location_dense = linear_module(attention_n_filters, attention_dim, bias=False,
+                                            w_init_gain='tanh')
+    def forward(self, attention_weights_cat):
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose(1, 2)
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+class TacotronSTFT(nn.Module):
+    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
+                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
+                 mel_fmax=8000.0):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer('mel_basis', mel_basis)
+    def spectral_de_normalize(self, magnitudes):
+        output = torch.exp(magnitudes) / C
+        return output
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert(torch.min(y.data) >= -1)
+        assert(torch.max(y.data) <= 1)
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = torch.log(torch.clamp(mel_output, min=clip_val) * C)
+        return mel_output

stft.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""
+BSD 3-Clause License
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+* Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from audio_processing import window_sumsquare
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length=800, hop_length=200, win_length=800,
+                 window='hann'):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
+                                   np.imag(fourier_basis[:cutoff, :])])
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
+        if window is not None:
+            assert(filter_length >= win_length)
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer('forward_basis', forward_basis.float())
+        self.register_buffer('inverse_basis', inverse_basis.float())
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode='reflect')
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(
+            torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window, magnitude.size(-1), hop_length=self.hop_length,
+                win_length=self.win_length, n_fft=self.filter_length,
+                dtype=np.float32)
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0])
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False)
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
+        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction

utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import numpy as np
+from scipy.io.wavfile import read
+import torch
+def get_mask_from_lengths(lengths):
+    max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
+    mask = (ids < lengths.unsqueeze(1)).byte()
+    #  mask = (ids < lengths.unsqueeze(1).cuda()).cpu()
+    #  mask = mask.byte()
+    return mask
+# probably I won't use it from here
+def load_wav_to_torch(full_path, sr):
+    sampling_rate, data = read(full_path)
+    assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
+        sr, sampling_rate, full_path)
+    return torch.FloatTensor(data.astype(np.float32))
+# probably I won't use it from here
+def load_filepaths_and_text(filename, sort_by_length, split="|"):
+    with open(filename, encoding='utf-8') as f:
+        filepaths_and_text = [line.strip().split(split) for line in f]
+    if sort_by_length:
+        filepaths_and_text.sort(key=lambda x: len(x[1]))
+    return filepaths_and_text
+def to_gpu(x):
+    x = x.contiguous()
+    if torch.cuda.is_available():
+        x = x.cuda(non_blocking=True)  # I understand this lets asynchronous processing
+    return torch.autograd.Variable(x)