Spaces:
Sleeping
Sleeping
Delete nn_layers.py
Browse files- nn_layers.py +0 -105
nn_layers.py
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch import nn
|
3 |
-
from librosa.filters import mel as librosa_mel_fn
|
4 |
-
from stft import STFT
|
5 |
-
|
6 |
-
clip_val = 1e-5
|
7 |
-
C = 1
|
8 |
-
|
9 |
-
|
10 |
-
class convolutional_module(nn.Module):
|
11 |
-
"""This class defines a 1d convolutional layer and its initialization for the system we are
|
12 |
-
replicating"""
|
13 |
-
def __init__(self, in_ch, out_ch, kernel_size=1, stride=1, padding=None, dilation=1, bias=True,
|
14 |
-
w_init_gain='linear'):
|
15 |
-
# in PyTorch you define your Models as subclasses of torch.nn.Module
|
16 |
-
super(convolutional_module, self).__init__()
|
17 |
-
if padding is None:
|
18 |
-
assert(kernel_size % 2 == 1)
|
19 |
-
padding = int(dilation * (kernel_size - 1) / 2)
|
20 |
-
|
21 |
-
# initialize the convolutional layer which is an instance of Conv1d
|
22 |
-
# torch.nn.Conv1d calls internally the method torch.nn.functional.conv1d, which accepts the
|
23 |
-
# input with the shape (minibatch x in_channels x input_w), and a weight of shape
|
24 |
-
# (out_channels x (in_channels/groups) x kernel_w). In our case, we do not split into groups.
|
25 |
-
# Then, our input shape will be (48 x 512 x 189) and the weights are set up as
|
26 |
-
# (512 x 512 x 5)
|
27 |
-
self.conv_layer = torch.nn.Conv1d(in_ch, out_ch, kernel_size=kernel_size, stride=stride,
|
28 |
-
padding=padding, dilation=dilation, bias=bias)
|
29 |
-
|
30 |
-
"""Useful information of Xavier initialization in:
|
31 |
-
https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/"""
|
32 |
-
torch.nn.init.xavier_uniform_(self.conv_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
33 |
-
|
34 |
-
def forward(self, x):
|
35 |
-
conv_output = self.conv_layer(x)
|
36 |
-
return conv_output
|
37 |
-
|
38 |
-
|
39 |
-
class linear_module(torch.nn.Module):
|
40 |
-
"""This class defines a linear layer and its initialization method for the system we are
|
41 |
-
replicating. This implements a linear transformation: y = xA^t + b"""
|
42 |
-
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
|
43 |
-
super(linear_module, self).__init__()
|
44 |
-
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
45 |
-
|
46 |
-
torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
47 |
-
|
48 |
-
def forward(self, x):
|
49 |
-
return self.linear_layer(x)
|
50 |
-
|
51 |
-
|
52 |
-
class location_layer(nn.Module):
|
53 |
-
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
|
54 |
-
super(location_layer, self).__init__()
|
55 |
-
padding = int((attention_kernel_size - 1) / 2)
|
56 |
-
"""We are being very restricting without training a bias"""
|
57 |
-
"""I think in_channels = 2 is k (number of vectors for every encoded stage position from prev.
|
58 |
-
alignment)."""
|
59 |
-
self.location_conv = convolutional_module(2, attention_n_filters, kernel_size=attention_kernel_size,
|
60 |
-
padding=padding, bias=False, stride=1, dilation=1)
|
61 |
-
self.location_dense = linear_module(attention_n_filters, attention_dim, bias=False,
|
62 |
-
w_init_gain='tanh')
|
63 |
-
|
64 |
-
def forward(self, attention_weights_cat):
|
65 |
-
processed_attention = self.location_conv(attention_weights_cat)
|
66 |
-
processed_attention = processed_attention.transpose(1, 2)
|
67 |
-
processed_attention = self.location_dense(processed_attention)
|
68 |
-
return processed_attention
|
69 |
-
|
70 |
-
|
71 |
-
class TacotronSTFT(nn.Module):
|
72 |
-
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
73 |
-
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
74 |
-
mel_fmax=8000.0):
|
75 |
-
super(TacotronSTFT, self).__init__()
|
76 |
-
self.n_mel_channels = n_mel_channels
|
77 |
-
self.sampling_rate = sampling_rate
|
78 |
-
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
79 |
-
mel_basis = librosa_mel_fn(
|
80 |
-
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
|
81 |
-
mel_basis = torch.from_numpy(mel_basis).float()
|
82 |
-
self.register_buffer('mel_basis', mel_basis)
|
83 |
-
|
84 |
-
def spectral_de_normalize(self, magnitudes):
|
85 |
-
output = torch.exp(magnitudes) / C
|
86 |
-
return output
|
87 |
-
|
88 |
-
def mel_spectrogram(self, y):
|
89 |
-
"""Computes mel-spectrograms from a batch of waves
|
90 |
-
PARAMS
|
91 |
-
------
|
92 |
-
y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
|
93 |
-
|
94 |
-
RETURNS
|
95 |
-
-------
|
96 |
-
mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
|
97 |
-
"""
|
98 |
-
assert(torch.min(y.data) >= -1)
|
99 |
-
assert(torch.max(y.data) <= 1)
|
100 |
-
|
101 |
-
magnitudes, phases = self.stft_fn.transform(y)
|
102 |
-
magnitudes = magnitudes.data
|
103 |
-
mel_output = torch.matmul(self.mel_basis, magnitudes)
|
104 |
-
mel_output = torch.log(torch.clamp(mel_output, min=clip_val) * C)
|
105 |
-
return mel_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|