Spaces:
Sleeping
Sleeping
Delete data_preparation.py
Browse files- data_preparation.py +0 -102
data_preparation.py
DELETED
@@ -1,102 +0,0 @@
|
|
1 |
-
import random
|
2 |
-
|
3 |
-
import numpy as np
|
4 |
-
import torch
|
5 |
-
import torch.utils.data
|
6 |
-
|
7 |
-
import nn_layers
|
8 |
-
from scipy.io.wavfile import read
|
9 |
-
from text import text_to_sequence
|
10 |
-
from hyper_parameters import tacotron_params
|
11 |
-
|
12 |
-
|
13 |
-
class DataPreparation(torch.utils.data.Dataset):
|
14 |
-
|
15 |
-
def __init__(self, audiopaths_and_text, tacotron_hyperparams):
|
16 |
-
self.audiopaths_and_text = audiopaths_and_text
|
17 |
-
self.audio_text_parameters = tacotron_hyperparams
|
18 |
-
self.stft = nn_layers.TacotronSTFT(tacotron_hyperparams['filter_length'], tacotron_hyperparams['hop_length'],
|
19 |
-
tacotron_hyperparams['win_length'], tacotron_hyperparams['n_mel_channels'],
|
20 |
-
self.audio_text_parameters['sampling_rate'],
|
21 |
-
tacotron_hyperparams['mel_fmin'], tacotron_hyperparams['mel_fmax'])
|
22 |
-
random.seed(1234)
|
23 |
-
random.shuffle(self.audiopaths_and_text)
|
24 |
-
|
25 |
-
def load_audiowav_torch(self, audiopath, samp_rate):
|
26 |
-
sr, data = read(audiopath)
|
27 |
-
assert samp_rate == sr, "Sample rate does not match with the configuration"
|
28 |
-
|
29 |
-
return torch.FloatTensor(data.astype(np.float32))
|
30 |
-
|
31 |
-
def melspec_textSequence_pair(self, audiopath_and_text):
|
32 |
-
wav_path, sentence = audiopath_and_text[0], audiopath_and_text[1]
|
33 |
-
# wav to torch tensor
|
34 |
-
wav_torch = self.load_audiowav_torch(wav_path, self.audio_text_parameters['sampling_rate'])
|
35 |
-
wav_torch_norm = wav_torch / self.audio_text_parameters['max_wav_value']
|
36 |
-
wav_torch_norm = wav_torch_norm.unsqueeze(0)
|
37 |
-
wav_torch_norm = torch.autograd.Variable(wav_torch_norm, requires_grad=False)
|
38 |
-
mel_spec = self.stft.mel_spectrogram(wav_torch_norm)
|
39 |
-
mel_spec = torch.squeeze(mel_spec, 0)
|
40 |
-
# text to torch integer tensor sequence
|
41 |
-
sentence_sequence = torch.IntTensor(text_to_sequence(sentence, self.audio_text_parameters['text_cleaners']))
|
42 |
-
|
43 |
-
return sentence_sequence, mel_spec
|
44 |
-
|
45 |
-
def __getitem__(self, index):
|
46 |
-
return self.melspec_textSequence_pair(self.audiopaths_and_text[index])
|
47 |
-
|
48 |
-
def __len__(self):
|
49 |
-
return len(self.audiopaths_and_text)
|
50 |
-
|
51 |
-
|
52 |
-
class DataCollate:
|
53 |
-
|
54 |
-
def __init__(self, number_frames_step):
|
55 |
-
self.number_frames_step = number_frames_step
|
56 |
-
|
57 |
-
def __call__(self, batch):
|
58 |
-
inp_lengths, sorted_decreasing = torch.sort(torch.LongTensor([len(x[0]) for x in batch]),
|
59 |
-
dim=0, descending=True)
|
60 |
-
max_length_in = inp_lengths[0]
|
61 |
-
|
62 |
-
# padding sentences sequences for a fixed-length tensor size
|
63 |
-
sentences_padded = torch.LongTensor(len(batch), max_length_in)
|
64 |
-
sentences_padded.zero_()
|
65 |
-
for i in range(len(sorted_decreasing)):
|
66 |
-
int_seq_sentence = batch[sorted_decreasing[i]][0]
|
67 |
-
# all slots of a line until the end of the sentence. The rest, 0's
|
68 |
-
sentences_padded[i, :int_seq_sentence.size(0)] = int_seq_sentence
|
69 |
-
|
70 |
-
# length of the mel filterbank used
|
71 |
-
num_melfilters = batch[0][1].size(0)
|
72 |
-
|
73 |
-
# longest recorded spectrogram representation + 1 space to mark the end
|
74 |
-
max_length_target = max([x[1].size(1) for x in batch]) # THERE IS A CHANGE FROM THE ORIGINAL CODE!!!
|
75 |
-
# add extra space if the number of frames per step is higher than 1
|
76 |
-
if max_length_target % self.number_frames_step != 0:
|
77 |
-
max_length_target += self.number_frames_step - max_length_target % self.number_frames_step
|
78 |
-
assert max_length_target % self.number_frames_step == 0
|
79 |
-
|
80 |
-
# padding mel spectrogram representations. The output is a 3D tensor
|
81 |
-
melspec_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
|
82 |
-
melspec_padded.zero_()
|
83 |
-
|
84 |
-
# GST new prosody matrices definition with zero padding:
|
85 |
-
prosody_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
|
86 |
-
prosody_padded.zero_()
|
87 |
-
|
88 |
-
gate_padded = torch.FloatTensor(len(batch), max_length_target)
|
89 |
-
gate_padded.zero_()
|
90 |
-
output_lengths = torch.LongTensor(len(batch))
|
91 |
-
|
92 |
-
for j in range(len(sorted_decreasing)):
|
93 |
-
melspec = batch[sorted_decreasing[j]][1]
|
94 |
-
melspec_padded[j, :, :melspec.size(1)] = melspec
|
95 |
-
|
96 |
-
# GST filling padded prosody matrix:
|
97 |
-
prosody_padded[j, :, :melspec.size(1)] = melspec
|
98 |
-
|
99 |
-
gate_padded[j, melspec.size(1) - 1:] = 1
|
100 |
-
output_lengths[j] = melspec.size(1)
|
101 |
-
|
102 |
-
return sentences_padded, inp_lengths, melspec_padded, gate_padded, output_lengths, prosody_padded
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|