AlexK-PL commited on
Commit
49e3642
·
1 Parent(s): 4996b99

Delete data_preparation.py

Browse files
Files changed (1) hide show
  1. data_preparation.py +0 -102
data_preparation.py DELETED
@@ -1,102 +0,0 @@
1
- import random
2
-
3
- import numpy as np
4
- import torch
5
- import torch.utils.data
6
-
7
- import nn_layers
8
- from scipy.io.wavfile import read
9
- from text import text_to_sequence
10
- from hyper_parameters import tacotron_params
11
-
12
-
13
- class DataPreparation(torch.utils.data.Dataset):
14
-
15
- def __init__(self, audiopaths_and_text, tacotron_hyperparams):
16
- self.audiopaths_and_text = audiopaths_and_text
17
- self.audio_text_parameters = tacotron_hyperparams
18
- self.stft = nn_layers.TacotronSTFT(tacotron_hyperparams['filter_length'], tacotron_hyperparams['hop_length'],
19
- tacotron_hyperparams['win_length'], tacotron_hyperparams['n_mel_channels'],
20
- self.audio_text_parameters['sampling_rate'],
21
- tacotron_hyperparams['mel_fmin'], tacotron_hyperparams['mel_fmax'])
22
- random.seed(1234)
23
- random.shuffle(self.audiopaths_and_text)
24
-
25
- def load_audiowav_torch(self, audiopath, samp_rate):
26
- sr, data = read(audiopath)
27
- assert samp_rate == sr, "Sample rate does not match with the configuration"
28
-
29
- return torch.FloatTensor(data.astype(np.float32))
30
-
31
- def melspec_textSequence_pair(self, audiopath_and_text):
32
- wav_path, sentence = audiopath_and_text[0], audiopath_and_text[1]
33
- # wav to torch tensor
34
- wav_torch = self.load_audiowav_torch(wav_path, self.audio_text_parameters['sampling_rate'])
35
- wav_torch_norm = wav_torch / self.audio_text_parameters['max_wav_value']
36
- wav_torch_norm = wav_torch_norm.unsqueeze(0)
37
- wav_torch_norm = torch.autograd.Variable(wav_torch_norm, requires_grad=False)
38
- mel_spec = self.stft.mel_spectrogram(wav_torch_norm)
39
- mel_spec = torch.squeeze(mel_spec, 0)
40
- # text to torch integer tensor sequence
41
- sentence_sequence = torch.IntTensor(text_to_sequence(sentence, self.audio_text_parameters['text_cleaners']))
42
-
43
- return sentence_sequence, mel_spec
44
-
45
- def __getitem__(self, index):
46
- return self.melspec_textSequence_pair(self.audiopaths_and_text[index])
47
-
48
- def __len__(self):
49
- return len(self.audiopaths_and_text)
50
-
51
-
52
- class DataCollate:
53
-
54
- def __init__(self, number_frames_step):
55
- self.number_frames_step = number_frames_step
56
-
57
- def __call__(self, batch):
58
- inp_lengths, sorted_decreasing = torch.sort(torch.LongTensor([len(x[0]) for x in batch]),
59
- dim=0, descending=True)
60
- max_length_in = inp_lengths[0]
61
-
62
- # padding sentences sequences for a fixed-length tensor size
63
- sentences_padded = torch.LongTensor(len(batch), max_length_in)
64
- sentences_padded.zero_()
65
- for i in range(len(sorted_decreasing)):
66
- int_seq_sentence = batch[sorted_decreasing[i]][0]
67
- # all slots of a line until the end of the sentence. The rest, 0's
68
- sentences_padded[i, :int_seq_sentence.size(0)] = int_seq_sentence
69
-
70
- # length of the mel filterbank used
71
- num_melfilters = batch[0][1].size(0)
72
-
73
- # longest recorded spectrogram representation + 1 space to mark the end
74
- max_length_target = max([x[1].size(1) for x in batch]) # THERE IS A CHANGE FROM THE ORIGINAL CODE!!!
75
- # add extra space if the number of frames per step is higher than 1
76
- if max_length_target % self.number_frames_step != 0:
77
- max_length_target += self.number_frames_step - max_length_target % self.number_frames_step
78
- assert max_length_target % self.number_frames_step == 0
79
-
80
- # padding mel spectrogram representations. The output is a 3D tensor
81
- melspec_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
82
- melspec_padded.zero_()
83
-
84
- # GST new prosody matrices definition with zero padding:
85
- prosody_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
86
- prosody_padded.zero_()
87
-
88
- gate_padded = torch.FloatTensor(len(batch), max_length_target)
89
- gate_padded.zero_()
90
- output_lengths = torch.LongTensor(len(batch))
91
-
92
- for j in range(len(sorted_decreasing)):
93
- melspec = batch[sorted_decreasing[j]][1]
94
- melspec_padded[j, :, :melspec.size(1)] = melspec
95
-
96
- # GST filling padded prosody matrix:
97
- prosody_padded[j, :, :melspec.size(1)] = melspec
98
-
99
- gate_padded[j, melspec.size(1) - 1:] = 1
100
- output_lengths[j] = melspec.size(1)
101
-
102
- return sentences_padded, inp_lengths, melspec_padded, gate_padded, output_lengths, prosody_padded