sssssy commited on
Commit
e96d01f
ยท
1 Parent(s): d2902aa

Update dataset.py

Browse files
Files changed (1) hide show
  1. dataset.py +10 -82
dataset.py CHANGED
@@ -41,7 +41,6 @@ def read_content(filepath):
41
  for i in range(1, len(tmp)):
42
  if len(tmp[i]) == 0:
43
  continue
44
- # need blank space or not?
45
  if i % 2 == 0:
46
  pinyin += tmp[i] + ' '
47
  tones += tmp[i][-1] + ' '
@@ -73,7 +72,7 @@ def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
73
  if len(tmp) != 0:
74
  durations[tmp[0]] = float(tmp[1])
75
 
76
- audio_path = os.path.join(filepath, 'wav')#่ฟ™้‡Œ่ฆๅˆ ๆŽ‰
77
  indexes = []
78
  for root, dirs, files in os.walk(audio_path):
79
  for f in files:
@@ -82,7 +81,6 @@ def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
82
  index = f[0:len(f)-4]
83
  filepath = os.path.join(audio_path, index[0:len(index)-4], f)
84
  word, py, tone = features[index]
85
- # du = librosa.get_duration(filename=filepath)
86
  du = durations[index]
87
  indexes.append((index, filepath, word, py, tone, du))
88
 
@@ -90,62 +88,8 @@ def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
90
  print('#wav file read:', count)
91
  print('read dataset index time: ', end_time - start_time)
92
 
93
- '''indexes = sorted(indexes, key=lambda x: x[0])
94
- with open('./durations.txt', 'w') as f:
95
- for i in indexes:
96
- f.write(i[0]+ ' ' + str(i[5]) + '\n')'''
97
-
98
- return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
99
-
100
-
101
- def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
102
- '''
103
- get all audio files' index and file paths
104
- read content.txt to get corresponding words, pinyin, tones, duration
105
-
106
- return dataframe:
107
- ['index', 'filepath', 'word', 'pinyin', 'tone', 'duration']
108
-
109
- 5 tones in total, 5 represents neutral tone
110
- '''
111
- features = read_content(os.path.join(filepath, 'content.txt'))
112
-
113
- start_time = time.time()
114
- count = 0
115
-
116
- durations = {}
117
- with open('/kaggle/input/durations/durations.txt', 'r') as f:
118
- lines = f.readlines()
119
- for l in lines:
120
- tmp = (l.replace('\n', '')).split(' ')
121
- if len(tmp) != 0:
122
- durations[tmp[0]] = float(tmp[1])
123
-
124
- audio_path = os.path.join(filepath, 'wav')#่ฟ™้‡Œ่ฆๅˆ ๆŽ‰
125
- indexes = []
126
- for root, dirs, files in os.walk(audio_path):
127
- for f in files:
128
- if f.endswith('.wav'):
129
- count += 1
130
- index = f[0:len(f)-4]
131
- filepath = os.path.join(audio_path, index[0:len(index)-4], f)
132
- word, py, tone = features[index]
133
- # du = librosa.get_duration(filename=filepath)
134
- du = durations[index]
135
- indexes.append((index, filepath, word, py, tone, du))
136
-
137
- end_time = time.time()
138
- print('#wav file read:', count)
139
- print('read dataset index time: ', end_time - start_time)
140
-
141
- '''indexes = sorted(indexes, key=lambda x: x[0])
142
- with open('./durations.txt', 'w') as f:
143
- for i in indexes:
144
- f.write(i[0]+ ' ' + str(i[5]) + '\n')'''
145
-
146
  return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
147
 
148
-
149
  def collate_fn(batch):
150
  inp = []
151
  f0 = []
@@ -194,7 +138,7 @@ def get_data_loader(split, args):
194
  class MyDataset(Dataset):
195
  def __init__(self, dataset_root, split, sampling_rate, sample_length, frame_size):
196
  self.dataset_root = dataset_root
197
- self.split = split # train or test
198
  self.sampling_rate = sampling_rate
199
  self.sample_length = sample_length
200
  self.frame_size = frame_size
@@ -202,7 +146,7 @@ class MyDataset(Dataset):
202
 
203
  # self.annotations = get_annotations(get_all_file_names(os.path.join(self.dataset_root, 'AISHELL-3', split)), level='word')
204
 
205
- self.dataset_index = read_dataset_index(os.path.join(self.dataset_root, 'AISHELL-3', split)) # maybe can be removed
206
 
207
  self.duration = {}
208
  self.index = self.index_data()
@@ -223,15 +167,11 @@ class MyDataset(Dataset):
223
 
224
  go through self.dataset_index to get duration and then calculate
225
  '''
226
- # duration already in dataset_index
227
- # TODO
228
- # pass
229
  index = []
230
  for indexs, row in self.dataset_index.iterrows():
231
  duration = row['duration']
232
  num_seg = math.ceil(duration / self.sample_length)
233
  for i in range(num_seg):
234
- # index.append([row['index'], i * self.sample_length])
235
  index.append([indexs, i * self.sample_length])
236
  self.duration[row['index']] = row['duration']
237
 
@@ -249,13 +189,9 @@ class MyDataset(Dataset):
249
  '''
250
  audio_fn, start_sec = self.index[idx]
251
  end_sec = start_sec + self.sample_length
252
- # print(start_sec, end_sec)
253
- #???
254
  audio_fp = self.dataset_index.loc[audio_fn,'filepath']
255
- # audio_fp = jpath('./dataset/AISHELL-3/train/wav/SSB0005/SSB0005',audio_fp,'.wav')
256
- #/kaggle/input/paddle-speech/AISHELL-3/train/wav/SSB0005/SSB00050001.wav
257
 
258
- # TODO: calculate mel spectrogram
259
  mel = None
260
  #load data from file
261
  waveform, sample_rate = torchaudio.load(audio_fp)
@@ -264,18 +200,16 @@ class MyDataset(Dataset):
264
  mel_spec = torch.mean(mel_spec,0)
265
  # print(mel_spec.shape)
266
 
267
- # TODO: calculate fundamental frequency
268
  f0 = None
269
  waveform, sr = librosa.load(audio_fp, sr=self.sampling_rate)
270
  f0 = torch.from_numpy(librosa.yin(waveform, fmin=50, fmax=550, hop_length=100))
271
 
272
- # get labels???
273
  # word_roll, tone_roll = self.get_labels(self.annotations[self.dataset_index.loc[audio_fn, 'index']], self.dataset_index.loc[audio_fn,'duration'])
274
  words = self.dataset_index.loc[audio_fn, 'pinyin']
275
  w = words.split(' ')
276
  word_roll = []
277
- for i in range(0, len(w)):
278
-
279
  if len(w[i]) != 0:
280
  if self.pinyin.get(w[i][0:-1]) == None:
281
  self.pinyin[w[i][0:-1]] = len(self.pinyin)
@@ -289,14 +223,12 @@ class MyDataset(Dataset):
289
 
290
  spectrogram_clip = None
291
  f0_clip = None
292
- onset_clip = None
293
- offset_clip = None
294
  word_clip = None
295
  tone_clip = None
296
 
297
- # TODO: create clips
298
  start_frame = int(start_sec * self.frame_per_sec)
299
- end_frame = start_frame + 1600 #int(end_sec * self.frame_per_sec)
300
  # print(start_frame, end_frame)
301
  spectrogram_clip = mel_spec[:, start_frame:end_frame].T
302
  f0_clip = f0[start_sec:end_sec]
@@ -304,7 +236,6 @@ class MyDataset(Dataset):
304
  #tone_clip = tone_roll[start_frame:end_frame]
305
 
306
  # print(tone_roll)
307
- #return spectrogram_clip, f0_clip, onset_clip, offset_clip, pinyin_clip, tone_clip
308
  return spectrogram_clip, f0_clip, torch.Tensor(word_roll), torch.Tensor(tone_roll) #word_clip, tone_clip
309
 
310
  def get_labels(self, annotation_data, duration):
@@ -312,14 +243,11 @@ class MyDataset(Dataset):
312
  This function read annotation from file, and then convert annotation from note-level to frame-level
313
  Because we will be using frame-level labels in training.
314
  '''
315
- # TODO
316
- # pass
317
  frame_num = math.ceil(duration * self.frame_per_sec)
318
 
319
  word_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
320
  tone_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
321
- # f0_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
322
- # mel_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
323
  for note in annotation_data:
324
  start_time, end_time, mark = note # Assuming annotation format: (start_time, end_time, pitch)
325
 
@@ -333,7 +261,7 @@ class MyDataset(Dataset):
333
  #print(start_frame, end_frame)
334
 
335
  # WORD LEVEL Mark the frames corresponding to the note
336
- word_roll[start_frame:end_frame+1] = self.pinyin[mark[:-1]] #mark[:-1]
337
  tone_roll[start_frame:end_frame+1] = int(mark[-1])
338
  # print(tone_roll)
339
  return word_roll, tone_roll
 
41
  for i in range(1, len(tmp)):
42
  if len(tmp[i]) == 0:
43
  continue
 
44
  if i % 2 == 0:
45
  pinyin += tmp[i] + ' '
46
  tones += tmp[i][-1] + ' '
 
72
  if len(tmp) != 0:
73
  durations[tmp[0]] = float(tmp[1])
74
 
75
+ audio_path = os.path.join(filepath, 'wav')
76
  indexes = []
77
  for root, dirs, files in os.walk(audio_path):
78
  for f in files:
 
81
  index = f[0:len(f)-4]
82
  filepath = os.path.join(audio_path, index[0:len(index)-4], f)
83
  word, py, tone = features[index]
 
84
  du = durations[index]
85
  indexes.append((index, filepath, word, py, tone, du))
86
 
 
88
  print('#wav file read:', count)
89
  print('read dataset index time: ', end_time - start_time)
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
92
 
 
93
  def collate_fn(batch):
94
  inp = []
95
  f0 = []
 
138
  class MyDataset(Dataset):
139
  def __init__(self, dataset_root, split, sampling_rate, sample_length, frame_size):
140
  self.dataset_root = dataset_root
141
+ self.split = split
142
  self.sampling_rate = sampling_rate
143
  self.sample_length = sample_length
144
  self.frame_size = frame_size
 
146
 
147
  # self.annotations = get_annotations(get_all_file_names(os.path.join(self.dataset_root, 'AISHELL-3', split)), level='word')
148
 
149
+ self.dataset_index = read_dataset_index(os.path.join(self.dataset_root, 'AISHELL-3', split))
150
 
151
  self.duration = {}
152
  self.index = self.index_data()
 
167
 
168
  go through self.dataset_index to get duration and then calculate
169
  '''
 
 
 
170
  index = []
171
  for indexs, row in self.dataset_index.iterrows():
172
  duration = row['duration']
173
  num_seg = math.ceil(duration / self.sample_length)
174
  for i in range(num_seg):
 
175
  index.append([indexs, i * self.sample_length])
176
  self.duration[row['index']] = row['duration']
177
 
 
189
  '''
190
  audio_fn, start_sec = self.index[idx]
191
  end_sec = start_sec + self.sample_length
192
+
 
193
  audio_fp = self.dataset_index.loc[audio_fn,'filepath']
 
 
194
 
 
195
  mel = None
196
  #load data from file
197
  waveform, sample_rate = torchaudio.load(audio_fp)
 
200
  mel_spec = torch.mean(mel_spec,0)
201
  # print(mel_spec.shape)
202
 
203
+ # calculate fundamental frequency
204
  f0 = None
205
  waveform, sr = librosa.load(audio_fp, sr=self.sampling_rate)
206
  f0 = torch.from_numpy(librosa.yin(waveform, fmin=50, fmax=550, hop_length=100))
207
 
 
208
  # word_roll, tone_roll = self.get_labels(self.annotations[self.dataset_index.loc[audio_fn, 'index']], self.dataset_index.loc[audio_fn,'duration'])
209
  words = self.dataset_index.loc[audio_fn, 'pinyin']
210
  w = words.split(' ')
211
  word_roll = []
212
+ for i in range(0, len(w)):
 
213
  if len(w[i]) != 0:
214
  if self.pinyin.get(w[i][0:-1]) == None:
215
  self.pinyin[w[i][0:-1]] = len(self.pinyin)
 
223
 
224
  spectrogram_clip = None
225
  f0_clip = None
 
 
226
  word_clip = None
227
  tone_clip = None
228
 
229
+ # create clips
230
  start_frame = int(start_sec * self.frame_per_sec)
231
+ end_frame = start_frame + 1600
232
  # print(start_frame, end_frame)
233
  spectrogram_clip = mel_spec[:, start_frame:end_frame].T
234
  f0_clip = f0[start_sec:end_sec]
 
236
  #tone_clip = tone_roll[start_frame:end_frame]
237
 
238
  # print(tone_roll)
 
239
  return spectrogram_clip, f0_clip, torch.Tensor(word_roll), torch.Tensor(tone_roll) #word_clip, tone_clip
240
 
241
  def get_labels(self, annotation_data, duration):
 
243
  This function read annotation from file, and then convert annotation from note-level to frame-level
244
  Because we will be using frame-level labels in training.
245
  '''
 
 
246
  frame_num = math.ceil(duration * self.frame_per_sec)
247
 
248
  word_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
249
  tone_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
250
+
 
251
  for note in annotation_data:
252
  start_time, end_time, mark = note # Assuming annotation format: (start_time, end_time, pitch)
253
 
 
261
  #print(start_frame, end_frame)
262
 
263
  # WORD LEVEL Mark the frames corresponding to the note
264
+ word_roll[start_frame:end_frame+1] = self.pinyin[mark[:-1]]
265
  tone_roll[start_frame:end_frame+1] = int(mark[-1])
266
  # print(tone_roll)
267
  return word_roll, tone_roll