HoneyTian commited on
Commit
86a51a3
·
1 Parent(s): 2d3c983
examples/silero_vad_by_webrtcvad/run.sh CHANGED
@@ -8,7 +8,7 @@ bash run.sh --stage 1 --stop_stage 1 --system_version centos \
8
  --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
9
  --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/"
10
 
11
- bash run.sh --stage 2 --stop_stage 2 --system_version centos \
12
  --file_folder_name silero-vad-by-webrtcvad-nx2-dns3 \
13
  --final_model_name silero-vad-by-webrtcvad-nx2-dns3 \
14
  --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
 
8
  --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
9
  --speech_dir "/data/tianxing/HuggingDatasets/nx_noise/data/speech/"
10
 
11
+ bash run.sh --stage 3 --stop_stage 3 --system_version centos \
12
  --file_folder_name silero-vad-by-webrtcvad-nx2-dns3 \
13
  --final_model_name silero-vad-by-webrtcvad-nx2-dns3 \
14
  --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
examples/silero_vad_by_webrtcvad/yaml/config.yaml CHANGED
@@ -1,14 +1,21 @@
1
  model_name: "silero_vad"
2
 
 
3
  sample_rate: 8000
4
  nfft: 512
5
  win_size: 240
6
  hop_size: 80
7
  win_type: hann
8
 
 
9
  in_channels: 64
10
  hidden_size: 128
11
 
 
 
 
 
 
12
  lr: 0.001
13
  lr_scheduler: CosineAnnealingLR
14
  lr_scheduler_kwargs: {}
 
1
  model_name: "silero_vad"
2
 
3
+ # spec
4
  sample_rate: 8000
5
  nfft: 512
6
  win_size: 240
7
  hop_size: 80
8
  win_type: hann
9
 
10
+ # model
11
  in_channels: 64
12
  hidden_size: 128
13
 
14
+ # data
15
+ min_snr_db: -10
16
+ max_snr_db: 20
17
+
18
+ # train
19
  lr: 0.001
20
  lr_scheduler: CosineAnnealingLR
21
  lr_scheduler_kwargs: {}
toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py CHANGED
@@ -16,6 +16,9 @@ class SileroVadConfig(PretrainedConfig):
16
  in_channels: int = 64,
17
  hidden_size: int = 128,
18
 
 
 
 
19
  lr: float = 0.001,
20
  lr_scheduler: str = "CosineAnnealingLR",
21
  lr_scheduler_kwargs: dict = None,
@@ -42,6 +45,10 @@ class SileroVadConfig(PretrainedConfig):
42
  self.in_channels = in_channels
43
  self.hidden_size = hidden_size
44
 
 
 
 
 
45
  # train
46
  self.lr = lr
47
  self.lr_scheduler = lr_scheduler
 
16
  in_channels: int = 64,
17
  hidden_size: int = 128,
18
 
19
+ min_snr_db: float = -10,
20
+ max_snr_db: float = 20,
21
+
22
  lr: float = 0.001,
23
  lr_scheduler: str = "CosineAnnealingLR",
24
  lr_scheduler_kwargs: dict = None,
 
45
  self.in_channels = in_channels
46
  self.hidden_size = hidden_size
47
 
48
+ # data snr
49
+ self.min_snr_db = min_snr_db
50
+ self.max_snr_db = max_snr_db
51
+
52
  # train
53
  self.lr = lr
54
  self.lr_scheduler = lr_scheduler
toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml CHANGED
@@ -1,14 +1,21 @@
1
  model_name: "silero_vad"
2
 
 
3
  sample_rate: 8000
4
  nfft: 512
5
  win_size: 240
6
  hop_size: 80
7
  win_type: hann
8
 
 
9
  in_channels: 64
10
  hidden_size: 128
11
 
 
 
 
 
 
12
  lr: 0.001
13
  lr_scheduler: CosineAnnealingLR
14
  lr_scheduler_kwargs: {}
 
1
  model_name: "silero_vad"
2
 
3
+ # spec
4
  sample_rate: 8000
5
  nfft: 512
6
  win_size: 240
7
  hop_size: 80
8
  win_type: hann
9
 
10
+ # model
11
  in_channels: 64
12
  hidden_size: 128
13
 
14
+ # data
15
+ min_snr_db: -10
16
+ max_snr_db: 20
17
+
18
+ # train
19
  lr: 0.001
20
  lr_scheduler: CosineAnnealingLR
21
  lr_scheduler_kwargs: {}
toolbox/torchaudio/modules/freq_bands/mel_bands.py CHANGED
@@ -1,6 +1,54 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  if __name__ == "__main__":
6
- pass
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
+ import librosa
4
+ import numpy as np
5
+
6
+
7
+ class MelBandsNumpy(object):
8
+
9
+ @staticmethod
10
+ def get_mel_points(sample_rate: int, nfft: int, n_mels: int, fmin: float = 0, fmax: int = None):
11
+ fmax = fmax or sample_rate // 2
12
+
13
+ mel_points = librosa.mel_frequencies(n_mels=n_mels, fmin=fmin, fmax=fmax)
14
+ return mel_points
15
+
16
+ @staticmethod
17
+ def get_mel_filter_bank(mel_points: np.ndarray,
18
+ sample_rate: int, nfft: int, n_mels: int, fmin: float = 0, fmax: int = None,
19
+ normalized: bool = True,
20
+ inverse: bool = False,
21
+ ):
22
+ fmax = fmax or sample_rate // 2
23
+
24
+ mel_points = librosa.mel_frequencies(n_mels=n_mels, fmin=fmin, fmax=fmax)
25
+
26
+ bin_freqs = np.linspace(0, sample_rate // 2, nfft // 2 + 1)
27
+ fft_bins = np.floor((nfft + 1) * mel_points / sample_rate).astype(int)
28
+
29
+ filterbank = np.zeros((n_mels, nfft // 2 + 1))
30
+ for i in range(1, n_mels + 1):
31
+ left = fft_bins[i - 1]
32
+ center = fft_bins[i]
33
+ right = fft_bins[i + 1] if i < n_mels - 1 else center
34
+
35
+ filterbank[i - 1, left:center] = np.linspace(0, 1, center - left)
36
+ filterbank[i - 1, center:right] = np.linspace(1, 0, right - center)
37
+
38
+ filterbank = librosa.util.normalize(filterbank, norm=1, axis=1)
39
+ return filterbank
40
+
41
+
42
+ def main():
43
+ mel_points = MelBandsNumpy.get_mel_points(
44
+ sample_rate=8000,
45
+ nfft=512,
46
+ n_mels=80,
47
+ fmin=10,
48
+ fmax=3800
49
+ )
50
+ return
51
 
52
 
53
  if __name__ == "__main__":
54
+ main()