HoneyTian commited on
Commit
8a64e31
·
1 Parent(s): 67d2438
Dockerfile CHANGED
@@ -12,6 +12,8 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
 
13
  RUN bash install.sh --stage 1 --stop_stage 2 --system_version centos
14
 
 
 
15
  USER user
16
 
17
  ENV HOME=/home/user \
 
12
 
13
  RUN bash install.sh --stage 1 --stop_stage 2 --system_version centos
14
 
15
+ RUN mkdir -p logs
16
+
17
  USER user
18
 
19
  ENV HOME=/home/user \
examples/fsmn_vad/step_1_prepare_data.py DELETED
@@ -1,156 +0,0 @@
1
- #!/usr/bin/python3
2
- # -*- coding: utf-8 -*-
3
- import argparse
4
- import json
5
- import os
6
- from pathlib import Path
7
- import random
8
- import sys
9
-
10
- pwd = os.path.abspath(os.path.dirname(__file__))
11
- sys.path.append(os.path.join(pwd, "../../"))
12
-
13
- import librosa
14
- import numpy as np
15
- from tqdm import tqdm
16
-
17
-
18
- def get_args():
19
- parser = argparse.ArgumentParser()
20
- parser.add_argument(
21
- "--noise_dir",
22
- default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
23
- type=str
24
- )
25
- parser.add_argument(
26
- "--speech_dir",
27
- default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
28
- type=str
29
- )
30
-
31
- parser.add_argument("--train_dataset", default="train.jsonl", type=str)
32
- parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
33
-
34
- parser.add_argument("--duration", default=6.0, type=float)
35
- parser.add_argument("--min_snr_db", default=-10, type=float)
36
- parser.add_argument("--max_snr_db", default=20, type=float)
37
-
38
- parser.add_argument("--target_sample_rate", default=8000, type=int)
39
-
40
- parser.add_argument("--max_count", default=-1, type=int)
41
-
42
- args = parser.parse_args()
43
- return args
44
-
45
-
46
- def target_second_signal_generator(data_dir: str, duration: int = 6, sample_rate: int = 8000, max_epoch: int = 20000):
47
- data_dir = Path(data_dir)
48
- for epoch_idx in range(max_epoch):
49
- for filename in data_dir.glob("**/*.wav"):
50
- signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
51
- raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
52
-
53
- if raw_duration < duration:
54
- # print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
55
- continue
56
- if signal.ndim != 1:
57
- raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
58
-
59
- signal_length = len(signal)
60
- win_size = int(duration * sample_rate)
61
- for begin in range(0, signal_length - win_size, win_size):
62
- if np.sum(signal[begin: begin+win_size]) == 0:
63
- continue
64
- row = {
65
- "epoch_idx": epoch_idx,
66
- "filename": filename.as_posix(),
67
- "raw_duration": round(raw_duration, 4),
68
- "offset": round(begin / sample_rate, 4),
69
- "duration": round(duration, 4),
70
- }
71
- yield row
72
-
73
-
74
- def main():
75
- args = get_args()
76
-
77
- noise_dir = Path(args.noise_dir)
78
- speech_dir = Path(args.speech_dir)
79
-
80
- train_dataset = Path(args.train_dataset)
81
- valid_dataset = Path(args.valid_dataset)
82
- train_dataset.parent.mkdir(parents=True, exist_ok=True)
83
- valid_dataset.parent.mkdir(parents=True, exist_ok=True)
84
-
85
- noise_generator = target_second_signal_generator(
86
- noise_dir.as_posix(),
87
- duration=args.duration,
88
- sample_rate=args.target_sample_rate,
89
- max_epoch=100000,
90
- )
91
- speech_generator = target_second_signal_generator(
92
- speech_dir.as_posix(),
93
- duration=args.duration,
94
- sample_rate=args.target_sample_rate,
95
- max_epoch=1,
96
- )
97
-
98
- count = 0
99
- process_bar = tqdm(desc="build dataset jsonl")
100
- with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
101
- for noise, speech in zip(noise_generator, speech_generator):
102
- if count >= args.max_count > 0:
103
- break
104
-
105
- # row
106
- noise_filename = noise["filename"]
107
- noise_raw_duration = noise["raw_duration"]
108
- noise_offset = noise["offset"]
109
- noise_duration = noise["duration"]
110
-
111
- speech_filename = speech["filename"]
112
- speech_raw_duration = speech["raw_duration"]
113
- speech_offset = speech["offset"]
114
- speech_duration = speech["duration"]
115
-
116
- # row
117
- random1 = random.random()
118
- random2 = random.random()
119
-
120
- row = {
121
- "count": count,
122
-
123
- "noise_filename": noise_filename,
124
- "noise_raw_duration": noise_raw_duration,
125
- "noise_offset": noise_offset,
126
- "noise_duration": noise_duration,
127
-
128
- "speech_filename": speech_filename,
129
- "speech_raw_duration": speech_raw_duration,
130
- "speech_offset": speech_offset,
131
- "speech_duration": speech_duration,
132
-
133
- "snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
134
-
135
- "random1": random1,
136
- }
137
- row = json.dumps(row, ensure_ascii=False)
138
- if random2 < (1 / 300 / 1):
139
- fvalid.write(f"{row}\n")
140
- else:
141
- ftrain.write(f"{row}\n")
142
-
143
- count += 1
144
- duration_seconds = count * args.duration
145
- duration_hours = duration_seconds / 3600
146
-
147
- process_bar.update(n=1)
148
- process_bar.set_postfix({
149
- "duration_hours": round(duration_hours, 4),
150
- })
151
-
152
- return
153
-
154
-
155
- if __name__ == "__main__":
156
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/fsmn_vad_by_webrtcvad/run.sh CHANGED
@@ -2,7 +2,7 @@
2
 
3
  : <<'END'
4
 
5
- bash run.sh --stage 1 --stop_stage 1 --system_version centos \
6
  --file_folder_name fsmn-vad-by-webrtcvad-nx2-dns3 \
7
  --final_model_name fsmn-vad-by-webrtcvad-nx2-dns3 \
8
  --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
 
2
 
3
  : <<'END'
4
 
5
+ bash run.sh --stage 2 --stop_stage 3 --system_version centos \
6
  --file_folder_name fsmn-vad-by-webrtcvad-nx2-dns3 \
7
  --final_model_name fsmn-vad-by-webrtcvad-nx2-dns3 \
8
  --noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
toolbox/torchaudio/models/vad/cnn_vad/configuration_cnn_vad.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ from typing import Tuple
4
+
5
+ from toolbox.torchaudio.configuration_utils import PretrainedConfig
6
+
7
+
8
+ class CNNVadConfig(PretrainedConfig):
9
+ def __init__(self,
10
+ sample_rate: int = 8000,
11
+ nfft: int = 512,
12
+ win_size: int = 240,
13
+ hop_size: int = 80,
14
+ win_type: str = "hann",
15
+
16
+ conv2d_block_param_list: list = None,
17
+ classifier_hidden_size: int = 128,
18
+
19
+ min_snr_db: float = -10,
20
+ max_snr_db: float = 20,
21
+
22
+ lr: float = 0.001,
23
+ lr_scheduler: str = "CosineAnnealingLR",
24
+ lr_scheduler_kwargs: dict = None,
25
+
26
+ max_epochs: int = 100,
27
+ clip_grad_norm: float = 10.,
28
+ seed: int = 1234,
29
+
30
+ num_workers: int = 4,
31
+ batch_size: int = 4,
32
+ eval_steps: int = 25000,
33
+
34
+ **kwargs
35
+ ):
36
+ super(CNNVadConfig, self).__init__(**kwargs)
37
+ # transform
38
+ self.sample_rate = sample_rate
39
+ self.nfft = nfft
40
+ self.win_size = win_size
41
+ self.hop_size = hop_size
42
+ self.win_type = win_type
43
+
44
+ # encoder
45
+ self.conv2d_block_param_list = conv2d_block_param_list
46
+ self.classifier_hidden_size = classifier_hidden_size
47
+
48
+ # data snr
49
+ self.min_snr_db = min_snr_db
50
+ self.max_snr_db = max_snr_db
51
+
52
+ # train
53
+ self.lr = lr
54
+ self.lr_scheduler = lr_scheduler
55
+ self.lr_scheduler_kwargs = lr_scheduler_kwargs or dict()
56
+
57
+ self.max_epochs = max_epochs
58
+ self.clip_grad_norm = clip_grad_norm
59
+ self.seed = seed
60
+
61
+ self.num_workers = num_workers
62
+ self.batch_size = batch_size
63
+ self.eval_steps = eval_steps
64
+
65
+
66
+ def main():
67
+ config = SileroVadConfig()
68
+ config.to_yaml_file("config.yaml")
69
+ return
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
toolbox/torchaudio/models/vad/cnn_vad/modeling_cnn_vad.py CHANGED
@@ -1,6 +1,169 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  if __name__ == "__main__":
6
- pass
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
+ import os
4
+ from typing import Dict, List, Optional, Tuple, Union
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from toolbox.torchaudio.configuration_utils import CONFIG_FILE
10
+ from toolbox.torchaudio.models.vad.silero_vad.configuration_silero_vad import SileroVadConfig
11
+ from toolbox.torchaudio.modules.conv_stft import ConvSTFT
12
+
13
+
14
+ MODEL_FILE = "model.pt"
15
+
16
+
17
+ name2activation = {
18
+ "relu": nn.ReLU,
19
+ }
20
+
21
+
22
+ class Conv2dBlock(nn.Module):
23
+ def __init__(self,
24
+ in_channels: int,
25
+ out_channels: int,
26
+ kernel_size: Union[int, Tuple[int, int]],
27
+ stride: Tuple[int, int],
28
+ padding: str = 0,
29
+ dilation: int = 1,
30
+ batch_norm: bool = False,
31
+ activation: str = None,
32
+ dropout: float = None,
33
+ ):
34
+ super().__init__()
35
+ self.in_channels = in_channels
36
+ self.out_channels = out_channels
37
+ self.kernel_size: Tuple[int, int] = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
38
+
39
+ if batch_norm:
40
+ self.batch_norm = nn.BatchNorm2d(in_channels)
41
+ else:
42
+ self.batch_norm = None
43
+
44
+ self.conv = nn.Conv2d(
45
+ in_channels,
46
+ out_channels,
47
+ kernel_size=kernel_size,
48
+ stride=stride,
49
+ padding=(padding,),
50
+ dilation=(dilation,),
51
+ )
52
+
53
+ if activation is None:
54
+ self.activation = None
55
+ else:
56
+ self.activation = name2activation[activation]()
57
+
58
+ if dropout is not None:
59
+ self.dropout = nn.Dropout(p=dropout)
60
+ else:
61
+ self.dropout = None
62
+
63
+ def forward(self, x: torch.Tensor):
64
+
65
+ if self.batch_norm is not None:
66
+ x = self.batch_norm(x)
67
+
68
+ x = self.conv(x)
69
+
70
+ if self.activation is not None:
71
+ x = self.activation(x)
72
+
73
+ if self.dropout is not None:
74
+ x = self.dropout(x)
75
+
76
+ return x
77
+
78
+
79
+ class CNNVadModel(nn.Module):
80
+ def __init__(self,
81
+ nfft: int,
82
+ win_size: int,
83
+ hop_size: int,
84
+ win_type: str,
85
+ conv2d_block_param_list: List[dict],
86
+ classifier_hidden_size: int,
87
+ ):
88
+ super(CNNVadModel, self).__init__()
89
+ self.nfft = nfft
90
+ self.win_size = win_size
91
+ self.hop_size = hop_size
92
+ self.win_type = win_type
93
+ self.conv2d_block_param_list = conv2d_block_param_list
94
+ self.classifier_hidden_size = classifier_hidden_size
95
+
96
+ self.eps = 1e-12
97
+
98
+ self.stft = ConvSTFT(
99
+ nfft=nfft,
100
+ win_size=win_size,
101
+ hop_size=hop_size,
102
+ win_type=win_type,
103
+ power=1,
104
+ requires_grad=False
105
+ )
106
+
107
+ self.cnn_encoder_list = nn.ModuleList(modules=[
108
+ Conv2dBlock(
109
+ batch_norm=param["batch_norm"],
110
+ in_channels=param["in_channels"],
111
+ out_channels=param["out_channels"],
112
+ kernel_size=param["kernel_size"],
113
+ stride=param["stride"],
114
+ dilation=param["dilation"],
115
+ activation=param["activation"],
116
+ dropout=param["dropout"],
117
+ )
118
+ for param in conv2d_block_param_list
119
+ ])
120
+
121
+ self.classifier = nn.Sequential(
122
+ nn.Linear(classifier_hidden_size, 32),
123
+ nn.ReLU(),
124
+ nn.Linear(32, 1),
125
+ )
126
+
127
+ self.sigmoid = nn.Sigmoid()
128
+
129
+ def forward(self, signal: torch.Tensor):
130
+ if signal.dim() == 2:
131
+ signal = torch.unsqueeze(signal, dim=1)
132
+ _, _, num_samples = signal.shape
133
+ # signal shape [b, 1, num_samples]
134
+
135
+ mags = self.stft.forward(signal)
136
+ # mags shape: [b, f, t]
137
+
138
+ x = torch.transpose(mags, dim0=1, dim1=2)
139
+ # x shape: [b, t, f]
140
+
141
+ x = self.linear.forward(x)
142
+ # x shape: [b, t, f']
143
+
144
+ x = self.encoder.forward(x)
145
+ # x shape: [b, t, f]
146
+
147
+ x, _ = self.lstm.forward(x)
148
+ logits = self.classifier.forward(x)
149
+ # logits shape: [b, t, 1]
150
+ probs = self.sigmoid.forward(logits)
151
+ # probs shape: [b, t, 1]
152
+ return logits, probs
153
+
154
+
155
+ def main():
156
+ config = SileroVadConfig()
157
+ model = SileroVadModel(config=config)
158
+
159
+ noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
160
+
161
+ logits, probs = model.forward(noisy)
162
+ print(f"logits: {probs}")
163
+ print(f"logits.shape: {logits.shape}")
164
+
165
+ return
166
 
167
 
168
  if __name__ == "__main__":
169
+ main()
toolbox/torchaudio/models/vad/cnn_vad/yaml/config.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: "cnn_vad"
2
+
3
+ # spec
4
+ sample_rate: 8000
5
+ nfft: 512
6
+ win_size: 240
7
+ hop_size: 80
8
+ win_type: hann
9
+
10
+ # model
11
+ conv2d_block_param_list:
12
+ - batch_norm: true
13
+ in_channels: 1
14
+ out_channels: 4
15
+ kernel_size: 3
16
+ stride: 1
17
+ dilation: 3
18
+ activation: relu
19
+ dropout: 0.1
20
+ - in_channels: 4
21
+ out_channels: 4
22
+ kernel_size: 5
23
+ stride: 2
24
+ dilation: 3
25
+ activation: relu
26
+ dropout: 0.1
27
+ - in_channels: 4
28
+ out_channels: 4
29
+ kernel_size: 3
30
+ stride: 1
31
+ dilation: 2
32
+ activation: relu
33
+ dropout: 0.1
34
+
35
+ # data
36
+ min_snr_db: -10
37
+ max_snr_db: 20
38
+
39
+ # train
40
+ lr: 0.001
41
+ lr_scheduler: "CosineAnnealingLR"
42
+ lr_scheduler_kwargs:
43
+ T_max: 250000
44
+ eta_min: 0.0001
45
+
46
+ max_epochs: 100
47
+ clip_grad_norm: 10.0
48
+ seed: 1234
49
+
50
+ num_workers: 4
51
+ batch_size: 128
52
+ eval_steps: 25000