update
Browse files- Dockerfile +2 -0
- examples/fsmn_vad/step_1_prepare_data.py +0 -156
- examples/fsmn_vad_by_webrtcvad/run.sh +1 -1
- toolbox/torchaudio/models/vad/cnn_vad/configuration_cnn_vad.py +73 -0
- toolbox/torchaudio/models/vad/cnn_vad/modeling_cnn_vad.py +164 -1
- toolbox/torchaudio/models/vad/cnn_vad/yaml/config.yaml +52 -0
Dockerfile
CHANGED
@@ -12,6 +12,8 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
|
12 |
|
13 |
RUN bash install.sh --stage 1 --stop_stage 2 --system_version centos
|
14 |
|
|
|
|
|
15 |
USER user
|
16 |
|
17 |
ENV HOME=/home/user \
|
|
|
12 |
|
13 |
RUN bash install.sh --stage 1 --stop_stage 2 --system_version centos
|
14 |
|
15 |
+
RUN mkdir -p logs
|
16 |
+
|
17 |
USER user
|
18 |
|
19 |
ENV HOME=/home/user \
|
examples/fsmn_vad/step_1_prepare_data.py
DELETED
@@ -1,156 +0,0 @@
|
|
1 |
-
#!/usr/bin/python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
import argparse
|
4 |
-
import json
|
5 |
-
import os
|
6 |
-
from pathlib import Path
|
7 |
-
import random
|
8 |
-
import sys
|
9 |
-
|
10 |
-
pwd = os.path.abspath(os.path.dirname(__file__))
|
11 |
-
sys.path.append(os.path.join(pwd, "../../"))
|
12 |
-
|
13 |
-
import librosa
|
14 |
-
import numpy as np
|
15 |
-
from tqdm import tqdm
|
16 |
-
|
17 |
-
|
18 |
-
def get_args():
|
19 |
-
parser = argparse.ArgumentParser()
|
20 |
-
parser.add_argument(
|
21 |
-
"--noise_dir",
|
22 |
-
default=r"E:\Users\tianx\HuggingDatasets\nx_noise\data\noise",
|
23 |
-
type=str
|
24 |
-
)
|
25 |
-
parser.add_argument(
|
26 |
-
"--speech_dir",
|
27 |
-
default=r"E:\programmer\asr_datasets\aishell\data_aishell\wav\train",
|
28 |
-
type=str
|
29 |
-
)
|
30 |
-
|
31 |
-
parser.add_argument("--train_dataset", default="train.jsonl", type=str)
|
32 |
-
parser.add_argument("--valid_dataset", default="valid.jsonl", type=str)
|
33 |
-
|
34 |
-
parser.add_argument("--duration", default=6.0, type=float)
|
35 |
-
parser.add_argument("--min_snr_db", default=-10, type=float)
|
36 |
-
parser.add_argument("--max_snr_db", default=20, type=float)
|
37 |
-
|
38 |
-
parser.add_argument("--target_sample_rate", default=8000, type=int)
|
39 |
-
|
40 |
-
parser.add_argument("--max_count", default=-1, type=int)
|
41 |
-
|
42 |
-
args = parser.parse_args()
|
43 |
-
return args
|
44 |
-
|
45 |
-
|
46 |
-
def target_second_signal_generator(data_dir: str, duration: int = 6, sample_rate: int = 8000, max_epoch: int = 20000):
|
47 |
-
data_dir = Path(data_dir)
|
48 |
-
for epoch_idx in range(max_epoch):
|
49 |
-
for filename in data_dir.glob("**/*.wav"):
|
50 |
-
signal, _ = librosa.load(filename.as_posix(), sr=sample_rate)
|
51 |
-
raw_duration = librosa.get_duration(y=signal, sr=sample_rate)
|
52 |
-
|
53 |
-
if raw_duration < duration:
|
54 |
-
# print(f"duration less than {duration} s. skip filename: {filename.as_posix()}")
|
55 |
-
continue
|
56 |
-
if signal.ndim != 1:
|
57 |
-
raise AssertionError(f"expected ndim 1, instead of {signal.ndim}")
|
58 |
-
|
59 |
-
signal_length = len(signal)
|
60 |
-
win_size = int(duration * sample_rate)
|
61 |
-
for begin in range(0, signal_length - win_size, win_size):
|
62 |
-
if np.sum(signal[begin: begin+win_size]) == 0:
|
63 |
-
continue
|
64 |
-
row = {
|
65 |
-
"epoch_idx": epoch_idx,
|
66 |
-
"filename": filename.as_posix(),
|
67 |
-
"raw_duration": round(raw_duration, 4),
|
68 |
-
"offset": round(begin / sample_rate, 4),
|
69 |
-
"duration": round(duration, 4),
|
70 |
-
}
|
71 |
-
yield row
|
72 |
-
|
73 |
-
|
74 |
-
def main():
|
75 |
-
args = get_args()
|
76 |
-
|
77 |
-
noise_dir = Path(args.noise_dir)
|
78 |
-
speech_dir = Path(args.speech_dir)
|
79 |
-
|
80 |
-
train_dataset = Path(args.train_dataset)
|
81 |
-
valid_dataset = Path(args.valid_dataset)
|
82 |
-
train_dataset.parent.mkdir(parents=True, exist_ok=True)
|
83 |
-
valid_dataset.parent.mkdir(parents=True, exist_ok=True)
|
84 |
-
|
85 |
-
noise_generator = target_second_signal_generator(
|
86 |
-
noise_dir.as_posix(),
|
87 |
-
duration=args.duration,
|
88 |
-
sample_rate=args.target_sample_rate,
|
89 |
-
max_epoch=100000,
|
90 |
-
)
|
91 |
-
speech_generator = target_second_signal_generator(
|
92 |
-
speech_dir.as_posix(),
|
93 |
-
duration=args.duration,
|
94 |
-
sample_rate=args.target_sample_rate,
|
95 |
-
max_epoch=1,
|
96 |
-
)
|
97 |
-
|
98 |
-
count = 0
|
99 |
-
process_bar = tqdm(desc="build dataset jsonl")
|
100 |
-
with open(args.train_dataset, "w", encoding="utf-8") as ftrain, open(args.valid_dataset, "w", encoding="utf-8") as fvalid:
|
101 |
-
for noise, speech in zip(noise_generator, speech_generator):
|
102 |
-
if count >= args.max_count > 0:
|
103 |
-
break
|
104 |
-
|
105 |
-
# row
|
106 |
-
noise_filename = noise["filename"]
|
107 |
-
noise_raw_duration = noise["raw_duration"]
|
108 |
-
noise_offset = noise["offset"]
|
109 |
-
noise_duration = noise["duration"]
|
110 |
-
|
111 |
-
speech_filename = speech["filename"]
|
112 |
-
speech_raw_duration = speech["raw_duration"]
|
113 |
-
speech_offset = speech["offset"]
|
114 |
-
speech_duration = speech["duration"]
|
115 |
-
|
116 |
-
# row
|
117 |
-
random1 = random.random()
|
118 |
-
random2 = random.random()
|
119 |
-
|
120 |
-
row = {
|
121 |
-
"count": count,
|
122 |
-
|
123 |
-
"noise_filename": noise_filename,
|
124 |
-
"noise_raw_duration": noise_raw_duration,
|
125 |
-
"noise_offset": noise_offset,
|
126 |
-
"noise_duration": noise_duration,
|
127 |
-
|
128 |
-
"speech_filename": speech_filename,
|
129 |
-
"speech_raw_duration": speech_raw_duration,
|
130 |
-
"speech_offset": speech_offset,
|
131 |
-
"speech_duration": speech_duration,
|
132 |
-
|
133 |
-
"snr_db": random.uniform(args.min_snr_db, args.max_snr_db),
|
134 |
-
|
135 |
-
"random1": random1,
|
136 |
-
}
|
137 |
-
row = json.dumps(row, ensure_ascii=False)
|
138 |
-
if random2 < (1 / 300 / 1):
|
139 |
-
fvalid.write(f"{row}\n")
|
140 |
-
else:
|
141 |
-
ftrain.write(f"{row}\n")
|
142 |
-
|
143 |
-
count += 1
|
144 |
-
duration_seconds = count * args.duration
|
145 |
-
duration_hours = duration_seconds / 3600
|
146 |
-
|
147 |
-
process_bar.update(n=1)
|
148 |
-
process_bar.set_postfix({
|
149 |
-
"duration_hours": round(duration_hours, 4),
|
150 |
-
})
|
151 |
-
|
152 |
-
return
|
153 |
-
|
154 |
-
|
155 |
-
if __name__ == "__main__":
|
156 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/fsmn_vad_by_webrtcvad/run.sh
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
|
3 |
: <<'END'
|
4 |
|
5 |
-
bash run.sh --stage
|
6 |
--file_folder_name fsmn-vad-by-webrtcvad-nx2-dns3 \
|
7 |
--final_model_name fsmn-vad-by-webrtcvad-nx2-dns3 \
|
8 |
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
|
|
|
2 |
|
3 |
: <<'END'
|
4 |
|
5 |
+
bash run.sh --stage 2 --stop_stage 3 --system_version centos \
|
6 |
--file_folder_name fsmn-vad-by-webrtcvad-nx2-dns3 \
|
7 |
--final_model_name fsmn-vad-by-webrtcvad-nx2-dns3 \
|
8 |
--noise_dir "/data/tianxing/HuggingDatasets/nx_noise/data/noise/" \
|
toolbox/torchaudio/models/vad/cnn_vad/configuration_cnn_vad.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from typing import Tuple
|
4 |
+
|
5 |
+
from toolbox.torchaudio.configuration_utils import PretrainedConfig
|
6 |
+
|
7 |
+
|
8 |
+
class CNNVadConfig(PretrainedConfig):
|
9 |
+
def __init__(self,
|
10 |
+
sample_rate: int = 8000,
|
11 |
+
nfft: int = 512,
|
12 |
+
win_size: int = 240,
|
13 |
+
hop_size: int = 80,
|
14 |
+
win_type: str = "hann",
|
15 |
+
|
16 |
+
conv2d_block_param_list: list = None,
|
17 |
+
classifier_hidden_size: int = 128,
|
18 |
+
|
19 |
+
min_snr_db: float = -10,
|
20 |
+
max_snr_db: float = 20,
|
21 |
+
|
22 |
+
lr: float = 0.001,
|
23 |
+
lr_scheduler: str = "CosineAnnealingLR",
|
24 |
+
lr_scheduler_kwargs: dict = None,
|
25 |
+
|
26 |
+
max_epochs: int = 100,
|
27 |
+
clip_grad_norm: float = 10.,
|
28 |
+
seed: int = 1234,
|
29 |
+
|
30 |
+
num_workers: int = 4,
|
31 |
+
batch_size: int = 4,
|
32 |
+
eval_steps: int = 25000,
|
33 |
+
|
34 |
+
**kwargs
|
35 |
+
):
|
36 |
+
super(CNNVadConfig, self).__init__(**kwargs)
|
37 |
+
# transform
|
38 |
+
self.sample_rate = sample_rate
|
39 |
+
self.nfft = nfft
|
40 |
+
self.win_size = win_size
|
41 |
+
self.hop_size = hop_size
|
42 |
+
self.win_type = win_type
|
43 |
+
|
44 |
+
# encoder
|
45 |
+
self.conv2d_block_param_list = conv2d_block_param_list
|
46 |
+
self.classifier_hidden_size = classifier_hidden_size
|
47 |
+
|
48 |
+
# data snr
|
49 |
+
self.min_snr_db = min_snr_db
|
50 |
+
self.max_snr_db = max_snr_db
|
51 |
+
|
52 |
+
# train
|
53 |
+
self.lr = lr
|
54 |
+
self.lr_scheduler = lr_scheduler
|
55 |
+
self.lr_scheduler_kwargs = lr_scheduler_kwargs or dict()
|
56 |
+
|
57 |
+
self.max_epochs = max_epochs
|
58 |
+
self.clip_grad_norm = clip_grad_norm
|
59 |
+
self.seed = seed
|
60 |
+
|
61 |
+
self.num_workers = num_workers
|
62 |
+
self.batch_size = batch_size
|
63 |
+
self.eval_steps = eval_steps
|
64 |
+
|
65 |
+
|
66 |
+
def main():
|
67 |
+
config = SileroVadConfig()
|
68 |
+
config.to_yaml_file("config.yaml")
|
69 |
+
return
|
70 |
+
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
main()
|
toolbox/torchaudio/models/vad/cnn_vad/modeling_cnn_vad.py
CHANGED
@@ -1,6 +1,169 @@
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
if __name__ == "__main__":
|
6 |
-
|
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
+
import os
|
4 |
+
from typing import Dict, List, Optional, Tuple, Union
|
5 |
+
|
6 |
+
import torch
|
7 |
+
import torch.nn as nn
|
8 |
+
|
9 |
+
from toolbox.torchaudio.configuration_utils import CONFIG_FILE
|
10 |
+
from toolbox.torchaudio.models.vad.silero_vad.configuration_silero_vad import SileroVadConfig
|
11 |
+
from toolbox.torchaudio.modules.conv_stft import ConvSTFT
|
12 |
+
|
13 |
+
|
14 |
+
MODEL_FILE = "model.pt"
|
15 |
+
|
16 |
+
|
17 |
+
name2activation = {
|
18 |
+
"relu": nn.ReLU,
|
19 |
+
}
|
20 |
+
|
21 |
+
|
22 |
+
class Conv2dBlock(nn.Module):
|
23 |
+
def __init__(self,
|
24 |
+
in_channels: int,
|
25 |
+
out_channels: int,
|
26 |
+
kernel_size: Union[int, Tuple[int, int]],
|
27 |
+
stride: Tuple[int, int],
|
28 |
+
padding: str = 0,
|
29 |
+
dilation: int = 1,
|
30 |
+
batch_norm: bool = False,
|
31 |
+
activation: str = None,
|
32 |
+
dropout: float = None,
|
33 |
+
):
|
34 |
+
super().__init__()
|
35 |
+
self.in_channels = in_channels
|
36 |
+
self.out_channels = out_channels
|
37 |
+
self.kernel_size: Tuple[int, int] = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
|
38 |
+
|
39 |
+
if batch_norm:
|
40 |
+
self.batch_norm = nn.BatchNorm2d(in_channels)
|
41 |
+
else:
|
42 |
+
self.batch_norm = None
|
43 |
+
|
44 |
+
self.conv = nn.Conv2d(
|
45 |
+
in_channels,
|
46 |
+
out_channels,
|
47 |
+
kernel_size=kernel_size,
|
48 |
+
stride=stride,
|
49 |
+
padding=(padding,),
|
50 |
+
dilation=(dilation,),
|
51 |
+
)
|
52 |
+
|
53 |
+
if activation is None:
|
54 |
+
self.activation = None
|
55 |
+
else:
|
56 |
+
self.activation = name2activation[activation]()
|
57 |
+
|
58 |
+
if dropout is not None:
|
59 |
+
self.dropout = nn.Dropout(p=dropout)
|
60 |
+
else:
|
61 |
+
self.dropout = None
|
62 |
+
|
63 |
+
def forward(self, x: torch.Tensor):
|
64 |
+
|
65 |
+
if self.batch_norm is not None:
|
66 |
+
x = self.batch_norm(x)
|
67 |
+
|
68 |
+
x = self.conv(x)
|
69 |
+
|
70 |
+
if self.activation is not None:
|
71 |
+
x = self.activation(x)
|
72 |
+
|
73 |
+
if self.dropout is not None:
|
74 |
+
x = self.dropout(x)
|
75 |
+
|
76 |
+
return x
|
77 |
+
|
78 |
+
|
79 |
+
class CNNVadModel(nn.Module):
|
80 |
+
def __init__(self,
|
81 |
+
nfft: int,
|
82 |
+
win_size: int,
|
83 |
+
hop_size: int,
|
84 |
+
win_type: str,
|
85 |
+
conv2d_block_param_list: List[dict],
|
86 |
+
classifier_hidden_size: int,
|
87 |
+
):
|
88 |
+
super(CNNVadModel, self).__init__()
|
89 |
+
self.nfft = nfft
|
90 |
+
self.win_size = win_size
|
91 |
+
self.hop_size = hop_size
|
92 |
+
self.win_type = win_type
|
93 |
+
self.conv2d_block_param_list = conv2d_block_param_list
|
94 |
+
self.classifier_hidden_size = classifier_hidden_size
|
95 |
+
|
96 |
+
self.eps = 1e-12
|
97 |
+
|
98 |
+
self.stft = ConvSTFT(
|
99 |
+
nfft=nfft,
|
100 |
+
win_size=win_size,
|
101 |
+
hop_size=hop_size,
|
102 |
+
win_type=win_type,
|
103 |
+
power=1,
|
104 |
+
requires_grad=False
|
105 |
+
)
|
106 |
+
|
107 |
+
self.cnn_encoder_list = nn.ModuleList(modules=[
|
108 |
+
Conv2dBlock(
|
109 |
+
batch_norm=param["batch_norm"],
|
110 |
+
in_channels=param["in_channels"],
|
111 |
+
out_channels=param["out_channels"],
|
112 |
+
kernel_size=param["kernel_size"],
|
113 |
+
stride=param["stride"],
|
114 |
+
dilation=param["dilation"],
|
115 |
+
activation=param["activation"],
|
116 |
+
dropout=param["dropout"],
|
117 |
+
)
|
118 |
+
for param in conv2d_block_param_list
|
119 |
+
])
|
120 |
+
|
121 |
+
self.classifier = nn.Sequential(
|
122 |
+
nn.Linear(classifier_hidden_size, 32),
|
123 |
+
nn.ReLU(),
|
124 |
+
nn.Linear(32, 1),
|
125 |
+
)
|
126 |
+
|
127 |
+
self.sigmoid = nn.Sigmoid()
|
128 |
+
|
129 |
+
def forward(self, signal: torch.Tensor):
|
130 |
+
if signal.dim() == 2:
|
131 |
+
signal = torch.unsqueeze(signal, dim=1)
|
132 |
+
_, _, num_samples = signal.shape
|
133 |
+
# signal shape [b, 1, num_samples]
|
134 |
+
|
135 |
+
mags = self.stft.forward(signal)
|
136 |
+
# mags shape: [b, f, t]
|
137 |
+
|
138 |
+
x = torch.transpose(mags, dim0=1, dim1=2)
|
139 |
+
# x shape: [b, t, f]
|
140 |
+
|
141 |
+
x = self.linear.forward(x)
|
142 |
+
# x shape: [b, t, f']
|
143 |
+
|
144 |
+
x = self.encoder.forward(x)
|
145 |
+
# x shape: [b, t, f]
|
146 |
+
|
147 |
+
x, _ = self.lstm.forward(x)
|
148 |
+
logits = self.classifier.forward(x)
|
149 |
+
# logits shape: [b, t, 1]
|
150 |
+
probs = self.sigmoid.forward(logits)
|
151 |
+
# probs shape: [b, t, 1]
|
152 |
+
return logits, probs
|
153 |
+
|
154 |
+
|
155 |
+
def main():
|
156 |
+
config = SileroVadConfig()
|
157 |
+
model = SileroVadModel(config=config)
|
158 |
+
|
159 |
+
noisy = torch.randn(size=(1, 16000), dtype=torch.float32)
|
160 |
+
|
161 |
+
logits, probs = model.forward(noisy)
|
162 |
+
print(f"logits: {probs}")
|
163 |
+
print(f"logits.shape: {logits.shape}")
|
164 |
+
|
165 |
+
return
|
166 |
|
167 |
|
168 |
if __name__ == "__main__":
|
169 |
+
main()
|
toolbox/torchaudio/models/vad/cnn_vad/yaml/config.yaml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_name: "cnn_vad"
|
2 |
+
|
3 |
+
# spec
|
4 |
+
sample_rate: 8000
|
5 |
+
nfft: 512
|
6 |
+
win_size: 240
|
7 |
+
hop_size: 80
|
8 |
+
win_type: hann
|
9 |
+
|
10 |
+
# model
|
11 |
+
conv2d_block_param_list:
|
12 |
+
- batch_norm: true
|
13 |
+
in_channels: 1
|
14 |
+
out_channels: 4
|
15 |
+
kernel_size: 3
|
16 |
+
stride: 1
|
17 |
+
dilation: 3
|
18 |
+
activation: relu
|
19 |
+
dropout: 0.1
|
20 |
+
- in_channels: 4
|
21 |
+
out_channels: 4
|
22 |
+
kernel_size: 5
|
23 |
+
stride: 2
|
24 |
+
dilation: 3
|
25 |
+
activation: relu
|
26 |
+
dropout: 0.1
|
27 |
+
- in_channels: 4
|
28 |
+
out_channels: 4
|
29 |
+
kernel_size: 3
|
30 |
+
stride: 1
|
31 |
+
dilation: 2
|
32 |
+
activation: relu
|
33 |
+
dropout: 0.1
|
34 |
+
|
35 |
+
# data
|
36 |
+
min_snr_db: -10
|
37 |
+
max_snr_db: 20
|
38 |
+
|
39 |
+
# train
|
40 |
+
lr: 0.001
|
41 |
+
lr_scheduler: "CosineAnnealingLR"
|
42 |
+
lr_scheduler_kwargs:
|
43 |
+
T_max: 250000
|
44 |
+
eta_min: 0.0001
|
45 |
+
|
46 |
+
max_epochs: 100
|
47 |
+
clip_grad_norm: 10.0
|
48 |
+
seed: 1234
|
49 |
+
|
50 |
+
num_workers: 4
|
51 |
+
batch_size: 128
|
52 |
+
eval_steps: 25000
|