update
Browse files
examples/silero_vad_by_webrtcvad/yaml/config.yaml
CHANGED
@@ -8,6 +8,7 @@ hop_size: 80
|
|
8 |
win_type: hann
|
9 |
|
10 |
# model
|
|
|
11 |
hidden_size: 80
|
12 |
kernel_size:
|
13 |
- 3
|
|
|
8 |
win_type: hann
|
9 |
|
10 |
# model
|
11 |
+
conv_channels: 32
|
12 |
hidden_size: 80
|
13 |
kernel_size:
|
14 |
- 3
|
toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py
CHANGED
@@ -13,8 +13,8 @@ class SileroVadConfig(PretrainedConfig):
|
|
13 |
hop_size: int = 80,
|
14 |
win_type: str = "hann",
|
15 |
|
16 |
-
|
17 |
-
hidden_size: int =
|
18 |
kernel_size: Tuple[int, int] = (3, 3),
|
19 |
|
20 |
n_frame: int = 3,
|
@@ -48,7 +48,7 @@ class SileroVadConfig(PretrainedConfig):
|
|
48 |
self.win_type = win_type
|
49 |
|
50 |
# encoder
|
51 |
-
self.
|
52 |
self.hidden_size = hidden_size
|
53 |
self.kernel_size = kernel_size
|
54 |
|
|
|
13 |
hop_size: int = 80,
|
14 |
win_type: str = "hann",
|
15 |
|
16 |
+
conv_channels: int = 32,
|
17 |
+
hidden_size: int = 80,
|
18 |
kernel_size: Tuple[int, int] = (3, 3),
|
19 |
|
20 |
n_frame: int = 3,
|
|
|
48 |
self.win_type = win_type
|
49 |
|
50 |
# encoder
|
51 |
+
self.conv_channels = conv_channels
|
52 |
self.hidden_size = hidden_size
|
53 |
self.kernel_size = kernel_size
|
54 |
|
toolbox/torchaudio/models/vad/silero_vad/modeling_silero_vad.py
CHANGED
@@ -132,6 +132,7 @@ class CausalConv2d(nn.Module):
|
|
132 |
|
133 |
class CausalEncoder(nn.Module):
|
134 |
def __init__(self,
|
|
|
135 |
kernel_size: Tuple[int, int] = (3, 3),
|
136 |
num_layers: int = 3,
|
137 |
):
|
@@ -139,7 +140,7 @@ class CausalEncoder(nn.Module):
|
|
139 |
self.layers: List[CausalConv2d] = nn.ModuleList(modules=[
|
140 |
CausalConv2d(
|
141 |
in_channels=1,
|
142 |
-
out_channels=
|
143 |
kernel_size=kernel_size,
|
144 |
bias=False,
|
145 |
separable=True,
|
@@ -147,8 +148,8 @@ class CausalEncoder(nn.Module):
|
|
147 |
)
|
148 |
if i == 0 else
|
149 |
CausalConv2d(
|
150 |
-
in_channels=
|
151 |
-
out_channels=
|
152 |
kernel_size=kernel_size,
|
153 |
bias=False,
|
154 |
separable=True,
|
@@ -160,7 +161,7 @@ class CausalEncoder(nn.Module):
|
|
160 |
def forward(self, x: torch.Tensor, cache_list: List[torch.Tensor] = None):
|
161 |
# x shape: [b, t, f]
|
162 |
x = torch.unsqueeze(x, dim=1)
|
163 |
-
# x shape: [b,
|
164 |
|
165 |
new_cache_list = list()
|
166 |
for idx, layer in enumerate(self.layers):
|
@@ -169,8 +170,11 @@ class CausalEncoder(nn.Module):
|
|
169 |
new_cache_list.append(new_cache)
|
170 |
|
171 |
# x shape: [b, c, t, f]
|
172 |
-
x =
|
173 |
-
# x shape: [b, t, f]
|
|
|
|
|
|
|
174 |
return x, new_cache_list
|
175 |
|
176 |
|
@@ -182,6 +186,7 @@ class SileroVadModel(nn.Module):
|
|
182 |
hop_size: int,
|
183 |
win_type: int,
|
184 |
|
|
|
185 |
hidden_size: int,
|
186 |
kernel_size: Tuple[int, int],
|
187 |
|
@@ -197,6 +202,7 @@ class SileroVadModel(nn.Module):
|
|
197 |
self.hop_size = hop_size
|
198 |
self.win_type = win_type
|
199 |
|
|
|
200 |
self.hidden_size = hidden_size
|
201 |
self.kernel_size = kernel_size
|
202 |
|
@@ -229,11 +235,12 @@ class SileroVadModel(nn.Module):
|
|
229 |
)
|
230 |
|
231 |
self.encoder = CausalEncoder(
|
|
|
232 |
kernel_size=(3, 3),
|
233 |
)
|
234 |
|
235 |
self.lstm = nn.LSTM(
|
236 |
-
input_size=self.hidden_size,
|
237 |
hidden_size=self.hidden_size,
|
238 |
bidirectional=False,
|
239 |
batch_first=True
|
@@ -338,6 +345,7 @@ class SileroVadPretrainedModel(SileroVadModel):
|
|
338 |
win_size=config.win_size,
|
339 |
hop_size=config.hop_size,
|
340 |
win_type=config.win_type,
|
|
|
341 |
hidden_size=config.hidden_size,
|
342 |
kernel_size=config.kernel_size,
|
343 |
n_frame=config.n_frame,
|
|
|
132 |
|
133 |
class CausalEncoder(nn.Module):
|
134 |
def __init__(self,
|
135 |
+
conv_channels: int,
|
136 |
kernel_size: Tuple[int, int] = (3, 3),
|
137 |
num_layers: int = 3,
|
138 |
):
|
|
|
140 |
self.layers: List[CausalConv2d] = nn.ModuleList(modules=[
|
141 |
CausalConv2d(
|
142 |
in_channels=1,
|
143 |
+
out_channels=conv_channels,
|
144 |
kernel_size=kernel_size,
|
145 |
bias=False,
|
146 |
separable=True,
|
|
|
148 |
)
|
149 |
if i == 0 else
|
150 |
CausalConv2d(
|
151 |
+
in_channels=conv_channels,
|
152 |
+
out_channels=conv_channels,
|
153 |
kernel_size=kernel_size,
|
154 |
bias=False,
|
155 |
separable=True,
|
|
|
161 |
def forward(self, x: torch.Tensor, cache_list: List[torch.Tensor] = None):
|
162 |
# x shape: [b, t, f]
|
163 |
x = torch.unsqueeze(x, dim=1)
|
164 |
+
# x shape: [b, 1, t, f]
|
165 |
|
166 |
new_cache_list = list()
|
167 |
for idx, layer in enumerate(self.layers):
|
|
|
170 |
new_cache_list.append(new_cache)
|
171 |
|
172 |
# x shape: [b, c, t, f]
|
173 |
+
x = x.permute(0, 2, 1, 3)
|
174 |
+
# x shape: [b, t, c, f]
|
175 |
+
b, t, c, f = x.shape
|
176 |
+
x = torch.reshape(x, shape=(b, t, c*f))
|
177 |
+
# x shape: [b, t, c*f]
|
178 |
return x, new_cache_list
|
179 |
|
180 |
|
|
|
186 |
hop_size: int,
|
187 |
win_type: int,
|
188 |
|
189 |
+
conv_channels: int,
|
190 |
hidden_size: int,
|
191 |
kernel_size: Tuple[int, int],
|
192 |
|
|
|
202 |
self.hop_size = hop_size
|
203 |
self.win_type = win_type
|
204 |
|
205 |
+
self.conv_channels = conv_channels
|
206 |
self.hidden_size = hidden_size
|
207 |
self.kernel_size = kernel_size
|
208 |
|
|
|
235 |
)
|
236 |
|
237 |
self.encoder = CausalEncoder(
|
238 |
+
conv_channels=conv_channels,
|
239 |
kernel_size=(3, 3),
|
240 |
)
|
241 |
|
242 |
self.lstm = nn.LSTM(
|
243 |
+
input_size=self.conv_channels * self.hidden_size,
|
244 |
hidden_size=self.hidden_size,
|
245 |
bidirectional=False,
|
246 |
batch_first=True
|
|
|
345 |
win_size=config.win_size,
|
346 |
hop_size=config.hop_size,
|
347 |
win_type=config.win_type,
|
348 |
+
conv_channels=config.conv_channels,
|
349 |
hidden_size=config.hidden_size,
|
350 |
kernel_size=config.kernel_size,
|
351 |
n_frame=config.n_frame,
|
toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml
CHANGED
@@ -8,6 +8,7 @@ hop_size: 80
|
|
8 |
win_type: hann
|
9 |
|
10 |
# model
|
|
|
11 |
hidden_size: 80
|
12 |
kernel_size:
|
13 |
- 3
|
|
|
8 |
win_type: hann
|
9 |
|
10 |
# model
|
11 |
+
conv_channels: 32
|
12 |
hidden_size: 80
|
13 |
kernel_size:
|
14 |
- 3
|