HoneyTian commited on
Commit
ff7995b
·
1 Parent(s): 8051e41
examples/silero_vad_by_webrtcvad/yaml/config.yaml CHANGED
@@ -8,6 +8,7 @@ hop_size: 80
8
  win_type: hann
9
 
10
  # model
 
11
  hidden_size: 80
12
  kernel_size:
13
  - 3
 
8
  win_type: hann
9
 
10
  # model
11
+ conv_channels: 32
12
  hidden_size: 80
13
  kernel_size:
14
  - 3
toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py CHANGED
@@ -13,8 +13,8 @@ class SileroVadConfig(PretrainedConfig):
13
  hop_size: int = 80,
14
  win_type: str = "hann",
15
 
16
- in_channels: int = 64,
17
- hidden_size: int = 128,
18
  kernel_size: Tuple[int, int] = (3, 3),
19
 
20
  n_frame: int = 3,
@@ -48,7 +48,7 @@ class SileroVadConfig(PretrainedConfig):
48
  self.win_type = win_type
49
 
50
  # encoder
51
- self.in_channels = in_channels
52
  self.hidden_size = hidden_size
53
  self.kernel_size = kernel_size
54
 
 
13
  hop_size: int = 80,
14
  win_type: str = "hann",
15
 
16
+ conv_channels: int = 32,
17
+ hidden_size: int = 80,
18
  kernel_size: Tuple[int, int] = (3, 3),
19
 
20
  n_frame: int = 3,
 
48
  self.win_type = win_type
49
 
50
  # encoder
51
+ self.conv_channels = conv_channels
52
  self.hidden_size = hidden_size
53
  self.kernel_size = kernel_size
54
 
toolbox/torchaudio/models/vad/silero_vad/modeling_silero_vad.py CHANGED
@@ -132,6 +132,7 @@ class CausalConv2d(nn.Module):
132
 
133
  class CausalEncoder(nn.Module):
134
  def __init__(self,
 
135
  kernel_size: Tuple[int, int] = (3, 3),
136
  num_layers: int = 3,
137
  ):
@@ -139,7 +140,7 @@ class CausalEncoder(nn.Module):
139
  self.layers: List[CausalConv2d] = nn.ModuleList(modules=[
140
  CausalConv2d(
141
  in_channels=1,
142
- out_channels=1,
143
  kernel_size=kernel_size,
144
  bias=False,
145
  separable=True,
@@ -147,8 +148,8 @@ class CausalEncoder(nn.Module):
147
  )
148
  if i == 0 else
149
  CausalConv2d(
150
- in_channels=1,
151
- out_channels=1,
152
  kernel_size=kernel_size,
153
  bias=False,
154
  separable=True,
@@ -160,7 +161,7 @@ class CausalEncoder(nn.Module):
160
  def forward(self, x: torch.Tensor, cache_list: List[torch.Tensor] = None):
161
  # x shape: [b, t, f]
162
  x = torch.unsqueeze(x, dim=1)
163
- # x shape: [b, c, t, f]
164
 
165
  new_cache_list = list()
166
  for idx, layer in enumerate(self.layers):
@@ -169,8 +170,11 @@ class CausalEncoder(nn.Module):
169
  new_cache_list.append(new_cache)
170
 
171
  # x shape: [b, c, t, f]
172
- x = torch.squeeze(x, dim=1)
173
- # x shape: [b, t, f]
 
 
 
174
  return x, new_cache_list
175
 
176
 
@@ -182,6 +186,7 @@ class SileroVadModel(nn.Module):
182
  hop_size: int,
183
  win_type: int,
184
 
 
185
  hidden_size: int,
186
  kernel_size: Tuple[int, int],
187
 
@@ -197,6 +202,7 @@ class SileroVadModel(nn.Module):
197
  self.hop_size = hop_size
198
  self.win_type = win_type
199
 
 
200
  self.hidden_size = hidden_size
201
  self.kernel_size = kernel_size
202
 
@@ -229,11 +235,12 @@ class SileroVadModel(nn.Module):
229
  )
230
 
231
  self.encoder = CausalEncoder(
 
232
  kernel_size=(3, 3),
233
  )
234
 
235
  self.lstm = nn.LSTM(
236
- input_size=self.hidden_size,
237
  hidden_size=self.hidden_size,
238
  bidirectional=False,
239
  batch_first=True
@@ -338,6 +345,7 @@ class SileroVadPretrainedModel(SileroVadModel):
338
  win_size=config.win_size,
339
  hop_size=config.hop_size,
340
  win_type=config.win_type,
 
341
  hidden_size=config.hidden_size,
342
  kernel_size=config.kernel_size,
343
  n_frame=config.n_frame,
 
132
 
133
  class CausalEncoder(nn.Module):
134
  def __init__(self,
135
+ conv_channels: int,
136
  kernel_size: Tuple[int, int] = (3, 3),
137
  num_layers: int = 3,
138
  ):
 
140
  self.layers: List[CausalConv2d] = nn.ModuleList(modules=[
141
  CausalConv2d(
142
  in_channels=1,
143
+ out_channels=conv_channels,
144
  kernel_size=kernel_size,
145
  bias=False,
146
  separable=True,
 
148
  )
149
  if i == 0 else
150
  CausalConv2d(
151
+ in_channels=conv_channels,
152
+ out_channels=conv_channels,
153
  kernel_size=kernel_size,
154
  bias=False,
155
  separable=True,
 
161
  def forward(self, x: torch.Tensor, cache_list: List[torch.Tensor] = None):
162
  # x shape: [b, t, f]
163
  x = torch.unsqueeze(x, dim=1)
164
+ # x shape: [b, 1, t, f]
165
 
166
  new_cache_list = list()
167
  for idx, layer in enumerate(self.layers):
 
170
  new_cache_list.append(new_cache)
171
 
172
  # x shape: [b, c, t, f]
173
+ x = x.permute(0, 2, 1, 3)
174
+ # x shape: [b, t, c, f]
175
+ b, t, c, f = x.shape
176
+ x = torch.reshape(x, shape=(b, t, c*f))
177
+ # x shape: [b, t, c*f]
178
  return x, new_cache_list
179
 
180
 
 
186
  hop_size: int,
187
  win_type: int,
188
 
189
+ conv_channels: int,
190
  hidden_size: int,
191
  kernel_size: Tuple[int, int],
192
 
 
202
  self.hop_size = hop_size
203
  self.win_type = win_type
204
 
205
+ self.conv_channels = conv_channels
206
  self.hidden_size = hidden_size
207
  self.kernel_size = kernel_size
208
 
 
235
  )
236
 
237
  self.encoder = CausalEncoder(
238
+ conv_channels=conv_channels,
239
  kernel_size=(3, 3),
240
  )
241
 
242
  self.lstm = nn.LSTM(
243
+ input_size=self.conv_channels * self.hidden_size,
244
  hidden_size=self.hidden_size,
245
  bidirectional=False,
246
  batch_first=True
 
345
  win_size=config.win_size,
346
  hop_size=config.hop_size,
347
  win_type=config.win_type,
348
+ conv_channels=config.conv_channels,
349
  hidden_size=config.hidden_size,
350
  kernel_size=config.kernel_size,
351
  n_frame=config.n_frame,
toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml CHANGED
@@ -8,6 +8,7 @@ hop_size: 80
8
  win_type: hann
9
 
10
  # model
 
11
  hidden_size: 80
12
  kernel_size:
13
  - 3
 
8
  win_type: hann
9
 
10
  # model
11
+ conv_channels: 32
12
  hidden_size: 80
13
  kernel_size:
14
  - 3