HoneyTian commited on
Commit
8051e41
·
1 Parent(s): 00e4381
examples/silero_vad_by_webrtcvad/yaml/config.yaml CHANGED
@@ -8,8 +8,10 @@ hop_size: 80
8
  win_type: hann
9
 
10
  # model
11
- in_channels: 64
12
- hidden_size: 128
 
 
13
 
14
  # lsnr
15
  n_frame: 3
 
8
  win_type: hann
9
 
10
  # model
11
+ hidden_size: 80
12
+ kernel_size:
13
+ - 3
14
+ - 3
15
 
16
  # lsnr
17
  n_frame: 3
main.py CHANGED
@@ -240,10 +240,10 @@ def main():
240
  with gr.Row():
241
  vad_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="start_ring_rate")
242
  vad_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="end_ring_rate")
243
- vad_min_silence_length = gr.Number(value=2, label="min_silence_length")
244
  with gr.Row():
245
  vad_max_speech_length = gr.Number(value=100000, label="max_speech_length")
246
- vad_min_speech_length = gr.Number(value=10, label="min_speech_length")
247
  vad_engine = gr.Dropdown(choices=vad_engine_choices, value=vad_engine_choices[0], label="engine")
248
  vad_button = gr.Button(variant="primary")
249
  with gr.Column(variant="panel", scale=5):
 
240
  with gr.Row():
241
  vad_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="start_ring_rate")
242
  vad_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="end_ring_rate")
243
+ vad_min_silence_length = gr.Number(value=30, label="min_silence_length")
244
  with gr.Row():
245
  vad_max_speech_length = gr.Number(value=100000, label="max_speech_length")
246
+ vad_min_speech_length = gr.Number(value=15, label="min_speech_length")
247
  vad_engine = gr.Dropdown(choices=vad_engine_choices, value=vad_engine_choices[0], label="engine")
248
  vad_button = gr.Button(variant="primary")
249
  with gr.Column(variant="panel", scale=5):
toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py CHANGED
@@ -15,6 +15,7 @@ class SileroVadConfig(PretrainedConfig):
15
 
16
  in_channels: int = 64,
17
  hidden_size: int = 128,
 
18
 
19
  n_frame: int = 3,
20
  min_local_snr_db: float = -15,
@@ -49,6 +50,7 @@ class SileroVadConfig(PretrainedConfig):
49
  # encoder
50
  self.in_channels = in_channels
51
  self.hidden_size = hidden_size
 
52
 
53
  # lsnr
54
  self.n_frame = n_frame
 
15
 
16
  in_channels: int = 64,
17
  hidden_size: int = 128,
18
+ kernel_size: Tuple[int, int] = (3, 3),
19
 
20
  n_frame: int = 3,
21
  min_local_snr_db: float = -15,
 
50
  # encoder
51
  self.in_channels = in_channels
52
  self.hidden_size = hidden_size
53
+ self.kernel_size = kernel_size
54
 
55
  # lsnr
56
  self.n_frame = n_frame
toolbox/torchaudio/models/vad/silero_vad/modeling_silero_vad.py CHANGED
@@ -8,8 +8,9 @@ https://github.com/snakers4/silero-vad
8
 
9
  https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/data/silero_vad.jit
10
  """
 
11
  import os
12
- from typing import Optional, Union
13
 
14
  import torch
15
  import torch.nn as nn
@@ -24,61 +25,153 @@ from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
24
  MODEL_FILE = "model.pt"
25
 
26
 
27
- class EncoderBlock(nn.Module):
 
 
 
 
 
 
 
 
 
 
 
 
28
  def __init__(self,
29
- in_channels: int = 64,
30
- out_channels: int = 128,
 
 
 
 
 
 
 
 
31
  ):
32
- super(EncoderBlock, self).__init__()
33
- self.conv1d = nn.Conv1d(
34
- in_channels=in_channels,
35
- out_channels=out_channels,
36
- kernel_size=3,
37
- padding="same",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  )
39
- self.activation = nn.ReLU()
40
- self.norm = nn.BatchNorm1d(out_channels)
41
 
42
- def forward(self, x: torch.Tensor):
43
- # x shape: [b, t, f]
44
- x = torch.transpose(x, dim0=1, dim1=2)
45
- # x shape: [b, f, t]
 
 
 
 
 
46
 
47
- x = self.conv1d.forward(x)
48
- x = self.activation(x)
49
- x = self.norm(x)
 
 
50
 
51
- x = torch.transpose(x, dim0=1, dim1=2)
52
- # x shape: [b, t, f]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- return x
 
 
55
 
 
56
 
57
- class Encoder(nn.Module):
 
58
  def __init__(self,
59
- in_channels: int = 64,
60
- out_channels: int = 128,
61
  num_layers: int = 3,
62
  ):
63
- super(Encoder, self).__init__()
64
-
65
- self.layers = nn.ModuleList(modules=[
66
- EncoderBlock(
67
- in_channels=in_channels,
68
- out_channels=out_channels,
 
 
 
69
  )
70
  if i == 0 else
71
- EncoderBlock(
72
- in_channels=out_channels,
73
- out_channels=out_channels,
 
 
 
 
74
  )
75
  for i in range(num_layers)
76
  ])
77
 
78
- def forward(self, x: torch.Tensor):
79
- for layer in self.layers:
80
- x = layer.forward(x)
81
- return x
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
  class SileroVadModel(nn.Module):
@@ -89,8 +182,8 @@ class SileroVadModel(nn.Module):
89
  hop_size: int,
90
  win_type: int,
91
 
92
- in_channels: int,
93
  hidden_size: int,
 
94
 
95
  n_frame: int,
96
  min_local_snr_db: float,
@@ -104,8 +197,8 @@ class SileroVadModel(nn.Module):
104
  self.hop_size = hop_size
105
  self.win_type = win_type
106
 
107
- self.in_channels = in_channels
108
  self.hidden_size = hidden_size
 
109
 
110
  self.n_frame = n_frame
111
  self.min_local_snr_db = min_local_snr_db
@@ -132,12 +225,11 @@ class SileroVadModel(nn.Module):
132
 
133
  self.linear = nn.Linear(
134
  in_features=(self.nfft // 2 + 1),
135
- out_features=self.in_channels,
136
  )
137
 
138
- self.encoder = Encoder(
139
- in_channels=self.in_channels,
140
- out_channels=self.hidden_size,
141
  )
142
 
143
  self.lstm = nn.LSTM(
@@ -190,8 +282,8 @@ class SileroVadModel(nn.Module):
190
  x = self.linear.forward(x)
191
  # x shape: [b, t, f']
192
 
193
- x = self.encoder.forward(x)
194
- # x shape: [b, t, f]
195
 
196
  x, _ = self.lstm.forward(x)
197
 
@@ -246,8 +338,8 @@ class SileroVadPretrainedModel(SileroVadModel):
246
  win_size=config.win_size,
247
  hop_size=config.hop_size,
248
  win_type=config.win_type,
249
- in_channels=config.in_channels,
250
  hidden_size=config.hidden_size,
 
251
  n_frame=config.n_frame,
252
  min_local_snr_db=config.min_local_snr_db,
253
  max_local_snr_db=config.max_local_snr_db,
 
8
 
9
  https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/data/silero_vad.jit
10
  """
11
+ import math
12
  import os
13
+ from typing import List, Optional, Union, Iterable, Tuple
14
 
15
  import torch
16
  import torch.nn as nn
 
25
  MODEL_FILE = "model.pt"
26
 
27
 
28
+ norm_layer_dict = {
29
+ "batch_norm_2d": torch.nn.BatchNorm2d
30
+ }
31
+
32
+
33
+ activation_layer_dict = {
34
+ "relu": torch.nn.ReLU,
35
+ "identity": torch.nn.Identity,
36
+ "sigmoid": torch.nn.Sigmoid,
37
+ }
38
+
39
+
40
+ class CausalConv2d(nn.Module):
41
  def __init__(self,
42
+ in_channels: int,
43
+ out_channels: int,
44
+ kernel_size: Union[int, Iterable[int]],
45
+ fstride: int = 1,
46
+ dilation: int = 1,
47
+ pad_f_dim: bool = True,
48
+ bias: bool = True,
49
+ separable: bool = False,
50
+ norm_layer: str = "batch_norm_2d",
51
+ activation_layer: str = "relu",
52
  ):
53
+ super(CausalConv2d, self).__init__()
54
+ kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
55
+
56
+ if pad_f_dim:
57
+ fpad = kernel_size[1] // 2 + dilation - 1
58
+ else:
59
+ fpad = 0
60
+
61
+ # for last 2 dim, pad (left, right, top, bottom).
62
+ self.lookback = kernel_size[0] - 1
63
+ if self.lookback > 0:
64
+ self.tpad = nn.ConstantPad2d(padding=(0, 0, self.lookback, 0), value=0.0)
65
+ else:
66
+ self.tpad = nn.Identity()
67
+
68
+ groups = math.gcd(in_channels, out_channels) if separable else 1
69
+ if groups == 1:
70
+ separable = False
71
+ if max(kernel_size) == 1:
72
+ separable = False
73
+
74
+ self.conv = nn.Conv2d(
75
+ in_channels,
76
+ out_channels,
77
+ kernel_size=kernel_size,
78
+ padding=(0, fpad),
79
+ stride=(1, fstride), # stride over time is always 1
80
+ dilation=(1, dilation), # dilation over time is always 1
81
+ groups=groups,
82
+ bias=bias,
83
  )
 
 
84
 
85
+ if separable:
86
+ self.convp = nn.Conv2d(
87
+ out_channels,
88
+ out_channels,
89
+ kernel_size=1,
90
+ bias=False,
91
+ )
92
+ else:
93
+ self.convp = nn.Identity()
94
 
95
+ if norm_layer is not None:
96
+ norm_layer = norm_layer_dict[norm_layer]
97
+ self.norm = norm_layer(out_channels)
98
+ else:
99
+ self.norm = nn.Identity()
100
 
101
+ if activation_layer is not None:
102
+ activation_layer = activation_layer_dict[activation_layer]
103
+ self.activation = activation_layer()
104
+ else:
105
+ self.activation = nn.Identity()
106
+
107
+ def forward(self, inputs: torch.Tensor, cache: torch.Tensor = None):
108
+ """
109
+ :param inputs: shape: [b, c, t, f]
110
+ :param cache: shape: [b, c, lookback, f];
111
+ :return:
112
+ """
113
+ x = inputs
114
+
115
+ if cache is None:
116
+ x = self.tpad(x)
117
+ else:
118
+ x = torch.concat(tensors=[cache, x], dim=2)
119
+
120
+ new_cache = None
121
+ if self.lookback > 0:
122
+ new_cache = x[:, :, -self.lookback:, :]
123
+
124
+ x = self.conv(x)
125
 
126
+ x = self.convp(x)
127
+ x = self.norm(x)
128
+ x = self.activation(x)
129
 
130
+ return x, new_cache
131
 
132
+
133
+ class CausalEncoder(nn.Module):
134
  def __init__(self,
135
+ kernel_size: Tuple[int, int] = (3, 3),
 
136
  num_layers: int = 3,
137
  ):
138
+ super(CausalEncoder, self).__init__()
139
+ self.layers: List[CausalConv2d] = nn.ModuleList(modules=[
140
+ CausalConv2d(
141
+ in_channels=1,
142
+ out_channels=1,
143
+ kernel_size=kernel_size,
144
+ bias=False,
145
+ separable=True,
146
+ fstride=1,
147
  )
148
  if i == 0 else
149
+ CausalConv2d(
150
+ in_channels=1,
151
+ out_channels=1,
152
+ kernel_size=kernel_size,
153
+ bias=False,
154
+ separable=True,
155
+ fstride=1,
156
  )
157
  for i in range(num_layers)
158
  ])
159
 
160
+ def forward(self, x: torch.Tensor, cache_list: List[torch.Tensor] = None):
161
+ # x shape: [b, t, f]
162
+ x = torch.unsqueeze(x, dim=1)
163
+ # x shape: [b, c, t, f]
164
+
165
+ new_cache_list = list()
166
+ for idx, layer in enumerate(self.layers):
167
+ cache = None if cache_list is None else cache_list[idx]
168
+ x, new_cache = layer.forward(x, cache=cache)
169
+ new_cache_list.append(new_cache)
170
+
171
+ # x shape: [b, c, t, f]
172
+ x = torch.squeeze(x, dim=1)
173
+ # x shape: [b, t, f]
174
+ return x, new_cache_list
175
 
176
 
177
  class SileroVadModel(nn.Module):
 
182
  hop_size: int,
183
  win_type: int,
184
 
 
185
  hidden_size: int,
186
+ kernel_size: Tuple[int, int],
187
 
188
  n_frame: int,
189
  min_local_snr_db: float,
 
197
  self.hop_size = hop_size
198
  self.win_type = win_type
199
 
 
200
  self.hidden_size = hidden_size
201
+ self.kernel_size = kernel_size
202
 
203
  self.n_frame = n_frame
204
  self.min_local_snr_db = min_local_snr_db
 
225
 
226
  self.linear = nn.Linear(
227
  in_features=(self.nfft // 2 + 1),
228
+ out_features=self.hidden_size,
229
  )
230
 
231
+ self.encoder = CausalEncoder(
232
+ kernel_size=(3, 3),
 
233
  )
234
 
235
  self.lstm = nn.LSTM(
 
282
  x = self.linear.forward(x)
283
  # x shape: [b, t, f']
284
 
285
+ x, _ = self.encoder.forward(x)
286
+ # x shape: [b, t, f']
287
 
288
  x, _ = self.lstm.forward(x)
289
 
 
338
  win_size=config.win_size,
339
  hop_size=config.hop_size,
340
  win_type=config.win_type,
 
341
  hidden_size=config.hidden_size,
342
+ kernel_size=config.kernel_size,
343
  n_frame=config.n_frame,
344
  min_local_snr_db=config.min_local_snr_db,
345
  max_local_snr_db=config.max_local_snr_db,
toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml CHANGED
@@ -8,8 +8,10 @@ hop_size: 80
8
  win_type: hann
9
 
10
  # model
11
- in_channels: 64
12
- hidden_size: 128
 
 
13
 
14
  # lsnr
15
  n_frame: 3
 
8
  win_type: hann
9
 
10
  # model
11
+ hidden_size: 80
12
+ kernel_size:
13
+ - 3
14
+ - 3
15
 
16
  # lsnr
17
  n_frame: 3