Spaces:

qgyd2021
/

cc_vad

Running

App Files Files Community

HoneyTian commited on 29 days ago

Commit

8051e41

1 Parent(s): 00e4381

update

Browse files

Files changed (5) hide show

examples/silero_vad_by_webrtcvad/yaml/config.yaml +4 -2
main.py +2 -2
toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py +2 -0
toolbox/torchaudio/models/vad/silero_vad/modeling_silero_vad.py +139 -47
toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml +4 -2

examples/silero_vad_by_webrtcvad/yaml/config.yaml CHANGED Viewed

@@ -8,8 +8,10 @@ hop_size: 80
 win_type: hann
 # model
-in_channels: 64
-hidden_size: 128
 # lsnr
 n_frame: 3

 win_type: hann
 # model
+hidden_size: 80
+kernel_size:
+  - 3
+  - 3
 # lsnr
 n_frame: 3

main.py CHANGED Viewed

@@ -240,10 +240,10 @@ def main():
                         with gr.Row():
                             vad_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="start_ring_rate")
                             vad_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="end_ring_rate")
-                            vad_min_silence_length = gr.Number(value=2, label="min_silence_length")
                         with gr.Row():
                             vad_max_speech_length = gr.Number(value=100000, label="max_speech_length")
-                            vad_min_speech_length = gr.Number(value=10, label="min_speech_length")
                             vad_engine = gr.Dropdown(choices=vad_engine_choices, value=vad_engine_choices[0], label="engine")
                         vad_button = gr.Button(variant="primary")
                     with gr.Column(variant="panel", scale=5):

                         with gr.Row():
                             vad_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="start_ring_rate")
                             vad_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="end_ring_rate")
+                            vad_min_silence_length = gr.Number(value=30, label="min_silence_length")
                         with gr.Row():
                             vad_max_speech_length = gr.Number(value=100000, label="max_speech_length")
+                            vad_min_speech_length = gr.Number(value=15, label="min_speech_length")
                             vad_engine = gr.Dropdown(choices=vad_engine_choices, value=vad_engine_choices[0], label="engine")
                         vad_button = gr.Button(variant="primary")
                     with gr.Column(variant="panel", scale=5):

toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py CHANGED Viewed

@@ -15,6 +15,7 @@ class SileroVadConfig(PretrainedConfig):
                  in_channels: int = 64,
                  hidden_size: int = 128,
                  n_frame: int = 3,
                  min_local_snr_db: float = -15,
@@ -49,6 +50,7 @@ class SileroVadConfig(PretrainedConfig):
         # encoder
         self.in_channels = in_channels
         self.hidden_size = hidden_size
         # lsnr
         self.n_frame = n_frame

                  in_channels: int = 64,
                  hidden_size: int = 128,
+                 kernel_size: Tuple[int, int] = (3, 3),
                  n_frame: int = 3,
                  min_local_snr_db: float = -15,
         # encoder
         self.in_channels = in_channels
         self.hidden_size = hidden_size
+        self.kernel_size = kernel_size
         # lsnr
         self.n_frame = n_frame

toolbox/torchaudio/models/vad/silero_vad/modeling_silero_vad.py CHANGED Viewed

@@ -8,8 +8,9 @@ https://github.com/snakers4/silero-vad
 https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/data/silero_vad.jit
 """
 import os
-from typing import Optional, Union
 import torch
 import torch.nn as nn
@@ -24,61 +25,153 @@ from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
 MODEL_FILE = "model.pt"
-class EncoderBlock(nn.Module):
     def __init__(self,
-                 in_channels: int = 64,
-                 out_channels: int = 128,
                  ):
-        super(EncoderBlock, self).__init__()
-        self.conv1d = nn.Conv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            padding="same",
         )
-        self.activation = nn.ReLU()
-        self.norm = nn.BatchNorm1d(out_channels)
-    def forward(self, x: torch.Tensor):
-        # x shape: [b, t, f]
-        x = torch.transpose(x, dim0=1, dim1=2)
-        # x shape: [b, f, t]
-        x = self.conv1d.forward(x)
-        x = self.activation(x)
-        x = self.norm(x)
-        x = torch.transpose(x, dim0=1, dim1=2)
-        # x shape: [b, t, f]
-        return x
-class Encoder(nn.Module):
     def __init__(self,
-                 in_channels: int = 64,
-                 out_channels: int = 128,
                  num_layers: int = 3,
                  ):
-        super(Encoder, self).__init__()
-        self.layers = nn.ModuleList(modules=[
-            EncoderBlock(
-                in_channels=in_channels,
-                out_channels=out_channels,
             )
             if i == 0 else
-            EncoderBlock(
-                in_channels=out_channels,
-                out_channels=out_channels,
             )
             for i in range(num_layers)
         ])
-    def forward(self, x: torch.Tensor):
-        for layer in self.layers:
-            x = layer.forward(x)
-        return x
 class SileroVadModel(nn.Module):
@@ -89,8 +182,8 @@ class SileroVadModel(nn.Module):
                  hop_size: int,
                  win_type: int,
-                 in_channels: int,
                  hidden_size: int,
                  n_frame: int,
                  min_local_snr_db: float,
@@ -104,8 +197,8 @@ class SileroVadModel(nn.Module):
         self.hop_size = hop_size
         self.win_type = win_type
-        self.in_channels = in_channels
         self.hidden_size = hidden_size
         self.n_frame = n_frame
         self.min_local_snr_db = min_local_snr_db
@@ -132,12 +225,11 @@ class SileroVadModel(nn.Module):
         self.linear = nn.Linear(
             in_features=(self.nfft // 2 + 1),
-            out_features=self.in_channels,
         )
-        self.encoder = Encoder(
-            in_channels=self.in_channels,
-            out_channels=self.hidden_size,
         )
         self.lstm = nn.LSTM(
@@ -190,8 +282,8 @@ class SileroVadModel(nn.Module):
         x = self.linear.forward(x)
         # x shape: [b, t, f']
-        x = self.encoder.forward(x)
-        # x shape: [b, t, f]
         x, _ = self.lstm.forward(x)
@@ -246,8 +338,8 @@ class SileroVadPretrainedModel(SileroVadModel):
             win_size=config.win_size,
             hop_size=config.hop_size,
             win_type=config.win_type,
-            in_channels=config.in_channels,
             hidden_size=config.hidden_size,
             n_frame=config.n_frame,
             min_local_snr_db=config.min_local_snr_db,
             max_local_snr_db=config.max_local_snr_db,

 https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/data/silero_vad.jit
 """
+import math
 import os
+from typing import List, Optional, Union, Iterable, Tuple
 import torch
 import torch.nn as nn
 MODEL_FILE = "model.pt"
+norm_layer_dict = {
+    "batch_norm_2d": torch.nn.BatchNorm2d
+}
+activation_layer_dict = {
+    "relu": torch.nn.ReLU,
+    "identity": torch.nn.Identity,
+    "sigmoid": torch.nn.Sigmoid,
+}
+class CausalConv2d(nn.Module):
     def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Iterable[int]],
+                 fstride: int = 1,
+                 dilation: int = 1,
+                 pad_f_dim: bool = True,
+                 bias: bool = True,
+                 separable: bool = False,
+                 norm_layer: str = "batch_norm_2d",
+                 activation_layer: str = "relu",
                  ):
+        super(CausalConv2d, self).__init__()
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
+        if pad_f_dim:
+            fpad = kernel_size[1] // 2 + dilation - 1
+        else:
+            fpad = 0
+        # for last 2 dim, pad (left, right, top, bottom).
+        self.lookback = kernel_size[0] - 1
+        if self.lookback > 0:
+            self.tpad = nn.ConstantPad2d(padding=(0, 0, self.lookback, 0), value=0.0)
+        else:
+            self.tpad = nn.Identity()
+        groups = math.gcd(in_channels, out_channels) if separable else 1
+        if groups == 1:
+            separable = False
+        if max(kernel_size) == 1:
+            separable = False
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(0, fpad),
+            stride=(1, fstride),  # stride over time is always 1
+            dilation=(1, dilation),  # dilation over time is always 1
+            groups=groups,
+            bias=bias,
         )
+        if separable:
+            self.convp = nn.Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+            )
+        else:
+            self.convp = nn.Identity()
+        if norm_layer is not None:
+            norm_layer = norm_layer_dict[norm_layer]
+            self.norm = norm_layer(out_channels)
+        else:
+            self.norm = nn.Identity()
+        if activation_layer is not None:
+            activation_layer = activation_layer_dict[activation_layer]
+            self.activation = activation_layer()
+        else:
+            self.activation = nn.Identity()
+    def forward(self, inputs: torch.Tensor, cache: torch.Tensor = None):
+        """
+        :param inputs: shape: [b, c, t, f]
+        :param cache: shape: [b, c, lookback, f];
+        :return:
+        """
+        x = inputs
+        if cache is None:
+            x = self.tpad(x)
+        else:
+            x = torch.concat(tensors=[cache, x], dim=2)
+        new_cache = None
+        if self.lookback > 0:
+            new_cache = x[:, :, -self.lookback:, :]
+        x = self.conv(x)
+        x = self.convp(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        return x, new_cache
+class CausalEncoder(nn.Module):
     def __init__(self,
+                 kernel_size: Tuple[int, int] = (3, 3),
                  num_layers: int = 3,
                  ):
+        super(CausalEncoder, self).__init__()
+        self.layers: List[CausalConv2d] = nn.ModuleList(modules=[
+            CausalConv2d(
+                in_channels=1,
+                out_channels=1,
+                kernel_size=kernel_size,
+                bias=False,
+                separable=True,
+                fstride=1,
             )
             if i == 0 else
+            CausalConv2d(
+                in_channels=1,
+                out_channels=1,
+                kernel_size=kernel_size,
+                bias=False,
+                separable=True,
+                fstride=1,
             )
             for i in range(num_layers)
         ])
+    def forward(self, x: torch.Tensor, cache_list: List[torch.Tensor] = None):
+        # x shape: [b, t, f]
+        x = torch.unsqueeze(x, dim=1)
+        # x shape: [b, c, t, f]
+        new_cache_list = list()
+        for idx, layer in enumerate(self.layers):
+            cache = None if cache_list is None else cache_list[idx]
+            x, new_cache = layer.forward(x, cache=cache)
+            new_cache_list.append(new_cache)
+        # x shape: [b, c, t, f]
+        x = torch.squeeze(x, dim=1)
+        # x shape: [b, t, f]
+        return x, new_cache_list
 class SileroVadModel(nn.Module):
                  hop_size: int,
                  win_type: int,
                  hidden_size: int,
+                 kernel_size: Tuple[int, int],
                  n_frame: int,
                  min_local_snr_db: float,
         self.hop_size = hop_size
         self.win_type = win_type
         self.hidden_size = hidden_size
+        self.kernel_size = kernel_size
         self.n_frame = n_frame
         self.min_local_snr_db = min_local_snr_db
         self.linear = nn.Linear(
             in_features=(self.nfft // 2 + 1),
+            out_features=self.hidden_size,
         )
+        self.encoder = CausalEncoder(
+            kernel_size=(3, 3),
         )
         self.lstm = nn.LSTM(
         x = self.linear.forward(x)
         # x shape: [b, t, f']
+        x, _ = self.encoder.forward(x)
+        # x shape: [b, t, f']
         x, _ = self.lstm.forward(x)
             win_size=config.win_size,
             hop_size=config.hop_size,
             win_type=config.win_type,
             hidden_size=config.hidden_size,
+            kernel_size=config.kernel_size,
             n_frame=config.n_frame,
             min_local_snr_db=config.min_local_snr_db,
             max_local_snr_db=config.max_local_snr_db,

toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml CHANGED Viewed

@@ -8,8 +8,10 @@ hop_size: 80
 win_type: hann
 # model
-in_channels: 64
-hidden_size: 128
 # lsnr
 n_frame: 3

 win_type: hann
 # model
+hidden_size: 80
+kernel_size:
+  - 3
+  - 3
 # lsnr
 n_frame: 3