Spaces:

qgyd2021
/

cc_vad

Running

App Files Files Community

HoneyTian commited on 27 days ago

Commit

83fc52b

1 Parent(s): ff7995b

update

Browse files

Files changed (5) hide show

examples/silero_vad_by_webrtcvad/yaml/config.yaml +5 -5
toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py +12 -6
toolbox/torchaudio/models/vad/silero_vad/inference_silero_vad_onnx.py +181 -0
toolbox/torchaudio/models/vad/silero_vad/modeling_silero_vad.py +232 -159
toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml +5 -5

examples/silero_vad_by_webrtcvad/yaml/config.yaml CHANGED Viewed

@@ -8,11 +8,11 @@ hop_size: 80
 win_type: hann
 # model
-conv_channels: 32
-hidden_size: 80
-kernel_size:
-  - 3
-  - 3
 # lsnr
 n_frame: 3

 win_type: hann
 # model
+encoder_in_channels: 64
+encoder_kernel_size: 3
+encoder_num_layers: 3
+decoder_hidden_size: 64
 # lsnr
 n_frame: 3

toolbox/torchaudio/models/vad/silero_vad/configuration_silero_vad.py CHANGED Viewed

@@ -13,9 +13,12 @@ class SileroVadConfig(PretrainedConfig):
                  hop_size: int = 80,
                  win_type: str = "hann",
-                 conv_channels: int = 32,
-                 hidden_size: int = 80,
-                 kernel_size: Tuple[int, int] = (3, 3),
                  n_frame: int = 3,
                  min_local_snr_db: float = -15,
@@ -48,9 +51,12 @@ class SileroVadConfig(PretrainedConfig):
         self.win_type = win_type
         # encoder
-        self.conv_channels = conv_channels
-        self.hidden_size = hidden_size
-        self.kernel_size = kernel_size
         # lsnr
         self.n_frame = n_frame

                  hop_size: int = 80,
                  win_type: str = "hann",
+                 encoder_in_channels: int = 64,
+                 encoder_kernel_size: int = 3,
+                 encoder_num_layers: int = 3,
+                 decoder_hidden_size: int = 64,
+                 decoder_num_layers: int = 2,
                  n_frame: int = 3,
                  min_local_snr_db: float = -15,
         self.win_type = win_type
         # encoder
+        self.encoder_in_channels = encoder_in_channels
+        self.encoder_kernel_size = encoder_kernel_size
+        self.encoder_num_layers = encoder_num_layers
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_layers = decoder_num_layers
         # lsnr
         self.n_frame = n_frame

toolbox/torchaudio/models/vad/silero_vad/inference_silero_vad_onnx.py ADDED Viewed

	@@ -0,0 +1,181 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import logging
+from pathlib import Path
+import shutil
+import tempfile, time
+from typing import List
+import zipfile
+from scipy.io import wavfile
+import numpy as np
+import torch
+import onnxruntime as ort
+torch.set_num_threads(1)
+from project_settings import project_path
+from toolbox.torchaudio.models.vad.silero_vad.configuration_silero_vad import SileroVadConfig
+from toolbox.torchaudio.utils.visualization import process_speech_probs, make_visualization
+logger = logging.getLogger("toolbox")
+class InferenceSileroVadOnnx(object):
+    def __init__(self, pretrained_model_path_or_zip_file: str, device: str = "cpu"):
+        self.pretrained_model_path_or_zip_file = pretrained_model_path_or_zip_file
+        self.device = torch.device(device)
+        logger.info(f"loading model; model_file: {self.pretrained_model_path_or_zip_file}")
+        config, ort_session = self.load_models(self.pretrained_model_path_or_zip_file)
+        logger.info(f"model loading completed; model_file: {self.pretrained_model_path_or_zip_file}")
+        self.config = config
+        self.ort_session = ort_session
+    def load_models(self, model_path: str):
+        model_path = Path(model_path)
+        if model_path.name.endswith(".zip"):
+            with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
+                out_root = Path(tempfile.gettempdir()) / "cc_vad"
+                out_root.mkdir(parents=True, exist_ok=True)
+                f_zip.extractall(path=out_root)
+            model_path = out_root / model_path.stem
+        config = SileroVadConfig.from_pretrained(
+            pretrained_model_name_or_path=model_path.as_posix(),
+        )
+        ort_session = ort.InferenceSession(
+            path_or_bytes=(model_path / "model.onnx").as_posix()
+        )
+        shutil.rmtree(model_path)
+        return config, ort_session
+    def infer(self, signal: np.ndarray) -> np.ndarray:
+        # signal shape: [num_samples,], value between -1 and 1.
+        inputs = torch.tensor(signal, dtype=torch.float32)
+        inputs = torch.unsqueeze(inputs, dim=0)
+        inputs = torch.unsqueeze(inputs, dim=0)
+        # inputs shape: [1, 1, num_samples]
+        b = 1
+        # param
+        encoder_num_layers = self.config.encoder_num_layers
+        p = (self.config.encoder_kernel_size - 1) // 2
+        encoder_in_channels = self.config.encoder_in_channels
+        decoder_num_layers = self.config.decoder_num_layers
+        decoder_hidden_size = self.config.decoder_hidden_size
+        # cache 1
+        encoder_cache_list = [
+                                 torch.zeros(size=(b, 2 * p, encoder_in_channels), dtype=torch.float32)
+                             ] * encoder_num_layers
+        encoder_cache_list = torch.stack(encoder_cache_list, dim=0)
+        # cache 2
+        lstm_hidden_state = [
+                                torch.zeros(size=(decoder_num_layers, b, decoder_hidden_size), dtype=torch.float32)
+                            ] * 2
+        lstm_hidden_state = torch.stack(lstm_hidden_state, dim=0)
+        input_feed = {
+            "inputs": inputs.numpy(),
+            "encoder_cache_list": encoder_cache_list.numpy(),
+            "lstm_hidden_state": lstm_hidden_state.numpy(),
+        }
+        output_names = [
+            "logits", "probs", "lsnr", "new_encoder_cache_list", "new_lstm_hidden_state"
+        ]
+        logits, probs, lsnr, new_encoder_cache_list, new_lstm_hidden_state = self.ort_session.run(output_names, input_feed)
+        # probs shape: [b, t, 1]
+        probs = np.squeeze(probs, axis=-1)
+        # probs shape: [b, t]
+        probs = probs[0]
+        # lsnr shape: [b, t, 1]
+        lsnr = np.squeeze(lsnr, axis=-1)
+        # lsnr shape: [b, t]
+        lsnr = lsnr[0]
+        result = {
+            "probs": probs,
+            "lsnr": lsnr,
+        }
+        return result
+    def post_process(self, probs: List[float]):
+        return
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wav_file",
+        # default=(project_path / "data/examples/ai_agent/chinese-4.wav").as_posix(),
+        # default=(project_path / "data/examples/ai_agent/chinese-5.wav").as_posix(),
+        # default=(project_path / "data/examples/hado/b556437e-c68b-4f6d-9eed-2977c29db887.wav").as_posix(),
+        # default=(project_path / "data/examples/hado/eae93a33-8ee0-4d86-8f85-cac5116ae6ef.wav").as_posix(),
+        # default=(project_path / "data/examples/speech/active_media_r_0ba69730-66a4-4ecd-8929-ef58f18f4612_2.wav").as_posix(),
+        # default=(project_path / "data/examples/speech/active_media_r_2a2f472b-a0b8-4fd5-b1c4-1aedc5d2ce57_0.wav").as_posix(),
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\en-SG\2025-05-19\active_media_w_8b6e28e2-a238-4c8c-b2e3-426b1fca149b_6.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\en-SG\2025-05-19\active_media_r_0a56f035-40f6-4530-b852-613f057d718d_6.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\en-SG\2025-05-19\active_media_r_0ae70b76-3651-4a71-bc0c-9e1429e4c854_5.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\en-SG\2025-05-19\active_media_r_0d483249-57f8-4d45-b4c6-bda82d6816ae_2.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\en-SG\2025-05-19\active_media_r_0d952885-5bc2-4633-81b6-e0e809e113f1_2.wav",
+        default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\speech\en-SG\2025-05-19\active_media_r_0ddac777-d986-4a5c-9c7c-ff64be0a463d_11.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_0b8a8e80-52af-423b-8877-03a78b1e6e43_0.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_0ebffb68-6490-4a8b-8eb6-eb82443d7d75_0.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_0f6ec933-90df-447b-aca4-6ddc149452ab_0.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_1aac396f-1661-4f26-ab49-1a4879684567_0.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_1aac396f-1661-4f26-ab49-1a4879684567_1.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_1aff518b-4749-42fc-adfe-64046f9baeb6_0.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_1b16f2a3-a8c9-4739-9a76-59faf1c64d79_0.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_1b16f2a3-a8c9-4739-9a76-59faf1c64d79_1.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_1bb1f22e-9c3a-4aea-b53f-71cc6547a6ee_0.wav",
+        # default=r"D:\Users\tianx\HuggingDatasets\nx_noise\data\noise\en-SG\2025-05-19\active_media_r_1dab161b-2a76-4491-abd1-60dba6172f8d_2.wav",
+        type=str,
+    )
+    args = parser.parse_args()
+    return args
+SAMPLE_RATE = 8000
+def main():
+    args = get_args()
+    sample_rate, signal = wavfile.read(args.wav_file)
+    if SAMPLE_RATE != sample_rate:
+        raise AssertionError
+    signal = signal / (1 << 15)
+    infer = InferenceFSMNVadOnnx(
+        # pretrained_model_path_or_zip_file=(project_path / "trained_models/fsmn-vad-by-webrtcvad-nx-dns3.zip").as_posix(),
+        pretrained_model_path_or_zip_file = (project_path / "trained_models/fsmn-vad-by-webrtcvad-nx2-dns3.zip").as_posix(),
+    )
+    frame_step = infer.config.hop_size
+    speech_probs: np.ndarray = infer.infer(signal)
+    speech_probs = speech_probs.tolist()
+    speech_probs = process_speech_probs(
+        signal=signal,
+        speech_probs=speech_probs,
+        frame_step=frame_step,
+    )
+    # plot
+    make_visualization(signal, speech_probs, SAMPLE_RATE)
+    return
+if __name__ == "__main__":
+    main()

toolbox/torchaudio/models/vad/silero_vad/modeling_silero_vad.py CHANGED Viewed

@@ -2,15 +2,12 @@
 # -*- coding: utf-8 -*-
 """
 https://github.com/snakers4/silero-vad/wiki/Quality-Metrics
 https://pytorch.org/hub/snakers4_silero-vad_vad/
 https://github.com/snakers4/silero-vad
 https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/data/silero_vad.jit
 """
-import math
 import os
-from typing import List, Optional, Union, Iterable, Tuple
 import torch
 import torch.nn as nn
@@ -25,156 +22,96 @@ from toolbox.torchaudio.modules.local_snr_target import LocalSnrTarget
 MODEL_FILE = "model.pt"
-norm_layer_dict = {
-    "batch_norm_2d": torch.nn.BatchNorm2d
-}
-activation_layer_dict = {
-    "relu": torch.nn.ReLU,
-    "identity": torch.nn.Identity,
-    "sigmoid": torch.nn.Sigmoid,
-}
-class CausalConv2d(nn.Module):
     def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Iterable[int]],
-                 fstride: int = 1,
-                 dilation: int = 1,
-                 pad_f_dim: bool = True,
-                 bias: bool = True,
-                 separable: bool = False,
-                 norm_layer: str = "batch_norm_2d",
-                 activation_layer: str = "relu",
                  ):
-        super(CausalConv2d, self).__init__()
-        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
-        if pad_f_dim:
-            fpad = kernel_size[1] // 2 + dilation - 1
-        else:
-            fpad = 0
-        # for last 2 dim, pad (left, right, top, bottom).
-        self.lookback = kernel_size[0] - 1
-        if self.lookback > 0:
-            self.tpad = nn.ConstantPad2d(padding=(0, 0, self.lookback, 0), value=0.0)
-        else:
-            self.tpad = nn.Identity()
-        groups = math.gcd(in_channels, out_channels) if separable else 1
-        if groups == 1:
-            separable = False
-        if max(kernel_size) == 1:
-            separable = False
-        self.conv = nn.Conv2d(
-            in_channels,
-            out_channels,
             kernel_size=kernel_size,
-            padding=(0, fpad),
-            stride=(1, fstride),  # stride over time is always 1
-            dilation=(1, dilation),  # dilation over time is always 1
-            groups=groups,
-            bias=bias,
         )
-        if separable:
-            self.convp = nn.Conv2d(
-                out_channels,
-                out_channels,
-                kernel_size=1,
-                bias=False,
-            )
-        else:
-            self.convp = nn.Identity()
-        if norm_layer is not None:
-            norm_layer = norm_layer_dict[norm_layer]
-            self.norm = norm_layer(out_channels)
-        else:
-            self.norm = nn.Identity()
-        if activation_layer is not None:
-            activation_layer = activation_layer_dict[activation_layer]
-            self.activation = activation_layer()
-        else:
-            self.activation = nn.Identity()
-    def forward(self, inputs: torch.Tensor, cache: torch.Tensor = None):
-        """
-        :param inputs: shape: [b, c, t, f]
-        :param cache: shape: [b, c, lookback, f];
-        :return:
-        """
-        x = inputs
-        if cache is None:
-            x = self.tpad(x)
-        else:
-            x = torch.concat(tensors=[cache, x], dim=2)
-        new_cache = None
-        if self.lookback > 0:
-            new_cache = x[:, :, -self.lookback:, :]
-        x = self.conv(x)
-        x = self.convp(x)
-        x = self.norm(x)
         x = self.activation(x)
-        return x, new_cache
-class CausalEncoder(nn.Module):
     def __init__(self,
-                 conv_channels: int,
-                 kernel_size: Tuple[int, int] = (3, 3),
                  num_layers: int = 3,
                  ):
-        super(CausalEncoder, self).__init__()
-        self.layers: List[CausalConv2d] = nn.ModuleList(modules=[
-            CausalConv2d(
-                in_channels=1,
-                out_channels=conv_channels,
-                kernel_size=kernel_size,
-                bias=False,
-                separable=True,
-                fstride=1,
-            )
-            if i == 0 else
-            CausalConv2d(
-                in_channels=conv_channels,
-                out_channels=conv_channels,
-                kernel_size=kernel_size,
-                bias=False,
-                separable=True,
-                fstride=1,
-            )
-            for i in range(num_layers)
-        ])
-    def forward(self, x: torch.Tensor, cache_list: List[torch.Tensor] = None):
         # x shape: [b, t, f]
-        x = torch.unsqueeze(x, dim=1)
-        # x shape: [b, 1, t, f]
         new_cache_list = list()
         for idx, layer in enumerate(self.layers):
-            cache = None if cache_list is None else cache_list[idx]
-            x, new_cache = layer.forward(x, cache=cache)
             new_cache_list.append(new_cache)
-        # x shape: [b, c, t, f]
-        x = x.permute(0, 2, 1, 3)
-        # x shape: [b, t, c, f]
-        b, t, c, f = x.shape
-        x = torch.reshape(x, shape=(b, t, c*f))
-        # x shape: [b, t, c*f]
         return x, new_cache_list
@@ -185,15 +122,14 @@ class SileroVadModel(nn.Module):
                  win_size: int,
                  hop_size: int,
                  win_type: int,
-                 conv_channels: int,
-                 hidden_size: int,
-                 kernel_size: Tuple[int, int],
                  n_frame: int,
                  min_local_snr_db: float,
                  max_local_snr_db: float,
                  ):
         super(SileroVadModel, self).__init__()
         self.sample_rate = sample_rate
@@ -202,9 +138,12 @@ class SileroVadModel(nn.Module):
         self.hop_size = hop_size
         self.win_type = win_type
-        self.conv_channels = conv_channels
-        self.hidden_size = hidden_size
-        self.kernel_size = kernel_size
         self.n_frame = n_frame
         self.min_local_snr_db = min_local_snr_db
@@ -231,24 +170,33 @@ class SileroVadModel(nn.Module):
         self.linear = nn.Linear(
             in_features=(self.nfft // 2 + 1),
-            out_features=self.hidden_size,
         )
-        self.encoder = CausalEncoder(
-            conv_channels=conv_channels,
-            kernel_size=(3, 3),
         )
         self.lstm = nn.LSTM(
-            input_size=self.conv_channels * self.hidden_size,
-            hidden_size=self.hidden_size,
             bidirectional=False,
             batch_first=True
         )
         # vad
         self.vad_fc = nn.Sequential(
-            nn.Linear(self.hidden_size, 32),
             nn.ReLU(),
             nn.Linear(32, 1),
         )
@@ -256,7 +204,7 @@ class SileroVadModel(nn.Module):
         # lsnr
         self.lsnr_fc = nn.Sequential(
-            nn.Linear(self.hidden_size, 1),
             nn.Sigmoid()
         )
         self.lsnr_scale = self.max_local_snr_db - self.min_local_snr_db
@@ -289,10 +237,14 @@ class SileroVadModel(nn.Module):
         x = self.linear.forward(x)
         # x shape: [b, t, f']
-        x, _ = self.encoder.forward(x)
         # x shape: [b, t, f']
-        x, _ = self.lstm.forward(x)
         logits = self.vad_fc.forward(x)
         # logits shape: [b, t, 1]
@@ -345,9 +297,11 @@ class SileroVadPretrainedModel(SileroVadModel):
             win_size=config.win_size,
             hop_size=config.hop_size,
             win_type=config.win_type,
-            conv_channels=config.conv_channels,
-            hidden_size=config.hidden_size,
-            kernel_size=config.kernel_size,
             n_frame=config.n_frame,
             min_local_snr_db=config.min_local_snr_db,
             max_local_snr_db=config.max_local_snr_db,
@@ -392,7 +346,61 @@ class SileroVadPretrainedModel(SileroVadModel):
         return save_directory
-def main():
     config = SileroVadConfig()
     model = SileroVadPretrainedModel(config=config)
@@ -406,5 +414,70 @@ def main():
     return
 if __name__ == "__main__":
-    main()

 # -*- coding: utf-8 -*-
 """
 https://github.com/snakers4/silero-vad/wiki/Quality-Metrics
 https://pytorch.org/hub/snakers4_silero-vad_vad/
 https://github.com/snakers4/silero-vad
 https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/data/silero_vad.jit
 """
 import os
+from typing import Optional, Union
 import torch
 import torch.nn as nn
 MODEL_FILE = "model.pt"
+class EncoderBlock(nn.Module):
     def __init__(self,
+                 in_channels: int = 64,
+                 out_channels: int = 128,
+                 kernel_size: int = 3,
                  ):
+        super(EncoderBlock, self).__init__()
+        self.conv1d = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
             kernel_size=kernel_size,
+            padding="valid",
         )
+        self.activation = nn.ReLU()
+        self.norm = nn.BatchNorm1d(out_channels)
+    def forward(self, x: torch.Tensor):
+        # x shape: [b, t, f]
+        x = torch.transpose(x, dim0=1, dim1=2)
+        # x shape: [b, f, t]
+        x = self.conv1d.forward(x)
         x = self.activation(x)
+        x = self.norm(x)
+        x = torch.transpose(x, dim0=1, dim1=2)
+        # x shape: [b, t, f]
+        return x
+class Encoder(nn.Module):
     def __init__(self,
+                 in_channels: int = 64,
+                 hidden_channels: int = 128,
+                 out_channels: int = 64,
+                 kernel_size: int = 3,
                  num_layers: int = 3,
                  ):
+        super(Encoder, self).__init__()
+        self.layers = nn.ModuleList(modules=[])
+        for i in range(num_layers):
+            if i == 0:
+                encoder_block = EncoderBlock(
+                    in_channels=in_channels,
+                    out_channels=hidden_channels,
+                    kernel_size=kernel_size,
+                )
+            elif i == (num_layers - 1):
+                encoder_block = EncoderBlock(
+                    in_channels=hidden_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                )
+            else:
+                encoder_block = EncoderBlock(
+                    in_channels=hidden_channels,
+                    out_channels=hidden_channels,
+                    kernel_size=kernel_size,
+                )
+            self.layers.append(encoder_block)
+    def forward(self, x: torch.Tensor):
+        # x shape: [b, t, f]
+        for layer in self.layers:
+            x = layer.forward(x)
+        return x
+class EncoderExport(nn.Module):
+    def __init__(self, model: Encoder):
+        super(EncoderExport, self).__init__()
+        self.layers = model.layers
+    def forward(self, x: torch.Tensor, cache_list: torch.Tensor):
         # x shape: [b, t, f]
+        # cache_list shape: [num_layers, b, 2p, f]
         new_cache_list = list()
         for idx, layer in enumerate(self.layers):
+            cache = cache_list[idx]
+            x_pad = torch.concat(tensors=[cache, x], dim=1)
+            x = layer.forward(x_pad)
+            _, twop, _ = cache.shape
+            new_cache = x_pad[:, -twop:, :]
             new_cache_list.append(new_cache)
+        new_cache_list = torch.stack(tensors=new_cache_list, dim=0)
         return x, new_cache_list
                  win_size: int,
                  hop_size: int,
                  win_type: int,
+                 encoder_in_channels: int,
+                 encoder_kernel_size: int,
+                 encoder_num_layers: int,
+                 decoder_hidden_size: int,
+                 decoder_num_layers: int,
                  n_frame: int,
                  min_local_snr_db: float,
                  max_local_snr_db: float,
                  ):
         super(SileroVadModel, self).__init__()
         self.sample_rate = sample_rate
         self.hop_size = hop_size
         self.win_type = win_type
+        self.encoder_in_channels = encoder_in_channels
+        self.encoder_kernel_size = encoder_kernel_size
+        self.encoder_num_layers = encoder_num_layers
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_layers = decoder_num_layers
         self.n_frame = n_frame
         self.min_local_snr_db = min_local_snr_db
         self.linear = nn.Linear(
             in_features=(self.nfft // 2 + 1),
+            out_features=self.encoder_in_channels,
         )
+        # for last 2 dim, pad (left, right, top, bottom).
+        # (b, t, f) -> (b, t+p, f)
+        self.p = self.encoder_num_layers * (self.encoder_kernel_size - 1) // 2
+        self.tpad = nn.ConstantPad2d(padding=(0, 0, self.p, self.p), value=0.0)
+        self.encoder = Encoder(
+            in_channels=self.encoder_in_channels,
+            hidden_channels=self.decoder_hidden_size,
+            out_channels=self.decoder_hidden_size,
+            kernel_size=self.encoder_kernel_size,
+            num_layers=self.encoder_num_layers,
         )
         self.lstm = nn.LSTM(
+            input_size=self.decoder_hidden_size,
+            hidden_size=self.decoder_hidden_size,
+            num_layers=self.decoder_num_layers,
             bidirectional=False,
             batch_first=True
         )
         # vad
         self.vad_fc = nn.Sequential(
+            nn.Linear(self.decoder_hidden_size, 32),
             nn.ReLU(),
             nn.Linear(32, 1),
         )
         # lsnr
         self.lsnr_fc = nn.Sequential(
+            nn.Linear(self.decoder_hidden_size, 1),
             nn.Sigmoid()
         )
         self.lsnr_scale = self.max_local_snr_db - self.min_local_snr_db
         x = self.linear.forward(x)
         # x shape: [b, t, f']
+        # pad
+        x = self.tpad.forward(x)
+        # x shape: [b, t+2p, f']
+        x = self.encoder.forward(x)
         # x shape: [b, t, f']
+        x, (h, c) = self.lstm.forward(x)
         logits = self.vad_fc.forward(x)
         # logits shape: [b, t, 1]
             win_size=config.win_size,
             hop_size=config.hop_size,
             win_type=config.win_type,
+            encoder_in_channels=config.encoder_in_channels,
+            encoder_kernel_size=config.encoder_kernel_size,
+            encoder_num_layers=config.encoder_num_layers,
+            decoder_hidden_size=config.decoder_hidden_size,
+            decoder_num_layers=config.decoder_num_layers,
             n_frame=config.n_frame,
             min_local_snr_db=config.min_local_snr_db,
             max_local_snr_db=config.max_local_snr_db,
         return save_directory
+class SileroVadModelExport(nn.Module):
+    def __init__(self, model: SileroVadModel):
+        super(SileroVadModelExport, self).__init__()
+        self.stft = model.stft
+        self.linear = model.linear
+        self.encoder = EncoderExport(model.encoder)
+        self.lstm = model.lstm
+        self.vad_fc = model.vad_fc
+        self.sigmoid = model.sigmoid
+        self.lsnr_fc = model.lsnr_fc
+        self.lsnr_scale = model.lsnr_scale
+        self.lsnr_offset = model.lsnr_offset
+    def forward(self,
+                signal: torch.Tensor,
+                encoder_cache_list: torch.Tensor,
+                lstm_hidden_state: torch.Tensor,
+                ):
+        # encoder_cache_list shape: [num_layers, b, 2p, f]
+        # lstm_hidden_state shape: [2, num_layers, b, h]
+        # signal shape [b, 1, num_samples]
+        mags = self.stft.forward(signal)
+        # mags shape: [b, f, t]
+        x = torch.transpose(mags, dim0=1, dim1=2)
+        # x shape: [b, t, f]
+        x = self.linear.forward(x)
+        # x shape: [b, t, f']
+        # pad
+        # x = self.tpad.forward(x)
+        # x shape: [b, t+p, f']
+        x, new_encoder_cache_list = self.encoder.forward(x, cache_list=encoder_cache_list)
+        # x shape: [b, t, f']
+        x, new_lstm_hidden_state = self.lstm.forward(x, (lstm_hidden_state[0], lstm_hidden_state[1]))
+        new_lstm_hidden_state = torch.stack(tensors=new_lstm_hidden_state, dim=0)
+        # new_lstm_hidden_state shape: [2, num_layers, b, h]
+        logits = self.vad_fc.forward(x)
+        # logits shape: [b, t, 1]
+        probs = self.sigmoid.forward(logits)
+        # probs shape: [b, t, 1]
+        lsnr = self.lsnr_fc.forward(x) * self.lsnr_scale + self.lsnr_offset
+        # lsnr shape: [b, t, 1]
+        return logits, probs, lsnr, new_encoder_cache_list, new_lstm_hidden_state
+def main1():
     config = SileroVadConfig()
     model = SileroVadPretrainedModel(config=config)
     return
+def main2():
+    import onnx
+    import onnxruntime as ort
+    config = SileroVadConfig()
+    model = SileroVadPretrainedModel(config=config)
+    model_export = SileroVadModelExport(model)
+    encoder_num_layers = config.encoder_num_layers
+    p = (config.encoder_kernel_size - 1) // 2
+    encoder_in_channels = config.encoder_in_channels
+    decoder_num_layers = config.decoder_num_layers
+    decoder_hidden_size = config.decoder_hidden_size
+    b = 1
+    inputs = torch.randn(size=(b, 1, 16000), dtype=torch.float32)
+    encoder_cache_list = [
+        torch.zeros(size=(b, 2*p, encoder_in_channels), dtype=torch.float32)
+    ] * encoder_num_layers
+    encoder_cache_list = torch.stack(encoder_cache_list, dim=0)
+    lstm_hidden_state = [
+        torch.zeros(size=(decoder_num_layers, b, decoder_hidden_size), dtype=torch.float32)
+    ] * 2
+    lstm_hidden_state = torch.stack(lstm_hidden_state, dim=0)
+    logits, probs, lsnr, new_encoder_cache_list, new_lstm_hidden_state = model_export.forward(inputs, encoder_cache_list, lstm_hidden_state)
+    print(f"logits.shape: {logits.shape}")
+    print(f"new_encoder_cache_list.shape: {new_encoder_cache_list.shape}")
+    print(f"new_lstm_hidden_state.shape: {new_lstm_hidden_state.shape}")
+    torch.onnx.export(model_export,
+                      args=(inputs, encoder_cache_list, lstm_hidden_state),
+                      f="silero_vad.onnx",
+                      input_names=["inputs", "encoder_cache_list", "lstm_hidden_state"],
+                      output_names=["logits", "probs", "lsnr", "new_encoder_cache_list", "new_lstm_hidden_state"],
+                      dynamic_axes={
+                          "inputs": {0: "batch_size", 2: "num_samples"},
+                          "encoder_cache_list": {1: "batch_size"},
+                          "lstm_hidden_state": {2: "batch_size"},
+                          "logits": {0: "batch_size"},
+                          "probs": {0: "batch_size"},
+                          "lsnr": {0: "batch_size"},
+                          "new_encoder_cache_list": {1: "batch_size"},
+                          "new_lstm_hidden_state": {2: "batch_size"},
+                      })
+    ort_session = ort.InferenceSession("silero_vad.onnx")
+    input_feed = {
+        "inputs": inputs.numpy(),
+        "encoder_cache_list": encoder_cache_list.numpy(),
+        "lstm_hidden_state": lstm_hidden_state.numpy(),
+    }
+    output_names = [
+        "logits", "probs", "lsnr", "new_encoder_cache_list", "new_lstm_hidden_state"
+    ]
+    logits, probs, lsnr, new_encoder_cache_list, new_lstm_hidden_state = ort_session.run(output_names, input_feed)
+    print(f"probs.shape: {probs.shape}")
+    print(f"new_encoder_cache_list.shape: {new_encoder_cache_list.shape}")
+    return
 if __name__ == "__main__":
+    main2()

toolbox/torchaudio/models/vad/silero_vad/yaml/config.yaml CHANGED Viewed

@@ -8,11 +8,11 @@ hop_size: 80
 win_type: hann
 # model
-conv_channels: 32
-hidden_size: 80
-kernel_size:
-  - 3
-  - 3
 # lsnr
 n_frame: 3

 win_type: hann
 # model
+encoder_in_channels: 64
+encoder_kernel_size: 3
+encoder_num_layers: 3
+decoder_hidden_size: 64
 # lsnr
 n_frame: 3