Update modules/vad/silero_vad.py
Browse files- modules/vad/silero_vad.py +19 -19
modules/vad/silero_vad.py
CHANGED
@@ -4,11 +4,11 @@ from faster_whisper.vad import VadOptions, get_vad_model
|
|
4 |
import numpy as np
|
5 |
from typing import BinaryIO, Union, List, Optional, Tuple
|
6 |
import warnings
|
|
|
7 |
import faster_whisper
|
8 |
from faster_whisper.transcribe import SpeechTimestampsMap, Segment
|
9 |
import gradio as gr
|
10 |
|
11 |
-
|
12 |
class SileroVAD:
|
13 |
def __init__(self):
|
14 |
self.sampling_rate = 16000
|
@@ -57,6 +57,7 @@ class SileroVAD:
|
|
57 |
vad_options=vad_parameters,
|
58 |
progress=progress
|
59 |
)
|
|
|
60 |
audio = self.collect_chunks(audio, speech_chunks)
|
61 |
duration_after_vad = audio.shape[0] / sampling_rate
|
62 |
|
@@ -93,35 +94,27 @@ class SileroVAD:
|
|
93 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
94 |
window_size_samples = self.window_size_samples
|
95 |
speech_pad_ms = vad_options.speech_pad_ms
|
96 |
-
|
97 |
-
|
98 |
-
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
99 |
max_speech_samples = (
|
100 |
-
sampling_rate * max_speech_duration_s
|
101 |
- window_size_samples
|
102 |
- 2 * speech_pad_samples
|
103 |
)
|
104 |
-
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
105 |
-
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
106 |
|
107 |
audio_length_samples = len(audio)
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
|
114 |
-
|
115 |
-
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
116 |
-
if len(chunk) < window_size_samples:
|
117 |
-
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
118 |
-
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
|
119 |
-
speech_probs.append(speech_prob)
|
120 |
|
121 |
triggered = False
|
122 |
speeches = []
|
123 |
current_speech = {}
|
124 |
-
neg_threshold =
|
125 |
|
126 |
# to save potential segment end (and tolerate some silence)
|
127 |
temp_end = 0
|
@@ -222,6 +215,13 @@ class SileroVAD:
|
|
222 |
|
223 |
return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
@staticmethod
|
226 |
def format_timestamp(
|
227 |
seconds: float,
|
|
|
4 |
import numpy as np
|
5 |
from typing import BinaryIO, Union, List, Optional, Tuple
|
6 |
import warnings
|
7 |
+
import bisect
|
8 |
import faster_whisper
|
9 |
from faster_whisper.transcribe import SpeechTimestampsMap, Segment
|
10 |
import gradio as gr
|
11 |
|
|
|
12 |
class SileroVAD:
|
13 |
def __init__(self):
|
14 |
self.sampling_rate = 16000
|
|
|
57 |
vad_options=vad_parameters,
|
58 |
progress=progress
|
59 |
)
|
60 |
+
|
61 |
audio = self.collect_chunks(audio, speech_chunks)
|
62 |
duration_after_vad = audio.shape[0] / sampling_rate
|
63 |
|
|
|
94 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
95 |
window_size_samples = self.window_size_samples
|
96 |
speech_pad_ms = vad_options.speech_pad_ms
|
97 |
+
min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
|
98 |
+
speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
|
|
|
99 |
max_speech_samples = (
|
100 |
+
self.sampling_rate * max_speech_duration_s
|
101 |
- window_size_samples
|
102 |
- 2 * speech_pad_samples
|
103 |
)
|
104 |
+
min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
|
105 |
+
min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
|
106 |
|
107 |
audio_length_samples = len(audio)
|
108 |
|
109 |
+
padded_audio = np.pad(
|
110 |
+
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
|
111 |
+
)
|
112 |
+
speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
triggered = False
|
115 |
speeches = []
|
116 |
current_speech = {}
|
117 |
+
neg_threshold = vad_options.neg_threshold
|
118 |
|
119 |
# to save potential segment end (and tolerate some silence)
|
120 |
temp_end = 0
|
|
|
215 |
|
216 |
return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
|
217 |
|
218 |
+
def get_chunk_index(self, time: float) -> int:
|
219 |
+
sample = int(time * self.sampling_rate)
|
220 |
+
return min(
|
221 |
+
bisect.bisect(self.chunk_end_sample, sample),
|
222 |
+
len(self.chunk_end_sample) - 1,
|
223 |
+
)
|
224 |
+
|
225 |
@staticmethod
|
226 |
def format_timestamp(
|
227 |
seconds: float,
|