Update modules/vad/silero_vad.py
Browse files- modules/vad/silero_vad.py +27 -28
modules/vad/silero_vad.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import BinaryIO, Union, List, Optional, Tuple
|
|
| 6 |
import warnings
|
| 7 |
import bisect
|
| 8 |
import faster_whisper
|
| 9 |
-
from faster_whisper.transcribe import SpeechTimestampsMap
|
| 10 |
import gradio as gr
|
| 11 |
|
| 12 |
class SileroVAD:
|
|
@@ -57,7 +57,7 @@ class SileroVAD:
|
|
| 57 |
vad_options=vad_parameters,
|
| 58 |
progress=progress
|
| 59 |
)
|
| 60 |
-
|
| 61 |
audio = self.collect_chunks(audio, speech_chunks)
|
| 62 |
duration_after_vad = audio.shape[0] / sampling_rate
|
| 63 |
|
|
@@ -89,7 +89,7 @@ class SileroVAD:
|
|
| 89 |
vad_options = VadOptions(**kwargs)
|
| 90 |
|
| 91 |
threshold = vad_options.threshold
|
| 92 |
-
neg_threshold =
|
| 93 |
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
| 94 |
max_speech_duration_s = vad_options.max_speech_duration_s
|
| 95 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
|
@@ -106,22 +106,18 @@ class SileroVAD:
|
|
| 106 |
min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
|
| 107 |
|
| 108 |
audio_length_samples = len(audio)
|
| 109 |
-
state, context = self.model.get_initial_states(batch_size=1)
|
| 110 |
-
|
| 111 |
-
speech_probs = []
|
| 112 |
-
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
| 113 |
-
progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
speech_probs.append(speech_prob)
|
| 120 |
|
| 121 |
triggered = False
|
| 122 |
speeches = []
|
| 123 |
current_speech = {}
|
| 124 |
-
|
|
|
|
|
|
|
| 125 |
# to save potential segment end (and tolerate some silence)
|
| 126 |
temp_end = 0
|
| 127 |
# to save potential segment limits in case of maximum segment size reached
|
|
@@ -244,17 +240,6 @@ class SileroVAD:
|
|
| 244 |
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
|
| 245 |
)
|
| 246 |
|
| 247 |
-
def restore_speech_timestamps(
|
| 248 |
-
self,
|
| 249 |
-
segments: List[dict],
|
| 250 |
-
speech_chunks: List[dict],
|
| 251 |
-
sampling_rate: Optional[int] = None,
|
| 252 |
-
) -> List[dict]:
|
| 253 |
-
if sampling_rate is None:
|
| 254 |
-
sampling_rate = self.sampling_rate
|
| 255 |
-
|
| 256 |
-
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
|
| 257 |
-
|
| 258 |
def restore_speech_timestamps(
|
| 259 |
self,
|
| 260 |
segments: List[dict],
|
|
@@ -267,8 +252,22 @@ class SileroVAD:
|
|
| 267 |
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
|
| 268 |
|
| 269 |
for segment in segments:
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
|
|
|
|
|
|
|
| 274 |
|
|
|
|
|
|
| 6 |
import warnings
|
| 7 |
import bisect
|
| 8 |
import faster_whisper
|
| 9 |
+
from faster_whisper.transcribe import SpeechTimestampsMap
|
| 10 |
import gradio as gr
|
| 11 |
|
| 12 |
class SileroVAD:
|
|
|
|
| 57 |
vad_options=vad_parameters,
|
| 58 |
progress=progress
|
| 59 |
)
|
| 60 |
+
|
| 61 |
audio = self.collect_chunks(audio, speech_chunks)
|
| 62 |
duration_after_vad = audio.shape[0] / sampling_rate
|
| 63 |
|
|
|
|
| 89 |
vad_options = VadOptions(**kwargs)
|
| 90 |
|
| 91 |
threshold = vad_options.threshold
|
| 92 |
+
neg_threshold = vad_options.neg_threshold
|
| 93 |
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
| 94 |
max_speech_duration_s = vad_options.max_speech_duration_s
|
| 95 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
|
|
|
| 106 |
min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
|
| 107 |
|
| 108 |
audio_length_samples = len(audio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
padded_audio = np.pad(
|
| 111 |
+
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
|
| 112 |
+
)
|
| 113 |
+
speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
|
|
|
|
| 114 |
|
| 115 |
triggered = False
|
| 116 |
speeches = []
|
| 117 |
current_speech = {}
|
| 118 |
+
if neg_threshold is None:
|
| 119 |
+
neg_threshold = max(threshold - 0.15, 0.01)
|
| 120 |
+
|
| 121 |
# to save potential segment end (and tolerate some silence)
|
| 122 |
temp_end = 0
|
| 123 |
# to save potential segment limits in case of maximum segment size reached
|
|
|
|
| 240 |
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
|
| 241 |
)
|
| 242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
def restore_speech_timestamps(
|
| 244 |
self,
|
| 245 |
segments: List[dict],
|
|
|
|
| 252 |
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
|
| 253 |
|
| 254 |
for segment in segments:
|
| 255 |
+
if segment["words"]:
|
| 256 |
+
words = []
|
| 257 |
+
for word in segment["words"]:
|
| 258 |
+
# Ensure the word start and end times are resolved to the same chunk.
|
| 259 |
+
middle = (word["start"] + word["end"]) / 2
|
| 260 |
+
chunk_index = ts_map.get_chunk_index(middle)
|
| 261 |
+
word["start"] = ts_map.get_original_time(word["start"], chunk_index)
|
| 262 |
+
word["end"] = ts_map.get_original_time(word["end"], chunk_index)
|
| 263 |
+
words.append(word)
|
| 264 |
+
|
| 265 |
+
segment["start"] = words[0].start
|
| 266 |
+
segment["end"] = words[-1].end
|
| 267 |
+
segment["words"] = words
|
| 268 |
|
| 269 |
+
else:
|
| 270 |
+
segment["start"] = ts_map.get_original_time(segment["start"])
|
| 271 |
+
segment["end"] = ts_map.get_original_time(segment["end"])
|
| 272 |
|
| 273 |
+
return segments
|