Spaces:

LAP-DEV
/

Demo

Sleeping

App Files Files Community

LAP-DEV commited on Feb 27

Commit

6a9efdd

verified ·

1 Parent(s): 5101e2a

Update modules/vad/silero_vad.py

Browse files

Files changed (1) hide show

modules/vad/silero_vad.py +27 -28

modules/vad/silero_vad.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import bisect
 import faster_whisper
-from faster_whisper.transcribe import SpeechTimestampsMap, Segment
 import gradio as gr
 class SileroVAD:
@@ -57,7 +57,7 @@ class SileroVAD:
             vad_options=vad_parameters,
             progress=progress
         )
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
@@ -89,7 +89,7 @@ class SileroVAD:
             vad_options = VadOptions(**kwargs)
         threshold = vad_options.threshold
-        neg_threshold = max(threshold - 0.15, 0.01)
         min_speech_duration_ms = vad_options.min_speech_duration_ms
         max_speech_duration_s = vad_options.max_speech_duration_s
         min_silence_duration_ms = vad_options.min_silence_duration_ms
@@ -106,22 +106,18 @@ class SileroVAD:
         min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
         audio_length_samples = len(audio)
-        state, context = self.model.get_initial_states(batch_size=1)
-        speech_probs = []
-        for current_start_sample in range(0, audio_length_samples, window_size_samples):
-            progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
-            chunk = audio[current_start_sample: current_start_sample + window_size_samples]
-            if len(chunk) < window_size_samples:
-                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
-            speech_prob, state, context = self.model(chunk, state, context, self.sampling_rate)
-            speech_probs.append(speech_prob)
         triggered = False
         speeches = []
         current_speech = {}
         # to save potential segment end (and tolerate some silence)
         temp_end = 0
         # to save potential segment limits in case of maximum segment size reached
@@ -244,17 +240,6 @@ class SileroVAD:
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )
-    def restore_speech_timestamps(
-        self,
-        segments: List[dict],
-        speech_chunks: List[dict],
-        sampling_rate: Optional[int] = None,
-    ) -> List[dict]:
-        if sampling_rate is None:
-            sampling_rate = self.sampling_rate
-        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
     def restore_speech_timestamps(
         self,
         segments: List[dict],
@@ -267,8 +252,22 @@ class SileroVAD:
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
         for segment in segments:
-            segment["start"] = ts_map.get_original_time(segment["start"])
-            segment["end"] = ts_map.get_original_time(segment["end"])
-        return segments

 import warnings
 import bisect
 import faster_whisper
+from faster_whisper.transcribe import SpeechTimestampsMap
 import gradio as gr
 class SileroVAD:
             vad_options=vad_parameters,
             progress=progress
         )
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
             vad_options = VadOptions(**kwargs)
         threshold = vad_options.threshold
+        neg_threshold = vad_options.neg_threshold
         min_speech_duration_ms = vad_options.min_speech_duration_ms
         max_speech_duration_s = vad_options.max_speech_duration_s
         min_silence_duration_ms = vad_options.min_silence_duration_ms
         min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
         audio_length_samples = len(audio)
+        padded_audio = np.pad(
+            audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
+        )
+        speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
         triggered = False
         speeches = []
         current_speech = {}
+        if neg_threshold is None:
+            neg_threshold = max(threshold - 0.15, 0.01)
         # to save potential segment end (and tolerate some silence)
         temp_end = 0
         # to save potential segment limits in case of maximum segment size reached
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )
     def restore_speech_timestamps(
         self,
         segments: List[dict],
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
         for segment in segments:
+            if segment["words"]:
+                words = []
+                for word in segment["words"]:
+                    # Ensure the word start and end times are resolved to the same chunk.
+                    middle = (word["start"] + word["end"]) / 2
+                    chunk_index = ts_map.get_chunk_index(middle)
+                    word["start"] = ts_map.get_original_time(word["start"], chunk_index)
+                    word["end"] = ts_map.get_original_time(word["end"], chunk_index)
+                    words.append(word)
+                segment["start"] = words[0].start
+                segment["end"] = words[-1].end
+                segment["words"] = words
+            else:
+                segment["start"] = ts_map.get_original_time(segment["start"])
+                segment["end"] = ts_map.get_original_time(segment["end"])
+        return segments