Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Jul 15, 2024

Commit

16a0393

1 Parent(s): 7386da0

add `restore_speech_timestamps()`

Browse files

Files changed (1) hide show

modules/vad/silero_vad.py +25 -4

modules/vad/silero_vad.py CHANGED Viewed

@@ -2,9 +2,10 @@
 from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
-from typing import BinaryIO, Union, List, Optional
 import warnings
 import faster_whisper
 import gradio as gr
@@ -17,7 +18,8 @@ class SileroVAD:
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
-            progress: gr.Progress = gr.Progress()):
         """
         Run VAD
@@ -32,8 +34,10 @@ class SileroVAD:
         Returns
         ----------
-        audio: np.ndarray
             Pre-processed audio with VAD
         """
         sampling_rate = self.sampling_rate
@@ -56,7 +60,7 @@ class SileroVAD:
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
-        return audio
     def get_speech_timestamps(
         self,
@@ -241,3 +245,20 @@ class SileroVAD:
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )

 from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
+from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import faster_whisper
+from faster_whisper.transcribe import SpeechTimestampsMap, Segment
 import gradio as gr
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
+            progress: gr.Progress = gr.Progress()
+            ) -> Tuple[np.ndarray, List[dict]]:
         """
         Run VAD
         Returns
         ----------
+        np.ndarray
             Pre-processed audio with VAD
+        List[dict]
+            Chunks of speeches to be used to restore the timestamps later
         """
         sampling_rate = self.sampling_rate
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
+        return audio, speech_chunks
     def get_speech_timestamps(
         self,
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )
+    def restore_speech_timestamps(
+        self,
+        segments: List[dict],
+        speech_chunks: List[dict],
+        sampling_rate: Optional[int] = None,
+    ) -> List[dict]:
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+        for segment in segments:
+            segment["start"] = ts_map.get_original_time(segment["start"])
+            segment["end"] = ts_map.get_original_time(segment["end"])
+        return segments