LAP-DEV commited on
Commit
6a9efdd
·
verified ·
1 Parent(s): 5101e2a

Update modules/vad/silero_vad.py

Browse files
Files changed (1) hide show
  1. modules/vad/silero_vad.py +27 -28
modules/vad/silero_vad.py CHANGED
@@ -6,7 +6,7 @@ from typing import BinaryIO, Union, List, Optional, Tuple
6
  import warnings
7
  import bisect
8
  import faster_whisper
9
- from faster_whisper.transcribe import SpeechTimestampsMap, Segment
10
  import gradio as gr
11
 
12
  class SileroVAD:
@@ -57,7 +57,7 @@ class SileroVAD:
57
  vad_options=vad_parameters,
58
  progress=progress
59
  )
60
-
61
  audio = self.collect_chunks(audio, speech_chunks)
62
  duration_after_vad = audio.shape[0] / sampling_rate
63
 
@@ -89,7 +89,7 @@ class SileroVAD:
89
  vad_options = VadOptions(**kwargs)
90
 
91
  threshold = vad_options.threshold
92
- neg_threshold = max(threshold - 0.15, 0.01)
93
  min_speech_duration_ms = vad_options.min_speech_duration_ms
94
  max_speech_duration_s = vad_options.max_speech_duration_s
95
  min_silence_duration_ms = vad_options.min_silence_duration_ms
@@ -106,22 +106,18 @@ class SileroVAD:
106
  min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
107
 
108
  audio_length_samples = len(audio)
109
- state, context = self.model.get_initial_states(batch_size=1)
110
-
111
- speech_probs = []
112
- for current_start_sample in range(0, audio_length_samples, window_size_samples):
113
- progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
114
 
115
- chunk = audio[current_start_sample: current_start_sample + window_size_samples]
116
- if len(chunk) < window_size_samples:
117
- chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
118
- speech_prob, state, context = self.model(chunk, state, context, self.sampling_rate)
119
- speech_probs.append(speech_prob)
120
 
121
  triggered = False
122
  speeches = []
123
  current_speech = {}
124
-
 
 
125
  # to save potential segment end (and tolerate some silence)
126
  temp_end = 0
127
  # to save potential segment limits in case of maximum segment size reached
@@ -244,17 +240,6 @@ class SileroVAD:
244
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
245
  )
246
 
247
- def restore_speech_timestamps(
248
- self,
249
- segments: List[dict],
250
- speech_chunks: List[dict],
251
- sampling_rate: Optional[int] = None,
252
- ) -> List[dict]:
253
- if sampling_rate is None:
254
- sampling_rate = self.sampling_rate
255
-
256
- ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
257
-
258
  def restore_speech_timestamps(
259
  self,
260
  segments: List[dict],
@@ -267,8 +252,22 @@ class SileroVAD:
267
  ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
268
 
269
  for segment in segments:
270
- segment["start"] = ts_map.get_original_time(segment["start"])
271
- segment["end"] = ts_map.get_original_time(segment["end"])
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- return segments
 
 
274
 
 
 
6
  import warnings
7
  import bisect
8
  import faster_whisper
9
+ from faster_whisper.transcribe import SpeechTimestampsMap
10
  import gradio as gr
11
 
12
  class SileroVAD:
 
57
  vad_options=vad_parameters,
58
  progress=progress
59
  )
60
+
61
  audio = self.collect_chunks(audio, speech_chunks)
62
  duration_after_vad = audio.shape[0] / sampling_rate
63
 
 
89
  vad_options = VadOptions(**kwargs)
90
 
91
  threshold = vad_options.threshold
92
+ neg_threshold = vad_options.neg_threshold
93
  min_speech_duration_ms = vad_options.min_speech_duration_ms
94
  max_speech_duration_s = vad_options.max_speech_duration_s
95
  min_silence_duration_ms = vad_options.min_silence_duration_ms
 
106
  min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
107
 
108
  audio_length_samples = len(audio)
 
 
 
 
 
109
 
110
+ padded_audio = np.pad(
111
+ audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
112
+ )
113
+ speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
 
114
 
115
  triggered = False
116
  speeches = []
117
  current_speech = {}
118
+ if neg_threshold is None:
119
+ neg_threshold = max(threshold - 0.15, 0.01)
120
+
121
  # to save potential segment end (and tolerate some silence)
122
  temp_end = 0
123
  # to save potential segment limits in case of maximum segment size reached
 
240
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
241
  )
242
 
 
 
 
 
 
 
 
 
 
 
 
243
  def restore_speech_timestamps(
244
  self,
245
  segments: List[dict],
 
252
  ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
253
 
254
  for segment in segments:
255
+ if segment["words"]:
256
+ words = []
257
+ for word in segment["words"]:
258
+ # Ensure the word start and end times are resolved to the same chunk.
259
+ middle = (word["start"] + word["end"]) / 2
260
+ chunk_index = ts_map.get_chunk_index(middle)
261
+ word["start"] = ts_map.get_original_time(word["start"], chunk_index)
262
+ word["end"] = ts_map.get_original_time(word["end"], chunk_index)
263
+ words.append(word)
264
+
265
+ segment["start"] = words[0].start
266
+ segment["end"] = words[-1].end
267
+ segment["words"] = words
268
 
269
+ else:
270
+ segment["start"] = ts_map.get_original_time(segment["start"])
271
+ segment["end"] = ts_map.get_original_time(segment["end"])
272
 
273
+ return segments