LAP-DEV commited on
Commit
3824a61
·
verified ·
1 Parent(s): 13c185e

Update modules/vad/silero_vad.py

Browse files
Files changed (1) hide show
  1. modules/vad/silero_vad.py +19 -19
modules/vad/silero_vad.py CHANGED
@@ -4,11 +4,11 @@ from faster_whisper.vad import VadOptions, get_vad_model
4
  import numpy as np
5
  from typing import BinaryIO, Union, List, Optional, Tuple
6
  import warnings
 
7
  import faster_whisper
8
  from faster_whisper.transcribe import SpeechTimestampsMap, Segment
9
  import gradio as gr
10
 
11
-
12
  class SileroVAD:
13
  def __init__(self):
14
  self.sampling_rate = 16000
@@ -57,6 +57,7 @@ class SileroVAD:
57
  vad_options=vad_parameters,
58
  progress=progress
59
  )
 
60
  audio = self.collect_chunks(audio, speech_chunks)
61
  duration_after_vad = audio.shape[0] / sampling_rate
62
 
@@ -93,35 +94,27 @@ class SileroVAD:
93
  min_silence_duration_ms = vad_options.min_silence_duration_ms
94
  window_size_samples = self.window_size_samples
95
  speech_pad_ms = vad_options.speech_pad_ms
96
- sampling_rate = 16000
97
- min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
98
- speech_pad_samples = sampling_rate * speech_pad_ms / 1000
99
  max_speech_samples = (
100
- sampling_rate * max_speech_duration_s
101
  - window_size_samples
102
  - 2 * speech_pad_samples
103
  )
104
- min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
105
- min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
106
 
107
  audio_length_samples = len(audio)
108
 
109
- state, context = self.model.get_initial_states(batch_size=1)
110
-
111
- speech_probs = []
112
- for current_start_sample in range(0, audio_length_samples, window_size_samples):
113
- progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
114
-
115
- chunk = audio[current_start_sample: current_start_sample + window_size_samples]
116
- if len(chunk) < window_size_samples:
117
- chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
118
- speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
119
- speech_probs.append(speech_prob)
120
 
121
  triggered = False
122
  speeches = []
123
  current_speech = {}
124
- neg_threshold = threshold - 0.15
125
 
126
  # to save potential segment end (and tolerate some silence)
127
  temp_end = 0
@@ -222,6 +215,13 @@ class SileroVAD:
222
 
223
  return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
224
 
 
 
 
 
 
 
 
225
  @staticmethod
226
  def format_timestamp(
227
  seconds: float,
 
4
  import numpy as np
5
  from typing import BinaryIO, Union, List, Optional, Tuple
6
  import warnings
7
+ import bisect
8
  import faster_whisper
9
  from faster_whisper.transcribe import SpeechTimestampsMap, Segment
10
  import gradio as gr
11
 
 
12
  class SileroVAD:
13
  def __init__(self):
14
  self.sampling_rate = 16000
 
57
  vad_options=vad_parameters,
58
  progress=progress
59
  )
60
+
61
  audio = self.collect_chunks(audio, speech_chunks)
62
  duration_after_vad = audio.shape[0] / sampling_rate
63
 
 
94
  min_silence_duration_ms = vad_options.min_silence_duration_ms
95
  window_size_samples = self.window_size_samples
96
  speech_pad_ms = vad_options.speech_pad_ms
97
+ min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
98
+ speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
 
99
  max_speech_samples = (
100
+ self.sampling_rate * max_speech_duration_s
101
  - window_size_samples
102
  - 2 * speech_pad_samples
103
  )
104
+ min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
105
+ min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
106
 
107
  audio_length_samples = len(audio)
108
 
109
+ padded_audio = np.pad(
110
+ audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
111
+ )
112
+ speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
 
 
 
 
 
 
 
113
 
114
  triggered = False
115
  speeches = []
116
  current_speech = {}
117
+ neg_threshold = vad_options.neg_threshold
118
 
119
  # to save potential segment end (and tolerate some silence)
120
  temp_end = 0
 
215
 
216
  return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
217
 
218
+ def get_chunk_index(self, time: float) -> int:
219
+ sample = int(time * self.sampling_rate)
220
+ return min(
221
+ bisect.bisect(self.chunk_end_sample, sample),
222
+ len(self.chunk_end_sample) - 1,
223
+ )
224
+
225
  @staticmethod
226
  def format_timestamp(
227
  seconds: float,