LAP-DEV commited on
Commit
6c957f2
·
verified ·
1 Parent(s): b038962

Delete modules/vad

Browse files
modules/vad/__init__.py DELETED
File without changes
modules/vad/silero_vad.py DELETED
@@ -1,295 +0,0 @@
1
- # Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
2
-
3
- from faster_whisper.vad import VadOptions, get_vad_model
4
- import numpy as np
5
- from typing import BinaryIO, Union, List, Optional, Tuple
6
- import warnings
7
- import bisect
8
- import faster_whisper
9
- from faster_whisper.transcribe import SpeechTimestampsMap
10
- import gradio as gr
11
-
12
- class Segment:
13
- def __init__(self):
14
- self.id: Optional[int] = Field(default=None, description="Incremental id for the segment")
15
- self.seek: Optional[int] = Field(default=None, description="Seek of the segment from chunked audio")
16
- self.text: Optional[str] = Field(default=None, description="Transcription text of the segment")
17
- self.start: Optional[float] = Field(default=None, description="Start time of the segment")
18
- self.end: Optional[float] = Field(default=None, description="End time of the segment")
19
- self.tokens: Optional[List[int]] = Field(default=None, description="List of token IDs")
20
- self.temperature: Optional[float] = Field(default=None, description="Temperature used during the decoding process")
21
- self.avg_logprob: Optional[float] = Field(default=None, description="Average log probability of the tokens")
22
- self.compression_ratio: Optional[float] = Field(default=None, description="Compression ratio of the segment")
23
- self.no_speech_prob: Optional[float] = Field(default=None, description="Probability that it's not speech")
24
- self.words: Optional[List['Word']] = Field(default=None, description="List of words contained in the segment")
25
-
26
- class Word:
27
- def __init__(self):
28
- self.start: Optional[float] = Field(default=None, description="Start time of the word")
29
- self.end: Optional[float] = Field(default=None, description="Start time of the word")
30
- self.word: Optional[str] = Field(default=None, description="Word text")
31
- self.probability: Optional[float] = Field(default=None, description="Probability of the word")
32
-
33
- class SileroVAD:
34
- def __init__(self):
35
- self.sampling_rate = 16000
36
- self.window_size_samples = 512
37
- self.model = None
38
-
39
- def run(self,
40
- audio: Union[str, BinaryIO, np.ndarray],
41
- vad_parameters: VadOptions,
42
- progress: gr.Progress = gr.Progress()
43
- ) -> Tuple[np.ndarray, List[dict]]:
44
- """
45
- Run VAD
46
-
47
- Parameters
48
- ----------
49
- audio: Union[str, BinaryIO, np.ndarray]
50
- Audio path or file binary or Audio numpy array
51
- vad_parameters:
52
- Options for VAD processing.
53
- progress: gr.Progress
54
- Indicator to show progress directly in gradio.
55
-
56
- Returns
57
- ----------
58
- np.ndarray
59
- Pre-processed audio with VAD
60
- List[dict]
61
- Chunks of speeches to be used to restore the timestamps later
62
- """
63
-
64
- sampling_rate = self.sampling_rate
65
-
66
- if not isinstance(audio, np.ndarray):
67
- audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
68
-
69
- duration = audio.shape[0] / sampling_rate
70
- duration_after_vad = duration
71
-
72
- if vad_parameters is None:
73
- vad_parameters = VadOptions()
74
- elif isinstance(vad_parameters, dict):
75
- vad_parameters = VadOptions(**vad_parameters)
76
- speech_chunks = self.get_speech_timestamps(
77
- audio=audio,
78
- vad_options=vad_parameters,
79
- progress=progress
80
- )
81
-
82
- audio = self.collect_chunks(audio, speech_chunks)
83
- duration_after_vad = audio.shape[0] / sampling_rate
84
-
85
- return audio, speech_chunks
86
-
87
- def get_speech_timestamps(
88
- self,
89
- audio: np.ndarray,
90
- vad_options: Optional[VadOptions] = None,
91
- progress: gr.Progress = gr.Progress(),
92
- **kwargs,
93
- ) -> List[dict]:
94
- """This method is used for splitting long audios into speech chunks using silero VAD.
95
-
96
- Args:
97
- audio: One dimensional float array.
98
- vad_options: Options for VAD processing.
99
- kwargs: VAD options passed as keyword arguments for backward compatibility.
100
- progress: Gradio progress to indicate progress.
101
-
102
- Returns:
103
- List of dicts containing begin and end samples of each speech chunk.
104
- """
105
-
106
- if self.model is None:
107
- self.update_model()
108
-
109
- if vad_options is None:
110
- vad_options = VadOptions(**kwargs)
111
-
112
- threshold = vad_options.threshold
113
- neg_threshold = vad_options.neg_threshold
114
- min_speech_duration_ms = vad_options.min_speech_duration_ms
115
- max_speech_duration_s = vad_options.max_speech_duration_s
116
- min_silence_duration_ms = vad_options.min_silence_duration_ms
117
- window_size_samples = self.window_size_samples
118
- speech_pad_ms = vad_options.speech_pad_ms
119
- min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
120
- speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
121
- max_speech_samples = (
122
- self.sampling_rate * max_speech_duration_s
123
- - window_size_samples
124
- - 2 * speech_pad_samples
125
- )
126
- min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
127
- min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
128
-
129
- audio_length_samples = len(audio)
130
-
131
- padded_audio = np.pad(
132
- audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
133
- )
134
- speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
135
-
136
- triggered = False
137
- speeches = []
138
- current_speech = {}
139
- if neg_threshold is None:
140
- neg_threshold = max(threshold - 0.15, 0.01)
141
-
142
- # to save potential segment end (and tolerate some silence)
143
- temp_end = 0
144
- # to save potential segment limits in case of maximum segment size reached
145
- prev_end = next_start = 0
146
-
147
- for i, speech_prob in enumerate(speech_probs):
148
- if (speech_prob >= threshold) and temp_end:
149
- temp_end = 0
150
- if next_start < prev_end:
151
- next_start = window_size_samples * i
152
-
153
- if (speech_prob >= threshold) and not triggered:
154
- triggered = True
155
- current_speech["start"] = window_size_samples * i
156
- continue
157
-
158
- if (
159
- triggered
160
- and (window_size_samples * i) - current_speech["start"] > max_speech_samples
161
- ):
162
- if prev_end:
163
- current_speech["end"] = prev_end
164
- speeches.append(current_speech)
165
- current_speech = {}
166
- # previously reached silence (< neg_thres) and is still not speech (< thres)
167
- if next_start < prev_end:
168
- triggered = False
169
- else:
170
- current_speech["start"] = next_start
171
- prev_end = next_start = temp_end = 0
172
- else:
173
- current_speech["end"] = window_size_samples * i
174
- speeches.append(current_speech)
175
- current_speech = {}
176
- prev_end = next_start = temp_end = 0
177
- triggered = False
178
- continue
179
-
180
- if (speech_prob < neg_threshold) and triggered:
181
- if not temp_end:
182
- temp_end = window_size_samples * i
183
- # condition to avoid cutting in very short silence
184
- if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
185
- prev_end = temp_end
186
- if (window_size_samples * i) - temp_end < min_silence_samples:
187
- continue
188
- else:
189
- current_speech["end"] = temp_end
190
- if (
191
- current_speech["end"] - current_speech["start"]
192
- ) > min_speech_samples:
193
- speeches.append(current_speech)
194
- current_speech = {}
195
- prev_end = next_start = temp_end = 0
196
- triggered = False
197
- continue
198
-
199
- if (
200
- current_speech
201
- and (audio_length_samples - current_speech["start"]) > min_speech_samples
202
- ):
203
- current_speech["end"] = audio_length_samples
204
- speeches.append(current_speech)
205
-
206
- for i, speech in enumerate(speeches):
207
- if i == 0:
208
- speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
209
- if i != len(speeches) - 1:
210
- silence_duration = speeches[i + 1]["start"] - speech["end"]
211
- if silence_duration < 2 * speech_pad_samples:
212
- speech["end"] += int(silence_duration // 2)
213
- speeches[i + 1]["start"] = int(
214
- max(0, speeches[i + 1]["start"] - silence_duration // 2)
215
- )
216
- else:
217
- speech["end"] = int(
218
- min(audio_length_samples, speech["end"] + speech_pad_samples)
219
- )
220
- speeches[i + 1]["start"] = int(
221
- max(0, speeches[i + 1]["start"] - speech_pad_samples)
222
- )
223
- else:
224
- speech["end"] = int(
225
- min(audio_length_samples, speech["end"] + speech_pad_samples)
226
- )
227
-
228
- return speeches
229
-
230
- def update_model(self):
231
- self.model = get_vad_model()
232
-
233
- @staticmethod
234
- def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
235
- """Collects and concatenates audio chunks."""
236
- if not chunks:
237
- return np.array([], dtype=np.float32)
238
-
239
- return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
240
-
241
- @staticmethod
242
- def format_timestamp(
243
- seconds: float,
244
- always_include_hours: bool = False,
245
- decimal_marker: str = ".",
246
- ) -> str:
247
- assert seconds >= 0, "non-negative timestamp expected"
248
- milliseconds = round(seconds * 1000.0)
249
-
250
- hours = milliseconds // 3_600_000
251
- milliseconds -= hours * 3_600_000
252
-
253
- minutes = milliseconds // 60_000
254
- milliseconds -= minutes * 60_000
255
-
256
- seconds = milliseconds // 1_000
257
- milliseconds -= seconds * 1_000
258
-
259
- hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
260
- return (
261
- f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
262
- )
263
-
264
- def restore_speech_timestamps(
265
- self,
266
- segments: List[Segment],
267
- speech_chunks: List[dict],
268
- sampling_rate: Optional[int] = None,
269
- ) -> List[Segment]:
270
- if sampling_rate is None:
271
- sampling_rate = self.sampling_rate
272
-
273
- ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
274
-
275
- for segment in segments:
276
- if segment.words:
277
- words = []
278
- for word in segment.words:
279
- # Ensure the word start and end times are resolved to the same chunk.
280
- middle = (word.start + word.end) / 2
281
- chunk_index = ts_map.get_chunk_index(middle)
282
- word.start = ts_map.get_original_time(word.start, chunk_index)
283
- word.end = ts_map.get_original_time(word.end, chunk_index)
284
- words.append(word)
285
-
286
- segment.start = words[0].start
287
- segment.end = words[-1].end
288
- segment.words = words
289
-
290
- else:
291
- segment.start = ts_map.get_original_time(segment.start)
292
- segment.end = ts_map.get_original_time(segment.end)
293
-
294
- return segments
295
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/vad/silero_vad_backup.py DELETED
@@ -1,264 +0,0 @@
1
- # Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
2
-
3
- from faster_whisper.vad import VadOptions, get_vad_model
4
- import numpy as np
5
- from typing import BinaryIO, Union, List, Optional, Tuple
6
- import warnings
7
- import faster_whisper
8
- from faster_whisper.transcribe import SpeechTimestampsMap, Segment
9
- import gradio as gr
10
-
11
-
12
- class SileroVAD:
13
- def __init__(self):
14
- self.sampling_rate = 16000
15
- self.window_size_samples = 512
16
- self.model = None
17
-
18
- def run(self,
19
- audio: Union[str, BinaryIO, np.ndarray],
20
- vad_parameters: VadOptions,
21
- progress: gr.Progress = gr.Progress()
22
- ) -> Tuple[np.ndarray, List[dict]]:
23
- """
24
- Run VAD
25
-
26
- Parameters
27
- ----------
28
- audio: Union[str, BinaryIO, np.ndarray]
29
- Audio path or file binary or Audio numpy array
30
- vad_parameters:
31
- Options for VAD processing.
32
- progress: gr.Progress
33
- Indicator to show progress directly in gradio.
34
-
35
- Returns
36
- ----------
37
- np.ndarray
38
- Pre-processed audio with VAD
39
- List[dict]
40
- Chunks of speeches to be used to restore the timestamps later
41
- """
42
-
43
- sampling_rate = self.sampling_rate
44
-
45
- if not isinstance(audio, np.ndarray):
46
- audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
47
-
48
- duration = audio.shape[0] / sampling_rate
49
- duration_after_vad = duration
50
-
51
- if vad_parameters is None:
52
- vad_parameters = VadOptions()
53
- elif isinstance(vad_parameters, dict):
54
- vad_parameters = VadOptions(**vad_parameters)
55
- speech_chunks = self.get_speech_timestamps(
56
- audio=audio,
57
- vad_options=vad_parameters,
58
- progress=progress
59
- )
60
- audio = self.collect_chunks(audio, speech_chunks)
61
- duration_after_vad = audio.shape[0] / sampling_rate
62
-
63
- return audio, speech_chunks
64
-
65
- def get_speech_timestamps(
66
- self,
67
- audio: np.ndarray,
68
- vad_options: Optional[VadOptions] = None,
69
- progress: gr.Progress = gr.Progress(),
70
- **kwargs,
71
- ) -> List[dict]:
72
- """This method is used for splitting long audios into speech chunks using silero VAD.
73
-
74
- Args:
75
- audio: One dimensional float array.
76
- vad_options: Options for VAD processing.
77
- kwargs: VAD options passed as keyword arguments for backward compatibility.
78
- progress: Gradio progress to indicate progress.
79
-
80
- Returns:
81
- List of dicts containing begin and end samples of each speech chunk.
82
- """
83
-
84
- if self.model is None:
85
- self.update_model()
86
-
87
- if vad_options is None:
88
- vad_options = VadOptions(**kwargs)
89
-
90
- threshold = vad_options.threshold
91
- min_speech_duration_ms = vad_options.min_speech_duration_ms
92
- max_speech_duration_s = vad_options.max_speech_duration_s
93
- min_silence_duration_ms = vad_options.min_silence_duration_ms
94
- window_size_samples = self.window_size_samples
95
- speech_pad_ms = vad_options.speech_pad_ms
96
- sampling_rate = 16000
97
- min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
98
- speech_pad_samples = sampling_rate * speech_pad_ms / 1000
99
- max_speech_samples = (
100
- sampling_rate * max_speech_duration_s
101
- - window_size_samples
102
- - 2 * speech_pad_samples
103
- )
104
- min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
105
- min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
106
-
107
- audio_length_samples = len(audio)
108
-
109
- state, context = self.model.get_initial_states(batch_size=1)
110
-
111
- speech_probs = []
112
- for current_start_sample in range(0, audio_length_samples, window_size_samples):
113
- progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
114
-
115
- chunk = audio[current_start_sample: current_start_sample + window_size_samples]
116
- if len(chunk) < window_size_samples:
117
- chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
118
- speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
119
- speech_probs.append(speech_prob)
120
-
121
- triggered = False
122
- speeches = []
123
- current_speech = {}
124
- neg_threshold = threshold - 0.15
125
-
126
- # to save potential segment end (and tolerate some silence)
127
- temp_end = 0
128
- # to save potential segment limits in case of maximum segment size reached
129
- prev_end = next_start = 0
130
-
131
- for i, speech_prob in enumerate(speech_probs):
132
- if (speech_prob >= threshold) and temp_end:
133
- temp_end = 0
134
- if next_start < prev_end:
135
- next_start = window_size_samples * i
136
-
137
- if (speech_prob >= threshold) and not triggered:
138
- triggered = True
139
- current_speech["start"] = window_size_samples * i
140
- continue
141
-
142
- if (
143
- triggered
144
- and (window_size_samples * i) - current_speech["start"] > max_speech_samples
145
- ):
146
- if prev_end:
147
- current_speech["end"] = prev_end
148
- speeches.append(current_speech)
149
- current_speech = {}
150
- # previously reached silence (< neg_thres) and is still not speech (< thres)
151
- if next_start < prev_end:
152
- triggered = False
153
- else:
154
- current_speech["start"] = next_start
155
- prev_end = next_start = temp_end = 0
156
- else:
157
- current_speech["end"] = window_size_samples * i
158
- speeches.append(current_speech)
159
- current_speech = {}
160
- prev_end = next_start = temp_end = 0
161
- triggered = False
162
- continue
163
-
164
- if (speech_prob < neg_threshold) and triggered:
165
- if not temp_end:
166
- temp_end = window_size_samples * i
167
- # condition to avoid cutting in very short silence
168
- if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
169
- prev_end = temp_end
170
- if (window_size_samples * i) - temp_end < min_silence_samples:
171
- continue
172
- else:
173
- current_speech["end"] = temp_end
174
- if (
175
- current_speech["end"] - current_speech["start"]
176
- ) > min_speech_samples:
177
- speeches.append(current_speech)
178
- current_speech = {}
179
- prev_end = next_start = temp_end = 0
180
- triggered = False
181
- continue
182
-
183
- if (
184
- current_speech
185
- and (audio_length_samples - current_speech["start"]) > min_speech_samples
186
- ):
187
- current_speech["end"] = audio_length_samples
188
- speeches.append(current_speech)
189
-
190
- for i, speech in enumerate(speeches):
191
- if i == 0:
192
- speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
193
- if i != len(speeches) - 1:
194
- silence_duration = speeches[i + 1]["start"] - speech["end"]
195
- if silence_duration < 2 * speech_pad_samples:
196
- speech["end"] += int(silence_duration // 2)
197
- speeches[i + 1]["start"] = int(
198
- max(0, speeches[i + 1]["start"] - silence_duration // 2)
199
- )
200
- else:
201
- speech["end"] = int(
202
- min(audio_length_samples, speech["end"] + speech_pad_samples)
203
- )
204
- speeches[i + 1]["start"] = int(
205
- max(0, speeches[i + 1]["start"] - speech_pad_samples)
206
- )
207
- else:
208
- speech["end"] = int(
209
- min(audio_length_samples, speech["end"] + speech_pad_samples)
210
- )
211
-
212
- return speeches
213
-
214
- def update_model(self):
215
- self.model = get_vad_model()
216
-
217
- @staticmethod
218
- def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
219
- """Collects and concatenates audio chunks."""
220
- if not chunks:
221
- return np.array([], dtype=np.float32)
222
-
223
- return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
224
-
225
- @staticmethod
226
- def format_timestamp(
227
- seconds: float,
228
- always_include_hours: bool = False,
229
- decimal_marker: str = ".",
230
- ) -> str:
231
- assert seconds >= 0, "non-negative timestamp expected"
232
- milliseconds = round(seconds * 1000.0)
233
-
234
- hours = milliseconds // 3_600_000
235
- milliseconds -= hours * 3_600_000
236
-
237
- minutes = milliseconds // 60_000
238
- milliseconds -= minutes * 60_000
239
-
240
- seconds = milliseconds // 1_000
241
- milliseconds -= seconds * 1_000
242
-
243
- hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
244
- return (
245
- f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
246
- )
247
-
248
- def restore_speech_timestamps(
249
- self,
250
- segments: List[dict],
251
- speech_chunks: List[dict],
252
- sampling_rate: Optional[int] = None,
253
- ) -> List[dict]:
254
- if sampling_rate is None:
255
- sampling_rate = self.sampling_rate
256
-
257
- ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
258
-
259
- for segment in segments:
260
- segment["start"] = ts_map.get_original_time(segment["start"])
261
- segment["end"] = ts_map.get_original_time(segment["end"])
262
-
263
- return segments
264
-