HoneyTian commited on
Commit
00e4381
·
1 Parent(s): e90b328
Files changed (2) hide show
  1. main.py +43 -10
  2. toolbox/vad/utils.py +135 -0
main.py CHANGED
@@ -23,6 +23,7 @@ from toolbox.os.command import Command
23
  from toolbox.torchaudio.models.vad.fsmn_vad.inference_fsmn_vad_onnx import InferenceFSMNVadOnnx
24
  from toolbox.torchaudio.models.vad.silero_vad.inference_silero_vad import InferenceSileroVad
25
  from toolbox.torchaudio.utils.visualization import process_speech_probs
 
26
 
27
  log.setup_size_rotating(log_directory=log_directory, tz_info=time_zone_info)
28
 
@@ -98,7 +99,12 @@ def generate_image(signal: np.ndarray, speech_probs: np.ndarray, sample_rate: in
98
  return temp_file.name
99
 
100
 
101
- def when_click_vad_button(audio_file_t = None, audio_microphone_t = None, engine: str = None):
 
 
 
 
 
102
  if audio_file_t is None and audio_microphone_t is None:
103
  raise gr.Error(f"audio file and microphone is null.")
104
  if audio_file_t is not None and audio_microphone_t is not None:
@@ -136,15 +142,28 @@ def when_click_vad_button(audio_file_t = None, audio_microphone_t = None, engine
136
  lsnr = lsnr / 30
137
 
138
  frame_step = infer_engine.config.hop_size
139
- probs = process_speech_probs(audio, probs, frame_step)
140
- lsnr = process_speech_probs(audio, lsnr, frame_step)
141
- probs_image = generate_image(audio, probs)
142
- lsnr_image = generate_image(audio, lsnr)
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  except Exception as e:
145
  raise gr.Error(f"vad failed, error type: {type(e)}, error text: {str(e)}.")
146
 
147
- return probs_image, lsnr_image, message
148
 
149
 
150
  def main():
@@ -218,22 +237,36 @@ def main():
218
  with gr.TabItem("microphone"):
219
  vad_audio_microphone = gr.Audio(sources="microphone", label="audio")
220
 
221
- vad_engine = gr.Dropdown(choices=vad_engine_choices, value=vad_engine_choices[0], label="engine")
 
 
 
 
 
 
 
222
  vad_button = gr.Button(variant="primary")
223
  with gr.Column(variant="panel", scale=5):
224
  vad_vad_image = gr.Image(label="vad")
 
225
  vad_lsnr_image = gr.Image(label="lsnr")
226
  vad_message = gr.Textbox(lines=1, max_lines=20, label="message")
227
 
228
  vad_button.click(
229
  when_click_vad_button,
230
- inputs=[vad_audio_file, vad_audio_microphone, vad_engine],
231
- outputs=[vad_vad_image, vad_lsnr_image, vad_message],
 
 
 
 
 
 
232
  )
233
  gr.Examples(
234
  examples=examples,
235
  inputs=[vad_audio_file, vad_audio_microphone, vad_engine],
236
- outputs=[vad_vad_image, vad_lsnr_image, vad_message],
237
  fn=when_click_vad_button,
238
  # cache_examples=True,
239
  # cache_mode="lazy",
 
23
  from toolbox.torchaudio.models.vad.fsmn_vad.inference_fsmn_vad_onnx import InferenceFSMNVadOnnx
24
  from toolbox.torchaudio.models.vad.silero_vad.inference_silero_vad import InferenceSileroVad
25
  from toolbox.torchaudio.utils.visualization import process_speech_probs
26
+ from toolbox.vad.utils import PostProcess
27
 
28
  log.setup_size_rotating(log_directory=log_directory, tz_info=time_zone_info)
29
 
 
99
  return temp_file.name
100
 
101
 
102
+ def when_click_vad_button(audio_file_t = None, audio_microphone_t = None,
103
+ start_ring_rate: float = 0.5, end_ring_rate: float = 0.3,
104
+ min_silence_length: int = 2,
105
+ max_speech_length: int = 10000, min_speech_length: int = 10,
106
+ engine: str = None,
107
+ ):
108
  if audio_file_t is None and audio_microphone_t is None:
109
  raise gr.Error(f"audio file and microphone is null.")
110
  if audio_file_t is not None and audio_microphone_t is not None:
 
142
  lsnr = lsnr / 30
143
 
144
  frame_step = infer_engine.config.hop_size
145
+ probs_ = process_speech_probs(audio, probs, frame_step)
146
+ probs_image = generate_image(audio, probs_)
147
+
148
+ lsnr_ = process_speech_probs(audio, lsnr, frame_step)
149
+ lsnr_image = generate_image(audio, lsnr_)
150
+
151
+ # post process
152
+ vad_post_process = PostProcess(
153
+ start_ring_rate=start_ring_rate,
154
+ end_ring_rate=end_ring_rate,
155
+ min_silence_length=min_silence_length,
156
+ max_speech_length=max_speech_length,
157
+ min_speech_length=min_speech_length
158
+ )
159
+ vad = vad_post_process.post_process(probs)
160
+ vad_ = process_speech_probs(audio, vad, frame_step)
161
+ vad_image = generate_image(audio, vad_)
162
 
163
  except Exception as e:
164
  raise gr.Error(f"vad failed, error type: {type(e)}, error text: {str(e)}.")
165
 
166
+ return vad_image, probs_image, lsnr_image, message
167
 
168
 
169
  def main():
 
237
  with gr.TabItem("microphone"):
238
  vad_audio_microphone = gr.Audio(sources="microphone", label="audio")
239
 
240
+ with gr.Row():
241
+ vad_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="start_ring_rate")
242
+ vad_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="end_ring_rate")
243
+ vad_min_silence_length = gr.Number(value=2, label="min_silence_length")
244
+ with gr.Row():
245
+ vad_max_speech_length = gr.Number(value=100000, label="max_speech_length")
246
+ vad_min_speech_length = gr.Number(value=10, label="min_speech_length")
247
+ vad_engine = gr.Dropdown(choices=vad_engine_choices, value=vad_engine_choices[0], label="engine")
248
  vad_button = gr.Button(variant="primary")
249
  with gr.Column(variant="panel", scale=5):
250
  vad_vad_image = gr.Image(label="vad")
251
+ vad_prob_image = gr.Image(label="prob")
252
  vad_lsnr_image = gr.Image(label="lsnr")
253
  vad_message = gr.Textbox(lines=1, max_lines=20, label="message")
254
 
255
  vad_button.click(
256
  when_click_vad_button,
257
+ inputs=[
258
+ vad_audio_file, vad_audio_microphone,
259
+ vad_start_ring_rate, vad_end_ring_rate,
260
+ vad_min_silence_length,
261
+ vad_max_speech_length, vad_min_speech_length,
262
+ vad_engine,
263
+ ],
264
+ outputs=[vad_vad_image, vad_prob_image, vad_lsnr_image, vad_message],
265
  )
266
  gr.Examples(
267
  examples=examples,
268
  inputs=[vad_audio_file, vad_audio_microphone, vad_engine],
269
+ outputs=[vad_vad_image, vad_prob_image, vad_lsnr_image, vad_message],
270
  fn=when_click_vad_button,
271
  # cache_examples=True,
272
  # cache_mode="lazy",
toolbox/vad/utils.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import collections
4
+
5
+ from typing import List, Tuple
6
+
7
+
8
+ class PostProcess(object):
9
+ def __init__(self,
10
+ start_ring_rate: float = 0.5,
11
+ end_ring_rate: float = 0.5,
12
+ min_silence_length: int = 1,
13
+ max_speech_length: float = 10,
14
+ min_speech_length: float = 2,
15
+ ):
16
+ self.start_ring_rate = start_ring_rate
17
+ self.end_ring_rate = end_ring_rate
18
+ self.max_speech_length = max_speech_length
19
+ self.min_speech_length = min_speech_length
20
+ self.min_silence_length = min_silence_length
21
+
22
+ # segments
23
+ self.ring_buffer = collections.deque(maxlen=10)
24
+ self.triggered = False
25
+
26
+ # vad segments
27
+ self.is_first_segment = True
28
+ self.start_idx: int = -1
29
+ self.end_idx: int = -1
30
+
31
+ # speech probs
32
+ self.voiced_frames: List[Tuple[int, float]] = list()
33
+
34
+ def segments_generator(self, probs: List[float]):
35
+ for idx, prob in enumerate(probs):
36
+ if not self.triggered:
37
+ self.ring_buffer.append((idx, prob))
38
+ num_voiced = sum([p for _, p in self.ring_buffer])
39
+
40
+ if num_voiced > self.start_ring_rate * self.ring_buffer.maxlen:
41
+ self.triggered = True
42
+ for idx_prob_t in self.ring_buffer:
43
+ self.voiced_frames.append(idx_prob_t)
44
+ continue
45
+
46
+ idx_prob_t = (idx, prob)
47
+ self.voiced_frames.append(idx_prob_t)
48
+ self.ring_buffer.append(idx_prob_t)
49
+ num_voiced = sum([p for _, p in self.ring_buffer])
50
+
51
+ if num_voiced < self.end_ring_rate * self.ring_buffer.maxlen:
52
+ segment = [
53
+ self.voiced_frames[0][0],
54
+ self.voiced_frames[-1][0],
55
+ ]
56
+ yield segment
57
+ self.triggered = False
58
+ self.ring_buffer.clear()
59
+ self.voiced_frames: List[Tuple[int, float]] = list()
60
+ continue
61
+
62
+ def vad_segments_generator(self, segments_generator):
63
+ segments = list(segments_generator)
64
+
65
+ for i, segment in enumerate(segments):
66
+ start = segment[0]
67
+ end = segment[1]
68
+
69
+ if self.start_idx == -1 and self.end_idx == -1:
70
+ self.start_idx = start
71
+ self.end_idx = end
72
+ continue
73
+
74
+ if self.end_idx - self.start_idx > self.max_speech_length:
75
+ end_ = self.start_idx + self.max_speech_length
76
+ vad_segment = [self.start_idx, end_]
77
+ yield vad_segment
78
+ self.start_idx = end_
79
+
80
+ silence_length = start - self.end_idx
81
+ if silence_length < self.min_silence_length:
82
+ self.end_idx = end
83
+ continue
84
+
85
+ if self.end_idx - self.start_idx < self.min_speech_length:
86
+ self.start_idx = start
87
+ self.end_idx = end
88
+ continue
89
+
90
+ vad_segment = [self.start_idx, self.end_idx]
91
+ yield vad_segment
92
+ self.start_idx = start
93
+ self.end_idx = end
94
+
95
+ def vad(self, probs: List[float]) -> List[list]:
96
+ segments = self.segments_generator(probs)
97
+ vad_segments = self.vad_segments_generator(segments)
98
+ vad_segments = list(vad_segments)
99
+ return vad_segments
100
+
101
+ def last_vad_segments(self) -> List[list]:
102
+ # last segments
103
+ if len(self.voiced_frames) == 0:
104
+ segments = []
105
+ else:
106
+ segment = [
107
+ self.voiced_frames[0][0],
108
+ self.voiced_frames[-1][0]
109
+ ]
110
+ segments = [segment]
111
+
112
+ # last vad segments
113
+ vad_segments = self.vad_segments_generator(segments)
114
+ vad_segments = list(vad_segments)
115
+
116
+ if self.start_idx > 1e-5 and self.end_idx > 1e-5:
117
+ vad_segments = vad_segments + [[self.start_idx, self.end_idx]]
118
+ return vad_segments
119
+
120
+ def post_process(self, probs: List[float]):
121
+ vad_segments = list()
122
+ segments = self.vad(probs)
123
+ vad_segments += segments
124
+ segments = self.last_vad_segments()
125
+ vad_segments += segments
126
+
127
+ result = [0] * len(probs)
128
+ for begin, end in vad_segments:
129
+ result[begin: end] = [1] * (end - begin)
130
+
131
+ return result
132
+
133
+
134
+ if __name__ == "__main__":
135
+ pass