Spaces:
Sleeping
Sleeping
Upload applyVad.ipynb
Browse files- applyVad.ipynb +272 -0
applyVad.ipynb
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "cb0d4170-de67-444c-934c-98bfdad9ae97",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# import webrtcvad\n",
|
11 |
+
"# import numpy as np\n",
|
12 |
+
"# import librosa\n",
|
13 |
+
"# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
|
14 |
+
"# '''\n",
|
15 |
+
"# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. \n",
|
16 |
+
"# This is useful in noisy environments where you want to filter out non-speech parts of the audio.\n",
|
17 |
+
"# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. \n",
|
18 |
+
"# It helps detect speech in small chunks of audio.\n",
|
19 |
+
"# '''\n",
|
20 |
+
"# vad = webrtcvad.Vad()\n",
|
21 |
+
"# audio_int16 = np.int16(audio * 32767)\n",
|
22 |
+
"# frame_size = int(sr * frame_duration / 1000)\n",
|
23 |
+
"# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
|
24 |
+
"# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])\n",
|
25 |
+
"# voiced_audio = np.float32(voiced_audio) / 32767\n",
|
26 |
+
"# return voiced_audio"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 1,
|
32 |
+
"id": "bbf2e07e-1927-4abd-98a1-8abf8fc591b4",
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [],
|
35 |
+
"source": [
|
36 |
+
"# import webrtcvad\n",
|
37 |
+
"# import numpy as np\n",
|
38 |
+
"# import librosa\n",
|
39 |
+
"\n",
|
40 |
+
"# def apply_vad(audio, sr):\n",
|
41 |
+
"# # Ensure that sample rate is supported by webrtcvad\n",
|
42 |
+
"# if sr not in [8000, 16000, 32000, 48000]:\n",
|
43 |
+
"# raise ValueError(\"Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz\")\n",
|
44 |
+
"\n",
|
45 |
+
"# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3\n",
|
46 |
+
"# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only\n",
|
47 |
+
"\n",
|
48 |
+
"# # Convert to PCM 16-bit and calculate frame length\n",
|
49 |
+
"# audio_pcm16 = (audio * 32767).astype(np.int16)\n",
|
50 |
+
"# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM\n",
|
51 |
+
" \n",
|
52 |
+
"# # Create frames ensuring correct frame size\n",
|
53 |
+
"# frames = [\n",
|
54 |
+
"# audio_pcm16[i:i + frame_length].tobytes()\n",
|
55 |
+
"# for i in range(0, len(audio_pcm16) - frame_length, frame_length)\n",
|
56 |
+
"# ]\n",
|
57 |
+
"\n",
|
58 |
+
"# # Apply VAD\n",
|
59 |
+
"# voiced_frames = []\n",
|
60 |
+
"# for frame in frames:\n",
|
61 |
+
"# try:\n",
|
62 |
+
"# if vad.is_speech(frame, sample_rate=sr):\n",
|
63 |
+
"# voiced_frames.append(frame)\n",
|
64 |
+
"# except Exception as e:\n",
|
65 |
+
"# print(f\"Error during VAD frame processing: {e}\")\n",
|
66 |
+
"\n",
|
67 |
+
"# if not voiced_frames:\n",
|
68 |
+
"# raise Exception(\"No voiced frames detected.\")\n",
|
69 |
+
"\n",
|
70 |
+
"# # Concatenate voiced frames\n",
|
71 |
+
"# voiced_audio = b''.join(voiced_frames)\n",
|
72 |
+
"# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0\n"
|
73 |
+
]
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"cell_type": "code",
|
77 |
+
"execution_count": null,
|
78 |
+
"id": "a6277ddc-4692-480f-a930-fc70b82f6852",
|
79 |
+
"metadata": {},
|
80 |
+
"outputs": [],
|
81 |
+
"source": [
|
82 |
+
"# import webrtcvad\n",
|
83 |
+
"# import numpy as np\n",
|
84 |
+
"# import librosa\n",
|
85 |
+
"\n",
|
86 |
+
"# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
|
87 |
+
"# '''\n",
|
88 |
+
"# Voice Activity Detection (VAD): Detects speech in audio.\n",
|
89 |
+
"# '''\n",
|
90 |
+
"# vad = webrtcvad.Vad(aggressiveness)\n",
|
91 |
+
" \n",
|
92 |
+
"# # Resample to 16000 Hz if not already (recommended for better compatibility)\n",
|
93 |
+
"# if sr != 16000:\n",
|
94 |
+
"# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)\n",
|
95 |
+
"# sr = 16000\n",
|
96 |
+
" \n",
|
97 |
+
"# # Convert to 16-bit PCM format expected by webrtcvad\n",
|
98 |
+
"# audio_int16 = np.int16(audio * 32767)\n",
|
99 |
+
" \n",
|
100 |
+
"# # Ensure frame size matches WebRTC's expected lengths\n",
|
101 |
+
"# frame_size = int(sr * frame_duration / 1000)\n",
|
102 |
+
"# if frame_size % 2 != 0:\n",
|
103 |
+
"# frame_size -= 1 # Make sure it's even to avoid processing issues\n",
|
104 |
+
" \n",
|
105 |
+
"# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
|
106 |
+
" \n",
|
107 |
+
"# # Filter out non-speech frames\n",
|
108 |
+
"# voiced_frames = []\n",
|
109 |
+
"# for frame in frames:\n",
|
110 |
+
"# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):\n",
|
111 |
+
"# voiced_frames.append(frame)\n",
|
112 |
+
" \n",
|
113 |
+
"# # Concatenate the voiced frames\n",
|
114 |
+
"# voiced_audio = np.concatenate(voiced_frames)\n",
|
115 |
+
"# voiced_audio = np.float32(voiced_audio) / 32767\n",
|
116 |
+
" \n",
|
117 |
+
"# return voiced_audio\n"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "code",
|
122 |
+
"execution_count": 3,
|
123 |
+
"id": "06df7fd9-7900-4cd9-9bec-7d2875d8946b",
|
124 |
+
"metadata": {},
|
125 |
+
"outputs": [],
|
126 |
+
"source": [
|
127 |
+
"# import webrtcvad\n",
|
128 |
+
"# import numpy as np\n",
|
129 |
+
"# import librosa\n",
|
130 |
+
"\n",
|
131 |
+
"# def frame_generator(frame_duration_ms, audio, sample_rate):\n",
|
132 |
+
"# \"\"\"\n",
|
133 |
+
"# Generates audio frames from PCM audio data.\n",
|
134 |
+
"# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.\n",
|
135 |
+
"# \"\"\"\n",
|
136 |
+
"# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length\n",
|
137 |
+
"# offset = 0\n",
|
138 |
+
"# while offset + n < len(audio):\n",
|
139 |
+
"# yield audio[offset:offset + n]\n",
|
140 |
+
"# offset += n\n",
|
141 |
+
"\n",
|
142 |
+
"# def apply_vad(audio, sample_rate):\n",
|
143 |
+
"# vad = webrtcvad.Vad()\n",
|
144 |
+
"# vad.set_mode(1)\n",
|
145 |
+
"# print(\"Applying VAD with mode:\", 1)\n",
|
146 |
+
"# print(\"Audio length:\", len(audio), \"bytes\")\n",
|
147 |
+
"# print(\"Sample rate:\", sample_rate)\n",
|
148 |
+
"\n",
|
149 |
+
"# # Ensure mono and correct sample rate\n",
|
150 |
+
"# if sample_rate != 16000:\n",
|
151 |
+
"# print(\"Sample rate issue detected.\")\n",
|
152 |
+
"# raise ValueError(\"Sample rate must be 16000 Hz\")\n",
|
153 |
+
"\n",
|
154 |
+
"# frames = frame_generator(30, audio, sample_rate)\n",
|
155 |
+
"# frames = list(frames)\n",
|
156 |
+
"\n",
|
157 |
+
"# print(\"Number of frames:\", len(frames))\n",
|
158 |
+
"# try:\n",
|
159 |
+
"# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]\n",
|
160 |
+
"\n",
|
161 |
+
"# if not segments:\n",
|
162 |
+
"# raise Exception(\"No voiced frames detected.\")\n",
|
163 |
+
"\n",
|
164 |
+
"# return b''.join(segments)\n",
|
165 |
+
"\n",
|
166 |
+
"# except Exception as e:\n",
|
167 |
+
"# print(f\"Error during VAD frame processing: {e}\")\n",
|
168 |
+
"# raise\n"
|
169 |
+
]
|
170 |
+
},
|
171 |
+
{
|
172 |
+
"cell_type": "code",
|
173 |
+
"execution_count": 5,
|
174 |
+
"id": "cc8e709e-2798-40c4-9ad0-03b6ef43ff5a",
|
175 |
+
"metadata": {},
|
176 |
+
"outputs": [
|
177 |
+
{
|
178 |
+
"name": "stderr",
|
179 |
+
"output_type": "stream",
|
180 |
+
"text": [
|
181 |
+
"Downloading: \"https://github.com/snakers4/silero-vad/zipball/master\" to C:\\Users\\WCHL/.cache\\torch\\hub\\master.zip\n"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"name": "stdout",
|
186 |
+
"output_type": "stream",
|
187 |
+
"text": [
|
188 |
+
"Error during Silero VAD processing: Failed to open the input \"path_to_your_audio.wav\" (No such file or directory).\n"
|
189 |
+
]
|
190 |
+
}
|
191 |
+
],
|
192 |
+
"source": [
|
193 |
+
"import torch\n",
|
194 |
+
"import torchaudio\n",
|
195 |
+
"from silero_vad import get_speech_timestamps, read_audio, save_audio\n",
|
196 |
+
"\n",
|
197 |
+
"def apply_silero_vad(audio_file_path):\n",
|
198 |
+
" \"\"\"\n",
|
199 |
+
" Applies Silero VAD to an audio file and returns the processed audio\n",
|
200 |
+
" containing only the voiced segments.\n",
|
201 |
+
" \"\"\"\n",
|
202 |
+
" # Load the Silero VAD model\n",
|
203 |
+
" model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)\n",
|
204 |
+
" \n",
|
205 |
+
" # Define helper utilities manually\n",
|
206 |
+
" def read_audio(path, sampling_rate=16000):\n",
|
207 |
+
" wav, sr = torchaudio.load(path)\n",
|
208 |
+
" if sr != sampling_rate:\n",
|
209 |
+
" wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)\n",
|
210 |
+
" return wav.squeeze(0)\n",
|
211 |
+
"\n",
|
212 |
+
" def save_audio(path, tensor, sampling_rate=16000):\n",
|
213 |
+
" torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)\n",
|
214 |
+
"\n",
|
215 |
+
" # Read the audio file\n",
|
216 |
+
" wav = read_audio(audio_file_path, sampling_rate=16000)\n",
|
217 |
+
"\n",
|
218 |
+
" # Get timestamps for speech segments\n",
|
219 |
+
" speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)\n",
|
220 |
+
"\n",
|
221 |
+
" # If no speech detected, raise an exception\n",
|
222 |
+
" if not speech_timestamps:\n",
|
223 |
+
" raise Exception(\"No voiced frames detected using Silero VAD.\")\n",
|
224 |
+
"\n",
|
225 |
+
" # Combine the voiced segments\n",
|
226 |
+
" voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])\n",
|
227 |
+
"\n",
|
228 |
+
" # Save the processed audio if needed\n",
|
229 |
+
" save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)\n",
|
230 |
+
"\n",
|
231 |
+
" # Convert to numpy bytes for further processing\n",
|
232 |
+
" return voiced_audio.numpy().tobytes()\n",
|
233 |
+
"\n",
|
234 |
+
"# Example usage\n",
|
235 |
+
"try:\n",
|
236 |
+
" processed_audio = apply_silero_vad(\"path_to_your_audio.wav\")\n",
|
237 |
+
" print(\"VAD completed successfully!\")\n",
|
238 |
+
"except Exception as e:\n",
|
239 |
+
" print(f\"Error during Silero VAD processing: {e}\")\n"
|
240 |
+
]
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"cell_type": "code",
|
244 |
+
"execution_count": null,
|
245 |
+
"id": "7d4ea1a1-9f19-4603-b15f-b1389fddc81b",
|
246 |
+
"metadata": {},
|
247 |
+
"outputs": [],
|
248 |
+
"source": []
|
249 |
+
}
|
250 |
+
],
|
251 |
+
"metadata": {
|
252 |
+
"kernelspec": {
|
253 |
+
"display_name": "Python 3 (ipykernel)",
|
254 |
+
"language": "python",
|
255 |
+
"name": "python3"
|
256 |
+
},
|
257 |
+
"language_info": {
|
258 |
+
"codemirror_mode": {
|
259 |
+
"name": "ipython",
|
260 |
+
"version": 3
|
261 |
+
},
|
262 |
+
"file_extension": ".py",
|
263 |
+
"mimetype": "text/x-python",
|
264 |
+
"name": "python",
|
265 |
+
"nbconvert_exporter": "python",
|
266 |
+
"pygments_lexer": "ipython3",
|
267 |
+
"version": "3.11.7"
|
268 |
+
}
|
269 |
+
},
|
270 |
+
"nbformat": 4,
|
271 |
+
"nbformat_minor": 5
|
272 |
+
}
|