Spaces:
Sleeping
Sleeping
Delete applyVad.ipynb
Browse files- applyVad.ipynb +0 -272
applyVad.ipynb
DELETED
@@ -1,272 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": null,
|
6 |
-
"id": "cb0d4170-de67-444c-934c-98bfdad9ae97",
|
7 |
-
"metadata": {},
|
8 |
-
"outputs": [],
|
9 |
-
"source": [
|
10 |
-
"# import webrtcvad\n",
|
11 |
-
"# import numpy as np\n",
|
12 |
-
"# import librosa\n",
|
13 |
-
"# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
|
14 |
-
"# '''\n",
|
15 |
-
"# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. \n",
|
16 |
-
"# This is useful in noisy environments where you want to filter out non-speech parts of the audio.\n",
|
17 |
-
"# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. \n",
|
18 |
-
"# It helps detect speech in small chunks of audio.\n",
|
19 |
-
"# '''\n",
|
20 |
-
"# vad = webrtcvad.Vad()\n",
|
21 |
-
"# audio_int16 = np.int16(audio * 32767)\n",
|
22 |
-
"# frame_size = int(sr * frame_duration / 1000)\n",
|
23 |
-
"# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
|
24 |
-
"# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])\n",
|
25 |
-
"# voiced_audio = np.float32(voiced_audio) / 32767\n",
|
26 |
-
"# return voiced_audio"
|
27 |
-
]
|
28 |
-
},
|
29 |
-
{
|
30 |
-
"cell_type": "code",
|
31 |
-
"execution_count": 1,
|
32 |
-
"id": "bbf2e07e-1927-4abd-98a1-8abf8fc591b4",
|
33 |
-
"metadata": {},
|
34 |
-
"outputs": [],
|
35 |
-
"source": [
|
36 |
-
"# import webrtcvad\n",
|
37 |
-
"# import numpy as np\n",
|
38 |
-
"# import librosa\n",
|
39 |
-
"\n",
|
40 |
-
"# def apply_vad(audio, sr):\n",
|
41 |
-
"# # Ensure that sample rate is supported by webrtcvad\n",
|
42 |
-
"# if sr not in [8000, 16000, 32000, 48000]:\n",
|
43 |
-
"# raise ValueError(\"Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz\")\n",
|
44 |
-
"\n",
|
45 |
-
"# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3\n",
|
46 |
-
"# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only\n",
|
47 |
-
"\n",
|
48 |
-
"# # Convert to PCM 16-bit and calculate frame length\n",
|
49 |
-
"# audio_pcm16 = (audio * 32767).astype(np.int16)\n",
|
50 |
-
"# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM\n",
|
51 |
-
" \n",
|
52 |
-
"# # Create frames ensuring correct frame size\n",
|
53 |
-
"# frames = [\n",
|
54 |
-
"# audio_pcm16[i:i + frame_length].tobytes()\n",
|
55 |
-
"# for i in range(0, len(audio_pcm16) - frame_length, frame_length)\n",
|
56 |
-
"# ]\n",
|
57 |
-
"\n",
|
58 |
-
"# # Apply VAD\n",
|
59 |
-
"# voiced_frames = []\n",
|
60 |
-
"# for frame in frames:\n",
|
61 |
-
"# try:\n",
|
62 |
-
"# if vad.is_speech(frame, sample_rate=sr):\n",
|
63 |
-
"# voiced_frames.append(frame)\n",
|
64 |
-
"# except Exception as e:\n",
|
65 |
-
"# print(f\"Error during VAD frame processing: {e}\")\n",
|
66 |
-
"\n",
|
67 |
-
"# if not voiced_frames:\n",
|
68 |
-
"# raise Exception(\"No voiced frames detected.\")\n",
|
69 |
-
"\n",
|
70 |
-
"# # Concatenate voiced frames\n",
|
71 |
-
"# voiced_audio = b''.join(voiced_frames)\n",
|
72 |
-
"# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0\n"
|
73 |
-
]
|
74 |
-
},
|
75 |
-
{
|
76 |
-
"cell_type": "code",
|
77 |
-
"execution_count": null,
|
78 |
-
"id": "a6277ddc-4692-480f-a930-fc70b82f6852",
|
79 |
-
"metadata": {},
|
80 |
-
"outputs": [],
|
81 |
-
"source": [
|
82 |
-
"# import webrtcvad\n",
|
83 |
-
"# import numpy as np\n",
|
84 |
-
"# import librosa\n",
|
85 |
-
"\n",
|
86 |
-
"# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
|
87 |
-
"# '''\n",
|
88 |
-
"# Voice Activity Detection (VAD): Detects speech in audio.\n",
|
89 |
-
"# '''\n",
|
90 |
-
"# vad = webrtcvad.Vad(aggressiveness)\n",
|
91 |
-
" \n",
|
92 |
-
"# # Resample to 16000 Hz if not already (recommended for better compatibility)\n",
|
93 |
-
"# if sr != 16000:\n",
|
94 |
-
"# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)\n",
|
95 |
-
"# sr = 16000\n",
|
96 |
-
" \n",
|
97 |
-
"# # Convert to 16-bit PCM format expected by webrtcvad\n",
|
98 |
-
"# audio_int16 = np.int16(audio * 32767)\n",
|
99 |
-
" \n",
|
100 |
-
"# # Ensure frame size matches WebRTC's expected lengths\n",
|
101 |
-
"# frame_size = int(sr * frame_duration / 1000)\n",
|
102 |
-
"# if frame_size % 2 != 0:\n",
|
103 |
-
"# frame_size -= 1 # Make sure it's even to avoid processing issues\n",
|
104 |
-
" \n",
|
105 |
-
"# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
|
106 |
-
" \n",
|
107 |
-
"# # Filter out non-speech frames\n",
|
108 |
-
"# voiced_frames = []\n",
|
109 |
-
"# for frame in frames:\n",
|
110 |
-
"# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):\n",
|
111 |
-
"# voiced_frames.append(frame)\n",
|
112 |
-
" \n",
|
113 |
-
"# # Concatenate the voiced frames\n",
|
114 |
-
"# voiced_audio = np.concatenate(voiced_frames)\n",
|
115 |
-
"# voiced_audio = np.float32(voiced_audio) / 32767\n",
|
116 |
-
" \n",
|
117 |
-
"# return voiced_audio\n"
|
118 |
-
]
|
119 |
-
},
|
120 |
-
{
|
121 |
-
"cell_type": "code",
|
122 |
-
"execution_count": 3,
|
123 |
-
"id": "06df7fd9-7900-4cd9-9bec-7d2875d8946b",
|
124 |
-
"metadata": {},
|
125 |
-
"outputs": [],
|
126 |
-
"source": [
|
127 |
-
"# import webrtcvad\n",
|
128 |
-
"# import numpy as np\n",
|
129 |
-
"# import librosa\n",
|
130 |
-
"\n",
|
131 |
-
"# def frame_generator(frame_duration_ms, audio, sample_rate):\n",
|
132 |
-
"# \"\"\"\n",
|
133 |
-
"# Generates audio frames from PCM audio data.\n",
|
134 |
-
"# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.\n",
|
135 |
-
"# \"\"\"\n",
|
136 |
-
"# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length\n",
|
137 |
-
"# offset = 0\n",
|
138 |
-
"# while offset + n < len(audio):\n",
|
139 |
-
"# yield audio[offset:offset + n]\n",
|
140 |
-
"# offset += n\n",
|
141 |
-
"\n",
|
142 |
-
"# def apply_vad(audio, sample_rate):\n",
|
143 |
-
"# vad = webrtcvad.Vad()\n",
|
144 |
-
"# vad.set_mode(1)\n",
|
145 |
-
"# print(\"Applying VAD with mode:\", 1)\n",
|
146 |
-
"# print(\"Audio length:\", len(audio), \"bytes\")\n",
|
147 |
-
"# print(\"Sample rate:\", sample_rate)\n",
|
148 |
-
"\n",
|
149 |
-
"# # Ensure mono and correct sample rate\n",
|
150 |
-
"# if sample_rate != 16000:\n",
|
151 |
-
"# print(\"Sample rate issue detected.\")\n",
|
152 |
-
"# raise ValueError(\"Sample rate must be 16000 Hz\")\n",
|
153 |
-
"\n",
|
154 |
-
"# frames = frame_generator(30, audio, sample_rate)\n",
|
155 |
-
"# frames = list(frames)\n",
|
156 |
-
"\n",
|
157 |
-
"# print(\"Number of frames:\", len(frames))\n",
|
158 |
-
"# try:\n",
|
159 |
-
"# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]\n",
|
160 |
-
"\n",
|
161 |
-
"# if not segments:\n",
|
162 |
-
"# raise Exception(\"No voiced frames detected.\")\n",
|
163 |
-
"\n",
|
164 |
-
"# return b''.join(segments)\n",
|
165 |
-
"\n",
|
166 |
-
"# except Exception as e:\n",
|
167 |
-
"# print(f\"Error during VAD frame processing: {e}\")\n",
|
168 |
-
"# raise\n"
|
169 |
-
]
|
170 |
-
},
|
171 |
-
{
|
172 |
-
"cell_type": "code",
|
173 |
-
"execution_count": 5,
|
174 |
-
"id": "cc8e709e-2798-40c4-9ad0-03b6ef43ff5a",
|
175 |
-
"metadata": {},
|
176 |
-
"outputs": [
|
177 |
-
{
|
178 |
-
"name": "stderr",
|
179 |
-
"output_type": "stream",
|
180 |
-
"text": [
|
181 |
-
"Downloading: \"https://github.com/snakers4/silero-vad/zipball/master\" to C:\\Users\\WCHL/.cache\\torch\\hub\\master.zip\n"
|
182 |
-
]
|
183 |
-
},
|
184 |
-
{
|
185 |
-
"name": "stdout",
|
186 |
-
"output_type": "stream",
|
187 |
-
"text": [
|
188 |
-
"Error during Silero VAD processing: Failed to open the input \"path_to_your_audio.wav\" (No such file or directory).\n"
|
189 |
-
]
|
190 |
-
}
|
191 |
-
],
|
192 |
-
"source": [
|
193 |
-
"import torch\n",
|
194 |
-
"import torchaudio\n",
|
195 |
-
"from silero_vad import get_speech_timestamps, read_audio, save_audio\n",
|
196 |
-
"\n",
|
197 |
-
"def apply_silero_vad(audio_file_path):\n",
|
198 |
-
" \"\"\"\n",
|
199 |
-
" Applies Silero VAD to an audio file and returns the processed audio\n",
|
200 |
-
" containing only the voiced segments.\n",
|
201 |
-
" \"\"\"\n",
|
202 |
-
" # Load the Silero VAD model\n",
|
203 |
-
" model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)\n",
|
204 |
-
" \n",
|
205 |
-
" # Define helper utilities manually\n",
|
206 |
-
" def read_audio(path, sampling_rate=16000):\n",
|
207 |
-
" wav, sr = torchaudio.load(path)\n",
|
208 |
-
" if sr != sampling_rate:\n",
|
209 |
-
" wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)\n",
|
210 |
-
" return wav.squeeze(0)\n",
|
211 |
-
"\n",
|
212 |
-
" def save_audio(path, tensor, sampling_rate=16000):\n",
|
213 |
-
" torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)\n",
|
214 |
-
"\n",
|
215 |
-
" # Read the audio file\n",
|
216 |
-
" wav = read_audio(audio_file_path, sampling_rate=16000)\n",
|
217 |
-
"\n",
|
218 |
-
" # Get timestamps for speech segments\n",
|
219 |
-
" speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)\n",
|
220 |
-
"\n",
|
221 |
-
" # If no speech detected, raise an exception\n",
|
222 |
-
" if not speech_timestamps:\n",
|
223 |
-
" raise Exception(\"No voiced frames detected using Silero VAD.\")\n",
|
224 |
-
"\n",
|
225 |
-
" # Combine the voiced segments\n",
|
226 |
-
" voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])\n",
|
227 |
-
"\n",
|
228 |
-
" # Save the processed audio if needed\n",
|
229 |
-
" save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)\n",
|
230 |
-
"\n",
|
231 |
-
" # Convert to numpy bytes for further processing\n",
|
232 |
-
" return voiced_audio.numpy().tobytes()\n",
|
233 |
-
"\n",
|
234 |
-
"# Example usage\n",
|
235 |
-
"try:\n",
|
236 |
-
" processed_audio = apply_silero_vad(\"path_to_your_audio.wav\")\n",
|
237 |
-
" print(\"VAD completed successfully!\")\n",
|
238 |
-
"except Exception as e:\n",
|
239 |
-
" print(f\"Error during Silero VAD processing: {e}\")\n"
|
240 |
-
]
|
241 |
-
},
|
242 |
-
{
|
243 |
-
"cell_type": "code",
|
244 |
-
"execution_count": null,
|
245 |
-
"id": "7d4ea1a1-9f19-4603-b15f-b1389fddc81b",
|
246 |
-
"metadata": {},
|
247 |
-
"outputs": [],
|
248 |
-
"source": []
|
249 |
-
}
|
250 |
-
],
|
251 |
-
"metadata": {
|
252 |
-
"kernelspec": {
|
253 |
-
"display_name": "Python 3 (ipykernel)",
|
254 |
-
"language": "python",
|
255 |
-
"name": "python3"
|
256 |
-
},
|
257 |
-
"language_info": {
|
258 |
-
"codemirror_mode": {
|
259 |
-
"name": "ipython",
|
260 |
-
"version": 3
|
261 |
-
},
|
262 |
-
"file_extension": ".py",
|
263 |
-
"mimetype": "text/x-python",
|
264 |
-
"name": "python",
|
265 |
-
"nbconvert_exporter": "python",
|
266 |
-
"pygments_lexer": "ipython3",
|
267 |
-
"version": "3.11.7"
|
268 |
-
}
|
269 |
-
},
|
270 |
-
"nbformat": 4,
|
271 |
-
"nbformat_minor": 5
|
272 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|