cdactvm commited on
Commit
6c98b22
·
verified ·
1 Parent(s): 44cdd99

Upload applyVad.ipynb

Browse files
Files changed (1) hide show
  1. applyVad.ipynb +272 -0
applyVad.ipynb ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "cb0d4170-de67-444c-934c-98bfdad9ae97",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# import webrtcvad\n",
11
+ "# import numpy as np\n",
12
+ "# import librosa\n",
13
+ "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
14
+ "# '''\n",
15
+ "# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. \n",
16
+ "# This is useful in noisy environments where you want to filter out non-speech parts of the audio.\n",
17
+ "# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. \n",
18
+ "# It helps detect speech in small chunks of audio.\n",
19
+ "# '''\n",
20
+ "# vad = webrtcvad.Vad()\n",
21
+ "# audio_int16 = np.int16(audio * 32767)\n",
22
+ "# frame_size = int(sr * frame_duration / 1000)\n",
23
+ "# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
24
+ "# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])\n",
25
+ "# voiced_audio = np.float32(voiced_audio) / 32767\n",
26
+ "# return voiced_audio"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 1,
32
+ "id": "bbf2e07e-1927-4abd-98a1-8abf8fc591b4",
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "# import webrtcvad\n",
37
+ "# import numpy as np\n",
38
+ "# import librosa\n",
39
+ "\n",
40
+ "# def apply_vad(audio, sr):\n",
41
+ "# # Ensure that sample rate is supported by webrtcvad\n",
42
+ "# if sr not in [8000, 16000, 32000, 48000]:\n",
43
+ "# raise ValueError(\"Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz\")\n",
44
+ "\n",
45
+ "# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3\n",
46
+ "# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only\n",
47
+ "\n",
48
+ "# # Convert to PCM 16-bit and calculate frame length\n",
49
+ "# audio_pcm16 = (audio * 32767).astype(np.int16)\n",
50
+ "# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM\n",
51
+ " \n",
52
+ "# # Create frames ensuring correct frame size\n",
53
+ "# frames = [\n",
54
+ "# audio_pcm16[i:i + frame_length].tobytes()\n",
55
+ "# for i in range(0, len(audio_pcm16) - frame_length, frame_length)\n",
56
+ "# ]\n",
57
+ "\n",
58
+ "# # Apply VAD\n",
59
+ "# voiced_frames = []\n",
60
+ "# for frame in frames:\n",
61
+ "# try:\n",
62
+ "# if vad.is_speech(frame, sample_rate=sr):\n",
63
+ "# voiced_frames.append(frame)\n",
64
+ "# except Exception as e:\n",
65
+ "# print(f\"Error during VAD frame processing: {e}\")\n",
66
+ "\n",
67
+ "# if not voiced_frames:\n",
68
+ "# raise Exception(\"No voiced frames detected.\")\n",
69
+ "\n",
70
+ "# # Concatenate voiced frames\n",
71
+ "# voiced_audio = b''.join(voiced_frames)\n",
72
+ "# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0\n"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "a6277ddc-4692-480f-a930-fc70b82f6852",
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "# import webrtcvad\n",
83
+ "# import numpy as np\n",
84
+ "# import librosa\n",
85
+ "\n",
86
+ "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
87
+ "# '''\n",
88
+ "# Voice Activity Detection (VAD): Detects speech in audio.\n",
89
+ "# '''\n",
90
+ "# vad = webrtcvad.Vad(aggressiveness)\n",
91
+ " \n",
92
+ "# # Resample to 16000 Hz if not already (recommended for better compatibility)\n",
93
+ "# if sr != 16000:\n",
94
+ "# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)\n",
95
+ "# sr = 16000\n",
96
+ " \n",
97
+ "# # Convert to 16-bit PCM format expected by webrtcvad\n",
98
+ "# audio_int16 = np.int16(audio * 32767)\n",
99
+ " \n",
100
+ "# # Ensure frame size matches WebRTC's expected lengths\n",
101
+ "# frame_size = int(sr * frame_duration / 1000)\n",
102
+ "# if frame_size % 2 != 0:\n",
103
+ "# frame_size -= 1 # Make sure it's even to avoid processing issues\n",
104
+ " \n",
105
+ "# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
106
+ " \n",
107
+ "# # Filter out non-speech frames\n",
108
+ "# voiced_frames = []\n",
109
+ "# for frame in frames:\n",
110
+ "# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):\n",
111
+ "# voiced_frames.append(frame)\n",
112
+ " \n",
113
+ "# # Concatenate the voiced frames\n",
114
+ "# voiced_audio = np.concatenate(voiced_frames)\n",
115
+ "# voiced_audio = np.float32(voiced_audio) / 32767\n",
116
+ " \n",
117
+ "# return voiced_audio\n"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 3,
123
+ "id": "06df7fd9-7900-4cd9-9bec-7d2875d8946b",
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "# import webrtcvad\n",
128
+ "# import numpy as np\n",
129
+ "# import librosa\n",
130
+ "\n",
131
+ "# def frame_generator(frame_duration_ms, audio, sample_rate):\n",
132
+ "# \"\"\"\n",
133
+ "# Generates audio frames from PCM audio data.\n",
134
+ "# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.\n",
135
+ "# \"\"\"\n",
136
+ "# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length\n",
137
+ "# offset = 0\n",
138
+ "# while offset + n < len(audio):\n",
139
+ "# yield audio[offset:offset + n]\n",
140
+ "# offset += n\n",
141
+ "\n",
142
+ "# def apply_vad(audio, sample_rate):\n",
143
+ "# vad = webrtcvad.Vad()\n",
144
+ "# vad.set_mode(1)\n",
145
+ "# print(\"Applying VAD with mode:\", 1)\n",
146
+ "# print(\"Audio length:\", len(audio), \"bytes\")\n",
147
+ "# print(\"Sample rate:\", sample_rate)\n",
148
+ "\n",
149
+ "# # Ensure mono and correct sample rate\n",
150
+ "# if sample_rate != 16000:\n",
151
+ "# print(\"Sample rate issue detected.\")\n",
152
+ "# raise ValueError(\"Sample rate must be 16000 Hz\")\n",
153
+ "\n",
154
+ "# frames = frame_generator(30, audio, sample_rate)\n",
155
+ "# frames = list(frames)\n",
156
+ "\n",
157
+ "# print(\"Number of frames:\", len(frames))\n",
158
+ "# try:\n",
159
+ "# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]\n",
160
+ "\n",
161
+ "# if not segments:\n",
162
+ "# raise Exception(\"No voiced frames detected.\")\n",
163
+ "\n",
164
+ "# return b''.join(segments)\n",
165
+ "\n",
166
+ "# except Exception as e:\n",
167
+ "# print(f\"Error during VAD frame processing: {e}\")\n",
168
+ "# raise\n"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 5,
174
+ "id": "cc8e709e-2798-40c4-9ad0-03b6ef43ff5a",
175
+ "metadata": {},
176
+ "outputs": [
177
+ {
178
+ "name": "stderr",
179
+ "output_type": "stream",
180
+ "text": [
181
+ "Downloading: \"https://github.com/snakers4/silero-vad/zipball/master\" to C:\\Users\\WCHL/.cache\\torch\\hub\\master.zip\n"
182
+ ]
183
+ },
184
+ {
185
+ "name": "stdout",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "Error during Silero VAD processing: Failed to open the input \"path_to_your_audio.wav\" (No such file or directory).\n"
189
+ ]
190
+ }
191
+ ],
192
+ "source": [
193
+ "import torch\n",
194
+ "import torchaudio\n",
195
+ "from silero_vad import get_speech_timestamps, read_audio, save_audio\n",
196
+ "\n",
197
+ "def apply_silero_vad(audio_file_path):\n",
198
+ " \"\"\"\n",
199
+ " Applies Silero VAD to an audio file and returns the processed audio\n",
200
+ " containing only the voiced segments.\n",
201
+ " \"\"\"\n",
202
+ " # Load the Silero VAD model\n",
203
+ " model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)\n",
204
+ " \n",
205
+ " # Define helper utilities manually\n",
206
+ " def read_audio(path, sampling_rate=16000):\n",
207
+ " wav, sr = torchaudio.load(path)\n",
208
+ " if sr != sampling_rate:\n",
209
+ " wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)\n",
210
+ " return wav.squeeze(0)\n",
211
+ "\n",
212
+ " def save_audio(path, tensor, sampling_rate=16000):\n",
213
+ " torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)\n",
214
+ "\n",
215
+ " # Read the audio file\n",
216
+ " wav = read_audio(audio_file_path, sampling_rate=16000)\n",
217
+ "\n",
218
+ " # Get timestamps for speech segments\n",
219
+ " speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)\n",
220
+ "\n",
221
+ " # If no speech detected, raise an exception\n",
222
+ " if not speech_timestamps:\n",
223
+ " raise Exception(\"No voiced frames detected using Silero VAD.\")\n",
224
+ "\n",
225
+ " # Combine the voiced segments\n",
226
+ " voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])\n",
227
+ "\n",
228
+ " # Save the processed audio if needed\n",
229
+ " save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)\n",
230
+ "\n",
231
+ " # Convert to numpy bytes for further processing\n",
232
+ " return voiced_audio.numpy().tobytes()\n",
233
+ "\n",
234
+ "# Example usage\n",
235
+ "try:\n",
236
+ " processed_audio = apply_silero_vad(\"path_to_your_audio.wav\")\n",
237
+ " print(\"VAD completed successfully!\")\n",
238
+ "except Exception as e:\n",
239
+ " print(f\"Error during Silero VAD processing: {e}\")\n"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": null,
245
+ "id": "7d4ea1a1-9f19-4603-b15f-b1389fddc81b",
246
+ "metadata": {},
247
+ "outputs": [],
248
+ "source": []
249
+ }
250
+ ],
251
+ "metadata": {
252
+ "kernelspec": {
253
+ "display_name": "Python 3 (ipykernel)",
254
+ "language": "python",
255
+ "name": "python3"
256
+ },
257
+ "language_info": {
258
+ "codemirror_mode": {
259
+ "name": "ipython",
260
+ "version": 3
261
+ },
262
+ "file_extension": ".py",
263
+ "mimetype": "text/x-python",
264
+ "name": "python",
265
+ "nbconvert_exporter": "python",
266
+ "pygments_lexer": "ipython3",
267
+ "version": "3.11.7"
268
+ }
269
+ },
270
+ "nbformat": 4,
271
+ "nbformat_minor": 5
272
+ }