cdactvm commited on
Commit
9667366
·
verified ·
1 Parent(s): 8b8c613

Delete applyVad.ipynb

Browse files
Files changed (1) hide show
  1. applyVad.ipynb +0 -272
applyVad.ipynb DELETED
@@ -1,272 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "cb0d4170-de67-444c-934c-98bfdad9ae97",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "# import webrtcvad\n",
11
- "# import numpy as np\n",
12
- "# import librosa\n",
13
- "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
14
- "# '''\n",
15
- "# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. \n",
16
- "# This is useful in noisy environments where you want to filter out non-speech parts of the audio.\n",
17
- "# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. \n",
18
- "# It helps detect speech in small chunks of audio.\n",
19
- "# '''\n",
20
- "# vad = webrtcvad.Vad()\n",
21
- "# audio_int16 = np.int16(audio * 32767)\n",
22
- "# frame_size = int(sr * frame_duration / 1000)\n",
23
- "# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
24
- "# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])\n",
25
- "# voiced_audio = np.float32(voiced_audio) / 32767\n",
26
- "# return voiced_audio"
27
- ]
28
- },
29
- {
30
- "cell_type": "code",
31
- "execution_count": 1,
32
- "id": "bbf2e07e-1927-4abd-98a1-8abf8fc591b4",
33
- "metadata": {},
34
- "outputs": [],
35
- "source": [
36
- "# import webrtcvad\n",
37
- "# import numpy as np\n",
38
- "# import librosa\n",
39
- "\n",
40
- "# def apply_vad(audio, sr):\n",
41
- "# # Ensure that sample rate is supported by webrtcvad\n",
42
- "# if sr not in [8000, 16000, 32000, 48000]:\n",
43
- "# raise ValueError(\"Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz\")\n",
44
- "\n",
45
- "# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3\n",
46
- "# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only\n",
47
- "\n",
48
- "# # Convert to PCM 16-bit and calculate frame length\n",
49
- "# audio_pcm16 = (audio * 32767).astype(np.int16)\n",
50
- "# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM\n",
51
- " \n",
52
- "# # Create frames ensuring correct frame size\n",
53
- "# frames = [\n",
54
- "# audio_pcm16[i:i + frame_length].tobytes()\n",
55
- "# for i in range(0, len(audio_pcm16) - frame_length, frame_length)\n",
56
- "# ]\n",
57
- "\n",
58
- "# # Apply VAD\n",
59
- "# voiced_frames = []\n",
60
- "# for frame in frames:\n",
61
- "# try:\n",
62
- "# if vad.is_speech(frame, sample_rate=sr):\n",
63
- "# voiced_frames.append(frame)\n",
64
- "# except Exception as e:\n",
65
- "# print(f\"Error during VAD frame processing: {e}\")\n",
66
- "\n",
67
- "# if not voiced_frames:\n",
68
- "# raise Exception(\"No voiced frames detected.\")\n",
69
- "\n",
70
- "# # Concatenate voiced frames\n",
71
- "# voiced_audio = b''.join(voiced_frames)\n",
72
- "# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0\n"
73
- ]
74
- },
75
- {
76
- "cell_type": "code",
77
- "execution_count": null,
78
- "id": "a6277ddc-4692-480f-a930-fc70b82f6852",
79
- "metadata": {},
80
- "outputs": [],
81
- "source": [
82
- "# import webrtcvad\n",
83
- "# import numpy as np\n",
84
- "# import librosa\n",
85
- "\n",
86
- "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
87
- "# '''\n",
88
- "# Voice Activity Detection (VAD): Detects speech in audio.\n",
89
- "# '''\n",
90
- "# vad = webrtcvad.Vad(aggressiveness)\n",
91
- " \n",
92
- "# # Resample to 16000 Hz if not already (recommended for better compatibility)\n",
93
- "# if sr != 16000:\n",
94
- "# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)\n",
95
- "# sr = 16000\n",
96
- " \n",
97
- "# # Convert to 16-bit PCM format expected by webrtcvad\n",
98
- "# audio_int16 = np.int16(audio * 32767)\n",
99
- " \n",
100
- "# # Ensure frame size matches WebRTC's expected lengths\n",
101
- "# frame_size = int(sr * frame_duration / 1000)\n",
102
- "# if frame_size % 2 != 0:\n",
103
- "# frame_size -= 1 # Make sure it's even to avoid processing issues\n",
104
- " \n",
105
- "# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
106
- " \n",
107
- "# # Filter out non-speech frames\n",
108
- "# voiced_frames = []\n",
109
- "# for frame in frames:\n",
110
- "# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):\n",
111
- "# voiced_frames.append(frame)\n",
112
- " \n",
113
- "# # Concatenate the voiced frames\n",
114
- "# voiced_audio = np.concatenate(voiced_frames)\n",
115
- "# voiced_audio = np.float32(voiced_audio) / 32767\n",
116
- " \n",
117
- "# return voiced_audio\n"
118
- ]
119
- },
120
- {
121
- "cell_type": "code",
122
- "execution_count": 3,
123
- "id": "06df7fd9-7900-4cd9-9bec-7d2875d8946b",
124
- "metadata": {},
125
- "outputs": [],
126
- "source": [
127
- "# import webrtcvad\n",
128
- "# import numpy as np\n",
129
- "# import librosa\n",
130
- "\n",
131
- "# def frame_generator(frame_duration_ms, audio, sample_rate):\n",
132
- "# \"\"\"\n",
133
- "# Generates audio frames from PCM audio data.\n",
134
- "# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.\n",
135
- "# \"\"\"\n",
136
- "# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length\n",
137
- "# offset = 0\n",
138
- "# while offset + n < len(audio):\n",
139
- "# yield audio[offset:offset + n]\n",
140
- "# offset += n\n",
141
- "\n",
142
- "# def apply_vad(audio, sample_rate):\n",
143
- "# vad = webrtcvad.Vad()\n",
144
- "# vad.set_mode(1)\n",
145
- "# print(\"Applying VAD with mode:\", 1)\n",
146
- "# print(\"Audio length:\", len(audio), \"bytes\")\n",
147
- "# print(\"Sample rate:\", sample_rate)\n",
148
- "\n",
149
- "# # Ensure mono and correct sample rate\n",
150
- "# if sample_rate != 16000:\n",
151
- "# print(\"Sample rate issue detected.\")\n",
152
- "# raise ValueError(\"Sample rate must be 16000 Hz\")\n",
153
- "\n",
154
- "# frames = frame_generator(30, audio, sample_rate)\n",
155
- "# frames = list(frames)\n",
156
- "\n",
157
- "# print(\"Number of frames:\", len(frames))\n",
158
- "# try:\n",
159
- "# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]\n",
160
- "\n",
161
- "# if not segments:\n",
162
- "# raise Exception(\"No voiced frames detected.\")\n",
163
- "\n",
164
- "# return b''.join(segments)\n",
165
- "\n",
166
- "# except Exception as e:\n",
167
- "# print(f\"Error during VAD frame processing: {e}\")\n",
168
- "# raise\n"
169
- ]
170
- },
171
- {
172
- "cell_type": "code",
173
- "execution_count": 5,
174
- "id": "cc8e709e-2798-40c4-9ad0-03b6ef43ff5a",
175
- "metadata": {},
176
- "outputs": [
177
- {
178
- "name": "stderr",
179
- "output_type": "stream",
180
- "text": [
181
- "Downloading: \"https://github.com/snakers4/silero-vad/zipball/master\" to C:\\Users\\WCHL/.cache\\torch\\hub\\master.zip\n"
182
- ]
183
- },
184
- {
185
- "name": "stdout",
186
- "output_type": "stream",
187
- "text": [
188
- "Error during Silero VAD processing: Failed to open the input \"path_to_your_audio.wav\" (No such file or directory).\n"
189
- ]
190
- }
191
- ],
192
- "source": [
193
- "import torch\n",
194
- "import torchaudio\n",
195
- "from silero_vad import get_speech_timestamps, read_audio, save_audio\n",
196
- "\n",
197
- "def apply_silero_vad(audio_file_path):\n",
198
- " \"\"\"\n",
199
- " Applies Silero VAD to an audio file and returns the processed audio\n",
200
- " containing only the voiced segments.\n",
201
- " \"\"\"\n",
202
- " # Load the Silero VAD model\n",
203
- " model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)\n",
204
- " \n",
205
- " # Define helper utilities manually\n",
206
- " def read_audio(path, sampling_rate=16000):\n",
207
- " wav, sr = torchaudio.load(path)\n",
208
- " if sr != sampling_rate:\n",
209
- " wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)\n",
210
- " return wav.squeeze(0)\n",
211
- "\n",
212
- " def save_audio(path, tensor, sampling_rate=16000):\n",
213
- " torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)\n",
214
- "\n",
215
- " # Read the audio file\n",
216
- " wav = read_audio(audio_file_path, sampling_rate=16000)\n",
217
- "\n",
218
- " # Get timestamps for speech segments\n",
219
- " speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)\n",
220
- "\n",
221
- " # If no speech detected, raise an exception\n",
222
- " if not speech_timestamps:\n",
223
- " raise Exception(\"No voiced frames detected using Silero VAD.\")\n",
224
- "\n",
225
- " # Combine the voiced segments\n",
226
- " voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])\n",
227
- "\n",
228
- " # Save the processed audio if needed\n",
229
- " save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)\n",
230
- "\n",
231
- " # Convert to numpy bytes for further processing\n",
232
- " return voiced_audio.numpy().tobytes()\n",
233
- "\n",
234
- "# Example usage\n",
235
- "try:\n",
236
- " processed_audio = apply_silero_vad(\"path_to_your_audio.wav\")\n",
237
- " print(\"VAD completed successfully!\")\n",
238
- "except Exception as e:\n",
239
- " print(f\"Error during Silero VAD processing: {e}\")\n"
240
- ]
241
- },
242
- {
243
- "cell_type": "code",
244
- "execution_count": null,
245
- "id": "7d4ea1a1-9f19-4603-b15f-b1389fddc81b",
246
- "metadata": {},
247
- "outputs": [],
248
- "source": []
249
- }
250
- ],
251
- "metadata": {
252
- "kernelspec": {
253
- "display_name": "Python 3 (ipykernel)",
254
- "language": "python",
255
- "name": "python3"
256
- },
257
- "language_info": {
258
- "codemirror_mode": {
259
- "name": "ipython",
260
- "version": 3
261
- },
262
- "file_extension": ".py",
263
- "mimetype": "text/x-python",
264
- "name": "python",
265
- "nbconvert_exporter": "python",
266
- "pygments_lexer": "ipython3",
267
- "version": "3.11.7"
268
- }
269
- },
270
- "nbformat": 4,
271
- "nbformat_minor": 5
272
- }