File size: 8,025 Bytes
06dde59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
#     '''
#      Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. 
#      This is useful in noisy environments where you want to filter out non-speech parts of the audio.
#      webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. 
#      It helps detect speech in small chunks of audio.
#      '''
#     vad = webrtcvad.Vad()
#     audio_int16 = np.int16(audio * 32767)
#     frame_size = int(sr * frame_duration / 1000)
#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
#     voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
#     voiced_audio = np.float32(voiced_audio) / 32767
#     return voiced_audio


# In[1]:


# import webrtcvad
# import numpy as np
# import librosa

# def apply_vad(audio, sr):
#     # Ensure that sample rate is supported by webrtcvad
#     if sr not in [8000, 16000, 32000, 48000]:
#         raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")

#     vad = webrtcvad.Vad(2)  # Aggressiveness mode: 0-3
#     frame_duration_ms = 30  # Use 10ms, 20ms, or 30ms frames only

#     # Convert to PCM 16-bit and calculate frame length
#     audio_pcm16 = (audio * 32767).astype(np.int16)
#     frame_length = int(sr * frame_duration_ms / 1000) * 2  # 2 bytes per sample for 16-bit PCM
    
#     # Create frames ensuring correct frame size
#     frames = [
#         audio_pcm16[i:i + frame_length].tobytes()
#         for i in range(0, len(audio_pcm16) - frame_length, frame_length)
#     ]

#     # Apply VAD
#     voiced_frames = []
#     for frame in frames:
#         try:
#             if vad.is_speech(frame, sample_rate=sr):
#                 voiced_frames.append(frame)
#         except Exception as e:
#             print(f"Error during VAD frame processing: {e}")

#     if not voiced_frames:
#         raise Exception("No voiced frames detected.")

#     # Concatenate voiced frames
#     voiced_audio = b''.join(voiced_frames)
#     return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0


# In[ ]:


# import webrtcvad
# import numpy as np
# import librosa

# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
#     '''
#     Voice Activity Detection (VAD): Detects speech in audio.
#     '''
#     vad = webrtcvad.Vad(aggressiveness)
    
#     # Resample to 16000 Hz if not already (recommended for better compatibility)
#     if sr != 16000:
#         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
#         sr = 16000
    
#     # Convert to 16-bit PCM format expected by webrtcvad
#     audio_int16 = np.int16(audio * 32767)
    
#     # Ensure frame size matches WebRTC's expected lengths
#     frame_size = int(sr * frame_duration / 1000)
#     if frame_size % 2 != 0:
#         frame_size -= 1  # Make sure it's even to avoid processing issues
    
#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
    
#     # Filter out non-speech frames
#     voiced_frames = []
#     for frame in frames:
#         if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
#             voiced_frames.append(frame)
    
#     # Concatenate the voiced frames
#     voiced_audio = np.concatenate(voiced_frames)
#     voiced_audio = np.float32(voiced_audio) / 32767
    
#     return voiced_audio


# In[3]:


# import webrtcvad
# import numpy as np
# import librosa

# def frame_generator(frame_duration_ms, audio, sample_rate):
#     """
#     Generates audio frames from PCM audio data.
#     Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
#     """
#     n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)  # Convert to byte length
#     offset = 0
#     while offset + n < len(audio):
#         yield audio[offset:offset + n]
#         offset += n

# def apply_vad(audio, sample_rate):
#     vad = webrtcvad.Vad()
#     vad.set_mode(1)
#     print("Applying VAD with mode:", 1)
#     print("Audio length:", len(audio), "bytes")
#     print("Sample rate:", sample_rate)

#     # Ensure mono and correct sample rate
#     if sample_rate != 16000:
#         print("Sample rate issue detected.")
#         raise ValueError("Sample rate must be 16000 Hz")

#     frames = frame_generator(30, audio, sample_rate)
#     frames = list(frames)

#     print("Number of frames:", len(frames))
#     try:
#         segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]

#         if not segments:
#             raise Exception("No voiced frames detected.")

#         return b''.join(segments)

#     except Exception as e:
#         print(f"Error during VAD frame processing: {e}")
#         raise


# In[5]:


# import torch
# import torchaudio
# from silero_vad import get_speech_timestamps, read_audio, save_audio

# def apply_silero_vad(audio_file_path):
#     """
#     Applies Silero VAD to an audio file and returns the processed audio
#     containing only the voiced segments.
#     """
#     # Load the Silero VAD model
#     model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
    
#     # Define helper utilities manually
#     def read_audio(path, sampling_rate=16000):
#         wav, sr = torchaudio.load(path)
#         if sr != sampling_rate:
#             wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
#         return wav.squeeze(0)

#     def save_audio(path, tensor, sampling_rate=16000):
#         torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)

#     # Read the audio file
#     wav = read_audio(audio_file_path, sampling_rate=16000)

#     # Get timestamps for speech segments
#     speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)

#     # If no speech detected, raise an exception
#     if not speech_timestamps:
#         raise Exception("No voiced frames detected using Silero VAD.")

#     # Combine the voiced segments
#     voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])

#     # Save the processed audio if needed
#     save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)

#     # Convert to numpy bytes for further processing
#     return voiced_audio.numpy().tobytes()

# # Example usage
# try:
#     processed_audio = apply_silero_vad("path_to_your_audio.wav")
#     print("VAD completed successfully!")
# except Exception as e:
#     print(f"Error during Silero VAD processing: {e}")


import webrtcvad
import numpy as np
import librosa

def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
    '''
    Voice Activity Detection (VAD): Detects speech in audio.
    '''
    vad = webrtcvad.Vad(aggressiveness)
    
    # Resample to 16000 Hz if not already (recommended for better compatibility)
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Convert to 16-bit PCM format expected by webrtcvad
    audio_int16 = np.int16(audio * 32767)
    
    # Ensure frame size matches WebRTC's expected lengths
    frame_size = int(sr * frame_duration / 1000)
    if frame_size % 2 != 0:
        frame_size -= 1  # Make sure it's even to avoid processing issues
    
    frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
    
    # Filter out non-speech frames
    voiced_frames = []
    for frame in frames:
        if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
            voiced_frames.append(frame)
    
    # Concatenate the voiced frames
    voiced_audio = np.concatenate(voiced_frames)
    voiced_audio = np.float32(voiced_audio) / 32767
    
    return voiced_audio