File size: 6,943 Bytes
cd1b576
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
#     '''
#      Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. 
#      This is useful in noisy environments where you want to filter out non-speech parts of the audio.
#      webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. 
#      It helps detect speech in small chunks of audio.
#      '''
#     vad = webrtcvad.Vad()
#     audio_int16 = np.int16(audio * 32767)
#     frame_size = int(sr * frame_duration / 1000)
#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
#     voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
#     voiced_audio = np.float32(voiced_audio) / 32767
#     return voiced_audio


# In[1]:


# import webrtcvad
# import numpy as np
# import librosa

# def apply_vad(audio, sr):
#     # Ensure that sample rate is supported by webrtcvad
#     if sr not in [8000, 16000, 32000, 48000]:
#         raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")

#     vad = webrtcvad.Vad(2)  # Aggressiveness mode: 0-3
#     frame_duration_ms = 30  # Use 10ms, 20ms, or 30ms frames only

#     # Convert to PCM 16-bit and calculate frame length
#     audio_pcm16 = (audio * 32767).astype(np.int16)
#     frame_length = int(sr * frame_duration_ms / 1000) * 2  # 2 bytes per sample for 16-bit PCM
    
#     # Create frames ensuring correct frame size
#     frames = [
#         audio_pcm16[i:i + frame_length].tobytes()
#         for i in range(0, len(audio_pcm16) - frame_length, frame_length)
#     ]

#     # Apply VAD
#     voiced_frames = []
#     for frame in frames:
#         try:
#             if vad.is_speech(frame, sample_rate=sr):
#                 voiced_frames.append(frame)
#         except Exception as e:
#             print(f"Error during VAD frame processing: {e}")

#     if not voiced_frames:
#         raise Exception("No voiced frames detected.")

#     # Concatenate voiced frames
#     voiced_audio = b''.join(voiced_frames)
#     return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0


# In[ ]:


# import webrtcvad
# import numpy as np
# import librosa

# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
#     '''
#     Voice Activity Detection (VAD): Detects speech in audio.
#     '''
#     vad = webrtcvad.Vad(aggressiveness)
    
#     # Resample to 16000 Hz if not already (recommended for better compatibility)
#     if sr != 16000:
#         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
#         sr = 16000
    
#     # Convert to 16-bit PCM format expected by webrtcvad
#     audio_int16 = np.int16(audio * 32767)
    
#     # Ensure frame size matches WebRTC's expected lengths
#     frame_size = int(sr * frame_duration / 1000)
#     if frame_size % 2 != 0:
#         frame_size -= 1  # Make sure it's even to avoid processing issues
    
#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
    
#     # Filter out non-speech frames
#     voiced_frames = []
#     for frame in frames:
#         if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
#             voiced_frames.append(frame)
    
#     # Concatenate the voiced frames
#     voiced_audio = np.concatenate(voiced_frames)
#     voiced_audio = np.float32(voiced_audio) / 32767
    
#     return voiced_audio


# In[3]:


# import webrtcvad
# import numpy as np
# import librosa

# def frame_generator(frame_duration_ms, audio, sample_rate):
#     """
#     Generates audio frames from PCM audio data.
#     Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
#     """
#     n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)  # Convert to byte length
#     offset = 0
#     while offset + n < len(audio):
#         yield audio[offset:offset + n]
#         offset += n

# def apply_vad(audio, sample_rate):
#     vad = webrtcvad.Vad()
#     vad.set_mode(1)
#     print("Applying VAD with mode:", 1)
#     print("Audio length:", len(audio), "bytes")
#     print("Sample rate:", sample_rate)

#     # Ensure mono and correct sample rate
#     if sample_rate != 16000:
#         print("Sample rate issue detected.")
#         raise ValueError("Sample rate must be 16000 Hz")

#     frames = frame_generator(30, audio, sample_rate)
#     frames = list(frames)

#     print("Number of frames:", len(frames))
#     try:
#         segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]

#         if not segments:
#             raise Exception("No voiced frames detected.")

#         return b''.join(segments)

#     except Exception as e:
#         print(f"Error during VAD frame processing: {e}")
#         raise


# In[5]:


import torch
import torchaudio
from silero_vad import get_speech_timestamps, read_audio, save_audio

def apply_silero_vad(audio_file_path):
    """

    Applies Silero VAD to an audio file and returns the processed audio

    containing only the voiced segments.

    """
    # Load the Silero VAD model
    model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
    
    # Define helper utilities manually
    def read_audio(path, sampling_rate=16000):
        wav, sr = torchaudio.load(path)
        if sr != sampling_rate:
            wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
        return wav.squeeze(0)

    def save_audio(path, tensor, sampling_rate=16000):
        torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)

    # Read the audio file
    wav = read_audio(audio_file_path, sampling_rate=16000)

    # Get timestamps for speech segments
    speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)

    # If no speech detected, raise an exception
    if not speech_timestamps:
        raise Exception("No voiced frames detected using Silero VAD.")

    # Combine the voiced segments
    voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])

    # Save the processed audio if needed
    save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)

    # Convert to numpy bytes for further processing
    return voiced_audio.numpy().tobytes()

# Example usage
try:
    processed_audio = apply_silero_vad("path_to_your_audio.wav")
    print("VAD completed successfully!")
except Exception as e:
    print(f"Error during Silero VAD processing: {e}")


# In[ ]: