Spaces:
Sleeping
Sleeping
File size: 8,025 Bytes
06dde59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
# '''
# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech.
# This is useful in noisy environments where you want to filter out non-speech parts of the audio.
# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project.
# It helps detect speech in small chunks of audio.
# '''
# vad = webrtcvad.Vad()
# audio_int16 = np.int16(audio * 32767)
# frame_size = int(sr * frame_duration / 1000)
# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
# voiced_audio = np.float32(voiced_audio) / 32767
# return voiced_audio
# In[1]:
# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr):
# # Ensure that sample rate is supported by webrtcvad
# if sr not in [8000, 16000, 32000, 48000]:
# raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")
# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3
# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only
# # Convert to PCM 16-bit and calculate frame length
# audio_pcm16 = (audio * 32767).astype(np.int16)
# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM
# # Create frames ensuring correct frame size
# frames = [
# audio_pcm16[i:i + frame_length].tobytes()
# for i in range(0, len(audio_pcm16) - frame_length, frame_length)
# ]
# # Apply VAD
# voiced_frames = []
# for frame in frames:
# try:
# if vad.is_speech(frame, sample_rate=sr):
# voiced_frames.append(frame)
# except Exception as e:
# print(f"Error during VAD frame processing: {e}")
# if not voiced_frames:
# raise Exception("No voiced frames detected.")
# # Concatenate voiced frames
# voiced_audio = b''.join(voiced_frames)
# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0
# In[ ]:
# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
# '''
# Voice Activity Detection (VAD): Detects speech in audio.
# '''
# vad = webrtcvad.Vad(aggressiveness)
# # Resample to 16000 Hz if not already (recommended for better compatibility)
# if sr != 16000:
# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# sr = 16000
# # Convert to 16-bit PCM format expected by webrtcvad
# audio_int16 = np.int16(audio * 32767)
# # Ensure frame size matches WebRTC's expected lengths
# frame_size = int(sr * frame_duration / 1000)
# if frame_size % 2 != 0:
# frame_size -= 1 # Make sure it's even to avoid processing issues
# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
# # Filter out non-speech frames
# voiced_frames = []
# for frame in frames:
# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
# voiced_frames.append(frame)
# # Concatenate the voiced frames
# voiced_audio = np.concatenate(voiced_frames)
# voiced_audio = np.float32(voiced_audio) / 32767
# return voiced_audio
# In[3]:
# import webrtcvad
# import numpy as np
# import librosa
# def frame_generator(frame_duration_ms, audio, sample_rate):
# """
# Generates audio frames from PCM audio data.
# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
# """
# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length
# offset = 0
# while offset + n < len(audio):
# yield audio[offset:offset + n]
# offset += n
# def apply_vad(audio, sample_rate):
# vad = webrtcvad.Vad()
# vad.set_mode(1)
# print("Applying VAD with mode:", 1)
# print("Audio length:", len(audio), "bytes")
# print("Sample rate:", sample_rate)
# # Ensure mono and correct sample rate
# if sample_rate != 16000:
# print("Sample rate issue detected.")
# raise ValueError("Sample rate must be 16000 Hz")
# frames = frame_generator(30, audio, sample_rate)
# frames = list(frames)
# print("Number of frames:", len(frames))
# try:
# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]
# if not segments:
# raise Exception("No voiced frames detected.")
# return b''.join(segments)
# except Exception as e:
# print(f"Error during VAD frame processing: {e}")
# raise
# In[5]:
# import torch
# import torchaudio
# from silero_vad import get_speech_timestamps, read_audio, save_audio
# def apply_silero_vad(audio_file_path):
# """
# Applies Silero VAD to an audio file and returns the processed audio
# containing only the voiced segments.
# """
# # Load the Silero VAD model
# model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
# # Define helper utilities manually
# def read_audio(path, sampling_rate=16000):
# wav, sr = torchaudio.load(path)
# if sr != sampling_rate:
# wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
# return wav.squeeze(0)
# def save_audio(path, tensor, sampling_rate=16000):
# torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)
# # Read the audio file
# wav = read_audio(audio_file_path, sampling_rate=16000)
# # Get timestamps for speech segments
# speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)
# # If no speech detected, raise an exception
# if not speech_timestamps:
# raise Exception("No voiced frames detected using Silero VAD.")
# # Combine the voiced segments
# voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])
# # Save the processed audio if needed
# save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)
# # Convert to numpy bytes for further processing
# return voiced_audio.numpy().tobytes()
# # Example usage
# try:
# processed_audio = apply_silero_vad("path_to_your_audio.wav")
# print("VAD completed successfully!")
# except Exception as e:
# print(f"Error during Silero VAD processing: {e}")
import webrtcvad
import numpy as np
import librosa
def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
'''
Voice Activity Detection (VAD): Detects speech in audio.
'''
vad = webrtcvad.Vad(aggressiveness)
# Resample to 16000 Hz if not already (recommended for better compatibility)
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
sr = 16000
# Convert to 16-bit PCM format expected by webrtcvad
audio_int16 = np.int16(audio * 32767)
# Ensure frame size matches WebRTC's expected lengths
frame_size = int(sr * frame_duration / 1000)
if frame_size % 2 != 0:
frame_size -= 1 # Make sure it's even to avoid processing issues
frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
# Filter out non-speech frames
voiced_frames = []
for frame in frames:
if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
voiced_frames.append(frame)
# Concatenate the voiced frames
voiced_audio = np.concatenate(voiced_frames)
voiced_audio = np.float32(voiced_audio) / 32767
return voiced_audio
|