Spaces:
Sleeping
Sleeping
File size: 6,943 Bytes
cd1b576 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
# '''
# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech.
# This is useful in noisy environments where you want to filter out non-speech parts of the audio.
# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project.
# It helps detect speech in small chunks of audio.
# '''
# vad = webrtcvad.Vad()
# audio_int16 = np.int16(audio * 32767)
# frame_size = int(sr * frame_duration / 1000)
# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
# voiced_audio = np.float32(voiced_audio) / 32767
# return voiced_audio
# In[1]:
# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr):
# # Ensure that sample rate is supported by webrtcvad
# if sr not in [8000, 16000, 32000, 48000]:
# raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")
# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3
# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only
# # Convert to PCM 16-bit and calculate frame length
# audio_pcm16 = (audio * 32767).astype(np.int16)
# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM
# # Create frames ensuring correct frame size
# frames = [
# audio_pcm16[i:i + frame_length].tobytes()
# for i in range(0, len(audio_pcm16) - frame_length, frame_length)
# ]
# # Apply VAD
# voiced_frames = []
# for frame in frames:
# try:
# if vad.is_speech(frame, sample_rate=sr):
# voiced_frames.append(frame)
# except Exception as e:
# print(f"Error during VAD frame processing: {e}")
# if not voiced_frames:
# raise Exception("No voiced frames detected.")
# # Concatenate voiced frames
# voiced_audio = b''.join(voiced_frames)
# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0
# In[ ]:
# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
# '''
# Voice Activity Detection (VAD): Detects speech in audio.
# '''
# vad = webrtcvad.Vad(aggressiveness)
# # Resample to 16000 Hz if not already (recommended for better compatibility)
# if sr != 16000:
# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# sr = 16000
# # Convert to 16-bit PCM format expected by webrtcvad
# audio_int16 = np.int16(audio * 32767)
# # Ensure frame size matches WebRTC's expected lengths
# frame_size = int(sr * frame_duration / 1000)
# if frame_size % 2 != 0:
# frame_size -= 1 # Make sure it's even to avoid processing issues
# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
# # Filter out non-speech frames
# voiced_frames = []
# for frame in frames:
# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
# voiced_frames.append(frame)
# # Concatenate the voiced frames
# voiced_audio = np.concatenate(voiced_frames)
# voiced_audio = np.float32(voiced_audio) / 32767
# return voiced_audio
# In[3]:
# import webrtcvad
# import numpy as np
# import librosa
# def frame_generator(frame_duration_ms, audio, sample_rate):
# """
# Generates audio frames from PCM audio data.
# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
# """
# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length
# offset = 0
# while offset + n < len(audio):
# yield audio[offset:offset + n]
# offset += n
# def apply_vad(audio, sample_rate):
# vad = webrtcvad.Vad()
# vad.set_mode(1)
# print("Applying VAD with mode:", 1)
# print("Audio length:", len(audio), "bytes")
# print("Sample rate:", sample_rate)
# # Ensure mono and correct sample rate
# if sample_rate != 16000:
# print("Sample rate issue detected.")
# raise ValueError("Sample rate must be 16000 Hz")
# frames = frame_generator(30, audio, sample_rate)
# frames = list(frames)
# print("Number of frames:", len(frames))
# try:
# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]
# if not segments:
# raise Exception("No voiced frames detected.")
# return b''.join(segments)
# except Exception as e:
# print(f"Error during VAD frame processing: {e}")
# raise
# In[5]:
import torch
import torchaudio
from silero_vad import get_speech_timestamps, read_audio, save_audio
def apply_silero_vad(audio_file_path):
"""
Applies Silero VAD to an audio file and returns the processed audio
containing only the voiced segments.
"""
# Load the Silero VAD model
model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
# Define helper utilities manually
def read_audio(path, sampling_rate=16000):
wav, sr = torchaudio.load(path)
if sr != sampling_rate:
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
return wav.squeeze(0)
def save_audio(path, tensor, sampling_rate=16000):
torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)
# Read the audio file
wav = read_audio(audio_file_path, sampling_rate=16000)
# Get timestamps for speech segments
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)
# If no speech detected, raise an exception
if not speech_timestamps:
raise Exception("No voiced frames detected using Silero VAD.")
# Combine the voiced segments
voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])
# Save the processed audio if needed
save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)
# Convert to numpy bytes for further processing
return voiced_audio.numpy().tobytes()
# Example usage
try:
processed_audio = apply_silero_vad("path_to_your_audio.wav")
print("VAD completed successfully!")
except Exception as e:
print(f"Error during Silero VAD processing: {e}")
# In[ ]:
|