{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "cb0d4170-de67-444c-934c-98bfdad9ae97", "metadata": {}, "outputs": [], "source": [ "# import webrtcvad\n", "# import numpy as np\n", "# import librosa\n", "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n", "# '''\n", "# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. \n", "# This is useful in noisy environments where you want to filter out non-speech parts of the audio.\n", "# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. \n", "# It helps detect speech in small chunks of audio.\n", "# '''\n", "# vad = webrtcvad.Vad()\n", "# audio_int16 = np.int16(audio * 32767)\n", "# frame_size = int(sr * frame_duration / 1000)\n", "# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n", "# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])\n", "# voiced_audio = np.float32(voiced_audio) / 32767\n", "# return voiced_audio" ] }, { "cell_type": "code", "execution_count": 1, "id": "bbf2e07e-1927-4abd-98a1-8abf8fc591b4", "metadata": {}, "outputs": [], "source": [ "# import webrtcvad\n", "# import numpy as np\n", "# import librosa\n", "\n", "# def apply_vad(audio, sr):\n", "# # Ensure that sample rate is supported by webrtcvad\n", "# if sr not in [8000, 16000, 32000, 48000]:\n", "# raise ValueError(\"Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz\")\n", "\n", "# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3\n", "# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only\n", "\n", "# # Convert to PCM 16-bit and calculate frame length\n", "# audio_pcm16 = (audio * 32767).astype(np.int16)\n", "# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM\n", " \n", "# # Create frames ensuring correct frame size\n", "# frames = [\n", "# audio_pcm16[i:i + frame_length].tobytes()\n", "# for i in range(0, len(audio_pcm16) - frame_length, frame_length)\n", "# ]\n", "\n", "# # Apply VAD\n", "# voiced_frames = []\n", "# for frame in frames:\n", "# try:\n", "# if vad.is_speech(frame, sample_rate=sr):\n", "# voiced_frames.append(frame)\n", "# except Exception as e:\n", "# print(f\"Error during VAD frame processing: {e}\")\n", "\n", "# if not voiced_frames:\n", "# raise Exception(\"No voiced frames detected.\")\n", "\n", "# # Concatenate voiced frames\n", "# voiced_audio = b''.join(voiced_frames)\n", "# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a6277ddc-4692-480f-a930-fc70b82f6852", "metadata": {}, "outputs": [], "source": [ "# import webrtcvad\n", "# import numpy as np\n", "# import librosa\n", "\n", "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n", "# '''\n", "# Voice Activity Detection (VAD): Detects speech in audio.\n", "# '''\n", "# vad = webrtcvad.Vad(aggressiveness)\n", " \n", "# # Resample to 16000 Hz if not already (recommended for better compatibility)\n", "# if sr != 16000:\n", "# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)\n", "# sr = 16000\n", " \n", "# # Convert to 16-bit PCM format expected by webrtcvad\n", "# audio_int16 = np.int16(audio * 32767)\n", " \n", "# # Ensure frame size matches WebRTC's expected lengths\n", "# frame_size = int(sr * frame_duration / 1000)\n", "# if frame_size % 2 != 0:\n", "# frame_size -= 1 # Make sure it's even to avoid processing issues\n", " \n", "# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n", " \n", "# # Filter out non-speech frames\n", "# voiced_frames = []\n", "# for frame in frames:\n", "# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):\n", "# voiced_frames.append(frame)\n", " \n", "# # Concatenate the voiced frames\n", "# voiced_audio = np.concatenate(voiced_frames)\n", "# voiced_audio = np.float32(voiced_audio) / 32767\n", " \n", "# return voiced_audio\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "06df7fd9-7900-4cd9-9bec-7d2875d8946b", "metadata": {}, "outputs": [], "source": [ "# import webrtcvad\n", "# import numpy as np\n", "# import librosa\n", "\n", "# def frame_generator(frame_duration_ms, audio, sample_rate):\n", "# \"\"\"\n", "# Generates audio frames from PCM audio data.\n", "# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.\n", "# \"\"\"\n", "# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length\n", "# offset = 0\n", "# while offset + n < len(audio):\n", "# yield audio[offset:offset + n]\n", "# offset += n\n", "\n", "# def apply_vad(audio, sample_rate):\n", "# vad = webrtcvad.Vad()\n", "# vad.set_mode(1)\n", "# print(\"Applying VAD with mode:\", 1)\n", "# print(\"Audio length:\", len(audio), \"bytes\")\n", "# print(\"Sample rate:\", sample_rate)\n", "\n", "# # Ensure mono and correct sample rate\n", "# if sample_rate != 16000:\n", "# print(\"Sample rate issue detected.\")\n", "# raise ValueError(\"Sample rate must be 16000 Hz\")\n", "\n", "# frames = frame_generator(30, audio, sample_rate)\n", "# frames = list(frames)\n", "\n", "# print(\"Number of frames:\", len(frames))\n", "# try:\n", "# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]\n", "\n", "# if not segments:\n", "# raise Exception(\"No voiced frames detected.\")\n", "\n", "# return b''.join(segments)\n", "\n", "# except Exception as e:\n", "# print(f\"Error during VAD frame processing: {e}\")\n", "# raise\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "cc8e709e-2798-40c4-9ad0-03b6ef43ff5a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading: \"https://github.com/snakers4/silero-vad/zipball/master\" to C:\\Users\\WCHL/.cache\\torch\\hub\\master.zip\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error during Silero VAD processing: Failed to open the input \"path_to_your_audio.wav\" (No such file or directory).\n" ] } ], "source": [ "import torch\n", "import torchaudio\n", "from silero_vad import get_speech_timestamps, read_audio, save_audio\n", "\n", "def apply_silero_vad(audio_file_path):\n", " \"\"\"\n", " Applies Silero VAD to an audio file and returns the processed audio\n", " containing only the voiced segments.\n", " \"\"\"\n", " # Load the Silero VAD model\n", " model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)\n", " \n", " # Define helper utilities manually\n", " def read_audio(path, sampling_rate=16000):\n", " wav, sr = torchaudio.load(path)\n", " if sr != sampling_rate:\n", " wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)\n", " return wav.squeeze(0)\n", "\n", " def save_audio(path, tensor, sampling_rate=16000):\n", " torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)\n", "\n", " # Read the audio file\n", " wav = read_audio(audio_file_path, sampling_rate=16000)\n", "\n", " # Get timestamps for speech segments\n", " speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)\n", "\n", " # If no speech detected, raise an exception\n", " if not speech_timestamps:\n", " raise Exception(\"No voiced frames detected using Silero VAD.\")\n", "\n", " # Combine the voiced segments\n", " voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])\n", "\n", " # Save the processed audio if needed\n", " save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)\n", "\n", " # Convert to numpy bytes for further processing\n", " return voiced_audio.numpy().tobytes()\n", "\n", "# Example usage\n", "try:\n", " processed_audio = apply_silero_vad(\"path_to_your_audio.wav\")\n", " print(\"VAD completed successfully!\")\n", "except Exception as e:\n", " print(f\"Error during Silero VAD processing: {e}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7d4ea1a1-9f19-4603-b15f-b1389fddc81b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }