{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb0d4170-de67-444c-934c-98bfdad9ae97",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import webrtcvad\n",
    "# import numpy as np\n",
    "# import librosa\n",
    "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
    "#     '''\n",
    "#      Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. \n",
    "#      This is useful in noisy environments where you want to filter out non-speech parts of the audio.\n",
    "#      webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. \n",
    "#      It helps detect speech in small chunks of audio.\n",
    "#      '''\n",
    "#     vad = webrtcvad.Vad()\n",
    "#     audio_int16 = np.int16(audio * 32767)\n",
    "#     frame_size = int(sr * frame_duration / 1000)\n",
    "#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
    "#     voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])\n",
    "#     voiced_audio = np.float32(voiced_audio) / 32767\n",
    "#     return voiced_audio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bbf2e07e-1927-4abd-98a1-8abf8fc591b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import webrtcvad\n",
    "# import numpy as np\n",
    "# import librosa\n",
    "\n",
    "# def apply_vad(audio, sr):\n",
    "#     # Ensure that sample rate is supported by webrtcvad\n",
    "#     if sr not in [8000, 16000, 32000, 48000]:\n",
    "#         raise ValueError(\"Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz\")\n",
    "\n",
    "#     vad = webrtcvad.Vad(2)  # Aggressiveness mode: 0-3\n",
    "#     frame_duration_ms = 30  # Use 10ms, 20ms, or 30ms frames only\n",
    "\n",
    "#     # Convert to PCM 16-bit and calculate frame length\n",
    "#     audio_pcm16 = (audio * 32767).astype(np.int16)\n",
    "#     frame_length = int(sr * frame_duration_ms / 1000) * 2  # 2 bytes per sample for 16-bit PCM\n",
    "    \n",
    "#     # Create frames ensuring correct frame size\n",
    "#     frames = [\n",
    "#         audio_pcm16[i:i + frame_length].tobytes()\n",
    "#         for i in range(0, len(audio_pcm16) - frame_length, frame_length)\n",
    "#     ]\n",
    "\n",
    "#     # Apply VAD\n",
    "#     voiced_frames = []\n",
    "#     for frame in frames:\n",
    "#         try:\n",
    "#             if vad.is_speech(frame, sample_rate=sr):\n",
    "#                 voiced_frames.append(frame)\n",
    "#         except Exception as e:\n",
    "#             print(f\"Error during VAD frame processing: {e}\")\n",
    "\n",
    "#     if not voiced_frames:\n",
    "#         raise Exception(\"No voiced frames detected.\")\n",
    "\n",
    "#     # Concatenate voiced frames\n",
    "#     voiced_audio = b''.join(voiced_frames)\n",
    "#     return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6277ddc-4692-480f-a930-fc70b82f6852",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import webrtcvad\n",
    "# import numpy as np\n",
    "# import librosa\n",
    "\n",
    "# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):\n",
    "#     '''\n",
    "#     Voice Activity Detection (VAD): Detects speech in audio.\n",
    "#     '''\n",
    "#     vad = webrtcvad.Vad(aggressiveness)\n",
    "    \n",
    "#     # Resample to 16000 Hz if not already (recommended for better compatibility)\n",
    "#     if sr != 16000:\n",
    "#         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)\n",
    "#         sr = 16000\n",
    "    \n",
    "#     # Convert to 16-bit PCM format expected by webrtcvad\n",
    "#     audio_int16 = np.int16(audio * 32767)\n",
    "    \n",
    "#     # Ensure frame size matches WebRTC's expected lengths\n",
    "#     frame_size = int(sr * frame_duration / 1000)\n",
    "#     if frame_size % 2 != 0:\n",
    "#         frame_size -= 1  # Make sure it's even to avoid processing issues\n",
    "    \n",
    "#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]\n",
    "    \n",
    "#     # Filter out non-speech frames\n",
    "#     voiced_frames = []\n",
    "#     for frame in frames:\n",
    "#         if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):\n",
    "#             voiced_frames.append(frame)\n",
    "    \n",
    "#     # Concatenate the voiced frames\n",
    "#     voiced_audio = np.concatenate(voiced_frames)\n",
    "#     voiced_audio = np.float32(voiced_audio) / 32767\n",
    "    \n",
    "#     return voiced_audio\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "06df7fd9-7900-4cd9-9bec-7d2875d8946b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import webrtcvad\n",
    "# import numpy as np\n",
    "# import librosa\n",
    "\n",
    "# def frame_generator(frame_duration_ms, audio, sample_rate):\n",
    "#     \"\"\"\n",
    "#     Generates audio frames from PCM audio data.\n",
    "#     Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.\n",
    "#     \"\"\"\n",
    "#     n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)  # Convert to byte length\n",
    "#     offset = 0\n",
    "#     while offset + n < len(audio):\n",
    "#         yield audio[offset:offset + n]\n",
    "#         offset += n\n",
    "\n",
    "# def apply_vad(audio, sample_rate):\n",
    "#     vad = webrtcvad.Vad()\n",
    "#     vad.set_mode(1)\n",
    "#     print(\"Applying VAD with mode:\", 1)\n",
    "#     print(\"Audio length:\", len(audio), \"bytes\")\n",
    "#     print(\"Sample rate:\", sample_rate)\n",
    "\n",
    "#     # Ensure mono and correct sample rate\n",
    "#     if sample_rate != 16000:\n",
    "#         print(\"Sample rate issue detected.\")\n",
    "#         raise ValueError(\"Sample rate must be 16000 Hz\")\n",
    "\n",
    "#     frames = frame_generator(30, audio, sample_rate)\n",
    "#     frames = list(frames)\n",
    "\n",
    "#     print(\"Number of frames:\", len(frames))\n",
    "#     try:\n",
    "#         segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]\n",
    "\n",
    "#         if not segments:\n",
    "#             raise Exception(\"No voiced frames detected.\")\n",
    "\n",
    "#         return b''.join(segments)\n",
    "\n",
    "#     except Exception as e:\n",
    "#         print(f\"Error during VAD frame processing: {e}\")\n",
    "#         raise\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cc8e709e-2798-40c4-9ad0-03b6ef43ff5a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading: \"https://github.com/snakers4/silero-vad/zipball/master\" to C:\\Users\\WCHL/.cache\\torch\\hub\\master.zip\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error during Silero VAD processing: Failed to open the input \"path_to_your_audio.wav\" (No such file or directory).\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torchaudio\n",
    "from silero_vad import get_speech_timestamps, read_audio, save_audio\n",
    "\n",
    "def apply_silero_vad(audio_file_path):\n",
    "    \"\"\"\n",
    "    Applies Silero VAD to an audio file and returns the processed audio\n",
    "    containing only the voiced segments.\n",
    "    \"\"\"\n",
    "    # Load the Silero VAD model\n",
    "    model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)\n",
    "    \n",
    "    # Define helper utilities manually\n",
    "    def read_audio(path, sampling_rate=16000):\n",
    "        wav, sr = torchaudio.load(path)\n",
    "        if sr != sampling_rate:\n",
    "            wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)\n",
    "        return wav.squeeze(0)\n",
    "\n",
    "    def save_audio(path, tensor, sampling_rate=16000):\n",
    "        torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)\n",
    "\n",
    "    # Read the audio file\n",
    "    wav = read_audio(audio_file_path, sampling_rate=16000)\n",
    "\n",
    "    # Get timestamps for speech segments\n",
    "    speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)\n",
    "\n",
    "    # If no speech detected, raise an exception\n",
    "    if not speech_timestamps:\n",
    "        raise Exception(\"No voiced frames detected using Silero VAD.\")\n",
    "\n",
    "    # Combine the voiced segments\n",
    "    voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])\n",
    "\n",
    "    # Save the processed audio if needed\n",
    "    save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)\n",
    "\n",
    "    # Convert to numpy bytes for further processing\n",
    "    return voiced_audio.numpy().tobytes()\n",
    "\n",
    "# Example usage\n",
    "try:\n",
    "    processed_audio = apply_silero_vad(\"path_to_your_audio.wav\")\n",
    "    print(\"VAD completed successfully!\")\n",
    "except Exception as e:\n",
    "    print(f\"Error during Silero VAD processing: {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d4ea1a1-9f19-4603-b15f-b1389fddc81b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}