Spaces:

cdactvm
/

Tamil_ASR_Demo

Sleeping

File size: 6,943 Bytes

cd1b576

#!/usr/bin/env python
# coding: utf-8

# In[ ]:


# import webrtcvad
# import numpy as np
# import librosa
# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
#     '''
#      Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech. 
#      This is useful in noisy environments where you want to filter out non-speech parts of the audio.
#      webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project. 
#      It helps detect speech in small chunks of audio.
#      '''
#     vad = webrtcvad.Vad()
#     audio_int16 = np.int16(audio * 32767)
#     frame_size = int(sr * frame_duration / 1000)
#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
#     voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
#     voiced_audio = np.float32(voiced_audio) / 32767
#     return voiced_audio


# In[1]:


# import webrtcvad
# import numpy as np
# import librosa

# def apply_vad(audio, sr):
#     # Ensure that sample rate is supported by webrtcvad
#     if sr not in [8000, 16000, 32000, 48000]:
#         raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")

#     vad = webrtcvad.Vad(2)  # Aggressiveness mode: 0-3
#     frame_duration_ms = 30  # Use 10ms, 20ms, or 30ms frames only

#     # Convert to PCM 16-bit and calculate frame length
#     audio_pcm16 = (audio * 32767).astype(np.int16)
#     frame_length = int(sr * frame_duration_ms / 1000) * 2  # 2 bytes per sample for 16-bit PCM
    
#     # Create frames ensuring correct frame size
#     frames = [
#         audio_pcm16[i:i + frame_length].tobytes()
#         for i in range(0, len(audio_pcm16) - frame_length, frame_length)
#     ]

#     # Apply VAD
#     voiced_frames = []
#     for frame in frames:
#         try:
#             if vad.is_speech(frame, sample_rate=sr):
#                 voiced_frames.append(frame)
#         except Exception as e:
#             print(f"Error during VAD frame processing: {e}")

#     if not voiced_frames:
#         raise Exception("No voiced frames detected.")

#     # Concatenate voiced frames
#     voiced_audio = b''.join(voiced_frames)
#     return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0


# In[ ]:


# import webrtcvad
# import numpy as np
# import librosa

# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
#     '''
#     Voice Activity Detection (VAD): Detects speech in audio.
#     '''
#     vad = webrtcvad.Vad(aggressiveness)
    
#     # Resample to 16000 Hz if not already (recommended for better compatibility)
#     if sr != 16000:
#         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
#         sr = 16000
    
#     # Convert to 16-bit PCM format expected by webrtcvad
#     audio_int16 = np.int16(audio * 32767)
    
#     # Ensure frame size matches WebRTC's expected lengths
#     frame_size = int(sr * frame_duration / 1000)
#     if frame_size % 2 != 0:
#         frame_size -= 1  # Make sure it's even to avoid processing issues
    
#     frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
    
#     # Filter out non-speech frames
#     voiced_frames = []
#     for frame in frames:
#         if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
#             voiced_frames.append(frame)
    
#     # Concatenate the voiced frames
#     voiced_audio = np.concatenate(voiced_frames)
#     voiced_audio = np.float32(voiced_audio) / 32767
    
#     return voiced_audio


# In[3]:


# import webrtcvad
# import numpy as np
# import librosa

# def frame_generator(frame_duration_ms, audio, sample_rate):
#     """
#     Generates audio frames from PCM audio data.
#     Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
#     """
#     n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)  # Convert to byte length
#     offset = 0
#     while offset + n < len(audio):
#         yield audio[offset:offset + n]
#         offset += n

# def apply_vad(audio, sample_rate):
#     vad = webrtcvad.Vad()
#     vad.set_mode(1)
#     print("Applying VAD with mode:", 1)
#     print("Audio length:", len(audio), "bytes")
#     print("Sample rate:", sample_rate)

#     # Ensure mono and correct sample rate
#     if sample_rate != 16000:
#         print("Sample rate issue detected.")
#         raise ValueError("Sample rate must be 16000 Hz")

#     frames = frame_generator(30, audio, sample_rate)
#     frames = list(frames)

#     print("Number of frames:", len(frames))
#     try:
#         segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]

#         if not segments:
#             raise Exception("No voiced frames detected.")

#         return b''.join(segments)

#     except Exception as e:
#         print(f"Error during VAD frame processing: {e}")
#         raise


# In[5]:


import torch
import torchaudio
from silero_vad import get_speech_timestamps, read_audio, save_audio

def apply_silero_vad(audio_file_path):
    """

    Applies Silero VAD to an audio file and returns the processed audio

    containing only the voiced segments.

    """
    # Load the Silero VAD model
    model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
    
    # Define helper utilities manually
    def read_audio(path, sampling_rate=16000):
        wav, sr = torchaudio.load(path)
        if sr != sampling_rate:
            wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
        return wav.squeeze(0)

    def save_audio(path, tensor, sampling_rate=16000):
        torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)

    # Read the audio file
    wav = read_audio(audio_file_path, sampling_rate=16000)

    # Get timestamps for speech segments
    speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)

    # If no speech detected, raise an exception
    if not speech_timestamps:
        raise Exception("No voiced frames detected using Silero VAD.")

    # Combine the voiced segments
    voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])

    # Save the processed audio if needed
    save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)

    # Convert to numpy bytes for further processing
    return voiced_audio.numpy().tobytes()

# Example usage
try:
    processed_audio = apply_silero_vad("path_to_your_audio.wav")
    print("VAD completed successfully!")
except Exception as e:
    print(f"Error during Silero VAD processing: {e}")


# In[ ]: