Spaces:

cdactvm
/

Tamil_ASR_Demo

Running

App Files Files Community

Tamil_ASR_Demo / applyVad.py

cdactvm

Upload 13 files

cd1b576 verified 7 months ago

raw

history blame

6.94 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[ ]:


	# import webrtcvad
	# import numpy as np
	# import librosa
	# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
	# '''
	# Voice Activity Detection (VAD): It is a technique used to determine whether a segment of audio contains speech.
	# This is useful in noisy environments where you want to filter out non-speech parts of the audio.
	# webrtcvad: This is a Python package based on the VAD from the WebRTC (Web Real-Time Communication) project.
	# It helps detect speech in small chunks of audio.
	# '''
	# vad = webrtcvad.Vad()
	# audio_int16 = np.int16(audio * 32767)
	# frame_size = int(sr * frame_duration / 1000)
	# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
	# voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
	# voiced_audio = np.float32(voiced_audio) / 32767
	# return voiced_audio


	# In[1]:


	# import webrtcvad
	# import numpy as np
	# import librosa

	# def apply_vad(audio, sr):
	# # Ensure that sample rate is supported by webrtcvad
	# if sr not in [8000, 16000, 32000, 48000]:
	# raise ValueError("Sample rate must be one of: 8000, 16000, 32000, or 48000 Hz")

	# vad = webrtcvad.Vad(2) # Aggressiveness mode: 0-3
	# frame_duration_ms = 30 # Use 10ms, 20ms, or 30ms frames only

	# # Convert to PCM 16-bit and calculate frame length
	# audio_pcm16 = (audio * 32767).astype(np.int16)
	# frame_length = int(sr * frame_duration_ms / 1000) * 2 # 2 bytes per sample for 16-bit PCM

	# # Create frames ensuring correct frame size
	# frames = [
	# audio_pcm16[i:i + frame_length].tobytes()
	# for i in range(0, len(audio_pcm16) - frame_length, frame_length)
	# ]

	# # Apply VAD
	# voiced_frames = []
	# for frame in frames:
	# try:
	# if vad.is_speech(frame, sample_rate=sr):
	# voiced_frames.append(frame)
	# except Exception as e:
	# print(f"Error during VAD frame processing: {e}")

	# if not voiced_frames:
	# raise Exception("No voiced frames detected.")

	# # Concatenate voiced frames
	# voiced_audio = b''.join(voiced_frames)
	# return np.frombuffer(voiced_audio, dtype=np.int16) / 32767.0


	# In[ ]:


	# import webrtcvad
	# import numpy as np
	# import librosa

	# def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
	# '''
	# Voice Activity Detection (VAD): Detects speech in audio.
	# '''
	# vad = webrtcvad.Vad(aggressiveness)

	# # Resample to 16000 Hz if not already (recommended for better compatibility)
	# if sr != 16000:
	# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
	# sr = 16000

	# # Convert to 16-bit PCM format expected by webrtcvad
	# audio_int16 = np.int16(audio * 32767)

	# # Ensure frame size matches WebRTC's expected lengths
	# frame_size = int(sr * frame_duration / 1000)
	# if frame_size % 2 != 0:
	# frame_size -= 1 # Make sure it's even to avoid processing issues

	# frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]

	# # Filter out non-speech frames
	# voiced_frames = []
	# for frame in frames:
	# if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
	# voiced_frames.append(frame)

	# # Concatenate the voiced frames
	# voiced_audio = np.concatenate(voiced_frames)
	# voiced_audio = np.float32(voiced_audio) / 32767

	# return voiced_audio


	# In[3]:


	# import webrtcvad
	# import numpy as np
	# import librosa

	# def frame_generator(frame_duration_ms, audio, sample_rate):
	# """
	# Generates audio frames from PCM audio data.
	# Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.
	# """
	# n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) # Convert to byte length
	# offset = 0
	# while offset + n < len(audio):
	# yield audio[offset:offset + n]
	# offset += n

	# def apply_vad(audio, sample_rate):
	# vad = webrtcvad.Vad()
	# vad.set_mode(1)
	# print("Applying VAD with mode:", 1)
	# print("Audio length:", len(audio), "bytes")
	# print("Sample rate:", sample_rate)

	# # Ensure mono and correct sample rate
	# if sample_rate != 16000:
	# print("Sample rate issue detected.")
	# raise ValueError("Sample rate must be 16000 Hz")

	# frames = frame_generator(30, audio, sample_rate)
	# frames = list(frames)

	# print("Number of frames:", len(frames))
	# try:
	# segments = [frame for frame in frames if vad.is_speech(frame, sample_rate)]

	# if not segments:
	# raise Exception("No voiced frames detected.")

	# return b''.join(segments)

	# except Exception as e:
	# print(f"Error during VAD frame processing: {e}")
	# raise


	# In[5]:


	import torch
	import torchaudio
	from silero_vad import get_speech_timestamps, read_audio, save_audio

	def apply_silero_vad(audio_file_path):
	"""
	Applies Silero VAD to an audio file and returns the processed audio
	containing only the voiced segments.
	"""
	# Load the Silero VAD model
	model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)

	# Define helper utilities manually
	def read_audio(path, sampling_rate=16000):
	wav, sr = torchaudio.load(path)
	if sr != sampling_rate:
	wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
	return wav.squeeze(0)

	def save_audio(path, tensor, sampling_rate=16000):
	torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)

	# Read the audio file
	wav = read_audio(audio_file_path, sampling_rate=16000)

	# Get timestamps for speech segments
	speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)

	# If no speech detected, raise an exception
	if not speech_timestamps:
	raise Exception("No voiced frames detected using Silero VAD.")

	# Combine the voiced segments
	voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])

	# Save the processed audio if needed
	save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)

	# Convert to numpy bytes for further processing
	return voiced_audio.numpy().tobytes()

	# Example usage
	try:
	processed_audio = apply_silero_vad("path_to_your_audio.wav")
	print("VAD completed successfully!")
	except Exception as e:
	print(f"Error during Silero VAD processing: {e}")


	# In[ ]: