Spaces:
Sleeping
Sleeping
import gradio as gr | |
import librosa | |
import numpy as np | |
import os | |
import tempfile | |
from collections import Counter | |
from speechbrain.inference.interfaces import foreign_class | |
# Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP) | |
classifier = foreign_class( | |
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
pymodule_file="custom_interface.py", | |
classname="CustomEncoderWav2vec2Classifier", | |
run_opts={"device": "cpu"} # Change to {"device": "cuda"} if GPU is available | |
) | |
# Try to import noisereduce (if not available, noise reduction will be skipped) | |
try: | |
import noisereduce as nr | |
NOISEREDUCE_AVAILABLE = True | |
except ImportError: | |
NOISEREDUCE_AVAILABLE = False | |
def preprocess_audio(audio_file, apply_noise_reduction=False): | |
""" | |
Load and preprocess the audio file: | |
- Convert to 16kHz mono. | |
- Optionally apply noise reduction. | |
- Normalize the audio. | |
The processed audio is saved to a temporary file and its path is returned. | |
""" | |
# Load audio (resampled to 16kHz and in mono) | |
y, sr = librosa.load(audio_file, sr=16000, mono=True) | |
# Apply noise reduction if requested and available | |
if apply_noise_reduction and NOISEREDUCE_AVAILABLE: | |
y = nr.reduce_noise(y=y, sr=sr) | |
# Normalize the audio (scale to -1 to 1) | |
if np.max(np.abs(y)) > 0: | |
y = y / np.max(np.abs(y)) | |
# Write the preprocessed audio to a temporary WAV file | |
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
import soundfile as sf | |
sf.write(temp_file.name, y, sr) | |
return temp_file.name | |
def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0): | |
""" | |
For audio files longer than a given segment duration, split the file into overlapping segments, | |
predict the emotion for each segment, and then return the majority-voted label. | |
""" | |
# Load audio | |
y, sr = librosa.load(audio_file, sr=16000, mono=True) | |
total_duration = librosa.get_duration(y=y, sr=sr) | |
# If the audio is short, just process it directly | |
if total_duration <= segment_duration: | |
temp_file = preprocess_audio(audio_file, apply_noise_reduction) | |
_, _, _, label = classifier.classify_file(temp_file) | |
os.remove(temp_file) | |
return label | |
# Split the audio into overlapping segments | |
step = segment_duration - overlap | |
segments = [] | |
for start in np.arange(0, total_duration - segment_duration + 0.001, step): | |
start_sample = int(start * sr) | |
end_sample = int((start + segment_duration) * sr) | |
segment_audio = y[start_sample:end_sample] | |
# Save the segment as a temporary file | |
temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
import soundfile as sf | |
sf.write(temp_seg.name, segment_audio, sr) | |
segments.append(temp_seg.name) | |
# Process each segment and collect predictions | |
predictions = [] | |
for seg in segments: | |
temp_file = preprocess_audio(seg, apply_noise_reduction) | |
_, _, _, label = classifier.classify_file(temp_file) | |
predictions.append(label) | |
os.remove(temp_file) | |
os.remove(seg) | |
# Determine the final label via majority vote | |
vote = Counter(predictions) | |
most_common = vote.most_common(1)[0][0] | |
return most_common | |
def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False): | |
""" | |
Main prediction function. | |
- If use_ensemble is True, the audio is split into segments and ensemble prediction is used. | |
- Otherwise, the audio is processed as a whole. | |
""" | |
try: | |
if use_ensemble: | |
label = ensemble_prediction(audio_file, apply_noise_reduction) | |
else: | |
temp_file = preprocess_audio(audio_file, apply_noise_reduction) | |
_, _, _, label = classifier.classify_file(temp_file) | |
os.remove(temp_file) | |
return label | |
except Exception as e: | |
return f"Error processing file: {str(e)}" | |
# Define the Gradio interface with additional options for ensemble prediction and noise reduction | |
iface = gr.Interface( | |
fn=predict_emotion, | |
inputs=[ | |
gr.Audio(type="filepath", label="Upload Audio"), | |
gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False), | |
gr.Checkbox(label="Apply Noise Reduction", value=False) | |
], | |
outputs="text", | |
title="Enhanced Emotion Recognition", | |
description=( | |
"Upload an audio file (expected 16kHz, mono) and the model will predict the emotion " | |
"using a wav2vec2 model fine-tuned on IEMOCAP data.\n\n" | |
"Options:\n" | |
" - Use Ensemble Prediction: For long audio, the file is split into segments and predictions are aggregated.\n" | |
" - Apply Noise Reduction: Applies a noise reduction filter before classification (requires noisereduce library)." | |
) | |
) | |
if __name__ == "__main__": | |
iface.launch() | |