Boltz79's picture
Update app.py
7539cee verified
raw
history blame
5.01 kB
import gradio as gr
import librosa
import numpy as np
import os
import tempfile
from collections import Counter
from speechbrain.inference.interfaces import foreign_class
# Load the pre-trained SpeechBrain classifier (Emotion Recognition with wav2vec2 on IEMOCAP)
classifier = foreign_class(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier",
run_opts={"device": "cpu"} # Change to {"device": "cuda"} if GPU is available
)
# Try to import noisereduce (if not available, noise reduction will be skipped)
try:
import noisereduce as nr
NOISEREDUCE_AVAILABLE = True
except ImportError:
NOISEREDUCE_AVAILABLE = False
def preprocess_audio(audio_file, apply_noise_reduction=False):
"""
Load and preprocess the audio file:
- Convert to 16kHz mono.
- Optionally apply noise reduction.
- Normalize the audio.
The processed audio is saved to a temporary file and its path is returned.
"""
# Load audio (resampled to 16kHz and in mono)
y, sr = librosa.load(audio_file, sr=16000, mono=True)
# Apply noise reduction if requested and available
if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
y = nr.reduce_noise(y=y, sr=sr)
# Normalize the audio (scale to -1 to 1)
if np.max(np.abs(y)) > 0:
y = y / np.max(np.abs(y))
# Write the preprocessed audio to a temporary WAV file
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
import soundfile as sf
sf.write(temp_file.name, y, sr)
return temp_file.name
def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
"""
For audio files longer than a given segment duration, split the file into overlapping segments,
predict the emotion for each segment, and then return the majority-voted label.
"""
# Load audio
y, sr = librosa.load(audio_file, sr=16000, mono=True)
total_duration = librosa.get_duration(y=y, sr=sr)
# If the audio is short, just process it directly
if total_duration <= segment_duration:
temp_file = preprocess_audio(audio_file, apply_noise_reduction)
_, _, _, label = classifier.classify_file(temp_file)
os.remove(temp_file)
return label
# Split the audio into overlapping segments
step = segment_duration - overlap
segments = []
for start in np.arange(0, total_duration - segment_duration + 0.001, step):
start_sample = int(start * sr)
end_sample = int((start + segment_duration) * sr)
segment_audio = y[start_sample:end_sample]
# Save the segment as a temporary file
temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
import soundfile as sf
sf.write(temp_seg.name, segment_audio, sr)
segments.append(temp_seg.name)
# Process each segment and collect predictions
predictions = []
for seg in segments:
temp_file = preprocess_audio(seg, apply_noise_reduction)
_, _, _, label = classifier.classify_file(temp_file)
predictions.append(label)
os.remove(temp_file)
os.remove(seg)
# Determine the final label via majority vote
vote = Counter(predictions)
most_common = vote.most_common(1)[0][0]
return most_common
def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False):
"""
Main prediction function.
- If use_ensemble is True, the audio is split into segments and ensemble prediction is used.
- Otherwise, the audio is processed as a whole.
"""
try:
if use_ensemble:
label = ensemble_prediction(audio_file, apply_noise_reduction)
else:
temp_file = preprocess_audio(audio_file, apply_noise_reduction)
_, _, _, label = classifier.classify_file(temp_file)
os.remove(temp_file)
return label
except Exception as e:
return f"Error processing file: {str(e)}"
# Define the Gradio interface with additional options for ensemble prediction and noise reduction
iface = gr.Interface(
fn=predict_emotion,
inputs=[
gr.Audio(type="filepath", label="Upload Audio"),
gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False),
gr.Checkbox(label="Apply Noise Reduction", value=False)
],
outputs="text",
title="Enhanced Emotion Recognition",
description=(
"Upload an audio file (expected 16kHz, mono) and the model will predict the emotion "
"using a wav2vec2 model fine-tuned on IEMOCAP data.\n\n"
"Options:\n"
" - Use Ensemble Prediction: For long audio, the file is split into segments and predictions are aggregated.\n"
" - Apply Noise Reduction: Applies a noise reduction filter before classification (requires noisereduce library)."
)
)
if __name__ == "__main__":
iface.launch()