Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import torch | |
from transformers import pipeline | |
import librosa | |
import soundfile as sf | |
class EmotionRecognizer: | |
def __init__(self): | |
self.classifier = pipeline( | |
"audio-classification", | |
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
self.target_sr = 16000 # Target sample rate for the model | |
self.max_duration = 10 # Max audio duration in seconds | |
def process_audio(self, audio_path): | |
try: | |
# Load audio file using soundfile (works better in Hugging Face Spaces) | |
audio, orig_sr = sf.read(audio_path) | |
# Convert stereo to mono if needed | |
if len(audio.shape) > 1: | |
audio = np.mean(audio, axis=1) | |
# Resample if necessary | |
if orig_sr != self.target_sr: | |
audio = librosa.resample( | |
y=audio.astype(np.float32), | |
orig_sr=orig_sr, | |
target_sr=self.target_sr | |
) | |
else: | |
audio = audio.astype(np.float32) | |
# Normalize audio | |
audio = librosa.util.normalize(audio) | |
# Trim/pad audio to max duration | |
max_samples = self.max_duration * self.target_sr | |
if len(audio) > max_samples: | |
audio = audio[:max_samples] | |
else: | |
audio = np.pad(audio, (0, max(0, max_samples - len(audio)))) | |
# Run classification | |
results = self.classifier( | |
{"array": audio, "sampling_rate": self.target_sr} | |
) | |
# Format output | |
labels = [res["label"] for res in results] | |
scores = [res["score"] * 100 for res in results] | |
text_output = "\n".join([ | |
f"{label}: {score:.2f}%" | |
for label, score in zip(labels, scores) | |
]) | |
plot_data = { | |
"labels": labels, | |
"values": scores | |
} | |
return text_output, plot_data | |
except Exception as e: | |
error_msg = f"Error processing audio: {str(e)}" | |
print(error_msg) | |
return error_msg, None | |
def create_interface(): | |
recognizer = EmotionRecognizer() | |
with gr.Blocks(title="Audio Emotion Recognition") as interface: | |
gr.Markdown("# 🎙️ Audio Emotion Recognition") | |
gr.Markdown("Record or upload audio (English speech, 3-10 seconds)") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
sources=["microphone", "upload"], | |
type="filepath", | |
label="Input Audio", | |
waveform_options={"waveform_progress_color": "#FF0066"} | |
) | |
submit_btn = gr.Button("Analyze", variant="primary") | |
with gr.Column(): | |
text_output = gr.Textbox( | |
label="Emotion Analysis Results", | |
interactive=False | |
) | |
plot_output = gr.BarPlot( | |
label="Confidence Scores", | |
x="labels", | |
y="values", | |
color="labels", | |
height=300 | |
) | |
submit_btn.click( | |
fn=recognizer.process_audio, | |
inputs=audio_input, | |
outputs=[text_output, plot_output] | |
) | |
gr.Examples( | |
examples=[ | |
"https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_angry.wav", | |
"https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_happy.wav" | |
], | |
inputs=audio_input, | |
outputs=[text_output, plot_output], | |
fn=recognizer.process_audio, | |
cache_examples=True | |
) | |
return interface | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() |