import gradio as gr import numpy as np import torch from transformers import pipeline import librosa import soundfile as sf class EmotionRecognizer: def __init__(self): self.classifier = pipeline( "audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device=0 if torch.cuda.is_available() else -1 ) self.target_sr = 16000 # Target sample rate for the model self.max_duration = 10 # Max audio duration in seconds def process_audio(self, audio_path): try: # Load audio file using soundfile (works better in Hugging Face Spaces) audio, orig_sr = sf.read(audio_path) # Convert stereo to mono if needed if len(audio.shape) > 1: audio = np.mean(audio, axis=1) # Resample if necessary if orig_sr != self.target_sr: audio = librosa.resample( y=audio.astype(np.float32), orig_sr=orig_sr, target_sr=self.target_sr ) else: audio = audio.astype(np.float32) # Normalize audio audio = librosa.util.normalize(audio) # Trim/pad audio to max duration max_samples = self.max_duration * self.target_sr if len(audio) > max_samples: audio = audio[:max_samples] else: audio = np.pad(audio, (0, max(0, max_samples - len(audio)))) # Run classification results = self.classifier( {"array": audio, "sampling_rate": self.target_sr} ) # Format output labels = [res["label"] for res in results] scores = [res["score"] * 100 for res in results] text_output = "\n".join([ f"{label}: {score:.2f}%" for label, score in zip(labels, scores) ]) plot_data = { "labels": labels, "values": scores } return text_output, plot_data except Exception as e: error_msg = f"Error processing audio: {str(e)}" print(error_msg) return error_msg, None def create_interface(): recognizer = EmotionRecognizer() with gr.Blocks(title="Audio Emotion Recognition") as interface: gr.Markdown("# 🎙️ Audio Emotion Recognition") gr.Markdown("Record or upload audio (English speech, 3-10 seconds)") with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Input Audio", waveform_options={"waveform_progress_color": "#FF0066"} ) submit_btn = gr.Button("Analyze", variant="primary") with gr.Column(): text_output = gr.Textbox( label="Emotion Analysis Results", interactive=False ) plot_output = gr.BarPlot( label="Confidence Scores", x="labels", y="values", color="labels", height=300 ) submit_btn.click( fn=recognizer.process_audio, inputs=audio_input, outputs=[text_output, plot_output] ) gr.Examples( examples=[ "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_angry.wav", "https://huggingface.co/spaces/echalabres/emotion-recognition/raw/main/example_happy.wav" ], inputs=audio_input, outputs=[text_output, plot_output], fn=recognizer.process_audio, cache_examples=True ) return interface if __name__ == "__main__": demo = create_interface() demo.launch()