import gradio as gr import numpy as np import torch from transformers import pipeline import librosa class EmotionRecognizer: def __init__(self): self.classifier = pipeline( "audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device=0 if torch.cuda.is_available() else -1 ) self.sample_rate = 16000 def process_audio(self, audio_input): try: # Extract audio data and sample rate from gradio input sample_rate, audio_data = audio_input # Convert stereo to mono if necessary if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) # Convert to float32 and normalize audio_data = audio_data.astype(np.float32) if np.max(np.abs(audio_data)) > 1.0: audio_data = audio_data / np.max(np.abs(audio_data)) # Resample if necessary if sample_rate != self.sample_rate: audio_data = librosa.resample( y=audio_data, orig_sr=sample_rate, target_sr=self.sample_rate ) # Ensure the audio isn't too short if len(audio_data) < self.sample_rate: # Pad audio if it's too short audio_data = np.pad(audio_data, (0, self.sample_rate - len(audio_data))) elif len(audio_data) > 10 * self.sample_rate: # Take first 10 seconds if audio is too long audio_data = audio_data[:10 * self.sample_rate] # Make prediction result = self.classifier({"array": audio_data, "sampling_rate": self.sample_rate}) # Format results emotions_text = "\n".join([ f"{pred['label']}: {pred['score']*100:.2f}%" for pred in result ]) # Prepare plot data plot_data = { "labels": [pred['label'] for pred in result], "values": [pred['score'] * 100 for pred in result] } return emotions_text, plot_data except Exception as e: print(f"Error details: {str(e)}") return f"Error processing audio: {str(e)}", None def create_interface(): recognizer = EmotionRecognizer() def process_audio_file(audio): if audio is None: return "Please provide an audio input.", None return recognizer.process_audio(audio) with gr.Blocks() as interface: gr.Markdown("# Audio Emotion Recognition") gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.") with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Upload or Record Audio", type="numpy", sources=["microphone", "upload"], ) analyze_btn = gr.Button("Analyze Emotion") gr.Markdown("Note: Audio will be automatically converted to mono and resampled if needed.") with gr.Column(): output_text = gr.Textbox( label="Results", lines=5 ) output_plot = gr.BarPlot( title="Emotion Confidence Scores", x_title="Emotions", y_title="Confidence (%)" ) analyze_btn.click( fn=process_audio_file, inputs=[audio_input], outputs=[output_text, output_plot] ) return interface if __name__ == "__main__": demo = create_interface() demo.launch(share=True)