import gradio as gr
import numpy as np
import torch
from transformers import pipeline
import librosa

class EmotionRecognizer:
    def __init__(self):
        self.classifier = pipeline(
            "audio-classification",
            model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
            device=0 if torch.cuda.is_available() else -1
        )
        self.sample_rate = 16000

    def process_audio(self, audio_input):
        try:
            # Extract audio data and sample rate from gradio input
            sample_rate, audio_data = audio_input
            
            # Convert stereo to mono if necessary
            if len(audio_data.shape) > 1:
                audio_data = np.mean(audio_data, axis=1)
            
            # Convert to float32 and normalize
            audio_data = audio_data.astype(np.float32)
            if np.max(np.abs(audio_data)) > 1.0:
                audio_data = audio_data / np.max(np.abs(audio_data))
            
            # Resample if necessary
            if sample_rate != self.sample_rate:
                audio_data = librosa.resample(
                    y=audio_data,
                    orig_sr=sample_rate,
                    target_sr=self.sample_rate
                )
            
            # Ensure the audio isn't too short
            if len(audio_data) < self.sample_rate:
                # Pad audio if it's too short
                audio_data = np.pad(audio_data, (0, self.sample_rate - len(audio_data)))
            elif len(audio_data) > 10 * self.sample_rate:
                # Take first 10 seconds if audio is too long
                audio_data = audio_data[:10 * self.sample_rate]
            
            # Make prediction
            result = self.classifier({"array": audio_data, "sampling_rate": self.sample_rate})
            
            # Format results
            emotions_text = "\n".join([
                f"{pred['label']}: {pred['score']*100:.2f}%"
                for pred in result
            ])
            
            # Prepare plot data
            plot_data = {
                "labels": [pred['label'] for pred in result],
                "values": [pred['score'] * 100 for pred in result]
            }
            
            return emotions_text, plot_data
            
        except Exception as e:
            print(f"Error details: {str(e)}")
            return f"Error processing audio: {str(e)}", None

def create_interface():
    recognizer = EmotionRecognizer()
    
    def process_audio_file(audio):
        if audio is None:
            return "Please provide an audio input.", None
        return recognizer.process_audio(audio)

    with gr.Blocks() as interface:
        gr.Markdown("# Audio Emotion Recognition")
        gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.")
        
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(
                    label="Upload or Record Audio",
                    type="numpy",
                    sources=["microphone", "upload"],
                )
                analyze_btn = gr.Button("Analyze Emotion")
                gr.Markdown("Note: Audio will be automatically converted to mono and resampled if needed.")
            
            with gr.Column():
                output_text = gr.Textbox(
                    label="Results",
                    lines=5
                )
                output_plot = gr.BarPlot(
                    title="Emotion Confidence Scores",
                    x_title="Emotions",
                    y_title="Confidence (%)"
                )
        
        analyze_btn.click(
            fn=process_audio_file,
            inputs=[audio_input],
            outputs=[output_text, output_plot]
        )
    
    return interface

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True)