import os
import numpy as np
import librosa
import tensorflow as tf
import gradio as gr

class SpeechEmotionRecognizer:
    def __init__(self, model_path):
        self.model = tf.keras.models.load_model(model_path)
        self.sample_rate = 22050
        self.duration = 4  # seconds
        self.emotion_labels = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad']

    def extract_melspectrogram(self, audio_path):
        try:
            # Load and resample audio
            audio, sr = librosa.load(audio_path, sr=self.sample_rate)
            
            # Ensure audio is exactly 4 seconds
            target_length = self.sample_rate * self.duration
            if len(audio) < target_length:
                audio = np.pad(audio, (0, int(target_length - len(audio))))
            else:
                audio = audio[:int(target_length)]
            
            # Extract mel-spectrogram
            mel_spec = librosa.feature.melspectrogram(
                y=audio,
                sr=self.sample_rate,
                n_mels=128,
                n_fft=2048,
                hop_length=512,
                win_length=2048,
                fmax=8000
            )
            
            mel_spec_db = librosa.power_to_db(mel_spec + 1e-10, ref=np.max)
            
            # Normalize
            mean = np.mean(mel_spec_db)
            std = np.std(mel_spec_db)
            mel_spec_norm = (mel_spec_db - mean) / (std + 1e-10)
            
            # Clip extreme values
            mel_spec_norm = np.clip(mel_spec_norm, -5, 5)
            
            # Ensure correct shape (128, 173)
            target_length = 173
            if mel_spec_norm.shape[1] > target_length:
                mel_spec_norm = mel_spec_norm[:, :target_length]
            elif mel_spec_norm.shape[1] < target_length:
                pad_width = target_length - mel_spec_norm.shape[1]
                mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width)), mode='constant')
            
            return mel_spec_norm.reshape((1, 128, 173, 1))
            
        except Exception as e:
            raise gr.Error(f"Error processing audio: {str(e)}")

    def predict_emotion(self, audio_path):
        try:
            # Extract features
            mel_spec = self.extract_melspectrogram(audio_path)
            
            # Make prediction
            prediction = self.model.predict(mel_spec)
            emotion_index = np.argmax(prediction)
            
            # Create results dictionary with confidence scores
            results = {emotion: float(pred) for emotion, pred in zip(self.emotion_labels, prediction[0])}
            
            return results
            
        except Exception as e:
            raise gr.Error(f"Prediction error: {str(e)}")

# Initialize the model
recognizer = SpeechEmotionRecognizer('final_model_conv2d_1K_1.keras')

# Define the Gradio interface
def process_audio(audio):
    if audio is None:
        raise gr.Error("Please provide an audio input")
    
    results = recognizer.predict_emotion(audio)
    return results

# Create the Gradio interface
demo = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(
            label="Record audio (4 seconds)",
            type="filepath",
            sources=["microphone"]  # Updated from 'source' to 'sources'
        )
    ],
    outputs=gr.Label(num_top_classes=6),
    title="Speech Emotion Recognition",
    description="Record a 4-second audio clip to detect the emotion in your voice."
)

# Launch the app
if __name__ == "__main__":
    demo.launch()