import gradio as gr
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio

# Load model and processor
model_name = "Dpngtm/wave2vec2-emotion-recognition"  # Replace with your model's Hugging Face Hub path
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Define device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Preprocessing and inference function
def recognize_emotion(audio):
    # Load and resample audio to 16kHz
    speech_array, sampling_rate = torchaudio.load(audio)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        speech_array = resampler(speech_array)
    speech_array = speech_array.mean(dim=0).numpy()  # Convert to mono if multi-channel

    # Process input and make predictions
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_id = torch.argmax(logits, dim=-1).item()

    # Define emotion labels (use the same order as during training)
    # Emotion labels mapped to indices
    emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
    return emotion_labels[predicted_id]

# Gradio interface
interface = gr.Interface(
    fn=recognize_emotion,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Emotion Recognition with Wav2Vec2",
    description="Upload or record audio, and the model will predict the emotion."
)

# Launch the app
interface.launch()