import torch import torchaudio from transformers import AutoFeatureExtractor, AutoModelForAudioClassification import torch.nn.functional as F # Load model model_id = "Hatman/audio-emotion-detection" extractor = AutoFeatureExtractor.from_pretrained(model_id) model = AutoModelForAudioClassification.from_pretrained(model_id) def classify_emotion(audio): waveform, sr = torchaudio.load(audio) # Mono if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) # Resample if sr != 16000: waveform = torchaudio.transforms.Resample(sr, 16000)(waveform) # Inference inputs = extractor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt") with torch.no_grad(): logits = model(**inputs).logits probs = F.softmax(logits, dim=-1)[0] # Return scores as a dict return {model.config.id2label[i]: float(probs[i]) for i in range(len(probs))} import gradio as gr gr.Interface(fn=classify_emotion, inputs=gr.Audio(type="filepath", label="Upload or record audio"), outputs=gr.Label(num_top_classes=None), title="Speech Emotion Detection").launch()