Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification | |
import torch.nn.functional as F | |
# Load model | |
model_id = "Hatman/audio-emotion-detection" | |
extractor = AutoFeatureExtractor.from_pretrained(model_id) | |
model = AutoModelForAudioClassification.from_pretrained(model_id) | |
def classify_emotion(audio): | |
waveform, sr = torchaudio.load(audio) | |
# Mono | |
if waveform.shape[0] > 1: | |
waveform = waveform.mean(dim=0, keepdim=True) | |
# Resample | |
if sr != 16000: | |
waveform = torchaudio.transforms.Resample(sr, 16000)(waveform) | |
# Inference | |
inputs = extractor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt") | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
probs = F.softmax(logits, dim=-1)[0] | |
# Return scores as a dict | |
return {model.config.id2label[i]: float(probs[i]) for i in range(len(probs))} | |
import gradio as gr | |
gr.Interface(fn=classify_emotion, | |
inputs=gr.Audio(type="filepath", label="Upload or record audio"), | |
outputs=gr.Label(num_top_classes=None), | |
title="Speech Emotion Detection").launch() | |