FalconTheBerd's picture
initcommit
fd761f2
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import torch.nn.functional as F
# Load model
model_id = "Hatman/audio-emotion-detection"
extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModelForAudioClassification.from_pretrained(model_id)
def classify_emotion(audio):
waveform, sr = torchaudio.load(audio)
# Mono
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Resample
if sr != 16000:
waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
# Inference
inputs = extractor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
probs = F.softmax(logits, dim=-1)[0]
# Return scores as a dict
return {model.config.id2label[i]: float(probs[i]) for i in range(len(probs))}
import gradio as gr
gr.Interface(fn=classify_emotion,
inputs=gr.Audio(type="filepath", label="Upload or record audio"),
outputs=gr.Label(num_top_classes=None),
title="Speech Emotion Detection").launch()