|
import gradio as gr |
|
import torch |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
|
import torchaudio |
|
|
|
|
|
model_name = "Dpngtm/wave2vec2-emotion-recognition" |
|
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) |
|
processor = Wav2Vec2Processor.from_pretrained(model_name) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
|
|
|
|
def recognize_emotion(audio): |
|
|
|
speech_array, sampling_rate = torchaudio.load(audio) |
|
if sampling_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(sampling_rate, 16000) |
|
speech_array = resampler(speech_array) |
|
speech_array = speech_array.mean(dim=0).numpy() |
|
|
|
|
|
inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True) |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
predicted_id = torch.argmax(logits, dim=-1).item() |
|
|
|
|
|
|
|
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"] |
|
return emotion_labels[predicted_id] |
|
|
|
|
|
interface = gr.Interface( |
|
fn=recognize_emotion, |
|
inputs=gr.Audio(source="microphone", type="filepath"), |
|
outputs="text", |
|
title="Emotion Recognition with Wav2Vec2", |
|
description="Upload or record audio, and the model will predict the emotion." |
|
) |
|
|
|
|
|
interface.launch() |
|
|