Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification | |
import torchaudio | |
# Load model and processor | |
model_name = "Dpngtm/wave2vec2-emotion-recognition" # Replace with your model's Hugging Face Hub path | |
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) | |
processor = Wav2Vec2Processor.from_pretrained(model_name) | |
# Define device (use GPU if available) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model.to(device) | |
# Preprocessing and inference function | |
def recognize_emotion(audio): | |
# Load and resample audio to 16kHz | |
speech_array, sampling_rate = torchaudio.load(audio) | |
if sampling_rate != 16000: | |
resampler = torchaudio.transforms.Resample(sampling_rate, 16000) | |
speech_array = resampler(speech_array) | |
speech_array = speech_array.mean(dim=0).numpy() # Convert to mono if multi-channel | |
# Process input and make predictions | |
inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
predicted_id = torch.argmax(logits, dim=-1).item() | |
# Define emotion labels (use the same order as during training) | |
# Emotion labels mapped to indices | |
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"] | |
return emotion_labels[predicted_id] | |
# Gradio interface | |
interface = gr.Interface( | |
fn=recognize_emotion, | |
inputs=gr.Audio(source="microphone", type="filepath"), | |
outputs="text", | |
title="Emotion Recognition with Wav2Vec2", | |
description="Upload or record audio, and the model will predict the emotion." | |
) | |
# Launch the app | |
interface.launch() | |