Dpngtm's picture
Create app.py
e111c36 verified
raw
history blame
1.79 kB
import gradio as gr
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio
# Load model and processor
model_name = "Dpngtm/wave2vec2-emotion-recognition" # Replace with your model's Hugging Face Hub path
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)
# Define device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Preprocessing and inference function
def recognize_emotion(audio):
# Load and resample audio to 16kHz
speech_array, sampling_rate = torchaudio.load(audio)
if sampling_rate != 16000:
resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
speech_array = resampler(speech_array)
speech_array = speech_array.mean(dim=0).numpy() # Convert to mono if multi-channel
# Process input and make predictions
inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
logits = model(**inputs).logits
predicted_id = torch.argmax(logits, dim=-1).item()
# Define emotion labels (use the same order as during training)
# Emotion labels mapped to indices
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
return emotion_labels[predicted_id]
# Gradio interface
interface = gr.Interface(
fn=recognize_emotion,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="Emotion Recognition with Wav2Vec2",
description="Upload or record audio, and the model will predict the emotion."
)
# Launch the app
interface.launch()