Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import librosa | |
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor | |
# Load the model and feature extractor | |
model_name = "r-f/wav2vec-english-speech-emotion-recognition" | |
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) | |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) | |
# Define the emotion labels | |
labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'] | |
def predict_emotion(audio): | |
# Load and preprocess the audio | |
audio, rate = librosa.load(audio, sr=16000) | |
inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt", padding=True) | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
predicted_class_id = torch.argmax(logits).item() | |
return labels[predicted_class_id] | |
# Create the Gradio interface | |
interface = gr.Interface(fn=predict_emotion, inputs=gr.Audio(type="filepath"), outputs="text") | |
interface.launch() | |