import gradio as gr import torch import librosa from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor # Load the model and feature extractor model_name = "r-f/wav2vec-english-speech-emotion-recognition" model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) # Define the emotion labels labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'] def predict_emotion(audio): # Load and preprocess the audio audio, rate = librosa.load(audio, sr=16000) inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_class_id = torch.argmax(logits).item() return labels[predicted_class_id] # Create the Gradio interface interface = gr.Interface(fn=predict_emotion, inputs=gr.Audio(type="filepath"), outputs="text") interface.launch()