|
import gradio as gr |
|
import torch |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
|
import torchaudio |
|
|
|
|
|
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"] |
|
|
|
|
|
model_name = "Dpngtm/wave2vec2-emotion-recognition" |
|
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) |
|
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels)) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
|
|
|
|
def recognize_emotion(audio): |
|
""" |
|
Predicts the emotion from an audio file using the fine-tuned Wav2Vec2 model. |
|
|
|
Args: |
|
audio (str or file-like object): Path or file-like object for the audio file to predict emotion for. |
|
|
|
Returns: |
|
str: Predicted emotion label for the given audio file. |
|
""" |
|
try: |
|
|
|
audio_path = audio if isinstance(audio, str) else audio.name |
|
print(f'Received audio file:', audio_path) |
|
|
|
|
|
speech_array, sampling_rate = torchaudio.load(audio_path) |
|
print(f'Loaded audio with sampling rate:', sampling_rate) |
|
|
|
if sampling_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) |
|
speech_array = resampler(speech_array).squeeze().numpy() |
|
else: |
|
speech_array = speech_array.squeeze().numpy() |
|
|
|
|
|
inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True) |
|
input_values = inputs.input_values.to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logits |
|
predicted_label = torch.argmax(logits, dim=1).item() |
|
|
|
|
|
emotion = emotion_labels[predicted_label] |
|
return emotion |
|
except Exception as e: |
|
return f'Error during prediction: {str(e)}' |
|
|
|
|
|
interface = gr.Interface( |
|
fn=recognize_emotion, |
|
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), |
|
outputs="text", |
|
title="Emotion Recognition with Wav2Vec2", |
|
description="Upload an audio file or record audio, and the model will predict the emotion." |
|
) |
|
|
|
|
|
interface.launch() |
|
|