File size: 2,705 Bytes
e111c36
 
 
 
 
fc0b2dd
 
 
e111c36
 
 
fc0b2dd
e111c36
 
 
 
 
 
 
fc0b2dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e111c36
79addda
e111c36
 
e61150e
e111c36
 
79addda
e111c36
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio

# Define emotion labels (use the same order as during training)
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]

# Load model and processor
model_name = "Dpngtm/wave2vec2-emotion-recognition"  # Replace with your model's Hugging Face Hub path
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))

# Define device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Preprocessing and inference function
def recognize_emotion(audio):
    """
    Predicts the emotion from an audio file using the fine-tuned Wav2Vec2 model.
    
    Args:
        audio (str or file-like object): Path or file-like object for the audio file to predict emotion for.
    
    Returns:
        str: Predicted emotion label for the given audio file.
    """
    try:
        # Determine if input is a file path or file-like object
        audio_path = audio if isinstance(audio, str) else audio.name
        print(f'Received audio file:', audio_path)
        
        # Load and resample audio to 16kHz if necessary
        speech_array, sampling_rate = torchaudio.load(audio_path)
        print(f'Loaded audio with sampling rate:', sampling_rate)
        
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array).squeeze().numpy()
        else:
            speech_array = speech_array.squeeze().numpy()
        
        # Process input for the model
        inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True)
        input_values = inputs.input_values.to(device)
        
        # Make predictions
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_label = torch.argmax(logits, dim=1).item()
        
        # Map prediction to emotion label
        emotion = emotion_labels[predicted_label]
        return emotion
    except Exception as e:
        return f'Error during prediction: {str(e)}'

# Gradio interface with both microphone and file upload options
interface = gr.Interface(
    fn=recognize_emotion,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs="text",
    title="Emotion Recognition with Wav2Vec2",
    description="Upload an audio file or record audio, and the model will predict the emotion."
)

# Launch the app
interface.launch()