File size: 4,474 Bytes
e111c36
 
87f6c9c
e111c36
 
87f6c9c
e111c36
87f6c9c
fc0b2dd
 
e111c36
87f6c9c
e111c36
fc0b2dd
e111c36
87f6c9c
e111c36
 
87f6c9c
e111c36
 
fc0b2dd
87f6c9c
 
fc0b2dd
 
87f6c9c
 
 
 
fc0b2dd
 
87f6c9c
fc0b2dd
 
87f6c9c
 
 
 
 
 
 
 
 
fc0b2dd
 
87f6c9c
 
 
 
 
 
 
 
 
 
 
fc0b2dd
87f6c9c
 
 
 
 
 
 
fc0b2dd
 
87f6c9c
fc0b2dd
87f6c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc0b2dd
87f6c9c
 
 
 
e111c36
87f6c9c
e111c36
 
87f6c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e111c36
 
87f6c9c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import torch
import torch.nn.functional as F
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio
import numpy as np

# Define emotion labels
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]

# Load model and processor
model_name = "Dpngtm/wav2vec2-emotion-recognition"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()  # Set model to evaluation mode

def recognize_emotion(audio):
    """
    Predicts the emotion and confidence scores from an audio file.
    Max duration: 60 seconds
    """
    try:
        if audio is None:
            return {emotion: 0.0 for emotion in emotion_labels}
            
        # Handle audio input
        audio_path = audio if isinstance(audio, str) else audio.name
        
        # Load and resample audio
        speech_array, sampling_rate = torchaudio.load(audio_path)
        
        # Check audio duration
        duration = speech_array.shape[1] / sampling_rate
        if duration > 60:  # 60 seconds (1 minute) limit
            return {
                "Error": "Audio too long (max 1 minute)",
                **{emotion: 0.0 for emotion in emotion_labels}
            }
        
        # Resample if needed
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array)
        
        # Convert to mono if stereo
        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)
            
        # Normalize audio
        speech_array = speech_array / torch.max(torch.abs(speech_array))
        
        # Convert to numpy and squeeze
        speech_array = speech_array.squeeze().numpy()
        
        # Process input
        inputs = processor(
            speech_array, 
            sampling_rate=16000, 
            return_tensors='pt', 
            padding=True
        )
        input_values = inputs.input_values.to(device)
        
        # Get predictions
        with torch.no_grad():
            outputs = model(input_values)
            logits = outputs.logits
            
            # Get probabilities using softmax
            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
            
            # Get confidence scores for all emotions
            confidence_scores = {
                emotion: round(float(prob) * 100, 2) # Convert to percentage with 2 decimal places
                for emotion, prob in zip(emotion_labels, probs)
            }
            
            # Sort confidence scores by value
            sorted_scores = dict(sorted(
                confidence_scores.items(), 
                key=lambda x: x[1], 
                reverse=True
            ))
            
            return sorted_scores
            
    except Exception as e:
        return {
            "Error": str(e),
            **{emotion: 0.0 for emotion in emotion_labels}
        }

# Create Gradio interface
interface = gr.Interface(
    fn=recognize_emotion,
    inputs=gr.Audio(
        sources=["microphone", "upload"], 
        type="filepath",
        label="Upload audio or record from microphone",
        max_length=60  # Set max length to 60 seconds in Gradio interface
    ),
    outputs=gr.Label(
        num_top_classes=len(emotion_labels),
        label="Emotion Predictions"
    ),
    title="Speech Emotion Recognition",
    description="""
    ## Speech Emotion Recognition using Wav2Vec2
    
    This model recognizes emotions from speech audio in the following categories:
    - Angry 😠
    - Calm 😌
    - Disgust 🀒
    - Fearful 😨
    - Happy 😊
    - Neutral 😐
    - Sad 😒
    - Surprised 😲
    
    ### Instructions:
    1. Upload an audio file or record through the microphone
    2. Wait for processing
    3. View predicted emotions with confidence scores
    
    ### Notes:
    - Maximum audio length: 1 minute
    - Best results with clear speech and minimal background noise
    - Confidence scores are shown as percentages
    """,


# Launch the app
interface.launch(
    share=True, 
    debug=True,
    server_name="0.0.0.0",
    server_port=7860
)