Spaces:

Dpngtm
/

Audio-Emotion-Recognition

Running

File size: 4,474 Bytes

e111c36
 
87f6c9c
e111c36
 
87f6c9c
e111c36
87f6c9c
fc0b2dd
 
e111c36
87f6c9c
e111c36
fc0b2dd
e111c36
87f6c9c
e111c36
 
87f6c9c
e111c36
 
fc0b2dd
87f6c9c
 
fc0b2dd
 
87f6c9c
 
 
 
fc0b2dd
 
87f6c9c
fc0b2dd
 
87f6c9c
 
 
 
 
 
 
 
 
fc0b2dd
 
87f6c9c
 
 
 
 
 
 
 
 
 
 
fc0b2dd
87f6c9c
 
 
 
 
 
 
fc0b2dd
 
87f6c9c
fc0b2dd
87f6c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc0b2dd
87f6c9c
 
 
 
e111c36
87f6c9c
e111c36
 
87f6c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e111c36
 
87f6c9c

import gradio as gr
import torch
import torch.nn.functional as F
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio
import numpy as np

# Define emotion labels
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]

# Load model and processor
model_name = "Dpngtm/wav2vec2-emotion-recognition"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()  # Set model to evaluation mode

def recognize_emotion(audio):
    """
    Predicts the emotion and confidence scores from an audio file.
    Max duration: 60 seconds
    """
    try:
        if audio is None:
            return {emotion: 0.0 for emotion in emotion_labels}
            
        # Handle audio input
        audio_path = audio if isinstance(audio, str) else audio.name
        
        # Load and resample audio
        speech_array, sampling_rate = torchaudio.load(audio_path)
        
        # Check audio duration
        duration = speech_array.shape[1] / sampling_rate
        if duration > 60:  # 60 seconds (1 minute) limit
            return {
                "Error": "Audio too long (max 1 minute)",
                **{emotion: 0.0 for emotion in emotion_labels}
            }
        
        # Resample if needed
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array)
        
        # Convert to mono if stereo
        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)
            
        # Normalize audio
        speech_array = speech_array / torch.max(torch.abs(speech_array))
        
        # Convert to numpy and squeeze
        speech_array = speech_array.squeeze().numpy()
        
        # Process input
        inputs = processor(
            speech_array, 
            sampling_rate=16000, 
            return_tensors='pt', 
            padding=True
        )
        input_values = inputs.input_values.to(device)
        
        # Get predictions
        with torch.no_grad():
            outputs = model(input_values)
            logits = outputs.logits
            
            # Get probabilities using softmax
            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
            
            # Get confidence scores for all emotions
            confidence_scores = {
                emotion: round(float(prob) * 100, 2) # Convert to percentage with 2 decimal places
                for emotion, prob in zip(emotion_labels, probs)
            }
            
            # Sort confidence scores by value
            sorted_scores = dict(sorted(
                confidence_scores.items(), 
                key=lambda x: x[1], 
                reverse=True
            ))
            
            return sorted_scores
            
    except Exception as e:
        return {
            "Error": str(e),
            **{emotion: 0.0 for emotion in emotion_labels}
        }

# Create Gradio interface
interface = gr.Interface(
    fn=recognize_emotion,
    inputs=gr.Audio(
        sources=["microphone", "upload"], 
        type="filepath",
        label="Upload audio or record from microphone",
        max_length=60  # Set max length to 60 seconds in Gradio interface
    ),
    outputs=gr.Label(
        num_top_classes=len(emotion_labels),
        label="Emotion Predictions"
    ),
    title="Speech Emotion Recognition",
    description="""
    ## Speech Emotion Recognition using Wav2Vec2
    
    This model recognizes emotions from speech audio in the following categories:
    - Angry 😠
    - Calm 😌
    - Disgust 🤢
    - Fearful 😨
    - Happy 😊
    - Neutral 😐
    - Sad 😢
    - Surprised 😲
    
    ### Instructions:
    1. Upload an audio file or record through the microphone
    2. Wait for processing
    3. View predicted emotions with confidence scores
    
    ### Notes:
    - Maximum audio length: 1 minute
    - Best results with clear speech and minimal background noise
    - Confidence scores are shown as percentages
    """,


# Launch the app
interface.launch(
    share=True, 
    debug=True,
    server_name="0.0.0.0",
    server_port=7860
)