Spaces:
Sleeping
Sleeping
File size: 4,474 Bytes
e111c36 87f6c9c e111c36 87f6c9c e111c36 87f6c9c fc0b2dd e111c36 87f6c9c e111c36 fc0b2dd e111c36 87f6c9c e111c36 87f6c9c e111c36 fc0b2dd 87f6c9c fc0b2dd 87f6c9c fc0b2dd 87f6c9c fc0b2dd 87f6c9c fc0b2dd 87f6c9c fc0b2dd 87f6c9c fc0b2dd 87f6c9c fc0b2dd 87f6c9c fc0b2dd 87f6c9c e111c36 87f6c9c e111c36 87f6c9c e111c36 87f6c9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import gradio as gr
import torch
import torch.nn.functional as F
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio
import numpy as np
# Define emotion labels
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
# Load model and processor
model_name = "Dpngtm/wav2vec2-emotion-recognition"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))
# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval() # Set model to evaluation mode
def recognize_emotion(audio):
"""
Predicts the emotion and confidence scores from an audio file.
Max duration: 60 seconds
"""
try:
if audio is None:
return {emotion: 0.0 for emotion in emotion_labels}
# Handle audio input
audio_path = audio if isinstance(audio, str) else audio.name
# Load and resample audio
speech_array, sampling_rate = torchaudio.load(audio_path)
# Check audio duration
duration = speech_array.shape[1] / sampling_rate
if duration > 60: # 60 seconds (1 minute) limit
return {
"Error": "Audio too long (max 1 minute)",
**{emotion: 0.0 for emotion in emotion_labels}
}
# Resample if needed
if sampling_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
speech_array = resampler(speech_array)
# Convert to mono if stereo
if speech_array.shape[0] > 1:
speech_array = torch.mean(speech_array, dim=0, keepdim=True)
# Normalize audio
speech_array = speech_array / torch.max(torch.abs(speech_array))
# Convert to numpy and squeeze
speech_array = speech_array.squeeze().numpy()
# Process input
inputs = processor(
speech_array,
sampling_rate=16000,
return_tensors='pt',
padding=True
)
input_values = inputs.input_values.to(device)
# Get predictions
with torch.no_grad():
outputs = model(input_values)
logits = outputs.logits
# Get probabilities using softmax
probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
# Get confidence scores for all emotions
confidence_scores = {
emotion: round(float(prob) * 100, 2) # Convert to percentage with 2 decimal places
for emotion, prob in zip(emotion_labels, probs)
}
# Sort confidence scores by value
sorted_scores = dict(sorted(
confidence_scores.items(),
key=lambda x: x[1],
reverse=True
))
return sorted_scores
except Exception as e:
return {
"Error": str(e),
**{emotion: 0.0 for emotion in emotion_labels}
}
# Create Gradio interface
interface = gr.Interface(
fn=recognize_emotion,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Upload audio or record from microphone",
max_length=60 # Set max length to 60 seconds in Gradio interface
),
outputs=gr.Label(
num_top_classes=len(emotion_labels),
label="Emotion Predictions"
),
title="Speech Emotion Recognition",
description="""
## Speech Emotion Recognition using Wav2Vec2
This model recognizes emotions from speech audio in the following categories:
- Angry π
- Calm π
- Disgust π€’
- Fearful π¨
- Happy π
- Neutral π
- Sad π’
- Surprised π²
### Instructions:
1. Upload an audio file or record through the microphone
2. Wait for processing
3. View predicted emotions with confidence scores
### Notes:
- Maximum audio length: 1 minute
- Best results with clear speech and minimal background noise
- Confidence scores are shown as percentages
""",
# Launch the app
interface.launch(
share=True,
debug=True,
server_name="0.0.0.0",
server_port=7860
) |