SER / app.py
sjagird1's picture
Update app.py
d5652e7 verified
import os
import numpy as np
import librosa
import tensorflow as tf
import gradio as gr
class SpeechEmotionRecognizer:
def __init__(self, model_path):
self.model = tf.keras.models.load_model(model_path)
self.sample_rate = 22050
self.duration = 4 # seconds
self.emotion_labels = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad']
def extract_melspectrogram(self, audio_path):
try:
# Load and resample audio
audio, sr = librosa.load(audio_path, sr=self.sample_rate)
# Ensure audio is exactly 4 seconds
target_length = self.sample_rate * self.duration
if len(audio) < target_length:
audio = np.pad(audio, (0, int(target_length - len(audio))))
else:
audio = audio[:int(target_length)]
# Extract mel-spectrogram
mel_spec = librosa.feature.melspectrogram(
y=audio,
sr=self.sample_rate,
n_mels=128,
n_fft=2048,
hop_length=512,
win_length=2048,
fmax=8000
)
mel_spec_db = librosa.power_to_db(mel_spec + 1e-10, ref=np.max)
# Normalize
mean = np.mean(mel_spec_db)
std = np.std(mel_spec_db)
mel_spec_norm = (mel_spec_db - mean) / (std + 1e-10)
# Clip extreme values
mel_spec_norm = np.clip(mel_spec_norm, -5, 5)
# Ensure correct shape (128, 173)
target_length = 173
if mel_spec_norm.shape[1] > target_length:
mel_spec_norm = mel_spec_norm[:, :target_length]
elif mel_spec_norm.shape[1] < target_length:
pad_width = target_length - mel_spec_norm.shape[1]
mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width)), mode='constant')
return mel_spec_norm.reshape((1, 128, 173, 1))
except Exception as e:
raise gr.Error(f"Error processing audio: {str(e)}")
def predict_emotion(self, audio_path):
try:
# Extract features
mel_spec = self.extract_melspectrogram(audio_path)
# Make prediction
prediction = self.model.predict(mel_spec)
emotion_index = np.argmax(prediction)
# Create results dictionary with confidence scores
results = {emotion: float(pred) for emotion, pred in zip(self.emotion_labels, prediction[0])}
return results
except Exception as e:
raise gr.Error(f"Prediction error: {str(e)}")
# Initialize the model
recognizer = SpeechEmotionRecognizer('final_model_conv2d_1K_1.keras')
# Define the Gradio interface
def process_audio(audio):
if audio is None:
raise gr.Error("Please provide an audio input")
results = recognizer.predict_emotion(audio)
return results
# Create the Gradio interface
demo = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(
label="Record audio (4 seconds)",
type="filepath",
sources=["microphone"] # Updated from 'source' to 'sources'
)
],
outputs=gr.Label(num_top_classes=6),
title="Speech Emotion Recognition",
description="Record a 4-second audio clip to detect the emotion in your voice."
)
# Launch the app
if __name__ == "__main__":
demo.launch()