|
import os |
|
import numpy as np |
|
import librosa |
|
import tensorflow as tf |
|
import gradio as gr |
|
|
|
class SpeechEmotionRecognizer: |
|
def __init__(self, model_path): |
|
self.model = tf.keras.models.load_model(model_path) |
|
self.sample_rate = 22050 |
|
self.duration = 4 |
|
self.emotion_labels = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad'] |
|
|
|
def extract_melspectrogram(self, audio_path): |
|
try: |
|
|
|
audio, sr = librosa.load(audio_path, sr=self.sample_rate) |
|
|
|
|
|
target_length = self.sample_rate * self.duration |
|
if len(audio) < target_length: |
|
audio = np.pad(audio, (0, int(target_length - len(audio)))) |
|
else: |
|
audio = audio[:int(target_length)] |
|
|
|
|
|
mel_spec = librosa.feature.melspectrogram( |
|
y=audio, |
|
sr=self.sample_rate, |
|
n_mels=128, |
|
n_fft=2048, |
|
hop_length=512, |
|
win_length=2048, |
|
fmax=8000 |
|
) |
|
|
|
mel_spec_db = librosa.power_to_db(mel_spec + 1e-10, ref=np.max) |
|
|
|
|
|
mean = np.mean(mel_spec_db) |
|
std = np.std(mel_spec_db) |
|
mel_spec_norm = (mel_spec_db - mean) / (std + 1e-10) |
|
|
|
|
|
mel_spec_norm = np.clip(mel_spec_norm, -5, 5) |
|
|
|
|
|
target_length = 173 |
|
if mel_spec_norm.shape[1] > target_length: |
|
mel_spec_norm = mel_spec_norm[:, :target_length] |
|
elif mel_spec_norm.shape[1] < target_length: |
|
pad_width = target_length - mel_spec_norm.shape[1] |
|
mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width)), mode='constant') |
|
|
|
return mel_spec_norm.reshape((1, 128, 173, 1)) |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error processing audio: {str(e)}") |
|
|
|
def predict_emotion(self, audio_path): |
|
try: |
|
|
|
mel_spec = self.extract_melspectrogram(audio_path) |
|
|
|
|
|
prediction = self.model.predict(mel_spec) |
|
emotion_index = np.argmax(prediction) |
|
|
|
|
|
results = {emotion: float(pred) for emotion, pred in zip(self.emotion_labels, prediction[0])} |
|
|
|
return results |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Prediction error: {str(e)}") |
|
|
|
|
|
recognizer = SpeechEmotionRecognizer('final_model_conv2d_1K_1.keras') |
|
|
|
|
|
def process_audio(audio): |
|
if audio is None: |
|
raise gr.Error("Please provide an audio input") |
|
|
|
results = recognizer.predict_emotion(audio) |
|
return results |
|
|
|
|
|
demo = gr.Interface( |
|
fn=process_audio, |
|
inputs=[ |
|
gr.Audio( |
|
label="Record audio (4 seconds)", |
|
type="filepath", |
|
sources=["microphone"] |
|
) |
|
], |
|
outputs=gr.Label(num_top_classes=6), |
|
title="Speech Emotion Recognition", |
|
description="Record a 4-second audio clip to detect the emotion in your voice." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |