import os import numpy as np import librosa import tensorflow as tf import gradio as gr class SpeechEmotionRecognizer: def __init__(self, model_path): self.model = tf.keras.models.load_model(model_path) self.sample_rate = 22050 self.duration = 4 # seconds self.emotion_labels = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad'] def extract_melspectrogram(self, audio_path): try: # Load and resample audio audio, sr = librosa.load(audio_path, sr=self.sample_rate) # Ensure audio is exactly 4 seconds target_length = self.sample_rate * self.duration if len(audio) < target_length: audio = np.pad(audio, (0, int(target_length - len(audio)))) else: audio = audio[:int(target_length)] # Extract mel-spectrogram mel_spec = librosa.feature.melspectrogram( y=audio, sr=self.sample_rate, n_mels=128, n_fft=2048, hop_length=512, win_length=2048, fmax=8000 ) mel_spec_db = librosa.power_to_db(mel_spec + 1e-10, ref=np.max) # Normalize mean = np.mean(mel_spec_db) std = np.std(mel_spec_db) mel_spec_norm = (mel_spec_db - mean) / (std + 1e-10) # Clip extreme values mel_spec_norm = np.clip(mel_spec_norm, -5, 5) # Ensure correct shape (128, 173) target_length = 173 if mel_spec_norm.shape[1] > target_length: mel_spec_norm = mel_spec_norm[:, :target_length] elif mel_spec_norm.shape[1] < target_length: pad_width = target_length - mel_spec_norm.shape[1] mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width)), mode='constant') return mel_spec_norm.reshape((1, 128, 173, 1)) except Exception as e: raise gr.Error(f"Error processing audio: {str(e)}") def predict_emotion(self, audio_path): try: # Extract features mel_spec = self.extract_melspectrogram(audio_path) # Make prediction prediction = self.model.predict(mel_spec) emotion_index = np.argmax(prediction) # Create results dictionary with confidence scores results = {emotion: float(pred) for emotion, pred in zip(self.emotion_labels, prediction[0])} return results except Exception as e: raise gr.Error(f"Prediction error: {str(e)}") # Initialize the model recognizer = SpeechEmotionRecognizer('final_model_conv2d_1K_1.keras') # Define the Gradio interface def process_audio(audio): if audio is None: raise gr.Error("Please provide an audio input") results = recognizer.predict_emotion(audio) return results # Create the Gradio interface demo = gr.Interface( fn=process_audio, inputs=[ gr.Audio( label="Record audio (4 seconds)", type="filepath", sources=["microphone"] # Updated from 'source' to 'sources' ) ], outputs=gr.Label(num_top_classes=6), title="Speech Emotion Recognition", description="Record a 4-second audio clip to detect the emotion in your voice." ) # Launch the app if __name__ == "__main__": demo.launch()