Spaces:

sjagird1
/

SER

Sleeping

App Files Files Community

SER / app.py

sjagird1

Update app.py

d5652e7 verified 3 months ago

raw

history blame contribute delete

3.59 kB

	import os
	import numpy as np
	import librosa
	import tensorflow as tf
	import gradio as gr

	class SpeechEmotionRecognizer:
	def __init__(self, model_path):
	self.model = tf.keras.models.load_model(model_path)
	self.sample_rate = 22050
	self.duration = 4 # seconds
	self.emotion_labels = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad']

	def extract_melspectrogram(self, audio_path):
	try:
	# Load and resample audio
	audio, sr = librosa.load(audio_path, sr=self.sample_rate)

	# Ensure audio is exactly 4 seconds
	target_length = self.sample_rate * self.duration
	if len(audio) < target_length:
	audio = np.pad(audio, (0, int(target_length - len(audio))))
	else:
	audio = audio[:int(target_length)]

	# Extract mel-spectrogram
	mel_spec = librosa.feature.melspectrogram(
	y=audio,
	sr=self.sample_rate,
	n_mels=128,
	n_fft=2048,
	hop_length=512,
	win_length=2048,
	fmax=8000
	)

	mel_spec_db = librosa.power_to_db(mel_spec + 1e-10, ref=np.max)

	# Normalize
	mean = np.mean(mel_spec_db)
	std = np.std(mel_spec_db)
	mel_spec_norm = (mel_spec_db - mean) / (std + 1e-10)

	# Clip extreme values
	mel_spec_norm = np.clip(mel_spec_norm, -5, 5)

	# Ensure correct shape (128, 173)
	target_length = 173
	if mel_spec_norm.shape[1] > target_length:
	mel_spec_norm = mel_spec_norm[:, :target_length]
	elif mel_spec_norm.shape[1] < target_length:
	pad_width = target_length - mel_spec_norm.shape[1]
	mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width)), mode='constant')

	return mel_spec_norm.reshape((1, 128, 173, 1))

	except Exception as e:
	raise gr.Error(f"Error processing audio: {str(e)}")

	def predict_emotion(self, audio_path):
	try:
	# Extract features
	mel_spec = self.extract_melspectrogram(audio_path)

	# Make prediction
	prediction = self.model.predict(mel_spec)
	emotion_index = np.argmax(prediction)

	# Create results dictionary with confidence scores
	results = {emotion: float(pred) for emotion, pred in zip(self.emotion_labels, prediction[0])}

	return results

	except Exception as e:
	raise gr.Error(f"Prediction error: {str(e)}")

	# Initialize the model
	recognizer = SpeechEmotionRecognizer('final_model_conv2d_1K_1.keras')

	# Define the Gradio interface
	def process_audio(audio):
	if audio is None:
	raise gr.Error("Please provide an audio input")

	results = recognizer.predict_emotion(audio)
	return results

	# Create the Gradio interface
	demo = gr.Interface(
	fn=process_audio,
	inputs=[
	gr.Audio(
	label="Record audio (4 seconds)",
	type="filepath",
	sources=["microphone"] # Updated from 'source' to 'sources'
	)
	],
	outputs=gr.Label(num_top_classes=6),
	title="Speech Emotion Recognition",
	description="Record a 4-second audio clip to detect the emotion in your voice."
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()