Spaces:

szili2011
/

sound-to-text-converter

Runtime error

sound-to-text-converter / app.py

Update app.py

f3ecf4e verified 9 months ago

1.57 kB

	import os
	import librosa
	import numpy as np
	import tensorflow as tf
	import gradio as gr
	from sklearn.preprocessing import LabelEncoder

	# Load the pre-trained model
	model_path = 'sound_to_text_model.h5'
	model = tf.keras.models.load_model(model_path)

	# Initialize the encoder (make sure it's fitted to your labels)
	# Note: You need to fit the encoder to your actual labels before saving/loading the model
	# For example, you can use the same encoder you used during training
	encoder = LabelEncoder()
	# Assuming you have a list of labels used during training (e.g., y)
	# encoder.fit(y) # Uncomment and run this if you haven't already fitted the encoder

	# Function to extract features from audio
	def extract_features(file_path):
	y_audio, sr = librosa.load(file_path, duration=2.0)
	mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13)
	return np.mean(mfccs.T, axis=0) # Average to create a fixed size

	# Function to predict text from audio
	def predict_sound_text(audio):
	features = extract_features(audio) # Use audio directly as the file path
	prediction = model.predict(np.array([features]))
	label = encoder.inverse_transform([np.argmax(prediction)])
	return label[0]

	# Define Gradio interface
	interface = gr.Interface(
	fn=predict_sound_text,
	inputs=gr.Audio(type="filepath"), # Use only the type argument
	outputs="text",
	title="Audio to Text Converter",
	description="Upload an audio file (MP3 format) and get the textual representation."
	)

	# Launch the interface
	if __name__ == "__main__":
	interface.launch()