Spaces:

szili2011
/

sound-to-text-converter

Runtime error

sound-to-text-converter / app.py

Update app.py

45a8c56 verified 11 months ago

1.52 kB

	import os
	import librosa
	import numpy as np
	import tensorflow as tf
	import gradio as gr
	from sklearn.preprocessing import LabelEncoder
	import pandas as pd

	# Load the pre-trained model
	model_path = 'sound_to_text_model.h5'
	model = tf.keras.models.load_model(model_path)

	# Load the labels from labels.csv
	labels_df = pd.read_csv('labels.csv') # Make sure this path is correct
	training_labels = labels_df['Label'].tolist() # Assuming the column name is 'Label'

	# Initialize the encoder and fit it to your labels
	encoder = LabelEncoder()
	encoder.fit(training_labels) # Fit the encoder to your labels

	# Function to extract features from audio
	def extract_features(file_path):
	y_audio, sr = librosa.load(file_path, duration=2.0)
	mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13)
	return np.mean(mfccs.T, axis=0) # Average to create a fixed size

	# Function to predict text from audio
	def predict_sound_text(audio):
	features = extract_features(audio) # Use audio directly as the file path
	prediction = model.predict(np.array([features]))
	label = encoder.inverse_transform([np.argmax(prediction)])
	return label[0]

	# Define Gradio interface
	interface = gr.Interface(
	fn=predict_sound_text,
	inputs=gr.Audio(type="filepath"), # Use only the type argument
	outputs="text",
	title="Audio to Text Converter",
	description="Upload an audio file (MP3 format) and get the textual representation."
	)

	# Launch the interface
	if __name__ == "__main__":
	interface.launch()