import os import librosa import numpy as np import tensorflow as tf import gradio as gr from sklearn.preprocessing import LabelEncoder # Load the pre-trained model model_path = 'sound_to_text_model.h5' model = tf.keras.models.load_model(model_path) # Initialize the encoder (make sure it's fitted to your labels) # Note: You need to fit the encoder to your actual labels before saving/loading the model # For example, you can use the same encoder you used during training encoder = LabelEncoder() # Assuming you have a list of labels used during training (e.g., y) # encoder.fit(y) # Uncomment and run this if you haven't already fitted the encoder # Function to extract features from audio def extract_features(file_path): y_audio, sr = librosa.load(file_path, duration=2.0) mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13) return np.mean(mfccs.T, axis=0) # Average to create a fixed size # Function to predict text from audio def predict_sound_text(audio): features = extract_features(audio) # Use audio directly as the file path prediction = model.predict(np.array([features])) label = encoder.inverse_transform([np.argmax(prediction)]) return label[0] # Define Gradio interface interface = gr.Interface( fn=predict_sound_text, inputs=gr.Audio(type="filepath"), # Use only the type argument outputs="text", title="Audio to Text Converter", description="Upload an audio file (MP3 format) and get the textual representation." ) # Launch the interface if __name__ == "__main__": interface.launch()