szili2011's picture
45a8c56 verified
history blame
1.52 kB
import os
import librosa
import numpy as np
import tensorflow as tf
import gradio as gr
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Load the pre-trained model
model_path = 'sound_to_text_model.h5'
model = tf.keras.models.load_model(model_path)
# Load the labels from labels.csv
labels_df = pd.read_csv('labels.csv') # Make sure this path is correct
training_labels = labels_df['Label'].tolist() # Assuming the column name is 'Label'
# Initialize the encoder and fit it to your labels
encoder = LabelEncoder() # Fit the encoder to your labels
# Function to extract features from audio
def extract_features(file_path):
y_audio, sr = librosa.load(file_path, duration=2.0)
mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13)
return np.mean(mfccs.T, axis=0) # Average to create a fixed size
# Function to predict text from audio
def predict_sound_text(audio):
features = extract_features(audio) # Use audio directly as the file path
prediction = model.predict(np.array([features]))
label = encoder.inverse_transform([np.argmax(prediction)])
return label[0]
# Define Gradio interface
interface = gr.Interface(
inputs=gr.Audio(type="filepath"), # Use only the type argument
title="Audio to Text Converter",
description="Upload an audio file (MP3 format) and get the textual representation."
# Launch the interface
if __name__ == "__main__":