Spaces:
Runtime error
Runtime error
File size: 1,519 Bytes
1d0caf2 f3ecf4e 45a8c56 1d0caf2 45a8c56 f3ecf4e 45a8c56 f3ecf4e 1d0caf2 f3ecf4e 1d0caf2 c75b241 f3ecf4e c75b241 1d0caf2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import os
import librosa
import numpy as np
import tensorflow as tf
import gradio as gr
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Load the pre-trained model
model_path = 'sound_to_text_model.h5'
model = tf.keras.models.load_model(model_path)
# Load the labels from labels.csv
labels_df = pd.read_csv('labels.csv') # Make sure this path is correct
training_labels = labels_df['Label'].tolist() # Assuming the column name is 'Label'
# Initialize the encoder and fit it to your labels
encoder = LabelEncoder()
encoder.fit(training_labels) # Fit the encoder to your labels
# Function to extract features from audio
def extract_features(file_path):
y_audio, sr = librosa.load(file_path, duration=2.0)
mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13)
return np.mean(mfccs.T, axis=0) # Average to create a fixed size
# Function to predict text from audio
def predict_sound_text(audio):
features = extract_features(audio) # Use audio directly as the file path
prediction = model.predict(np.array([features]))
label = encoder.inverse_transform([np.argmax(prediction)])
return label[0]
# Define Gradio interface
interface = gr.Interface(
fn=predict_sound_text,
inputs=gr.Audio(type="filepath"), # Use only the type argument
outputs="text",
title="Audio to Text Converter",
description="Upload an audio file (MP3 format) and get the textual representation."
)
# Launch the interface
if __name__ == "__main__":
interface.launch()
|