Spaces:

szili2011
/

sound-to-text-converter

Runtime error

File size: 1,519 Bytes

1d0caf2
 
 
 
 
f3ecf4e
45a8c56
1d0caf2
 
 
 
 
45a8c56
 
 
 
 
f3ecf4e
45a8c56
f3ecf4e
1d0caf2
 
 
 
 
 
 
 
f3ecf4e
1d0caf2
 
 
 
 
c75b241
 
f3ecf4e
c75b241
 
 
 
1d0caf2

import os
import librosa
import numpy as np
import tensorflow as tf
import gradio as gr
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load the pre-trained model
model_path = 'sound_to_text_model.h5'
model = tf.keras.models.load_model(model_path)

# Load the labels from labels.csv
labels_df = pd.read_csv('labels.csv')  # Make sure this path is correct
training_labels = labels_df['Label'].tolist()  # Assuming the column name is 'Label'

# Initialize the encoder and fit it to your labels
encoder = LabelEncoder()
encoder.fit(training_labels)  # Fit the encoder to your labels

# Function to extract features from audio
def extract_features(file_path):
    y_audio, sr = librosa.load(file_path, duration=2.0)
    mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13)
    return np.mean(mfccs.T, axis=0)  # Average to create a fixed size

# Function to predict text from audio
def predict_sound_text(audio):
    features = extract_features(audio)  # Use audio directly as the file path
    prediction = model.predict(np.array([features]))
    label = encoder.inverse_transform([np.argmax(prediction)])
    return label[0]

# Define Gradio interface
interface = gr.Interface(
    fn=predict_sound_text,
    inputs=gr.Audio(type="filepath"),  # Use only the type argument
    outputs="text",
    title="Audio to Text Converter",
    description="Upload an audio file (MP3 format) and get the textual representation."
)

# Launch the interface
if __name__ == "__main__":
    interface.launch()