import gradio as gr import librosa import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense, Embedding from tensorflow.keras.optimizers import Adam from tensorflow.keras.losses import CategoricalCrossentropy from sklearn.preprocessing import LabelEncoder from tensorflow.keras.utils import to_categorical import os import json def extract_features(file_path): try: audio, sr = librosa.load(file_path, sr=None) # Load audio, keep the original sampling rate mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) return mfccs.T # Transpose to have (time_steps, features) except Exception as e: print(f"Error processing {file_path}: {e}") return None def create_model(input_shape, vocab_size): model = Sequential() #Embedding to increase the vocabulary space model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=input_shape)) model.add(LSTM(64)) # Simple LSTM with 64 units. model.add(Dense(vocab_size, activation='softmax')) # Output layer. optimizer = Adam(learning_rate=0.001) #Adam optimizer loss_function = CategoricalCrossentropy() #categorical crossentropy loss model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy']) return model def prepare_data(mfccs_list): all_mfccs = np.concatenate(mfccs_list, axis=0) label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(all_mfccs.reshape(-1, all_mfccs.shape[-1]).astype(str)) integer_encoded = integer_encoded.reshape(all_mfccs.shape[0], all_mfccs.shape[1]) vocab_size = len(label_encoder.classes_) # Creating the sequences seq_length = 10 dataX, dataY = [], [] for i in range(0, len(integer_encoded) - seq_length, 1): seq_in = integer_encoded[i:i + seq_length] seq_out = integer_encoded[i + seq_length] dataX.append(seq_in) dataY.append(seq_out) n_patterns = len(dataX) # Reshape input to be [samples, time steps, features] dataX = np.array(dataX) dataX = np.reshape(dataX, (n_patterns, seq_length, all_mfccs.shape[-1])) dataY = np.array(dataY) dataY = to_categorical(dataY, num_classes=vocab_size) return dataX, dataY, vocab_size, label_encoder def train_model(model, dataX, dataY): model.fit(dataX, dataY, epochs=10, batch_size=64, verbose=0) def generate_rap(model, start_seq, label_encoder, seq_length, vocab_size, num_frames=50): generated_seq = start_seq.copy() for _ in range(num_frames): # Reshape the input to be [samples, time_steps, features] x_input = np.reshape(generated_seq, (1, len(generated_seq), generated_seq[0].shape[0])) # Predict the next token predicted_probabilities = model.predict(x_input, verbose=0)[0] predicted_token = np.argmax(predicted_probabilities) # Add the new mfcc generated_seq = np.concatenate((generated_seq, [label_encoder.classes_[predicted_token].split()]), axis=0) generated_seq = generated_seq.astype(float) return generated_seq # Function to train model and get results def train_and_generate(file_path): # Check File extensions if not file_path.lower().endswith(('.mp3', '.wav')): return "Invalid file type" # Extract features and prepare data features = extract_features(file_path) if features is None: return "Error extracting audio features, check input" dataX, dataY, vocab_size, label_encoder = prepare_data([features]) input_shape = dataX.shape[1] # Create and Train model model = create_model(input_shape, vocab_size) train_model(model, dataX, dataY) # Generation from a random seed rand_index = np.random.randint(0, len(dataX)-1) start_seq = dataX[rand_index] generated_mfcc_sequence = generate_rap(model, start_seq, label_encoder, input_shape, vocab_size) return generated_mfcc_sequence # Gradio Interface iface = gr.Interface( fn=train_and_generate, inputs=gr.Audio(source="upload", type="filepath", label="Upload MP3 or WAV File"), outputs=gr.Textbox(label="Generated Rap"), title="AI Rapper", description="Upload a Rap song to train the model and generate a new rap verse" ) if __name__ == "__main__": iface.launch()