# ================================================================= # # Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced) # # This script combines two functionalities: # 1. Transcribing audio to MIDI using two methods: # a) A general-purpose model (basic-pitch by Spotify). # b) A model specialized for solo piano (ByteDance). # - Includes stereo processing by splitting channels, transcribing independently, and merging MIDI. # 2. Applying advanced transformations and re-rendering MIDI files using: # a) Standard SoundFonts via FluidSynth (produces stereo audio). # b) A custom 8-bit style synthesizer for a chiptune sound (updated for stereo output). # # The user can upload a Audio (e.g., WAV, MP3), or MIDI file. # - If an audio file is uploaded, it is first transcribed to MIDI using the selected method. # - The resulting MIDI (or an uploaded MIDI) can then be processed # with various effects and rendered into audio. # #================================================================ # Original sources: # https://huggingface.co/spaces/asigalov61/ByteDance-Solo-Piano-Audio-to-MIDI-Transcription # https://huggingface.co/spaces/asigalov61/Advanced-MIDI-Renderer #================================================================ # Packages: # # sudo apt install fluidsynth # # ================================================================= # Requirements: # # pip install gradio torch pytz numpy scipy matplotlib networkx scikit-learn # pip install piano_transcription_inference huggingface_hub # pip install basic-pitch pretty_midi librosa soundfile # # ================================================================= # Core modules: # # git clone --depth 1 https://github.com/asigalov61/tegridy-tools # # ================================================================= import os import hashlib import time as reqtime import copy import librosa import pyloudnorm as pyln import soundfile as sf import torch import gradio as gr from src.piano_transcription.utils import initialize_app from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate # --- Import core transcription and MIDI processing libraries --- from src import TMIDIX, TPLOTS from src import MIDI from src.midi_to_colab_audio import midi_to_colab_audio # --- Imports for General Purpose Transcription (basic-pitch) --- import basic_pitch from basic_pitch.inference import predict from basic_pitch import ICASSP_2022_MODEL_PATH # --- Imports for 8-bit Synthesizer & MIDI Merging --- import pretty_midi import numpy as np from scipy import signal # ================================================================================================= # === Hugging Face SoundFont Downloader === # ================================================================================================= from huggingface_hub import hf_hub_download import glob # --- Define a constant for the 8-bit synthesizer option --- SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)" def prepare_soundfonts(): """ Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2' directory recursively for all .sf2 files. Returns a dictionary mapping a user-friendly name to its full file path, with default soundfonts listed first in their specified order. Downloads soundfont files from the specified Hugging Face Space repository to a local 'src/sf2' directory if they don't already exist. Returns a list of local paths to the soundfont files. """ SF2_REPO_ID = "asigalov61/Advanced-MIDI-Renderer" SF2_DIR = "src/sf2" # This list is now just for ensuring default files exist # {"Super GM": 0, "Orpheus GM": 1, "Live HQ GM": 2, "Nice Strings + Orchestra": 3, "Real Choir": 4, "Super Game Boy": 5, "Proto Square": 6} DEFAULT_SF2_FILENAMES = [ "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2", "Orpheus_18.06.2020.sf2", "Live HQ Natural SoundFont GM.sf2", "Nice-Strings-PlusOrchestra-v1.6.sf2", "KBH-Real-Choir-V2.5.sf2", "SuperGameBoy.sf2", "ProtoSquare.sf2" ] # Create the target directory if it doesn't exist os.makedirs(SF2_DIR, exist_ok=True) # --- Step 1: Ensure default SoundFonts are available --- print("Checking for SoundFont files...") for filename in DEFAULT_SF2_FILENAMES: local_path = os.path.join(SF2_DIR, filename) # Check if the file already exists locally to avoid re-downloading if not os.path.exists(local_path): print(f"Downloading '{filename}' from Hugging Face Hub...") try: # Use hf_hub_download to get the file # It will be downloaded to the specified local directory hf_hub_download( repo_id=SF2_REPO_ID, repo_type='space', # Specify that the repository is a Space filename=f"{filename}", # The path to the file within the repository local_dir=SF2_DIR, # local_dir_use_symlinks=False # Copy file to the dir for a clean folder structure ) print(f"'{filename}' downloaded successfully.") except Exception as e: print(f"Error downloading {filename}: {e}") # If download fails, we might not be able to use this soundfont # --- Step 2: Scan the entire directory for all .sf2 files --- print(f"Scanning '{SF2_DIR}' for all .sf2 files...") all_sfs_map = {} # Use glob with recursive=True to find all .sf2 files in subdirectories search_pattern = os.path.join(SF2_DIR, '**', '*.sf2') for full_path in glob.glob(search_pattern, recursive=True): # Create a user-friendly display name, including subfolder if it exists relative_path = os.path.relpath(full_path, SF2_DIR) display_name = os.path.splitext(relative_path)[0].replace("\\", "/") # Use forward slashes for consistency all_sfs_map[display_name] = full_path # --- Step 3: Create the final ordered dictionary based on priority --- ordered_soundfont_map = {} # Create display names for default files (filename without extension) default_display_names = [os.path.splitext(f)[0] for f in DEFAULT_SF2_FILENAMES] # Separate other files from the default ones other_display_names = [name for name in all_sfs_map.keys() if name not in default_display_names] other_display_names.sort() # Sort the rest alphabetically # Add default soundfonts first, maintaining the order from DEFAULT_SF2_FILENAMES for name in default_display_names: if name in all_sfs_map: # Check if the file was actually found by the scanner ordered_soundfont_map[name] = all_sfs_map[name] # Add all other soundfonts after the default ones for name in other_display_names: ordered_soundfont_map[name] = all_sfs_map[name] return ordered_soundfont_map # ================================================================================================= # === 8-bit Style Synthesizer (Stereo Enabled) === # ================================================================================================= def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width, vibrato_rate, vibrato_depth, bass_boost_level, fs=44100): """ Synthesizes an 8-bit style audio waveform from a PrettyMIDI object. This function generates waveforms manually instead of using a synthesizer like FluidSynth. Includes an optional sub-octave bass booster with adjustable level. Instruments are panned based on their order in the MIDI file. Instrument 1 -> Left, Instrument 2 -> Right. """ total_duration = midi_data.get_end_time() # Initialize a stereo waveform buffer (2 channels: Left, Right) waveform = np.zeros((2, int(total_duration * fs) + fs)) num_instruments = len(midi_data.instruments) for i, instrument in enumerate(midi_data.instruments): # --- Panning Logic --- # Default to center-panned mono pan_l, pan_r = 0.707, 0.707 if num_instruments == 2: if i == 0: # First instrument panned left pan_l, pan_r = 1.0, 0.0 elif i == 1: # Second instrument panned right pan_l, pan_r = 0.0, 1.0 elif num_instruments > 2: if i == 0: pan_l, pan_r = 1.0, 0.0 # Left elif i == 1: pan_l, pan_r = 0.0, 1.0 # Right # Other instruments remain centered for note in instrument.notes: freq = pretty_midi.note_number_to_hz(note.pitch) note_duration = note.end - note.start num_samples = int(note_duration * fs) if num_samples == 0: continue t = np.linspace(0., note_duration, num_samples, endpoint=False) # --- Vibrato LFO --- vibrato_lfo = vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t) # --- Waveform Generation (Main Oscillator) --- if waveform_type == 'Square': note_waveform = signal.square(2 * np.pi * (freq + vibrato_lfo) * t, duty=pulse_width) elif waveform_type == 'Sawtooth': note_waveform = signal.sawtooth(2 * np.pi * (freq + vibrato_lfo) * t) elif waveform_type == 'Triangle': note_waveform = signal.sawtooth(2 * np.pi * (freq + vibrato_lfo) * t, width=0.5) # --- Bass Boost (Sub-Octave Oscillator) --- if bass_boost_level > 0: bass_freq = freq / 2.0 # Only add bass if the frequency is reasonably audible if bass_freq > 20: # Bass uses a simple square wave, no vibrato, for stability bass_sub_waveform = signal.square(2 * np.pi * bass_freq * t, duty=0.5) # Mix the main and bass waveforms. # As bass level increases, slightly decrease main waveform volume to prevent clipping. main_level = 1.0 - (0.5 * bass_boost_level) note_waveform = (note_waveform * main_level) + (bass_sub_waveform * bass_boost_level) # --- ADSR Envelope --- start_amp = note.velocity / 127.0 envelope = np.zeros(num_samples) if envelope_type == 'Plucky (AD Envelope)' and num_samples > 0: attack_time_s = 0.005 attack_samples = min(int(attack_time_s * fs), num_samples) decay_samples = min(int(decay_time_s * fs), num_samples - attack_samples) envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples) if decay_samples > 0: envelope[attack_samples:attack_samples+decay_samples] = np.linspace(start_amp, 0, decay_samples) elif envelope_type == 'Sustained (Full Decay)' and num_samples > 0: envelope = np.linspace(start_amp, 0, num_samples) # Apply envelope to the (potentially combined) waveform note_waveform *= envelope start_sample = int(note.start * fs) end_sample = start_sample + num_samples if end_sample > waveform.shape[1]: end_sample = waveform.shape[1] note_waveform = note_waveform[:end_sample-start_sample] # Add the mono note waveform to the stereo buffer with panning waveform[0, start_sample:end_sample] += note_waveform * pan_l waveform[1, start_sample:end_sample] += note_waveform * pan_r return waveform # Returns a (2, N) numpy array def analyze_midi_velocity(midi_path): midi = pretty_midi.PrettyMIDI(midi_path) all_velocities = [] print(f"Analyzing velocity for MIDI: {midi_path}") for i, instrument in enumerate(midi.instruments): velocities = [note.velocity for note in instrument.notes] all_velocities.extend(velocities) if velocities: print(f"Instrument {i} ({instrument.name}):") print(f" Notes count: {len(velocities)}") print(f" Velocity min: {min(velocities)}") print(f" Velocity max: {max(velocities)}") print(f" Velocity mean: {np.mean(velocities):.2f}") else: print(f"Instrument {i} ({instrument.name}): no notes found.") if all_velocities: print("\nOverall MIDI velocity stats:") print(f" Total notes: {len(all_velocities)}") print(f" Velocity min: {min(all_velocities)}") print(f" Velocity max: {max(all_velocities)}") print(f" Velocity mean: {np.mean(all_velocities):.2f}") else: print("No notes found in this MIDI.") def scale_instrument_velocity(instrument, scale=0.8): for note in instrument.notes: note.velocity = max(1, min(127, int(note.velocity * scale))) def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0): """ Normalizes the audio data to a target integrated loudness (LUFS). This provides more consistent perceived volume than peak normalization. Args: audio_data (np.ndarray): The audio signal. sample_rate (int): The sample rate of the audio. target_lufs (float): The target loudness in LUFS. Defaults to -23.0, a common standard for broadcast. Returns: np.ndarray: The loudness-normalized audio data. """ try: # 1. Measure the integrated loudness of the input audio meter = pyln.Meter(sample_rate) # create meter loudness = meter.integrated_loudness(audio_data) # 2. Calculate the gain needed to reach the target loudness # The gain is applied in the linear domain, so we convert from dB loudness_gain_db = target_lufs - loudness loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0) # 3. Apply the gain normalized_audio = audio_data * loudness_gain_linear # 4. Final safety check: peak normalize to prevent clipping, just in case # the loudness normalization results in peaks > 1.0 peak_val = np.max(np.abs(normalized_audio)) if peak_val > 1.0: normalized_audio /= peak_val print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.") print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.") return normalized_audio except Exception as e: print(f"Loudness normalization failed: {e}. Falling back to original audio.") return audio_data # ================================================================================================= # === MIDI Merging Function === # ================================================================================================= def merge_midis(midi_path_left, midi_path_right, output_path): """ Merges two MIDI files into a single MIDI file. This robust version iterates through ALL instruments in both MIDI files, ensuring no data is lost if the source files are multi-instrumental. It applies hard-left panning (Pan=0) to every instrument from the left MIDI and hard-right panning (Pan=127) to every instrument from the right MIDI. """ try: analyze_midi_velocity(midi_path_left) analyze_midi_velocity(midi_path_right) midi_left = pretty_midi.PrettyMIDI(midi_path_left) midi_right = pretty_midi.PrettyMIDI(midi_path_right) merged_midi = pretty_midi.PrettyMIDI() # --- Process ALL instruments from the left channel MIDI --- if midi_left.instruments: print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.") # Use a loop to iterate through every instrument for instrument in midi_left.instruments: scale_instrument_velocity(instrument, scale=0.8) # To avoid confusion, we can prefix the instrument name instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}" # Create and add the Pan Left control change # Create a Control Change event for Pan (controller number 10). # Set its value to 0 for hard left panning. # Add it at the very beginning of the track (time=0.0). pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0) # Use insert() to ensure the pan event is the very first one instrument.control_changes.insert(0, pan_left) # Append the fully processed instrument to the merged MIDI merged_midi.instruments.append(instrument) # --- Process ALL instruments from the right channel MIDI --- if midi_right.instruments: print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.") # Use a loop here as well for instrument in midi_right.instruments: scale_instrument_velocity(instrument, scale=0.8) instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}" # Create and add the Pan Right control change # Create a Control Change event for Pan (controller number 10). # Set its value to 127 for hard right panning. # Add it at the very beginning of the track (time=0.0). pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0) instrument.control_changes.insert(0, pan_right) merged_midi.instruments.append(instrument) merged_midi.write(output_path) print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'") analyze_midi_velocity(output_path) return output_path except Exception as e: print(f"Error merging MIDI files: {e}") # Fallback logic remains the same if os.path.exists(midi_path_left): print("Fallback: Using only the left channel MIDI.") return midi_path_left return None # ================================================================================================= # === Stage 1: Audio to MIDI Transcription Functions === # ================================================================================================= def TranscribePianoAudio(input_file): """ Transcribes a WAV or MP3 audio file of a SOLO PIANO performance into a MIDI file. This uses the ByteDance model. Args: input_file_path (str): The path to the input audio file. Returns: str: The file path of the generated MIDI file. """ print('=' * 70) print('STAGE 1: Starting Piano-Specific Transcription') print('=' * 70) # Generate a unique output filename for the MIDI fn = os.path.basename(input_file) fn1 = fn.split('.')[0] # Use os.path.join to create a platform-independent directory path output_dir = os.path.join("output", "transcribed_piano_") out_mid_path = os.path.join(output_dir, fn1 + '.mid') # Check for the directory's existence and create it if necessary if not os.path.exists(output_dir): os.makedirs(output_dir) print('-' * 70) print(f'Input file name: {fn}') print(f'Output MIDI path: {out_mid_path}') print('-' * 70) # Load audio using the utility function print('Loading audio...') (audio, _) = utilities.load_audio(input_file, sr=transcription_sample_rate, mono=True) print('Audio loaded successfully.') print('-' * 70) # Initialize the transcription model # Use 'cuda' if a GPU is available and configured, otherwise 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'Loading transcriptor model... device= {device}') transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth") print('Transcriptor loaded.') print('-' * 70) # Perform transcription print('Transcribing audio to MIDI (Piano-Specific)...') # This function call saves the MIDI file to the specified path transcriptor.transcribe(audio, out_mid_path) print('Piano transcription complete.') print('=' * 70) # Return the path to the newly created MIDI file return out_mid_path def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool): """ Transcribes a general audio file into a MIDI file using basic-pitch. This is suitable for various instruments and vocals. """ print('=' * 70) print('STAGE 1: Starting General Purpose Transcription') print('=' * 70) fn = os.path.basename(input_file) fn1 = fn.split('.')[0] output_dir = os.path.join("output", "transcribed_general_") out_mid_path = os.path.join(output_dir, fn1 + '.mid') os.makedirs(output_dir, exist_ok=True) print(f'Input file: {fn}\nOutput MIDI: {out_mid_path}') # --- Perform transcription using basic-pitch --- print('Transcribing audio to MIDI (General Purpose)...') # The predict function handles audio loading internally model_output, midi_data, note_events = basic_pitch.inference.predict( audio_path=input_file, model_or_model_path=ICASSP_2022_MODEL_PATH, onset_threshold=onset_thresh, frame_threshold=frame_thresh, minimum_note_length=min_note_len, minimum_frequency=min_freq, maximum_frequency=max_freq, infer_onsets=infer_onsets_bool, melodia_trick=melodia_trick_bool, multiple_pitch_bends=multiple_bends_bool ) # --- Save the MIDI file --- midi_data.write(out_mid_path) print('General transcription complete.') print('=' * 70) return out_mid_path # ================================================================================================= # === Stage 2: MIDI Transformation and Rendering Function === # ================================================================================================= def Render_MIDI(input_midi_path, render_type, soundfont_bank, render_sample_rate, render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, # --- 8-bit synth params --- s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level ): """ Processes and renders a MIDI file according to user-defined settings. Can render using SoundFonts or a custom 8-bit synthesizer. Args: input_midi_path (str): The path to the input MIDI file. All other arguments are rendering options from the Gradio UI. Returns: A tuple containing all the output elements for the Gradio UI. """ print('*' * 70) print('STAGE 2: Starting MIDI Rendering') print('*' * 70) # --- File and Settings Setup --- fn = os.path.basename(input_midi_path) fn1 = fn.split('.')[0] # Use os.path.join to create a platform-independent directory path output_dir = os.path.join("output", "rendered_midi") if not os.path.exists(output_dir): os.makedirs(output_dir) # Now, join the clean directory path with the filename new_fn_path = os.path.join(output_dir, fn1 + '_rendered.mid') try: with open(input_midi_path, 'rb') as f: fdata = f.read() input_midi_md5hash = hashlib.md5(fdata).hexdigest() except FileNotFoundError: # Handle cases where the input file might not exist print(f"Error: Input MIDI file not found at {input_midi_path}") return [None] * 7 # Return empty values for all outputs print('=' * 70) print('Requested settings:') print(f'Input MIDI file name: {fn}') print(f'Input MIDI md5 hash: {input_midi_md5hash}') print('-' * 70) print(f'Render type: {render_type}') print(f'Soundfont bank: {soundfont_bank}') print(f'Audio render sample rate: {render_sample_rate}') # ... (add other print statements for settings if needed) print('=' * 70) # --- MIDI Processing using TMIDIX --- print('Processing MIDI... Please wait...') raw_score = MIDI.midi2single_track_ms_score(fdata) escore = TMIDIX.advanced_score_processor(raw_score, return_enhanced_score_notes=True, apply_sustain=render_with_sustains )[0] # Handle cases where the MIDI might not contain any notes if not escore: print("Warning: MIDI file contains no processable notes.") return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.") # This line will now work correctly because merge_misaligned_notes is guaranteed to be an integer. if merge_misaligned_notes > 0: escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes) escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1) first_note_index = [e[0] for e in raw_score[1]].index('note') cscore = TMIDIX.chordify_score([1000, escore]) meta_data = raw_score[1][:first_note_index] + [escore[0]] + [escore[-1]] + [raw_score[1][-1]] aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True) song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes) print('Done!') print('=' * 70) print('Input MIDI metadata:', meta_data[:5]) print('=' * 70) print('Input MIDI song description:', song_description) print('=' * 70) print('Processing...Please wait...') # A deep copy of the score to be modified output_score = copy.deepcopy(escore) # Apply transformations based on render_type if render_type == "Extract melody": output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True) output_score = TMIDIX.recalculate_score_timings(output_score) elif render_type == "Flip": output_score = TMIDIX.flip_enhanced_score_notes(escore) elif render_type == "Reverse": output_score = TMIDIX.reverse_enhanced_score_notes(escore) elif render_type == 'Repair Durations': output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0) elif render_type == 'Repair Chords': fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0] output_score = TMIDIX.flatten(fixed_cscore) elif render_type == 'Remove Duplicate Pitches': output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore) elif render_type == "Add Drum Track": nd_escore = [e for e in escore if e[3] != 9] nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore) output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore) for e in output_score: e[1] *= 16 e[2] *= 16 print('MIDI processing complete.') print('=' * 70) # --- Final Processing and Patching --- if render_type != "Render as-is": print('Applying final adjustments (transpose, align, patch)...') if custom_render_patch != -1: # -1 indicates no change for e in output_score: if e[3] != 9: # not a drum channel e[6] = custom_render_patch if render_transpose_value != 0: output_score = TMIDIX.transpose_escore_notes(output_score, render_transpose_value) if render_transpose_to_C4: output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) # C4 is MIDI pitch 60 if render_align == "Start Times": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score) elif render_align == "Start Times and Durations": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True) elif render_align == "Start Times and Split Durations": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True) if render_type == "Longest Repeating Phrase": zscore = TMIDIX.recalculate_score_timings(output_score) lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore) if lrno_score is not None: output_score = lrno_score else: output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50)) if render_type == "Multi-Instrumental Summary": zscore = TMIDIX.recalculate_score_timings(output_score) c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore) if len(c_escore_notes) > 128: cmatrix = TMIDIX.escore_notes_to_image_matrix(c_escore_notes, filter_out_zero_rows=True, filter_out_duplicate_rows=True) smatrix = TPLOTS.square_image_matrix(cmatrix, num_pca_components=max(1, min(5, len(c_escore_notes) // 128))) output_score = TMIDIX.image_matrix_to_original_escore_notes(smatrix) for o in output_score: o[1] *= 250 o[2] *= 250 if render_output_as_solo_piano: output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not render_remove_drums)) if render_remove_drums and not render_output_as_solo_piano: output_score = TMIDIX.strip_drums_from_escore_notes(output_score) if render_type == "Solo Piano Summary": sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False) zscore = TMIDIX.recalculate_score_timings(sp_escore_notes) if len(zscore) > 128: bmatrix = TMIDIX.escore_notes_to_binary_matrix(zscore) cmatrix = TMIDIX.compress_binary_matrix(bmatrix, only_compress_zeros=True) smatrix = TPLOTS.square_binary_matrix(cmatrix, interpolation_order=max(1, min(5, len(zscore) // 128))) output_score = TMIDIX.binary_matrix_to_original_escore_notes(smatrix) for o in output_score: o[1] *= 200 o[2] *= 200 print('Final adjustments complete.') print('=' * 70) # --- Saving Processed MIDI File --- # Save the transformed MIDI data SONG, patches, _ = TMIDIX.patch_enhanced_score_notes(output_score) # The underlying function mistakenly adds a '.mid' extension. # We must pass the path without the extension to compensate. path_without_ext = new_fn_path.rsplit('.mid', 1)[0] TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(SONG, output_signature = 'Integrated-MIDI-Processor', output_file_name = path_without_ext, track_name='Processed Track', list_of_MIDI_patches=patches ) midi_to_render_path = new_fn_path else: # If "Render as-is", use the original MIDI data with open(new_fn_path, 'wb') as f: f.write(fdata) midi_to_render_path = new_fn_path # --- Audio Rendering --- print('Rendering final audio...') # Select sample rate srate = int(render_sample_rate) # --- Conditional Rendering Logic --- if soundfont_bank == SYNTH_8_BIT_LABEL: print("Using 8-bit style synthesizer...") try: # Load the MIDI file with pretty_midi for manual synthesis midi_data_for_synth = pretty_midi.PrettyMIDI(midi_to_render_path) # Synthesize the waveform audio = synthesize_8bit_style( midi_data_for_synth, s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level, fs=srate ) # Normalize and prepare for Gradio peak_val = np.max(np.abs(audio)) if peak_val > 0: audio /= peak_val # Transpose from (2, N) to (N, 2) and convert to int16 for Gradio audio_out = (audio.T * 32767).astype(np.int16) except Exception as e: print(f"Error during 8-bit synthesis: {e}") return [None] * 7 else: print(f"Using SoundFont: {soundfont_bank}") # Get the full path from the global dictionary soundfont_path = soundfonts_dict.get(soundfont_bank) # Select soundfont if not soundfont_path or not os.path.exists(soundfont_path): # Error handling in case the selected file is not found error_msg = f"SoundFont '{soundfont_bank}' not found!" print(f"ERROR: {error_msg}") # Fallback to the first available soundfont if possible if soundfonts_dict: fallback_key = list(soundfonts_dict.keys())[0] soundfont_path = soundfonts_dict[fallback_key] print(f"Falling back to '{fallback_key}'.") else: # If no soundfonts are available at all, raise an error raise gr.Error("No SoundFonts are available for rendering!") with open(midi_to_render_path, 'rb') as f: midi_file_content = f.read() audio_out = midi_to_colab_audio(midi_file_content, soundfont_path=soundfont_path, # Use the dynamically found path sample_rate=srate, output_for_gradio=True ) print('Audio rendering complete.') print('=' * 70) # --- Preparing Outputs for Gradio --- with open(midi_to_render_path, 'rb') as f: new_md5_hash = hashlib.md5(f.read()).hexdigest() output_plot = TPLOTS.plot_ms_SONG(output_score, plot_title=f"Score of {fn1}", return_plt=True) output_midi_summary = str(meta_data) return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description # ================================================================================================= # === Main Application Logic === # ================================================================================================= def process_and_render_file(input_file, # --- Transcription params --- enable_stereo_processing, transcription_method, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool, # --- MIDI rendering params --- render_type, soundfont_bank, render_sample_rate, render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, # --- 8-bit synth params --- s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level ): """ Main function to handle file processing. It determines the file type and calls the appropriate functions for transcription and/or rendering based on user selections. """ start_time = reqtime.time() if input_file is None: # Return a list of updates to clear all output fields return [gr.update(value=None)] * 7 # The input_file from gr.Audio(type="filepath") is now the direct path (a string), # not a temporary file object. We no longer need to access the .name attribute. input_file_path = input_file filename = os.path.basename(input_file_path) print(f"Processing new file: {filename}") try: audio_data, native_sample_rate = librosa.load(input_file_path, sr=None, mono=False) except Exception as e: raise gr.Error(f"Failed to load audio file: {e}") # --- Step 1: Check file type and transcribe if necessary --- if filename.lower().endswith(('.mid', '.midi', '.kar')): print("MIDI file detected. Proceeding directly to rendering.") midi_path_for_rendering = input_file_path else: #if filename.lower().endswith(('.wav', '.mp3')) print("Audio file detected. Starting transcription...") base_name = os.path.splitext(filename)[0] temp_dir = "output/temp_normalized" os.makedirs(temp_dir, exist_ok=True) # === STEREO PROCESSING LOGIC === if enable_stereo_processing: if audio_data.ndim != 2 or audio_data.shape[0] != 2: print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.") enable_stereo_processing = False # Disable stereo processing if audio is not stereo if enable_stereo_processing: print("Stereo processing enabled. Splitting channels...") try: left_channel = audio_data[0] right_channel = audio_data[1] normalized_left = normalize_loudness(left_channel, native_sample_rate) normalized_right = normalize_loudness(right_channel, native_sample_rate) temp_left_wav_path = os.path.join(temp_dir, f"{base_name}_left.wav") temp_right_wav_path = os.path.join(temp_dir, f"{base_name}_right.wav") sf.write(temp_left_wav_path, normalized_left, native_sample_rate) sf.write(temp_right_wav_path, normalized_right, native_sample_rate) print(f"Saved left channel to: {temp_left_wav_path}") print(f"Saved right channel to: {temp_right_wav_path}") print("Transcribing left channel...") if transcription_method == "General Purpose": midi_path_left = TranscribeGeneralAudio(temp_left_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) else: midi_path_left = TranscribePianoAudio(temp_left_wav_path) print("Transcribing right channel...") if transcription_method == "General Purpose": midi_path_right = TranscribeGeneralAudio(temp_right_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) else: midi_path_right = TranscribePianoAudio(temp_right_wav_path) if midi_path_left and midi_path_right: merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid") midi_path_for_rendering = merge_midis(midi_path_left, midi_path_right, merged_midi_path) elif midi_path_left: print("Warning: Right channel transcription failed. Using left channel only.") midi_path_for_rendering = midi_path_left elif midi_path_right: print("Warning: Left channel transcription failed. Using right channel only.") midi_path_for_rendering = midi_path_right else: raise gr.Error("Both left and right channel transcriptions failed.") except Exception as e: print(f"An error occurred during stereo processing: {e}") raise gr.Error(f"Stereo Processing Failed: {e}") else: print("Stereo processing disabled. Using standard mono transcription.") if audio_data.ndim == 1: mono_signal = audio_data else: mono_signal = np.mean(audio_data, axis=0) normalized_mono = normalize_loudness(mono_signal, native_sample_rate) temp_mono_wav_path = os.path.join(temp_dir, f"{base_name}_mono.wav") sf.write(temp_mono_wav_path, normalized_mono, native_sample_rate) try: if transcription_method == "General Purpose": midi_path_for_rendering = TranscribeGeneralAudio( temp_mono_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool ) else: # Piano-Specific midi_path_for_rendering = TranscribePianoAudio(temp_mono_wav_path) analyze_midi_velocity(midi_path_for_rendering) except Exception as e: print(f"An error occurred during transcription: {e}") raise gr.Error(f"Transcription Failed: {e}") # --- Step 2: Render the MIDI file with selected options --- print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}") results = Render_MIDI(midi_path_for_rendering, render_type, soundfont_bank, render_sample_rate, render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level) print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec') print('*' * 70) return results # ================================================================================================= # === Gradio UI Setup === # ================================================================================================= def update_ui_visibility(transcription_method, soundfont_choice): """ Dynamically updates the visibility of UI components based on user selections. """ is_general = (transcription_method == "General Purpose") is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL) return { general_transcription_settings: gr.update(visible=is_general), synth_8bit_settings: gr.update(visible=is_8bit), } if __name__ == "__main__": # Initialize the app: download model (if needed) and apply patches # Set to False if you don't have 'requests' or 'tqdm' installed initialize_app() # --- Prepare soundfonts and make the map globally accessible --- global soundfonts_dict # On application start, download SoundFonts from Hugging Face Hub if they don't exist. soundfonts_dict = prepare_soundfonts() print(f"Found {len(soundfonts_dict)} local SoundFonts.") if not soundfonts_dict: print("\nWARNING: No SoundFonts were found or could be downloaded.") print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.") app = gr.Blocks(theme=gr.themes.Base()) with app: gr.Markdown("

Audio-to-MIDI & Advanced Renderer

") gr.Markdown( "**Upload a Audio for transcription-then-rendering, or a MIDI for rendering-only.**\n\n" "This application combines piano audio transcription with a powerful MIDI transformation and rendering toolkit. " "Based on the work of [asigalov61](https://github.com/asigalov61)." ) with gr.Row(): waveform_options = gr.WaveformOptions(show_recording_waveform=False) with gr.Column(scale=1): # --- INPUT COLUMN --- gr.Markdown("## 1. Upload File") # Changed from gr.File to gr.Audio to allow for audio preview. # type="filepath" ensures the component returns a string path to the uploaded file. # The component will show a player for supported audio types (e.g., WAV, MP3). input_file = gr.Audio( label="Input Audio or MIDI File", type="filepath", sources=["upload"], waveform_options=waveform_options ) gr.Markdown("## 2. Configure Processing") # --- Transcription Method Selector --- transcription_method = gr.Radio( ["General Purpose", "Piano-Specific"], label="Audio Transcription Method", value="General Purpose", info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings." ) # --- Stereo Processing Checkbox --- enable_stereo_processing = gr.Checkbox( label="Enable Stereo Transcription", value=False, info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time." ) with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings: onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.") frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.") minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.") minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.") maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.") infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)") melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)") multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends") # --- Rendering Settings --- render_type = gr.Radio( ["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"], label="MIDI Transformation Render Type", value="Render as-is", info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations." ) # --- SoundFont Bank with 8-bit option --- # --- Dynamically create the list of choices --- soundfont_choices = [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys()) # Set a safe default value default_sf_choice = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" if "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" in soundfonts_dict else (soundfont_choices[0] if soundfont_choices else "") soundfont_bank = gr.Dropdown( soundfont_choices, label="SoundFont / Synthesizer", value=default_sf_choice ) render_sample_rate = gr.Radio( ["16000", "32000", "44100"], label="Audio Sample Rate", value="44100" ) # --- NEW: 8-bit Synthesizer Settings --- with gr.Accordion("8-bit Synthesizer Settings", open=False, visible=False) as synth_8bit_settings: s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type") s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type") s8bit_decay_time_s = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Decay Time (s)") s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width") s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)") s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)") s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.") # --- Original Advanced Options (Now tied to Piano-Specific) --- with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options: render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True) render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False) render_remove_drums = gr.Checkbox(label="Remove drum track", value=False) render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False) render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)") custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)") merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)") render_align = gr.Radio( ["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"], label="Align notes to musical bars", value="Do not align" ) submit_btn = gr.Button("Process and Render", variant="primary") with gr.Column(scale=2): # --- OUTPUT COLUMN --- gr.Markdown("## 3. Results") output_midi_title = gr.Textbox(label="MIDI Title") output_song_description = gr.Textbox(label="MIDI Description", lines=3) output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options) output_plot = gr.Plot(label="MIDI Score Plot") with gr.Row(): output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"]) output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash") output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4) # --- Define all input components for the click event --- all_inputs = [ input_file, enable_stereo_processing, transcription_method, onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends, render_type, soundfont_bank, render_sample_rate, render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level ] all_outputs = [ output_midi_md5, output_midi_title, output_midi_summary, output_midi, output_audio, output_plot, output_song_description ] # --- Event Handling --- submit_btn.click( process_and_render_file, inputs=all_inputs, outputs=all_outputs ) # --- Listeners for dynamic UI updates --- transcription_method.change( fn=update_ui_visibility, inputs=[transcription_method, soundfont_bank], outputs=[general_transcription_settings, synth_8bit_settings] ) soundfont_bank.change( fn=update_ui_visibility, inputs=[transcription_method, soundfont_bank], outputs=[general_transcription_settings, synth_8bit_settings] ) # Launch the Gradio app app.queue().launch(inbrowser=True, debug=True)