diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -44,6 +44,7 @@ import os import hashlib import time as reqtime import copy +import shutil import librosa import pyloudnorm as pyln import soundfile as sf @@ -51,6 +52,7 @@ import soundfile as sf import torch import ffmpeg import gradio as gr +from dataclasses import dataclass, fields # ADDED for the parameter object # --- Imports for Vocal Separation --- import torchaudio @@ -85,6 +87,72 @@ import glob # --- Define a constant for the 8-bit synthesizer option --- SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)" + +# ================================================================================================= +# === NEW: Central Parameter Object === +# ================================================================================================= + +@dataclass +class AppParameters: + """A dataclass to hold all configurable parameters for the application.""" + # This provides type safety and autocomplete, preventing typos from string keys. + + # Input files (not part of the settings panel) + input_file: str = None + batch_input_files: list = None + + # Global Settings + s8bit_preset_selector: str = "Custom" + separate_vocals: bool = False + remerge_vocals: bool = False + transcription_target: str = "Transcribe Music (Accompaniment)" + transcribe_both_stems: bool = False + enable_stereo_processing: bool = False + transcription_method: str = "General Purpose" + + # Basic Pitch Settings + onset_threshold: float = 0.5 + frame_threshold: float = 0.3 + minimum_note_length: int = 128 + minimum_frequency: float = 60.0 + maximum_frequency: float = 4000.0 + infer_onsets: bool = True + melodia_trick: bool = True + multiple_pitch_bends: bool = False + + # Render Settings + render_type: str = "Render as-is" + soundfont_bank: str = "None (8-bit Synthesizer)" + render_sample_rate: str = "44100" + render_with_sustains: bool = True + merge_misaligned_notes: int = -1 + custom_render_patch: int = -1 + render_align: str = "Do not align" + render_transpose_value: int = 0 + render_transpose_to_C4: bool = False + render_output_as_solo_piano: bool = False + render_remove_drums: bool = False + + # 8-bit Synthesizer Settings + s8bit_waveform_type: str = 'Square' + s8bit_pulse_width: float = 0.5 + s8bit_envelope_type: str = 'Plucky (AD Envelope)' + s8bit_decay_time_s: float = 0.1 + s8bit_vibrato_rate: float = 5.0 + s8bit_vibrato_depth: float = 0.0 + s8bit_bass_boost_level: float = 0.0 + s8bit_smooth_notes_level: float = 0.0 + s8bit_continuous_vibrato_level: float = 0.0 + s8bit_noise_level: float = 0.0 + s8bit_distortion_level: float = 0.0 + s8bit_fm_modulation_depth: float = 0.0 + s8bit_fm_modulation_rate: float = 0.0 + + +# ================================================================================================= +# === Helper Functions === +# ================================================================================================= + def prepare_soundfonts(): """ Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2' @@ -171,11 +239,7 @@ def prepare_soundfonts(): # ================================================================================================= # === 8-bit Style Synthesizer (Stereo Enabled) === # ================================================================================================= -def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width, - vibrato_rate, vibrato_depth, bass_boost_level, fs=44100, - smooth_notes_level=0.0, continuous_vibrato_level=0.0, - noise_level=0.0, distortion_level=0.0, - fm_modulation_depth=0.0, fm_modulation_rate=0.0): +def synthesize_8bit_style(*, midi_data: pretty_midi.PrettyMIDI, fs: int, params: AppParameters): """ Synthesizes an 8-bit style audio waveform from a PrettyMIDI object. This function generates waveforms manually instead of using a synthesizer like FluidSynth. @@ -225,23 +289,23 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, # --- Graded Continuous Vibrato --- # This now interpolates between a fully reset vibrato and a fully continuous one. # Use accumulated phase to avoid vibrato reset per note - vib_phase_inc = 2 * np.pi * vibrato_rate / fs - per_note_vib_phase = 2 * np.pi * vibrato_rate * t + vib_phase_inc = 2 * np.pi * params.s8bit_vibrato_rate / fs + per_note_vib_phase = 2 * np.pi * params.s8bit_vibrato_rate * t continuous_vib_phase = vibrato_phase + np.arange(num_samples) * vib_phase_inc # Weighted average of the two phase types final_vib_phase = ( - per_note_vib_phase * (1 - continuous_vibrato_level) + - continuous_vib_phase * continuous_vibrato_level + per_note_vib_phase * (1 - params.s8bit_continuous_vibrato_level) + + continuous_vib_phase * params.s8bit_continuous_vibrato_level ) - vibrato_lfo = vibrato_depth * np.sin(final_vib_phase) + vibrato_lfo = params.s8bit_vibrato_depth * np.sin(final_vib_phase) # Update the global vibrato phase for the next note if num_samples > 0: vibrato_phase = (continuous_vib_phase[-1] + vib_phase_inc) % (2 * np.pi) # --- Waveform Generation with FM --- - fm_lfo = fm_modulation_depth * np.sin(2 * np.pi * fm_modulation_rate * t) + fm_lfo = params.s8bit_fm_modulation_depth * np.sin(2 * np.pi * params.s8bit_fm_modulation_rate * t) modulated_freq = freq * (1 + fm_lfo) # --- Waveform Generation (Main Oscillator with phase continuity) --- @@ -250,15 +314,15 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, if num_samples > 0: osc_phase[i] = phase[-1] % (2 * np.pi) # Store last phase - if waveform_type == 'Square': - note_waveform = signal.square(phase, duty=pulse_width) - elif waveform_type == 'Sawtooth': + if params.s8bit_waveform_type == 'Square': + note_waveform = signal.square(phase, duty=params.s8bit_pulse_width) + elif params.s8bit_waveform_type == 'Sawtooth': note_waveform = signal.sawtooth(phase) else: # Triangle note_waveform = signal.sawtooth(phase, width=0.5) # --- Bass Boost (Sub-Octave Oscillator) --- - if bass_boost_level > 0: + if params.s8bit_bass_boost_level > 0: bass_freq = freq / 2.0 # Only add bass if the frequency is reasonably audible if bass_freq > 20: @@ -268,25 +332,25 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, bass_sub_waveform = signal.square(bass_phase, duty=0.5) # Mix the main and bass waveforms. # As bass level increases, slightly decrease main waveform volume to prevent clipping. - main_level = 1.0 - (0.5 * bass_boost_level) - note_waveform = (note_waveform * main_level) + (bass_sub_waveform * bass_boost_level) + main_level = 1.0 - (0.5 * params.s8bit_bass_boost_level) + note_waveform = (note_waveform * main_level) + (bass_sub_waveform * params.s8bit_bass_boost_level) # --- Noise & Distortion Simulation (White Noise) --- - if noise_level > 0: - note_waveform += np.random.uniform(-1, 1, num_samples) * noise_level + if params.s8bit_noise_level > 0: + note_waveform += np.random.uniform(-1, 1, num_samples) * params.s8bit_noise_level # --- Distortion (Wave Shaping) --- - if distortion_level > 0: + if params.s8bit_distortion_level > 0: # Using a tanh function for a smoother, "warmer" distortion - note_waveform = np.tanh(note_waveform * (1 + distortion_level * 5)) + note_waveform = np.tanh(note_waveform * (1 + params.s8bit_distortion_level * 5)) # --- ADSR Envelope --- start_amp = note.velocity / 127.0 envelope = np.zeros(num_samples) - if envelope_type == 'Plucky (AD Envelope)': + if params.s8bit_envelope_type == 'Plucky (AD Envelope)': attack_samples = min(int(0.005 * fs), num_samples) - decay_samples = min(int(decay_time_s * fs), num_samples - attack_samples) + decay_samples = min(int(params.s8bit_decay_time_s * fs), num_samples - attack_samples) envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples) if decay_samples > 0: @@ -296,8 +360,8 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, # --- Graded Note Smoothing --- # The level controls the length of the fade in/out. Max fade is 10ms. - if smooth_notes_level > 0 and num_samples > 10: - fade_length = int(fs * 0.01 * smooth_notes_level) + if params.s8bit_smooth_notes_level > 0 and num_samples > 10: + fade_length = int(fs * 0.01 * params.s8bit_smooth_notes_level) fade_samples = min(fade_length, num_samples // 2) if fade_samples > 0: envelope[:fade_samples] *= np.linspace(0.5, 1.0, fade_samples) @@ -523,7 +587,7 @@ def TranscribePianoAudio(input_file): # Return the path to the newly created MIDI file return out_mid_path -def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool): +def TranscribeGeneralAudio(input_file, onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_bends): """ Transcribes a general audio file into a MIDI file using basic-pitch. This is suitable for various instruments and vocals. @@ -546,14 +610,14 @@ def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, model_output, midi_data, note_events = basic_pitch.inference.predict( audio_path=input_file, model_or_model_path=ICASSP_2022_MODEL_PATH, - onset_threshold=onset_thresh, - frame_threshold=frame_thresh, - minimum_note_length=min_note_len, - minimum_frequency=min_freq, - maximum_frequency=max_freq, - infer_onsets=infer_onsets_bool, - melodia_trick=melodia_trick_bool, - multiple_pitch_bends=multiple_bends_bool + onset_threshold=onset_threshold, + frame_threshold=frame_threshold, + minimum_note_length=minimum_note_length, + minimum_frequency=minimum_frequency, + maximum_frequency=maximum_frequency, + infer_onsets=infer_onsets, + melodia_trick=melodia_trick, + multiple_pitch_bends=multiple_bends ) # --- Save the MIDI file --- @@ -567,24 +631,7 @@ def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, # === Stage 2: MIDI Transformation and Rendering Function === # ================================================================================================= -def Render_MIDI(input_midi_path, - render_type, - soundfont_bank, - render_sample_rate, - render_with_sustains, - merge_misaligned_notes, - custom_render_patch, - render_align, - render_transpose_value, - render_transpose_to_C4, - render_output_as_solo_piano, - render_remove_drums, - # --- 8-bit synth params --- - s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, - s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, - s8bit_bass_boost_level, s8bit_smooth_notes_level, s8bit_continuous_vibrato_level, - s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate - ): +def Render_MIDI(*, input_midi_path: str, params: AppParameters): """ Processes and renders a MIDI file according to user-defined settings. Can render using SoundFonts or a custom 8-bit synthesizer. @@ -624,9 +671,9 @@ def Render_MIDI(input_midi_path, print(f'Input MIDI file name: {fn}') print(f'Input MIDI md5 hash: {input_midi_md5hash}') print('-' * 70) - print(f'Render type: {render_type}') - print(f'Soundfont bank: {soundfont_bank}') - print(f'Audio render sample rate: {render_sample_rate}') + print(f"Render type: {params.render_type}") + print(f"Soundfont bank: {params.soundfont_bank}") + print(f"Audio render sample rate: {params.render_sample_rate}") # ... (add other print statements for settings if needed) print('=' * 70) @@ -636,7 +683,7 @@ def Render_MIDI(input_midi_path, # call the function and store the returned list in a variable. processed_scores = TMIDIX.advanced_score_processor(raw_score, return_enhanced_score_notes=True, - apply_sustain=render_with_sustains) + apply_sustain=params.render_with_sustains) # check if the returned list is empty. This happens when transcription finds no notes. # This check prevents the 'IndexError: list index out of range'. if not processed_scores: @@ -655,8 +702,8 @@ def Render_MIDI(input_midi_path, return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.") # This line will now work correctly because merge_misaligned_notes is guaranteed to be an integer. - if merge_misaligned_notes > 0: - escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes) + if params.merge_misaligned_notes > 0: + escore = TMIDIX.merge_escore_notes(escore, merge_threshold=params.merge_misaligned_notes) escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1) @@ -680,21 +727,21 @@ def Render_MIDI(input_midi_path, output_score = copy.deepcopy(escore) # Apply transformations based on render_type - if render_type == "Extract melody": + if params.render_type == "Extract melody": output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True) output_score = TMIDIX.recalculate_score_timings(output_score) - elif render_type == "Flip": + elif params.render_type == "Flip": output_score = TMIDIX.flip_enhanced_score_notes(escore) - elif render_type == "Reverse": + elif params.render_type == "Reverse": output_score = TMIDIX.reverse_enhanced_score_notes(escore) - elif render_type == 'Repair Durations': + elif params.render_type == 'Repair Durations': output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0) - elif render_type == 'Repair Chords': + elif params.render_type == 'Repair Chords': fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0] output_score = TMIDIX.flatten(fixed_cscore) - elif render_type == 'Remove Duplicate Pitches': + elif params.render_type == 'Remove Duplicate Pitches': output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore) - elif render_type == "Add Drum Track": + elif params.render_type == "Add Drum Track": nd_escore = [e for e in escore if e[3] != 9] nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore) output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore) @@ -707,32 +754,32 @@ def Render_MIDI(input_midi_path, print('=' * 70) # --- Final Processing and Patching --- - if render_type != "Render as-is": + if params.render_type != "Render as-is": print('Applying final adjustments (transpose, align, patch)...') - if custom_render_patch != -1: # -1 indicates no change + if params.custom_render_patch != -1: # -1 indicates no change for e in output_score: if e[3] != 9: # not a drum channel - e[6] = custom_render_patch + e[6] = params.custom_render_patch - if render_transpose_value != 0: - output_score = TMIDIX.transpose_escore_notes(output_score, render_transpose_value) + if params.render_transpose_value != 0: + output_score = TMIDIX.transpose_escore_notes(output_score, params.render_transpose_value) - if render_transpose_to_C4: + if params.render_transpose_to_C4: output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) # C4 is MIDI pitch 60 - if render_align == "Start Times": + if params.render_align == "Start Times": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score) - elif render_align == "Start Times and Durations": + elif params.render_align == "Start Times and Durations": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True) - elif render_align == "Start Times and Split Durations": + elif params.render_align == "Start Times and Split Durations": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True) - if render_type == "Longest Repeating Phrase": + if params.render_type == "Longest Repeating Phrase": zscore = TMIDIX.recalculate_score_timings(output_score) lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore) @@ -742,7 +789,7 @@ def Render_MIDI(input_midi_path, else: output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50)) - if render_type == "Multi-Instrumental Summary": + if params.render_type == "Multi-Instrumental Summary": zscore = TMIDIX.recalculate_score_timings(output_score) c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore) @@ -755,13 +802,13 @@ def Render_MIDI(input_midi_path, o[1] *= 250 o[2] *= 250 - if render_output_as_solo_piano: - output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not render_remove_drums)) + if params.render_output_as_solo_piano: + output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not params.render_remove_drums)) - if render_remove_drums and not render_output_as_solo_piano: + if params.render_remove_drums and not params.render_output_as_solo_piano: output_score = TMIDIX.strip_drums_from_escore_notes(output_score) - if render_type == "Solo Piano Summary": + if params.render_type == "Solo Piano Summary": sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False) zscore = TMIDIX.recalculate_score_timings(sp_escore_notes) @@ -787,7 +834,7 @@ def Render_MIDI(input_midi_path, # We must pass the path without the extension to compensate. path_without_ext = new_fn_path.rsplit('.mid', 1)[0] - TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(SONG, + MIDI.Tegridy_ms_SONG_to_MIDI_Converter(SONG, output_signature = 'Integrated-MIDI-Processor', output_file_name = path_without_ext, track_name='Processed Track', @@ -804,29 +851,17 @@ def Render_MIDI(input_midi_path, print('Rendering final audio...') # Select sample rate - srate = int(render_sample_rate) + srate = int(params.render_sample_rate) # --- Conditional Rendering Logic --- - if soundfont_bank == SYNTH_8_BIT_LABEL: + if params.soundfont_bank == SYNTH_8_BIT_LABEL: print("Using 8-bit style synthesizer...") try: # Load the MIDI file with pretty_midi for manual synthesis midi_data_for_synth = pretty_midi.PrettyMIDI(midi_to_render_path) # Synthesize the waveform # --- Passing new FX parameters to the synthesis function --- - audio = synthesize_8bit_style( - midi_data_for_synth, - s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, - s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, - s8bit_bass_boost_level, - fs=srate, - smooth_notes_level=s8bit_smooth_notes_level, - continuous_vibrato_level=s8bit_continuous_vibrato_level, - noise_level=s8bit_noise_level, - distortion_level=s8bit_distortion_level, - fm_modulation_depth=s8bit_fm_modulation_depth, - fm_modulation_rate=s8bit_fm_modulation_rate - ) + audio = synthesize_8bit_style(midi_data=midi_data_for_synth, fs=srate, params=params) # Normalize and prepare for Gradio peak_val = np.max(np.abs(audio)) if peak_val > 0: @@ -837,23 +872,25 @@ def Render_MIDI(input_midi_path, print(f"Error during 8-bit synthesis: {e}") return [None] * 7 else: - print(f"Using SoundFont: {soundfont_bank}") + print(f"Using SoundFont: {params.soundfont_bank}") # Get the full path from the global dictionary - soundfont_path = soundfonts_dict.get(soundfont_bank) + soundfont_path = soundfonts_dict.get(params.soundfont_bank) # Select soundfont if not soundfont_path or not os.path.exists(soundfont_path): - # Error handling in case the selected file is not found - error_msg = f"SoundFont '{soundfont_bank}' not found!" - print(f"ERROR: {error_msg}") - # Fallback to the first available soundfont if possible - if soundfonts_dict: - fallback_key = list(soundfonts_dict.keys())[0] - soundfont_path = soundfonts_dict[fallback_key] - print(f"Falling back to '{fallback_key}'.") - else: - # If no soundfonts are available at all, raise an error - raise gr.Error("No SoundFonts are available for rendering!") + # If the selected soundfont is not found, inform the user directly via the UI. + raise gr.Error(f"SoundFont file '{params.soundfont_bank}' could not be found. Please check your 'src/sf2' directory or select another SoundFont.") + # # Error handling in case the selected file is not found + # error_msg = f"SoundFont '{params.soundfont_bank}' not found!" + # print(f"ERROR: {error_msg}") + # # Fallback to the first available soundfont if possible + # if soundfonts_dict: + # fallback_key = list(soundfonts_dict.keys())[0] + # soundfont_path = soundfonts_dict[fallback_key] + # print(f"Falling back to '{fallback_key}'.") + # else: + # # If no soundfonts are available at all, raise an error + # raise gr.Error("No SoundFonts are available for rendering!") with open(midi_to_render_path, 'rb') as f: midi_file_content = f.read() @@ -1059,11 +1096,7 @@ def recommend_8bit_params(midi_data, default_preset): # --- Helper function to encapsulate the transcription pipeline for a single audio file --- -def _transcribe_stem(audio_path, base_name, temp_dir, - # Pass all transcription-related parameters - enable_stereo, transcription_method, - onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, - infer_onsets_bool, melodia_trick_bool, multiple_bends_bool): +def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: AppParameters): """ Takes a single audio file path and runs the full transcription pipeline on it. This includes stereo/mono handling and normalization. @@ -1074,7 +1107,7 @@ def _transcribe_stem(audio_path, base_name, temp_dir, # Load the audio stem to process it audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False) - if enable_stereo and audio_data.ndim == 2 and audio_data.shape[0] == 2: + if params.enable_stereo_processing and audio_data.ndim == 2 and audio_data.shape[0] == 2: print("Stereo processing enabled for stem.") left_channel_np = audio_data[0] right_channel_np = audio_data[1] @@ -1092,9 +1125,9 @@ def _transcribe_stem(audio_path, base_name, temp_dir, print(f"Saved right channel to: {temp_right_path}") print("Transcribing left and right channel...") - if transcription_method == "General Purpose": - midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) - midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) + if params.transcription_method == "General Purpose": + midi_path_left = TranscribeGeneralAudio(temp_left_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends) + midi_path_right = TranscribeGeneralAudio(temp_right_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends) else: # Piano-Specific midi_path_left = TranscribePianoAudio(temp_left_path) midi_path_right = TranscribePianoAudio(temp_right_path) @@ -1118,60 +1151,43 @@ def _transcribe_stem(audio_path, base_name, temp_dir, temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.flac") sf.write(temp_mono_path, normalized_mono, native_sample_rate) - if transcription_method == "General Purpose": - return TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) + if params.transcription_method == "General Purpose": + return TranscribeGeneralAudio(temp_mono_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends) else: return TranscribePianoAudio(temp_mono_path) -# --- The main processing function is now significantly refactored --- -def process_and_render_file(input_file, - # --- Pass the preset selector value --- - s8bit_preset_selector, - separate_vocals, - remerge_vocals, - transcription_target, - # --- ADDED: New parameter from UI --- - transcribe_both_stems, - # --- Transcription params --- - enable_stereo_processing, - transcription_method, - onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool, - # --- MIDI rendering params --- - render_type, soundfont_bank, render_sample_rate, - render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, - render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, - # --- 8-bit synth params --- - s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, - s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, - s8bit_bass_boost_level, s8bit_smooth_notes_level, s8bit_continuous_vibrato_level, - s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate - ): + +# --- The core processing engine for a single file --- +def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppParameters): """ - Main function to handle file processing. It determines the file type and calls the - appropriate functions for transcription and/or rendering based on user selections. + This is the main processing engine. It takes a file path and a dictionary of all settings, + and performs the full pipeline: load, separate, transcribe, render, re-merge. + It is UI-agnostic and returns file paths and data, not Gradio updates. """ - start_time = reqtime.time() - if input_file is None: - # Return a list of updates to clear all output fields and UI controls - return [gr.update(value=None)] * (7 + 13) # 7 results + 13 synth controls + # --- Start timer for this specific file --- + file_start_time = reqtime.time() - # The input_file from gr.Audio(type="filepath") is now the direct path (a string), - # not a temporary file object. We no longer need to access the .name attribute. - input_file_path = input_file filename = os.path.basename(input_file_path) - print(f"Processing new file: {filename}") - + base_name = os.path.splitext(filename)[0] + print(f"\n{'='*20} Starting Pipeline for: {filename} {'='*20}") + + # --- Use the provided timestamp for unique filenames --- + timestamped_base_name = f"{base_name}_{timestamp}" + # This will store the other part if separation is performed other_part_tensor = None other_part_sr = None # --- Step 1: Check file type and transcribe if necessary --- if filename.lower().endswith(('.mid', '.midi', '.kar')): - print("MIDI file detected. Cannot perform vocal separation. Proceeding directly to rendering.") + print("MIDI file detected. Skipping transcription. Proceeding directly to rendering.") midi_path_for_rendering = input_file_path else: + temp_dir = "output/temp_transcribe" # Define temp_dir early for the fallback + os.makedirs(temp_dir, exist_ok=True) + + # --- Audio Loading --- print("Audio file detected. Starting pre-processing...") - # --- Robust audio loading with ffmpeg fallback --- try: # Try loading directly with torchaudio (efficient for supported formats). @@ -1182,43 +1198,35 @@ def process_and_render_file(input_file, except Exception as e: print(f"Torchaudio failed: {e}. Attempting fallback with ffmpeg...") try: - # Use ffmpeg to convert the audio to WAV in-memory, then load the bytes. - out, err = ( + # Define a path for the temporary converted file + converted_flac_path = os.path.join(temp_dir, f"{timestamped_base_name}_converted.flac") + # Use ffmpeg to convert the input file to a clean FLAC file on disk + ( ffmpeg .input(input_file_path) - .output('pipe:', format='flac') + .output(converted_flac_path, acodec='flac') + .overwrite_output() .run(capture_stdout=True, capture_stderr=True) ) - # Load the WAV data from the in-memory buffer - audio_tensor, native_sample_rate = torchaudio.load(io.BytesIO(out)) - print("FFmpeg fallback successful.") + # Now, load the newly created and guaranteed-to-be-compatible FLAC file + audio_tensor, native_sample_rate = torchaudio.load(converted_flac_path) + print(f"FFmpeg fallback successful. Loaded from: {converted_flac_path}") except Exception as ffmpeg_err: - # If both direct loading and ffmpeg fallback fail, raise an error. - raise gr.Error(f"Failed to load audio file with both torchaudio and ffmpeg.\n" - f"Torchaudio error: {e}\n" - f"FFmpeg error: {ffmpeg_err.decode() if isinstance(ffmpeg_err, bytes) else ffmpeg_err}") + # In batch mode, we just print an error and skip this file + stderr = ffmpeg_err.stderr.decode() if hasattr(ffmpeg_err, 'stderr') else str(ffmpeg_err) + print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}") + return None # Return None to indicate failure - base_name = os.path.splitext(filename)[0] - temp_dir = "output/temp_transcribe" - os.makedirs(temp_dir, exist_ok=True) - # --- Demucs Vocal Separation Logic, now decides which stem to process --- - if not separate_vocals: + if not params.separate_vocals or demucs_model is None: + if params.separate_vocals and demucs_model is None: + print("ERROR: Demucs model not loaded. Skipping separation.") # --- Standard Workflow: Transcribe the original full audio --- - print("Standard workflow: No vocal separation.") - audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}_original.flac") + audio_to_transcribe_path = os.path.join(temp_dir, f"{timestamped_base_name}_original.flac") torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate) - midi_path_for_rendering = _transcribe_stem( - audio_to_transcribe_path, f"{base_name}_original", temp_dir, - enable_stereo_processing, transcription_method, - onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, - infer_onsets_bool, melodia_trick_bool, multiple_bends_bool - ) + midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params) else: # --- Vocal Separation Workflow --- - if demucs_model is None: - raise gr.Error("Demucs model is not loaded. Cannot separate vocals.") - # Convert to a common format (stereo, float32) that demucs expects audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels) @@ -1234,7 +1242,7 @@ def process_and_render_file(input_file, device='cuda' if torch.cuda.is_available() else 'cpu', progress=True, )[0] # Remove the batch dimension from the output - + # --- Clear CUDA cache immediately after use --- if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -1263,56 +1271,41 @@ def process_and_render_file(input_file, torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate) # --- Determine which stem is the primary target and which is the "other part" --- - primary_target_path = vocals_path if transcription_target == "Transcribe Vocals" else accompaniment_path - other_part_path = accompaniment_path if transcription_target == "Transcribe Vocals" else vocals_path + primary_target_path = vocals_path if params.transcription_target == "Transcribe Vocals" else accompaniment_path + other_part_path = accompaniment_path if params.transcription_target == "Transcribe Vocals" else vocals_path # Store the audio tensor of the "other part" for potential audio re-merging - other_part_tensor = accompaniment_tensor if transcription_target == "Transcribe Vocals" else vocals_tensor + other_part_tensor = accompaniment_tensor if params.transcription_target == "Transcribe Vocals" else vocals_tensor other_part_sr = demucs_model.samplerate print("Separation complete.") # --- Main Branching Logic: Transcribe one or both stems --- - if not transcribe_both_stems: + if not params.transcribe_both_stems: print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}") - midi_path_for_rendering = _transcribe_stem( - primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, - enable_stereo_processing, transcription_method, - onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, - infer_onsets_bool, melodia_trick_bool, multiple_bends_bool - ) + midi_path_for_rendering = _transcribe_stem(primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, params) else: print("Transcribing BOTH stems and merging the MIDI results.") - + # Transcribe the primary target - midi_path_primary = _transcribe_stem( - primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, - enable_stereo_processing, transcription_method, - onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, - infer_onsets_bool, melodia_trick_bool, multiple_bends_bool - ) - + midi_path_primary = _transcribe_stem(primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, params) + # Transcribe the other part - midi_path_other = _transcribe_stem( - other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir, - enable_stereo_processing, transcription_method, - onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, - infer_onsets_bool, melodia_trick_bool, multiple_bends_bool - ) - + midi_path_other = _transcribe_stem(other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir, params) + # Merge the two resulting MIDI files if midi_path_primary and midi_path_other: final_merged_midi_path = os.path.join(temp_dir, f"{base_name}_full_transcription.mid") print(f"Merging transcribed MIDI files into {os.path.basename(final_merged_midi_path)}") - + # A more robust MIDI merge is needed here primary_midi = pretty_midi.PrettyMIDI(midi_path_primary) other_midi = pretty_midi.PrettyMIDI(midi_path_other) - + # Add all instruments from the other midi to the primary one for instrument in other_midi.instruments: instrument.name = f"Other - {instrument.name}" # Rename to avoid confusion primary_midi.instruments.append(instrument) - + primary_midi.write(final_merged_midi_path) midi_path_for_rendering = final_merged_midi_path elif midi_path_primary: @@ -1320,77 +1313,52 @@ def process_and_render_file(input_file, midi_path_for_rendering = midi_path_primary else: raise gr.Error("Transcription of the primary target failed. Aborting.") - + + if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering): + print(f"ERROR: Transcription failed for {filename}. Skipping.") + return None + # --- Step 2: Render the FINAL MIDI file with selected options --- # --- Auto-Recommendation Logic --- - # Store the original parameters from the UI sliders into a dictionary. - # The keys in this dictionary match the keys returned by recommend_8bit_params. - synth_params = { - 'waveform_type': s8bit_waveform_type, 'pulse_width': s8bit_pulse_width, 'envelope_type': s8bit_envelope_type, - 'decay_time_s': s8bit_decay_time_s, 'vibrato_rate': s8bit_vibrato_rate, 'vibrato_depth': s8bit_vibrato_depth, - 'bass_boost_level': s8bit_bass_boost_level, 'smooth_notes_level': s8bit_smooth_notes_level, 'continuous_vibrato_level': s8bit_continuous_vibrato_level, - 'noise_level': s8bit_noise_level, 'distortion_level': s8bit_distortion_level, - 'fm_modulation_depth': s8bit_fm_modulation_depth, 'fm_modulation_rate': s8bit_fm_modulation_rate, - } - - # This variable will hold the values to update the UI sliders - ui_updates = {} - # If the user selected the auto-recommend option, override the parameters - if s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": + if params.s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": print("Auto-Recommendation is enabled. Analyzing MIDI features...") try: midi_to_analyze = pretty_midi.PrettyMIDI(midi_path_for_rendering) - default_params = S8BIT_PRESETS[FALLBACK_PRESET_NAME] - recommended_params = recommend_8bit_params(midi_to_analyze, default_params) - + default_preset = S8BIT_PRESETS[FALLBACK_PRESET_NAME] + recommended_params = recommend_8bit_params(midi_to_analyze, default_preset) + print("Recommended parameters:", recommended_params) - # Both the synthesis parameters and the UI update values are set to the recommendations - synth_params.update(recommended_params) - ui_updates = recommended_params.copy() # Use a copy for UI updates + # Update the params object *before* the main pipeline runs + for key, value in recommended_params.items(): + setattr(params, f"s8bit_{key}", value) + print("Parameters updated with recommendations.") except Exception as e: - print(f"Could not auto-recommend parameters: {e}. Using default values from UI.") - + print(f"Could not auto-recommend parameters for {filename}: {e}.") + print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}") - - # --- Correctly pass parameters to Render_MIDI --- - # The Render_MIDI function expects positional arguments, not keyword arguments. - # We must unpack the values from our synth_params dictionary in the correct order. - results = Render_MIDI(midi_path_for_rendering, - render_type, soundfont_bank, render_sample_rate, - render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, - render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, - # Unpack the values from the dictionary as positional arguments - synth_params['waveform_type'], - synth_params['envelope_type'], - synth_params['decay_time_s'], - synth_params['pulse_width'], - synth_params['vibrato_rate'], - synth_params['vibrato_depth'], - synth_params['bass_boost_level'], - synth_params['smooth_notes_level'], - synth_params['continuous_vibrato_level'], - synth_params['noise_level'], - synth_params['distortion_level'], - synth_params['fm_modulation_depth'], - synth_params['fm_modulation_rate'] - ) - - # --- Vocal Re-merging Logic now uses the generic "other_part" --- - # IMPORTANT: This only runs if we did NOT transcribe both stems. - if separate_vocals and remerge_vocals and not transcribe_both_stems and other_part_tensor is not None: + + # Call the rendering function, Pass dictionaries directly to Render_MIDI + results_tuple = Render_MIDI(input_midi_path=midi_path_for_rendering, params=params) + + # --- Vocal Re-merging Logic --- + if params.separate_vocals and params.remerge_vocals and not params.transcribe_both_stems and other_part_tensor is not None: print(f"Re-merging the non-transcribed part with newly rendered music...") - rendered_srate, rendered_music_int16 = results[4] + # 1. Unpack the original rendered audio from the results + rendered_srate, rendered_music_int16 = results_tuple[4] + # 2. Convert the rendered music to a float tensor rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0 rendered_music_tensor = torch.from_numpy(rendered_music_float).T + # 3. Resample if necessary if rendered_srate != other_part_sr: resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr) rendered_music_tensor = resampler(rendered_music_tensor) + # 4. Pad to match lengths len_music = rendered_music_tensor.shape[1] len_other = other_part_tensor.shape[1] @@ -1401,45 +1369,179 @@ def process_and_render_file(input_file, padding = len_other - len_music rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding)) + # 5. Merge and normalize merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu() - max_abs = torch.max(torch.abs(merged_audio_tensor)) if max_abs > 1.0: merged_audio_tensor /= max_abs + # 6. Convert back to the required format (int16 numpy array) merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16) - new_results = list(results) - new_results[4] = (other_part_sr, merged_audio_int16) - results = tuple(new_results) + # 7. Create the new audio tuple and UPDATE the main results_tuple + new_audio_tuple = (other_part_sr, merged_audio_int16) + + temp_results_list = list(results_tuple) + temp_results_list[4] = new_audio_tuple + results_tuple = tuple(temp_results_list) # results_tuple is now updated print("Re-merging complete.") - print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec') - print('*' * 70) + # --- Save final audio and return path --- + final_srate, final_audio_data = results_tuple[4] + final_midi_path_from_render = results_tuple[3] # Get the path of the processed MIDI - # --- Prepare the final return value for Gradio --- + # --- Use timestamped names for final outputs --- + output_audio_dir = "output/final_audio" + output_midi_dir = "output/final_midi" + os.makedirs(output_audio_dir, exist_ok=True) + os.makedirs(output_midi_dir, exist_ok=True) - # This list defines the order of UI components to be updated. - # IT MUST MATCH THE ORDER IN `s8bit_updater_outputs` IN THE MAIN BLOCK. - param_order = [ - 'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate', - 'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level', - 'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate' - ] + final_audio_path = os.path.join(output_audio_dir, f"{timestamped_base_name}_rendered.flac") + # Also, copy the final processed MIDI to a consistent output directory with a timestamped name + final_midi_path = os.path.join(output_midi_dir, f"{timestamped_base_name}_processed.mid") + + + sf.write(final_audio_path, final_audio_data, final_srate) + # Use shutil to copy the final midi file to its new home + shutil.copy(final_midi_path_from_render, final_midi_path) + + + # --- Log the processing time for this specific file at the end --- + file_processing_time = reqtime.time() - file_start_time + print(f"--- Pipeline finished for {filename} in {file_processing_time:.2f} seconds. ---") + print(f"Output Audio: {final_audio_path}\nOutput MIDI: {final_midi_path}") + + # Return a dictionary of all results for the wrappers to use + results = { + "final_audio_path": final_audio_path, + "final_midi_path": final_midi_path, + "md5_hash": results_tuple[0], + "title": results_tuple[1], + "summary": results_tuple[2], + "plot": results_tuple[5], + "description": results_tuple[6] + } + # Return both the results and the final state of the parameters object + return results, params + + +# ================================================================================================= +# === Gradio UI Wrappers === +# ================================================================================================= + +# --- Thin wrapper for batch processing --- +def batch_process_files(input_files, progress=gr.Progress(), *args): + """ + Gradio wrapper for batch processing. It packs all UI values into an AppParameters object. + It iterates through files, calls the core pipeline, and collects the output file paths. + """ + + if not input_files: + print("No files uploaded for batch processing.") + return [], [] # Return two empty lists + + # --- Start timer for the entire batch --- + batch_start_time = reqtime.time() + # --- Generate a single timestamp for the entire batch job --- + batch_timestamp = reqtime.strftime("%Y%m%d-%H%M%S") + + # Create the AppParameters object from the flat list of UI values + params = AppParameters(**dict(zip(ALL_PARAM_KEYS, args))) + + output_audio_paths = [] + output_midi_paths = [] # List to collect MIDI file paths + total_files = len(input_files) + + # Initialize progress at 0% + progress(0, desc="Starting Batch Process...") + for i, file_obj in enumerate(input_files): + # The input from gr.File is a tempfile object, we need its path + input_path = file_obj.name + progress(i / total_files, desc=f"Processing {os.path.basename(input_path)} ({i+1}/{total_files})") + + # --- Pass the batch_timestamp to the pipeline --- + results, _ = run_single_file_pipeline(input_path, batch_timestamp, params) + + if results: + if results.get("final_audio_path"): + output_audio_paths.append(results["final_audio_path"]) + if results.get("final_midi_path"): + output_midi_paths.append(results["final_midi_path"]) # Collect MIDI path + + # Ensure the progress bar reaches 100% upon completion + progress(1, desc="Batch Process Complete!") + + # --- Calculate and print the total batch time --- + total_batch_time = reqtime.time() - batch_start_time + print(f"\nBatch processing complete. {len(output_audio_paths)} of {total_files} files processed successfully.") + print(f"Total batch execution time: {total_batch_time:.2f} seconds.") + + # --- Return both lists of paths --- + return output_audio_paths, output_midi_paths + + +# --- The original function is now a thin wrapper for the single file UI --- +def process_and_render_file(input_file, *args): + """ + Gradio wrapper for the single file processing UI. Packs UI values into an AppParameters object. + Calls the core pipeline and formats the output for all UI components. + Main function to handle file processing. It determines the file type and calls the + appropriate functions for transcription and/or rendering based on user selections. + """ + if input_file is None: + # Return a list of updates to clear all output fields and UI controls + return [gr.update(value=None)] * (7 + 14) # 7 results + 14 UI controls (13 synth + 1 preset selector) + + # --- Start timer for the single file job --- + job_start_time = reqtime.time() + + # --- Generate a timestamp for this single job --- + single_file_timestamp = reqtime.strftime("%Y%m%d-%H%M%S") + + # Create the AppParameters object from the flat list of UI values + # The first value in *args is s8bit_preset_selector, the rest match the keys + params = AppParameters(input_file=input_file, **dict(zip(ALL_PARAM_KEYS, args))) + + # Run the core pipeline, Pass the timestamp to the pipeline + results, final_params = run_single_file_pipeline(input_file, single_file_timestamp, params) + + if results is None: + raise gr.Error("File processing failed. Check console for details.") + + # --- Calculate and print the total job time --- + total_job_time = reqtime.time() - job_start_time + print(f"Total single-file job execution time: {total_job_time:.2f} seconds.") + + # --- Prepare UI updates using the returned final_params --- + # This ensures the UI always reflects the parameters that were actually used for the render. final_ui_updates = [] - if ui_updates: # If auto-recommendation was successful - # We have new values, so we create a list of these values in the correct order. - for param in param_order: - final_ui_updates.append(ui_updates.get(param)) + + # Logic to decide what the preset selector should show after the run + if params.s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": + # After auto-recommendation, the state becomes "Custom" + final_ui_updates.append(gr.update(value="Custom")) else: - # No auto-recommendation, so we tell Gradio not to change the UI. - # We send a gr.update() for each UI component. - for _ in param_order: - final_ui_updates.append(gr.update()) - - # The final return is a combination of the result values and the UI update values. - return list(results) + final_ui_updates + # Otherwise, just keep the user's current selection + final_ui_updates.append(gr.update(value=final_params.s8bit_preset_selector)) + + # Get the keys for the 13 synthesizer controls (excluding the preset selector itself) + s8bit_control_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_') and key != 's8bit_preset_selector'] + + # Always update all 13 controls to match the final parameters used in the backend + for key in s8bit_control_keys: + final_ui_updates.append(getattr(final_params, key)) + + # Format the main results for the output components + main_results = [ + results['md5_hash'], results['title'], results['summary'], + results['final_midi_path'], results['final_audio_path'], + results['plot'], results['description'] + ] + + # The total return list now has a consistent structure and logic + return main_results + final_ui_updates + # ================================================================================================= # === Gradio UI Setup === @@ -1451,7 +1553,7 @@ if __name__ == "__main__": initialize_app() # --- Prepare soundfonts and make the map globally accessible --- - global soundfonts_dict + global soundfonts_dict, demucs_model # On application start, download SoundFonts from Hugging Face Hub if they don't exist. soundfonts_dict = prepare_soundfonts() print(f"Found {len(soundfonts_dict)} local SoundFonts.") @@ -1471,6 +1573,22 @@ if __name__ == "__main__": print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}") demucs_model = None + # --- Dictionary containing descriptions for each render type --- + RENDER_TYPE_DESCRIPTIONS = { + "Render as-is": "**Mode: Pass-through.** Renders the MIDI file directly without any modifications. Advanced MIDI options will be ignored.", + "Custom render": "**Mode: Activate Advanced Options.** Applies all settings from the 'Advanced MIDI Rendering Options' accordion without making other structural changes to the MIDI.", + "Extract melody": "**Action: Simplify.** Analyzes all tracks and attempts to isolate and render only the main melody line.", + "Flip": "**Action: Experimental.** Inverts the pitch of each note around the song's average pitch.", + "Reverse": "**Action: Experimental.** Reverses the playback order of all notes in the MIDI file.", + "Repair Durations": "**Action: Fix.** Recalculates note durations to ensure they connect smoothly (legato), filling any small gaps.", + "Repair Chords": "**Action: Fix.** Analyzes and aligns notes that occur at similar times to form cleaner, more structured chords.", + "Remove Duplicate Pitches": "**Action: Simplify.** If multiple instruments play the exact same pitch at the same time, it keeps only one.", + "Longest Repeating Phrase": "**Action: Analyze.** Finds the longest, most-repeated musical phrase (often the chorus) and renders only that section.", + "Multi-Instrumental Summary": "**Action: AI Summary.** Creates a short, compressed summary of a complex, multi-instrument song.", + "Solo Piano Summary": "**Action: AI Summary.** First converts the song to a solo piano arrangement, then creates a short, compressed summary.", + "Add Drum Track": "**Action: Enhance.** Analyzes the rhythm of the MIDI and automatically generates a basic drum track to accompany it." + } + # --- Define a constant for the fallback preset name --- # This prevents errors if the preset name is changed in the dictionary. FALLBACK_PRESET_NAME = "Generic Chiptune Loop" @@ -1621,7 +1739,7 @@ if __name__ == "__main__": # --- Epic & Orchestral Pads --- "Dragon Quest (Orchestral Feel / ドラゴンクエスト)": { # Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section. - 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6, + 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6, 'vibrato_rate': 3.0, 'vibrato_depth': 4, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.9, @@ -1801,84 +1919,82 @@ if __name__ == "__main__": # --- General & All-Purpose --- "Default (Balanced)": { 'description': "A good all-around starting point for most music types.", - 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 128, - 'min_freq': 60, 'max_freq': 4000, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False + 'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 128, + 'minimum_frequency': 60, 'maximum_frequency': 4000, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False }, "Anime / J-Pop": { 'description': "For tracks with clear melodies and pop/rock arrangements.", - 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 150, - 'min_freq': 40, 'max_freq': 2500, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True + 'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 150, + 'minimum_frequency': 40, 'maximum_frequency': 2500, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True }, # --- Specific Instruments --- "Solo Vocals": { 'description': "Optimized for a single singing voice. Sensitive to nuances.", - 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100, - 'min_freq': 80, 'max_freq': 1200, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True + 'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 100, + 'minimum_frequency': 80, 'maximum_frequency': 1200, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True }, "Solo Piano": { 'description': "For solo piano with a wide dynamic and frequency range.", - 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 120, - 'min_freq': 27, 'max_freq': 4200, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True + 'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 120, + 'minimum_frequency': 27, 'maximum_frequency': 4200, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True }, "Acoustic Guitar": { 'description': "Balanced for picked or strummed acoustic guitar.", - 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 90, - 'min_freq': 80, 'max_freq': 2500, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False + 'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 90, + 'minimum_frequency': 80, 'maximum_frequency': 2500, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False }, "Bass Guitar": { 'description': "Isolates and transcribes only the low frequencies of a bassline.", - 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100, - 'min_freq': 30, 'max_freq': 400, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False + 'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 100, + 'minimum_frequency': 30, 'maximum_frequency': 400, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False }, "Percussion / Drums": { 'description': "For drums and rhythmic elements. Catches fast, sharp hits.", - 'onset_thresh': 0.7, 'frame_thresh': 0.6, 'min_note_len': 30, - 'min_freq': 40, 'max_freq': 10000, - 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False + 'onset_threshold': 0.7, 'frame_threshold': 0.6, 'minimum_note_length': 30, + 'minimum_frequency': 40, 'maximum_frequency': 10000, + 'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': False }, # --- Complex Genres --- "Rock / Metal": { 'description': "Higher thresholds for distorted guitars, bass, and drums in a dense mix.", - 'onset_thresh': 0.6, 'frame_thresh': 0.4, 'min_note_len': 100, - 'min_freq': 50, 'max_freq': 3000, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True + 'onset_threshold': 0.6, 'frame_threshold': 0.4, 'minimum_note_length': 100, + 'minimum_frequency': 50, 'maximum_frequency': 3000, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True }, "Jazz (Multi-instrument)": { 'description': "High thresholds to separate notes in complex, improvisational passages.", - 'onset_thresh': 0.7, 'frame_thresh': 0.5, 'min_note_len': 150, - 'min_freq': 55, 'max_freq': 2000, - 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': True + 'onset_threshold': 0.7, 'frame_threshold': 0.5, 'minimum_note_length': 150, + 'minimum_frequency': 55, 'maximum_frequency': 2000, + 'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': True }, "Classical (Orchestral)": { 'description': "Longer note length to focus on sustained notes and filter out performance noise.", - 'onset_thresh': 0.5, 'frame_thresh': 0.4, 'min_note_len': 200, - 'min_freq': 32, 'max_freq': 4200, - 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True + 'onset_threshold': 0.5, 'frame_threshold': 0.4, 'minimum_note_length': 200, + 'minimum_frequency': 32, 'maximum_frequency': 4200, + 'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True }, "Electronic / Synth": { 'description': "Low thresholds and short note length for sharp, synthetic sounds.", - 'onset_thresh': 0.3, 'frame_thresh': 0.2, 'min_note_len': 50, - 'min_freq': 20, 'max_freq': 8000, - 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False + 'onset_threshold': 0.3, 'frame_threshold': 0.2, 'minimum_note_length': 50, + 'minimum_frequency': 20, 'maximum_frequency': 8000, + 'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': False } } # --- UI visibility logic now controls three components --- - def update_vocal_ui_visibility(separate_vocals, remerge_audio): + def update_vocal_ui_visibility(separate_vocals): """Shows or hides the separation-related UI controls based on selections.""" is_visible = gr.update(visible=separate_vocals) - # The "Transcribe Both" checkbox is only visible if separation AND re-merging are active - transcribe_both_visible = gr.update(visible=(separate_vocals and remerge_audio)) - return is_visible, is_visible, transcribe_both_visible + return is_visible, is_visible, is_visible def update_ui_visibility(transcription_method, soundfont_choice): """ @@ -1892,6 +2008,22 @@ if __name__ == "__main__": synth_8bit_settings: gr.update(visible=is_8bit), } + # --- Function to control visibility of advanced MIDI rendering options --- + def update_advanced_midi_options_visibility(render_type_choice): + """ + Shows or hides the advanced MIDI rendering options based on the render type. + The options are only visible if the type is NOT 'Render as-is'. + """ + is_visible = (render_type_choice != "Render as-is") + return gr.update(visible=is_visible) + + # --- UI controller function to update the description text --- + def update_render_type_description(render_type_choice): + """ + Returns the description for the selected render type. + """ + return RENDER_TYPE_DESCRIPTIONS.get(render_type_choice, "Select a render type to see its description.") + # --- Controller function to apply basic_pitch presets to the UI --- def apply_basic_pitch_preset(preset_name): if preset_name not in BASIC_PITCH_PRESETS: @@ -1902,14 +2034,14 @@ if __name__ == "__main__": # Return a dictionary that maps each UI component to its new value return { - onset_threshold: gr.update(value=settings['onset_thresh']), - frame_threshold: gr.update(value=settings['frame_thresh']), - minimum_note_length: gr.update(value=settings['min_note_len']), - minimum_frequency: gr.update(value=settings['min_freq']), - maximum_frequency: gr.update(value=settings['max_freq']), - infer_onsets: gr.update(value=settings['infer_onsets_bool']), - melodia_trick: gr.update(value=settings['melodia_trick_bool']), - multiple_pitch_bends: gr.update(value=settings['multiple_bends_bool']) + onset_threshold: gr.update(value=settings['onset_threshold']), + frame_threshold: gr.update(value=settings['frame_threshold']), + minimum_note_length: gr.update(value=settings['minimum_note_length']), + minimum_frequency: gr.update(value=settings['minimum_frequency']), + maximum_frequency: gr.update(value=settings['maximum_frequency']), + infer_onsets: gr.update(value=settings['infer_onsets']), + melodia_trick: gr.update(value=settings['melodia_trick']), + multiple_pitch_bends: gr.update(value=settings['multiple_bends']) } # --- Function to apply 8-bit synthesizer presets --- @@ -1917,33 +2049,34 @@ if __name__ == "__main__": def apply_8bit_preset(preset_name): """ Takes the name of a preset and returns a dictionary of gr.update objects - to set the values of all 13 of the 8-bit synthesizer's UI components. + to set the values of the 13 8-bit synthesizer control components. + This version is more robust as it directly maps keys to UI components. """ - # --- Use a list of keys for consistent updates --- - param_keys = [ - 'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate', - 'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level', - 'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate' - ] - - # If the user selects "Custom" or the preset is not found, do not change the values. - if preset_name == "Custom" or preset_name not in S8BIT_PRESETS: - # When switching to custom, don't change any values, just return empty updates. - return {comp: gr.update() for comp in s8bit_ui_components} - + # If a special value is selected or the preset is not found, return empty updates for all controls. + if preset_name in ["Custom", "Auto-Recommend (Analyze MIDI)"] or preset_name not in S8BIT_PRESETS: + # We create a dictionary mapping each control component to an empty update. + s8bit_control_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_') and key != 's8bit_preset_selector'] + return {ui_component_map[key]: gr.update() for key in s8bit_control_keys} + # Get the settings dictionary for the chosen preset. settings = S8BIT_PRESETS[preset_name] + updates = {} + + # Iterate through the KEY-VALUE pairs in the chosen preset's settings. + for simple_key, value in settings.items(): + # Reconstruct the full component key (e.g., 'waveform_type' -> 's8bit_waveform_type') + full_key = f"s8bit_{simple_key}" + + # Check if this key corresponds to a valid UI component + if full_key in ui_component_map: + component = ui_component_map[full_key] + updates[component] = gr.update(value=value) + + return updates - # Create a dictionary mapping UI components to their new values from the preset. - update_dict = {} - for i, key in enumerate(param_keys): - component = s8bit_ui_components[i] - value = settings.get(key) - if value is not None: - update_dict[component] = gr.update(value=value) - else: - update_dict[component] = gr.update() - return update_dict + # --- Use the dataclass to define the master list of parameter keys --- + # This is now the single source of truth for parameter order. + ALL_PARAM_KEYS = [field.name for field in fields(AppParameters) if field.name not in ["input_file", "batch_input_files"]] app = gr.Blocks(theme=gr.themes.Base()) @@ -1954,336 +2087,329 @@ if __name__ == "__main__": "This application combines piano audio transcription with a powerful MIDI transformation and rendering toolkit. " "Based on the work of [asigalov61](https://github.com/asigalov61)." ) - - with gr.Row(): - waveform_options = gr.WaveformOptions(show_recording_waveform=False) - with gr.Column(scale=1): - # --- INPUT COLUMN --- - gr.Markdown("## 1. Upload File") - - # Changed from gr.File to gr.Audio to allow for audio preview. - # type="filepath" ensures the component returns a string path to the uploaded file. - # The component will show a player for supported audio types (e.g., WAV, MP3). - input_file = gr.Audio( - label="Input Audio or MIDI File", - type="filepath", - sources=["upload"], waveform_options=waveform_options - ) - - gr.Markdown("## 2. Configure Processing") - - # --- Transcription Method Selector --- - transcription_method = gr.Radio( - ["General Purpose", "Piano-Specific"], - label="Audio Transcription Method", - value="General Purpose", - info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings." - ) - - # --- Stereo Processing Checkbox --- - enable_stereo_processing = gr.Checkbox( - label="Enable Stereo Transcription", - value=False, - info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time." - ) - - # --- Vocal Separation Checkboxes --- - with gr.Group(): - separate_vocals = gr.Checkbox( - label="Separate Vocals", - value=False, - info="If checked, separates the audio into vocals and music stems before processing." - ) - transcription_target = gr.Radio( - ["Transcribe Music (Accompaniment)", "Transcribe Vocals"], - label="Transcription Target", - value="Transcribe Music (Accompaniment)", - info="Choose which part of the separated audio to transcribe to MIDI.", - visible=False # Initially hidden - ) - remerge_vocals = gr.Checkbox( - label="Re-merge Other Part with Rendered Audio", - value=False, - info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.", - visible=False # Initially hidden - ) - # --- New checkbox for transcribing both stems --- - transcribe_both_stems = gr.Checkbox( - label="Transcribe Both Parts & Merge MIDI", - value=False, - info="If checked, transcribes BOTH vocals and music, then merges them into one MIDI file for rendering. Disables audio re-merging.", - visible=False # Initially hidden - ) - with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings: - # --- Preset dropdown for basic_pitch --- - basic_pitch_preset_selector = gr.Dropdown( - choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()), - value="Default (Balanced)", - label="Transcription Profile Preset", - info="Select a profile to auto-fill settings for different instrument types." - "For reference only; it is recommended to test and adjust for optimal results." - ) - - # --- The existing basic_pitch components --- - onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.") - frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.") - minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.") - minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.") - maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.") - infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)") - melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)") - multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends") - - # --- Rendering Settings --- - render_type = gr.Radio( - ["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"], - label="MIDI Transformation Render Type", - value="Render as-is", - info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations." - ) + # --- Use Tabs for different workflows --- + with gr.Tabs(): + waveform_options = gr.WaveformOptions(show_recording_waveform=False) + # --- TAB 1: SINGLE FILE PROCESSING --- + with gr.TabItem("Single File Processing"): + # --- All of your existing UI components go inside this Tab --- + with gr.Row(): + with gr.Column(scale=1): + # --- INPUT COLUMN --- + gr.Markdown("## 1. Upload File") - # --- SoundFont Bank with 8-bit option --- - # --- Dynamically create the list of choices --- - soundfont_choices = [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys()) - # Set a safe default value - default_sf_choice = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" if "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" in soundfonts_dict else (soundfont_choices[0] if soundfont_choices else "") + # Changed from gr.File to gr.Audio to allow for audio preview. + # type="filepath" ensures the component returns a string path to the uploaded file. + # The component will show a player for supported audio types (e.g., WAV, MP3). + input_file = gr.Audio( + label="Input Audio or MIDI File", + type="filepath", + sources=["upload"], waveform_options=waveform_options + ) + # --- The single file processing button --- + submit_btn = gr.Button("Process and Render Single File", variant="primary") + + with gr.Column(scale=2): + # --- OUTPUT COLUMN --- + gr.Markdown("### 2. Results") + output_midi_title = gr.Textbox(label="MIDI Title") + output_song_description = gr.Textbox(label="MIDI Description", lines=3) + output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options) + output_plot = gr.Plot(label="MIDI Score Plot") + with gr.Row(): + output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"]) + output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash") + output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4) - soundfont_bank = gr.Dropdown( - soundfont_choices, - label="SoundFont / Synthesizer", - value=default_sf_choice - ) - - render_sample_rate = gr.Radio( - ["16000", "32000", "44100"], - label="Audio Sample Rate", - value="44100" - ) - # --- 8-bit Synthesizer Settings --- - # - # ================================================================================= - # === 8-Bit Synthesizer Parameter Guide === - # ================================================================================= - # - # --- Basic Tone Shaping --- - # - # Waveform Type: The fundamental timbre of the sound. - # - Square: The classic, bright, somewhat hollow sound of the NES. Its tone is heavily modified by Pulse Width. - # - Sawtooth: Aggressive, buzzy, and rich. Great for intense leads or gritty basslines. - # - Triangle: Soft, pure, and flute-like. Often used for basslines or gentler melodies. - # - # Pulse Width (Square Wave Only): Modifies the character of the Square wave. - # - Low (near 0.1) or High (near 0.9): Creates a thin, sharp, or nasal sound. A common choice for classic leads. - # - Mid (near 0.5): A "perfect" square wave. The sound is full, round, and most robust. - # - # Envelope Type: Shapes the volume of each note over its duration. - # - Plucky (AD): Creates a percussive, short sound that attacks instantly and then fades. Ideal for fast melodies and arpeggios. - # - Sustained (Full Decay): Creates a held-out sound that lasts for the note's full duration. Ideal for pads and atmospheric sounds. - # - # Decay Time (s): Controls how long a note's sound lasts (in the Plucky envelope). - # - Low: Very short, staccato notes. - # - High: Longer, more resonant notes that can bleed into each other. - # - # Bass Boost Level: Mixes in a sub-octave (a square wave one octave lower). - # - Low (or 0): The pure, original waveform. - # - High: Adds significant weight, thickness, and power to the sound. - # - # --- Modulation & Performance --- - # - # Vibrato Rate (Hz): The SPEED of the pitch wobble. - # - Low: A slow, gentle wavering effect. - # - High (8Hz+): A fast, frantic buzzing or trembling effect. Can create "ring-mod" style sounds at extreme values. - # - # Vibrato Depth (Hz): The INTENSITY of the pitch wobble. - # - Low (or 0): A very subtle effect, or no vibrato at all. - # - High: An extreme, dramatic pitch bend. Can sound chaotic or like a siren at extreme values. - # - # Smooth Notes (Checkbox): - # - Enabled: Applies a tiny fade-in/out to reduce clicking artifacts. Makes the sound slightly softer but cleaner. - # - Disabled: More abrupt, harsh note onsets. Can be desirable for an aggressive sound. - # - # Continuous Vibrato (Checkbox): - # - Enabled: The vibrato is smooth and connected across a musical phrase, creating a "singing" or legato effect. - # - Disabled: The vibrato resets on each new note, creating a bouncy, per-note, staccato effect (key for the "Mario" style). - # - # --- FX & Advanced Synthesis --- - # - # Noise Level: Mixes in white noise with the main waveform. - # - Low (or 0): No noise. - # - High: Adds "air," "grit," or a "hissing" quality. Essential for simulating percussion or creating wind-like sound effects. - # - # Distortion Level: Applies a wave-shaping algorithm to make the sound harsher. - # - Low (or 0): The clean, original sound. - # - High: Progressively crushes and saturates the waveform, creating a very aggressive, "fuzzy" or "broken" tone. - # - # FM Depth (Frequency Modulation): Controls the intensity of the frequency modulation. - # - Low (or 0): No FM effect. - # - High: The main frequency is more heavily altered by the FM Rate, creating complex, bell-like, metallic, or dissonant tones. - # - # FM Rate (Frequency Modulation): Controls the speed of the modulating oscillator. - # - Low: Creates a slow, vibrato-like or "wobbling" FM effect. - # - High: Creates fast modulation, resulting in bright, complex, often metallic harmonics and sidebands. - # ================================================================================= - # - # --- New option for auto-recommendation --- - # Define the 8-bit UI components in one place for easy reference - with gr.Accordion("8-bit Synthesizer Settings", open=False, visible=False) as synth_8bit_settings: - # --- Preset selector dropdown --- - s8bit_preset_selector = gr.Dropdown( - choices=["Custom", "Auto-Recommend (Analyze MIDI)"] + list(S8BIT_PRESETS.keys()), - value="Custom", - label="Style Preset", - info="Select a preset to auto-fill the settings below. Choose 'Custom' for manual control.\nFor reference and entertainment only. These presets are not guaranteed to be perfectly accurate." - ) + # --- TAB 2: BATCH PROCESSING --- + with gr.TabItem("Batch Processing"): + with gr.Row(): + with gr.Column(): + gr.Markdown("### 1. Upload Files") + gr.Markdown("Uses the **global settings** configured above.") + batch_input_files = gr.File( + label="Upload Audio or MIDI Files", + file_count="multiple" + ) + + batch_process_btn = gr.Button("Process Batch", variant="primary") + + with gr.Column(): + gr.Markdown("### 2. Download Results") + batch_output_audio_files = gr.File( + label="Download Rendered FLAC Files", + file_count="multiple", + interactive=False + ) + batch_output_midi_files = gr.File( + label="Download Processed MIDI Files", + file_count="multiple", + interactive=False + ) + # --- Global Settings Accordion, Define all settings in a global, shared accordion --- + with gr.Accordion("▶️ Configure Global Settings (for both Single File and Batch)", open=True): + with gr.Row(): + with gr.Column(scale=1): + gr.Markdown("### Transcription Settings") + # --- Transcription Method Selector --- + transcription_method = gr.Radio(["General Purpose", "Piano-Specific"], label="Audio Transcription Method", value="General Purpose", + info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings.") + # --- Stereo Processing Checkbox --- + enable_stereo_processing = gr.Checkbox(label="Enable Stereo Transcription", value=False, + info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time.") - s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type") - s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)") - s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type") - s8bit_decay_time_s = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="Decay Time (s)") # Increased max to 0.6 for DQ style - s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)") - s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)") - s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.") - s8bit_smooth_notes_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Smooth Notes Level", info="Level of fade-in/out to reduce clicks. 0=off, 1=max.") - s8bit_continuous_vibrato_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Continuous Vibrato Level", info="Controls vibrato continuity. 0=resets per note, 1=fully continuous.") - - # --- New accordion for advanced effects --- - with gr.Accordion("Advanced Synthesis & FX", open=False): - s8bit_noise_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Noise Level", info="Mixes in white noise. Great for percussion or adding 'air'.") - s8bit_distortion_level = gr.Slider(minimum=0.0, maximum=0.9, value=0.0, step=0.05, label="Distortion Level", info="Applies wave-shaping distortion for a grittier, harsher sound.") - s8bit_fm_modulation_depth = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="FM Depth", info="Depth of Frequency Modulation. Creates complex, metallic, or bell-like tones.") - s8bit_fm_modulation_rate = gr.Slider(minimum=0.0, maximum=500.0, value=0.0, step=1.0, label="FM Rate", info="Rate of Frequency Modulation. Higher values create brighter, more complex harmonics.") - - # --- Original Advanced Options (Now tied to Piano-Specific) --- - with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options: - render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True) - render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False) - render_remove_drums = gr.Checkbox(label="Remove drum track", value=False) - render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False) - render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)") - custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)") - merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)") - render_align = gr.Radio( - ["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"], - label="Align notes to musical bars", - value="Do not align" + # --- Vocal Separation Checkboxes --- + with gr.Group(): + separate_vocals = gr.Checkbox(label="Separate Vocals", value=False, + info="If checked, separates the audio into vocals and music stems before processing.") + transcription_target = gr.Radio(["Transcribe Music (Accompaniment)", "Transcribe Vocals"], label="Transcription Target", value="Transcribe Music (Accompaniment)", visible=False, + info="Choose which part of the separated audio to transcribe to MIDI.") + remerge_vocals = gr.Checkbox(label="Re-merge Other Part with Rendered Audio", value=False, visible=False, + info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.") + transcribe_both_stems = gr.Checkbox(label="Transcribe Both Parts & Merge MIDI", value=False, visible=False, + info="If checked, transcribes BOTH vocals and music, then merges them into one MIDI file for rendering. Disables audio re-merging.") + + with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings: + # --- Preset dropdown for basic_pitch --- + basic_pitch_preset_selector = gr.Dropdown( + choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()), + value="Default (Balanced)", + label="Transcription Profile Preset", + info="Select a profile to auto-fill settings for different instrument types." + "For reference only; it is recommended to test and adjust for optimal results.") + # --- The existing basic_pitch components --- + onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.") + frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.") + minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.") + minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.") + maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.") + infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)") + melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)") + multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends") + + with gr.Column(scale=1): + # --- Rendering Settings --- + gr.Markdown("### MIDI Transformation & Rendering Settings") + render_type = gr.Radio( + list(RENDER_TYPE_DESCRIPTIONS.keys()), # Use keys from dict for choices + ["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"], + label="MIDI Transformation Render Type", + value="Render as-is", + info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations.") + # --- A Markdown box for the dynamic descriptions --- + render_type_info = gr.Markdown( + value=RENDER_TYPE_DESCRIPTIONS["Render as-is"], # Set initial value + elem_classes="description-box" # Optional: for CSS styling ) + # --- SoundFont Bank with 8-bit option --- + soundfont_bank = gr.Dropdown( + [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys()), + label="SoundFont / Synthesizer", + value=list(soundfonts_dict.keys())[0] if soundfonts_dict else SYNTH_8_BIT_LABEL) + render_sample_rate = gr.Radio( + ["16000", "32000", "44100"], + label="Audio Sample Rate", + value="44100") + + with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options: + render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True, + info="Applies sustain pedal effects (CC64) to lengthen notes, creating a more realistic and connected performance, especially for piano.") + render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False, + info="Converts all non-drum instruments to a Grand Piano patch, creating a solo piano arrangement of the entire score.") + render_remove_drums = gr.Checkbox(label="Remove drum track", value=False, + info="Removes the entire drum track (typically MIDI Channel 9) from the score. Ideal for creating instrumental or karaoke versions.") + render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False, + info="Transposes the entire score so that its average pitch is centered around C4 (MIDI note 60). Useful for standardizing key.") + render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)", + info="Shifts the pitch of all non-drum notes up (positive values) or down (negative values) by the specified number of semitones.") + custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)", + info="Forces all non-drum instruments to use a single specified MIDI patch number. Set to -1 to use the original instruments.") + merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)", + info="Aligns the start times of notes that are played almost simultaneously (within the specified ms threshold). Cleans up sloppy timing. -1 to disable.") + render_align = gr.Radio( + ["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"], + label="Align notes to musical bars", + value="Do not align", + info="Quantizes the score to a fixed bar length. 'Start Times' aligns onsets. " + "'Durations' trims notes at the bar line. 'Split Durations' splits notes that cross the bar line." + ) + + with gr.Column(scale=1): + # --- 8-bit Synthesizer Settings --- + # + # ================================================================================= + # === 8-Bit Synthesizer Parameter Guide === + # ================================================================================= + # + # --- Basic Tone Shaping --- + # + # Waveform Type: The fundamental timbre of the sound. + # - Square: The classic, bright, somewhat hollow sound of the NES. Its tone is heavily modified by Pulse Width. + # - Sawtooth: Aggressive, buzzy, and rich. Great for intense leads or gritty basslines. + # - Triangle: Soft, pure, and flute-like. Often used for basslines or gentler melodies. + # + # Pulse Width (Square Wave Only): Modifies the character of the Square wave. + # - Low (near 0.1) or High (near 0.9): Creates a thin, sharp, or nasal sound. A common choice for classic leads. + # - Mid (near 0.5): A "perfect" square wave. The sound is full, round, and most robust. + # + # Envelope Type: Shapes the volume of each note over its duration. + # - Plucky (AD): Creates a percussive, short sound that attacks instantly and then fades. Ideal for fast melodies and arpeggios. + # - Sustained (Full Decay): Creates a held-out sound that lasts for the note's full duration. Ideal for pads and atmospheric sounds. + # + # Decay Time (s): Controls how long a note's sound lasts (in the Plucky envelope). + # - Low: Very short, staccato notes. + # - High: Longer, more resonant notes that can bleed into each other. + # + # Bass Boost Level: Mixes in a sub-octave (a square wave one octave lower). + # - Low (or 0): The pure, original waveform. + # - High: Adds significant weight, thickness, and power to the sound. + # + # --- Modulation & Performance --- + # + # Vibrato Rate (Hz): The SPEED of the pitch wobble. + # - Low: A slow, gentle wavering effect. + # - High (8Hz+): A fast, frantic buzzing or trembling effect. Can create "ring-mod" style sounds at extreme values. + # + # Vibrato Depth (Hz): The INTENSITY of the pitch wobble. + # - Low (or 0): A very subtle effect, or no vibrato at all. + # - High: An extreme, dramatic pitch bend. Can sound chaotic or like a siren at extreme values. + # + # Smooth Notes (Checkbox): + # - Enabled: Applies a tiny fade-in/out to reduce clicking artifacts. Makes the sound slightly softer but cleaner. + # - Disabled: More abrupt, harsh note onsets. Can be desirable for an aggressive sound. + # + # Continuous Vibrato (Checkbox): + # - Enabled: The vibrato is smooth and connected across a musical phrase, creating a "singing" or legato effect. + # - Disabled: The vibrato resets on each new note, creating a bouncy, per-note, staccato effect (key for the "Mario" style). + # + # --- FX & Advanced Synthesis --- + # + # Noise Level: Mixes in white noise with the main waveform. + # - Low (or 0): No noise. + # - High: Adds "air," "grit," or a "hissing" quality. Essential for simulating percussion or creating wind-like sound effects. + # + # Distortion Level: Applies a wave-shaping algorithm to make the sound harsher. + # - Low (or 0): The clean, original sound. + # - High: Progressively crushes and saturates the waveform, creating a very aggressive, "fuzzy" or "broken" tone. + # + # FM Depth (Frequency Modulation): Controls the intensity of the frequency modulation. + # - Low (or 0): No FM effect. + # - High: The main frequency is more heavily altered by the FM Rate, creating complex, bell-like, metallic, or dissonant tones. + # + # FM Rate (Frequency Modulation): Controls the speed of the modulating oscillator. + # - Low: Creates a slow, vibrato-like or "wobbling" FM effect. + # - High: Creates fast modulation, resulting in bright, complex, often metallic harmonics and sidebands. + # ================================================================================= + # + # --- New option for auto-recommendation --- + # Define the 8-bit UI components in one place for easy reference + gr.Markdown("### 8-bit Synthesizer Settings") + with gr.Accordion("8-bit Synthesizer Settings", open=True, visible=False) as synth_8bit_settings: + s8bit_preset_selector = gr.Dropdown(choices=["Custom", "Auto-Recommend (Analyze MIDI)"] + list(S8BIT_PRESETS.keys()), value="Custom", label="Style Preset", + info="Select a preset to auto-fill the settings below. Choose 'Custom' for manual control.\nFor reference and entertainment only. These presets are not guaranteed to be perfectly accurate.") + s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type") + s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)") + s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type") + s8bit_decay_time_s = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="Decay Time (s)") + s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)") + s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)") + s8bit_bass_boost_level = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.") + s8bit_smooth_notes_level = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Smooth Notes Level", info="Level of fade-in/out to reduce clicks. 0=off, 1=max.") + s8bit_continuous_vibrato_level = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Continuous Vibrato Level", info="Controls vibrato continuity. 0=resets per note, 1=fully continuous.") + # --- New accordion for advanced effects --- + with gr.Accordion("Advanced Synthesis & FX", open=False): + s8bit_noise_level = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Noise Level", info="Mixes in white noise. Great for percussion or adding 'air'.") + s8bit_distortion_level = gr.Slider(0.0, 0.9, value=0.0, step=0.05, label="Distortion Level", info="Applies wave-shaping distortion for a grittier, harsher sound.") + s8bit_fm_modulation_depth = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="FM Depth", info="Depth of Frequency Modulation. Creates complex, metallic, or bell-like tones.") + s8bit_fm_modulation_rate = gr.Slider(0.0, 500.0, value=0.0, step=1.0, label="FM Rate", info="Rate of Frequency Modulation. Higher values create brighter, more complex harmonics.") + + # Create a dictionary mapping key names to the actual Gradio components + ui_component_map = locals() + + # Build the list of all setting components in the correct order using ALL_PARAM_KEYS + all_settings_components = [ui_component_map[key] for key in ALL_PARAM_KEYS] + + # --- FIX START: Isolate the preset selector from the controls it updates --- + # Original list of all 14 synth components + s8bit_ui_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_')] + s8bit_ui_components = [ui_component_map[key] for key in s8bit_ui_keys] - submit_btn = gr.Button("Process and Render", variant="primary") - - with gr.Column(scale=2): - # --- OUTPUT COLUMN --- - gr.Markdown("## 3. Results") - output_midi_title = gr.Textbox(label="MIDI Title") - output_song_description = gr.Textbox(label="MIDI Description", lines=3) - output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options) - output_plot = gr.Plot(label="MIDI Score Plot") - with gr.Row(): - output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"]) - output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash") - output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4) - - # Define all input components for the click event, excluding the preset selector which is not a direct input to the final processing. - # all_inputs now includes the preset selector itself - # Inputs for the main processing function - all_inputs = [ - input_file, - s8bit_preset_selector, - separate_vocals, - remerge_vocals, - transcription_target, - transcribe_both_stems, - enable_stereo_processing, - transcription_method, onset_threshold, frame_threshold, minimum_note_length, - minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends, - render_type, soundfont_bank, render_sample_rate, render_with_sustains, - merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, - render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, - s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, - s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level, - s8bit_smooth_notes_level, s8bit_continuous_vibrato_level, - s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate - ] - - # Outputs for the main results - result_outputs = [ - output_midi_md5, output_midi_title, output_midi_summary, - output_midi, output_audio, output_plot, output_song_description - ] - + # NEW: Create a separate list containing only the 13 controls to be updated + s8bit_control_components = [comp for comp in s8bit_ui_components if comp != s8bit_preset_selector] + # The list of basic_pitch UI components that can be updated by its preset selector. - # This MUST be defined after the components themselves are created in the UI. - basic_pitch_ui_components = [ - onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, - maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends - ] - - # The list of 8-bit UI components that can be updated - # This MUST be defined after the components themselves are created in the UI. - s8bit_ui_components = [ - s8bit_waveform_type, s8bit_pulse_width, s8bit_envelope_type, s8bit_decay_time_s, s8bit_vibrato_rate, - s8bit_vibrato_depth, s8bit_bass_boost_level, - s8bit_smooth_notes_level, s8bit_continuous_vibrato_level, - s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate - ] + basic_pitch_keys = ['onset_threshold', 'frame_threshold', 'minimum_note_length', 'minimum_frequency', 'maximum_frequency', + 'infer_onsets', 'melodia_trick', 'multiple_pitch_bends'] + basic_pitch_ui_components = [ui_component_map[key] for key in basic_pitch_keys] - # all_outputs now includes both results AND the UI controls to be updated - all_outputs = result_outputs + s8bit_ui_components + # Define inputs and outputs for Gradio events + single_file_inputs = [input_file] + all_settings_components + result_outputs = [output_midi_md5, output_midi_title, output_midi_summary, output_midi, output_audio, output_plot, output_song_description] + # The output list for the single file process now correctly includes all 14 synth components + single_file_outputs = result_outputs + s8bit_ui_components - # Event Handling + batch_inputs = [batch_input_files] + all_settings_components + batch_outputs = [batch_output_audio_files, batch_output_midi_files] + + # Event Handling for Single File Tab submit_btn.click( - process_and_render_file, - inputs=all_inputs, - outputs=all_outputs # Pass the combined list + fn=process_and_render_file, + inputs=single_file_inputs, + outputs=single_file_outputs + ) + # --- Event Handling for Batch Tab --- + batch_process_btn.click( + fn=batch_process_files, + inputs=batch_inputs, + outputs=batch_outputs ) - # --- Visibility logic is now more complex --- - # A simple lambda function to handle multiple inputs - update_visibility_lambda = lambda sep, rem: update_vocal_ui_visibility(sep, rem) - + # Event listeners for UI visibility and presets separate_vocals.change( - fn=update_visibility_lambda, - inputs=[separate_vocals, remerge_vocals], - outputs=[transcription_target, remerge_vocals, transcribe_both_stems] - ) - remerge_vocals.change( - fn=update_visibility_lambda, - inputs=[separate_vocals, remerge_vocals], + fn=update_vocal_ui_visibility, + inputs=separate_vocals, outputs=[transcription_target, remerge_vocals, transcribe_both_stems] ) # --- Listeners for dynamic UI updates --- transcription_method.change( - fn=update_ui_visibility, - inputs=[transcription_method, soundfont_bank], - outputs=[general_transcription_settings, synth_8bit_settings] + fn=lambda x: gr.update(visible=(x == "General Purpose")), + inputs=transcription_method, + outputs=general_transcription_settings ) soundfont_bank.change( - fn=update_ui_visibility, - inputs=[transcription_method, soundfont_bank], - outputs=[general_transcription_settings, synth_8bit_settings] + fn=lambda x: gr.update(visible=(x == SYNTH_8_BIT_LABEL)), + inputs=soundfont_bank, + outputs=synth_8bit_settings ) - + # --- Event listener for the new basic_pitch preset dropdown --- basic_pitch_preset_selector.change( fn=apply_basic_pitch_preset, - inputs=[basic_pitch_preset_selector], + inputs=basic_pitch_preset_selector, outputs=basic_pitch_ui_components ) - # This listener now correctly handles only the named presets, ignoring "Auto-Recommend" - # --- Event listener for the preset selector --- - # When the preset dropdown changes, it calls the `apply_8bit_preset` function. - # The input to the function is the selected preset name. - # The outputs are all the individual 8-bit setting components that need to be updated. - # This listener is for manual preset selection (e.g., choosing "Mega Man") + # --- Event listener for the 8-bit preset selector --- s8bit_preset_selector.change( fn=apply_8bit_preset, - inputs=[s8bit_preset_selector], - outputs=s8bit_ui_components # This now correctly targets the new sliders + inputs=s8bit_preset_selector, + outputs=s8bit_control_components ) + # --- New event listener for the render_type radio button --- + # This listener now has TWO outputs + render_type.change( + fn=update_advanced_midi_options_visibility, + inputs=render_type, + outputs=advanced_rendering_options + ).then( # Chain another event to the same trigger + fn=update_render_type_description, + inputs=render_type, + outputs=render_type_info + ) # Launch the Gradio app app.queue().launch(inbrowser=True, debug=True)