Audio-To-MIDI-And-Advanced-Renderer

Running

App Files Files Community

Audio-To-MIDI-And-Advanced-Renderer / app.py

avans06

feat: Implement stereo audio to MIDI transcription

b114cd4 9 days ago

raw

history blame

54.5 kB

	# =================================================================
	#
	# Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
	#
	# This script combines two functionalities:
	# 1. Transcribing audio to MIDI using two methods:
	# a) A general-purpose model (basic-pitch by Spotify).
	# b) A model specialized for solo piano (ByteDance).
	# - Includes stereo processing by splitting channels, transcribing independently, and merging MIDI.
	# 2. Applying advanced transformations and re-rendering MIDI files using:
	# a) Standard SoundFonts via FluidSynth (produces stereo audio).
	# b) A custom 8-bit style synthesizer for a chiptune sound (updated for stereo output).
	#
	# The user can upload a Audio (e.g., WAV, MP3), or MIDI file.
	# - If an audio file is uploaded, it is first transcribed to MIDI using the selected method.
	# - The resulting MIDI (or an uploaded MIDI) can then be processed
	# with various effects and rendered into audio.
	#
	#================================================================
	# Original sources:
	# https://huggingface.co/spaces/asigalov61/ByteDance-Solo-Piano-Audio-to-MIDI-Transcription
	# https://huggingface.co/spaces/asigalov61/Advanced-MIDI-Renderer
	#================================================================
	# Packages:
	#
	# sudo apt install fluidsynth
	#
	# =================================================================
	# Requirements:
	#
	# pip install gradio torch pytz numpy scipy matplotlib networkx scikit-learn
	# pip install piano_transcription_inference huggingface_hub
	# pip install basic-pitch pretty_midi librosa soundfile
	#
	# =================================================================
	# Core modules:
	#
	# git clone --depth 1 https://github.com/asigalov61/tegridy-tools
	#
	# =================================================================

	import os
	import hashlib
	import time as reqtime
	import copy
	import librosa
	import pyloudnorm as pyln
	import soundfile as sf

	import torch
	import gradio as gr

	from src.piano_transcription.utils import initialize_app

	from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate

	# --- Import core transcription and MIDI processing libraries ---
	from src import TMIDIX, TPLOTS
	from src import MIDI
	from src.midi_to_colab_audio import midi_to_colab_audio

	# --- Imports for General Purpose Transcription (basic-pitch) ---
	import basic_pitch
	from basic_pitch.inference import predict
	from basic_pitch import ICASSP_2022_MODEL_PATH

	# --- Imports for 8-bit Synthesizer & MIDI Merging ---
	import pretty_midi
	import numpy as np
	from scipy import signal

	# =================================================================================================
	# === Hugging Face SoundFont Downloader ===
	# =================================================================================================
	from huggingface_hub import hf_hub_download
	import glob

	# --- Define a constant for the 8-bit synthesizer option ---
	SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)"

	def prepare_soundfonts():
	"""
	Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2'
	directory recursively for all .sf2 files.
	Returns a dictionary mapping a user-friendly name to its full file path, with
	default soundfonts listed first in their specified order.

	Downloads soundfont files from the specified Hugging Face Space repository
	to a local 'src/sf2' directory if they don't already exist.
	Returns a list of local paths to the soundfont files.
	"""
	SF2_REPO_ID = "asigalov61/Advanced-MIDI-Renderer"
	SF2_DIR = "src/sf2"
	# This list is now just for ensuring default files exist
	# {"Super GM": 0, "Orpheus GM": 1, "Live HQ GM": 2, "Nice Strings + Orchestra": 3, "Real Choir": 4, "Super Game Boy": 5, "Proto Square": 6}
	DEFAULT_SF2_FILENAMES = [
	"SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2",
	"Orpheus_18.06.2020.sf2",
	"Live HQ Natural SoundFont GM.sf2",
	"Nice-Strings-PlusOrchestra-v1.6.sf2",
	"KBH-Real-Choir-V2.5.sf2",
	"SuperGameBoy.sf2",
	"ProtoSquare.sf2"
	]

	# Create the target directory if it doesn't exist
	os.makedirs(SF2_DIR, exist_ok=True)

	# --- Step 1: Ensure default SoundFonts are available ---
	print("Checking for SoundFont files...")
	for filename in DEFAULT_SF2_FILENAMES:
	local_path = os.path.join(SF2_DIR, filename)

	# Check if the file already exists locally to avoid re-downloading
	if not os.path.exists(local_path):
	print(f"Downloading '{filename}' from Hugging Face Hub...")
	try:
	# Use hf_hub_download to get the file
	# It will be downloaded to the specified local directory
	hf_hub_download(
	repo_id=SF2_REPO_ID,
	repo_type='space', # Specify that the repository is a Space
	filename=f"{filename}", # The path to the file within the repository
	local_dir=SF2_DIR,
	# local_dir_use_symlinks=False # Copy file to the dir for a clean folder structure
	)
	print(f"'{filename}' downloaded successfully.")
	except Exception as e:
	print(f"Error downloading {filename}: {e}")
	# If download fails, we might not be able to use this soundfont

	# --- Step 2: Scan the entire directory for all .sf2 files ---
	print(f"Scanning '{SF2_DIR}' for all .sf2 files...")
	all_sfs_map = {}
	# Use glob with recursive=True to find all .sf2 files in subdirectories
	search_pattern = os.path.join(SF2_DIR, '*', '.sf2')
	for full_path in glob.glob(search_pattern, recursive=True):
	# Create a user-friendly display name, including subfolder if it exists
	relative_path = os.path.relpath(full_path, SF2_DIR)
	display_name = os.path.splitext(relative_path)[0].replace("\\", "/") # Use forward slashes for consistency
	all_sfs_map[display_name] = full_path

	# --- Step 3: Create the final ordered dictionary based on priority ---
	ordered_soundfont_map = {}

	# Create display names for default files (filename without extension)
	default_display_names = [os.path.splitext(f)[0] for f in DEFAULT_SF2_FILENAMES]

	# Separate other files from the default ones
	other_display_names = [name for name in all_sfs_map.keys() if name not in default_display_names]
	other_display_names.sort() # Sort the rest alphabetically

	# Add default soundfonts first, maintaining the order from DEFAULT_SF2_FILENAMES
	for name in default_display_names:
	if name in all_sfs_map: # Check if the file was actually found by the scanner
	ordered_soundfont_map[name] = all_sfs_map[name]

	# Add all other soundfonts after the default ones
	for name in other_display_names:
	ordered_soundfont_map[name] = all_sfs_map[name]

	return ordered_soundfont_map

	# =================================================================================================
	# === 8-bit Style Synthesizer (Stereo Enabled) ===
	# =================================================================================================
	def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width, vibrato_rate, vibrato_depth, bass_boost_level, fs=44100):
	"""
	Synthesizes an 8-bit style audio waveform from a PrettyMIDI object.
	This function generates waveforms manually instead of using a synthesizer like FluidSynth.
	Includes an optional sub-octave bass booster with adjustable level.
	Instruments are panned based on their order in the MIDI file.
	Instrument 1 -> Left, Instrument 2 -> Right.
	"""
	total_duration = midi_data.get_end_time()
	# Initialize a stereo waveform buffer (2 channels: Left, Right)
	waveform = np.zeros((2, int(total_duration * fs) + fs))

	num_instruments = len(midi_data.instruments)

	for i, instrument in enumerate(midi_data.instruments):
	# --- Panning Logic ---
	# Default to center-panned mono
	pan_l, pan_r = 0.707, 0.707
	if num_instruments == 2:
	if i == 0: # First instrument panned left
	pan_l, pan_r = 1.0, 0.0
	elif i == 1: # Second instrument panned right
	pan_l, pan_r = 0.0, 1.0
	elif num_instruments > 2:
	if i == 0: pan_l, pan_r = 1.0, 0.0 # Left
	elif i == 1: pan_l, pan_r = 0.0, 1.0 # Right
	# Other instruments remain centered

	for note in instrument.notes:
	freq = pretty_midi.note_number_to_hz(note.pitch)
	note_duration = note.end - note.start
	num_samples = int(note_duration * fs)
	if num_samples == 0:
	continue

	t = np.linspace(0., note_duration, num_samples, endpoint=False)

	# --- Vibrato LFO ---
	vibrato_lfo = vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)

	# --- Waveform Generation (Main Oscillator) ---
	if waveform_type == 'Square':
	note_waveform = signal.square(2 * np.pi * (freq + vibrato_lfo) * t, duty=pulse_width)
	elif waveform_type == 'Sawtooth':
	note_waveform = signal.sawtooth(2 * np.pi * (freq + vibrato_lfo) * t)
	elif waveform_type == 'Triangle':
	note_waveform = signal.sawtooth(2 * np.pi * (freq + vibrato_lfo) * t, width=0.5)

	# --- Bass Boost (Sub-Octave Oscillator) ---
	if bass_boost_level > 0:
	bass_freq = freq / 2.0
	# Only add bass if the frequency is reasonably audible
	if bass_freq > 20:
	# Bass uses a simple square wave, no vibrato, for stability
	bass_sub_waveform = signal.square(2 * np.pi * bass_freq * t, duty=0.5)
	# Mix the main and bass waveforms.
	# As bass level increases, slightly decrease main waveform volume to prevent clipping.
	main_level = 1.0 - (0.5 * bass_boost_level)
	note_waveform = (note_waveform * main_level) + (bass_sub_waveform * bass_boost_level)

	# --- ADSR Envelope ---
	start_amp = note.velocity / 127.0
	envelope = np.zeros(num_samples)

	if envelope_type == 'Plucky (AD Envelope)' and num_samples > 0:
	attack_time_s = 0.005
	attack_samples = min(int(attack_time_s * fs), num_samples)
	decay_samples = min(int(decay_time_s * fs), num_samples - attack_samples)

	envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples)
	if decay_samples > 0:
	envelope[attack_samples:attack_samples+decay_samples] = np.linspace(start_amp, 0, decay_samples)
	elif envelope_type == 'Sustained (Full Decay)' and num_samples > 0:
	envelope = np.linspace(start_amp, 0, num_samples)

	# Apply envelope to the (potentially combined) waveform
	note_waveform *= envelope

	start_sample = int(note.start * fs)
	end_sample = start_sample + num_samples
	if end_sample > waveform.shape[1]:
	end_sample = waveform.shape[1]
	note_waveform = note_waveform[:end_sample-start_sample]

	# Add the mono note waveform to the stereo buffer with panning
	waveform[0, start_sample:end_sample] += note_waveform * pan_l
	waveform[1, start_sample:end_sample] += note_waveform * pan_r

	return waveform # Returns a (2, N) numpy array


	def analyze_midi_velocity(midi_path):
	midi = pretty_midi.PrettyMIDI(midi_path)
	all_velocities = []

	print(f"Analyzing velocity for MIDI: {midi_path}")
	for i, instrument in enumerate(midi.instruments):
	velocities = [note.velocity for note in instrument.notes]
	all_velocities.extend(velocities)

	if velocities:
	print(f"Instrument {i} ({instrument.name}):")
	print(f" Notes count: {len(velocities)}")
	print(f" Velocity min: {min(velocities)}")
	print(f" Velocity max: {max(velocities)}")
	print(f" Velocity mean: {np.mean(velocities):.2f}")
	else:
	print(f"Instrument {i} ({instrument.name}): no notes found.")

	if all_velocities:
	print("\nOverall MIDI velocity stats:")
	print(f" Total notes: {len(all_velocities)}")
	print(f" Velocity min: {min(all_velocities)}")
	print(f" Velocity max: {max(all_velocities)}")
	print(f" Velocity mean: {np.mean(all_velocities):.2f}")
	else:
	print("No notes found in this MIDI.")


	def scale_instrument_velocity(instrument, scale=0.8):
	for note in instrument.notes:
	note.velocity = max(1, min(127, int(note.velocity * scale)))


	def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
	"""
	Normalizes the audio data to a target integrated loudness (LUFS).
	This provides more consistent perceived volume than peak normalization.

	Args:
	audio_data (np.ndarray): The audio signal.
	sample_rate (int): The sample rate of the audio.
	target_lufs (float): The target loudness in LUFS. Defaults to -23.0,
	a common standard for broadcast.

	Returns:
	np.ndarray: The loudness-normalized audio data.
	"""
	try:
	# 1. Measure the integrated loudness of the input audio
	meter = pyln.Meter(sample_rate) # create meter
	loudness = meter.integrated_loudness(audio_data)

	# 2. Calculate the gain needed to reach the target loudness
	# The gain is applied in the linear domain, so we convert from dB
	loudness_gain_db = target_lufs - loudness
	loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0)

	# 3. Apply the gain
	normalized_audio = audio_data * loudness_gain_linear

	# 4. Final safety check: peak normalize to prevent clipping, just in case
	# the loudness normalization results in peaks > 1.0
	peak_val = np.max(np.abs(normalized_audio))
	if peak_val > 1.0:
	normalized_audio /= peak_val
	print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.")

	print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.")
	return normalized_audio

	except Exception as e:
	print(f"Loudness normalization failed: {e}. Falling back to original audio.")
	return audio_data


	# =================================================================================================
	# === MIDI Merging Function ===
	# =================================================================================================
	def merge_midis(midi_path_left, midi_path_right, output_path):
	"""
	Merges two MIDI files into a single MIDI file. This robust version iterates
	through ALL instruments in both MIDI files, ensuring no data is lost if the
	source files are multi-instrumental.

	It applies hard-left panning (Pan=0) to every instrument from the left MIDI
	and hard-right panning (Pan=127) to every instrument from the right MIDI.
	"""
	try:
	analyze_midi_velocity(midi_path_left)
	analyze_midi_velocity(midi_path_right)
	midi_left = pretty_midi.PrettyMIDI(midi_path_left)
	midi_right = pretty_midi.PrettyMIDI(midi_path_right)

	merged_midi = pretty_midi.PrettyMIDI()

	# --- Process ALL instruments from the left channel MIDI ---
	if midi_left.instruments:
	print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.")
	# Use a loop to iterate through every instrument
	for instrument in midi_left.instruments:
	scale_instrument_velocity(instrument, scale=0.8)
	# To avoid confusion, we can prefix the instrument name
	instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}"

	# Create and add the Pan Left control change
	# Create a Control Change event for Pan (controller number 10).
	# Set its value to 0 for hard left panning.
	# Add it at the very beginning of the track (time=0.0).
	pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0)
	# Use insert() to ensure the pan event is the very first one
	instrument.control_changes.insert(0, pan_left)

	# Append the fully processed instrument to the merged MIDI
	merged_midi.instruments.append(instrument)

	# --- Process ALL instruments from the right channel MIDI ---
	if midi_right.instruments:
	print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.")
	# Use a loop here as well
	for instrument in midi_right.instruments:
	scale_instrument_velocity(instrument, scale=0.8)
	instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}"

	# Create and add the Pan Right control change
	# Create a Control Change event for Pan (controller number 10).
	# Set its value to 127 for hard right panning.
	# Add it at the very beginning of the track (time=0.0).
	pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0)
	instrument.control_changes.insert(0, pan_right)

	merged_midi.instruments.append(instrument)

	merged_midi.write(output_path)
	print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'")
	analyze_midi_velocity(output_path)
	return output_path

	except Exception as e:
	print(f"Error merging MIDI files: {e}")
	# Fallback logic remains the same
	if os.path.exists(midi_path_left):
	print("Fallback: Using only the left channel MIDI.")
	return midi_path_left
	return None


	# =================================================================================================
	# === Stage 1: Audio to MIDI Transcription Functions ===
	# =================================================================================================

	def TranscribePianoAudio(input_file):
	"""
	Transcribes a WAV or MP3 audio file of a SOLO PIANO performance into a MIDI file.
	This uses the ByteDance model.
	Args:
	input_file_path (str): The path to the input audio file.
	Returns:
	str: The file path of the generated MIDI file.
	"""
	print('=' * 70)
	print('STAGE 1: Starting Piano-Specific Transcription')
	print('=' * 70)

	# Generate a unique output filename for the MIDI
	fn = os.path.basename(input_file)
	fn1 = fn.split('.')[0]

	# Use os.path.join to create a platform-independent directory path
	output_dir = os.path.join("output", "transcribed_piano_")
	out_mid_path = os.path.join(output_dir, fn1 + '.mid')

	# Check for the directory's existence and create it if necessary
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	print('-' * 70)
	print(f'Input file name: {fn}')
	print(f'Output MIDI path: {out_mid_path}')
	print('-' * 70)

	# Load audio using the utility function
	print('Loading audio...')
	(audio, _) = utilities.load_audio(input_file, sr=transcription_sample_rate, mono=True)
	print('Audio loaded successfully.')
	print('-' * 70)

	# Initialize the transcription model
	# Use 'cuda' if a GPU is available and configured, otherwise 'cpu'
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	print(f'Loading transcriptor model... device= {device}')
	transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth")
	print('Transcriptor loaded.')
	print('-' * 70)

	# Perform transcription
	print('Transcribing audio to MIDI (Piano-Specific)...')
	# This function call saves the MIDI file to the specified path
	transcriptor.transcribe(audio, out_mid_path)
	print('Piano transcription complete.')
	print('=' * 70)

	# Return the path to the newly created MIDI file
	return out_mid_path

	def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool):
	"""
	Transcribes a general audio file into a MIDI file using basic-pitch.
	This is suitable for various instruments and vocals.
	"""
	print('=' * 70)
	print('STAGE 1: Starting General Purpose Transcription')
	print('=' * 70)

	fn = os.path.basename(input_file)
	fn1 = fn.split('.')[0]
	output_dir = os.path.join("output", "transcribed_general_")
	out_mid_path = os.path.join(output_dir, fn1 + '.mid')
	os.makedirs(output_dir, exist_ok=True)

	print(f'Input file: {fn}\nOutput MIDI: {out_mid_path}')

	# --- Perform transcription using basic-pitch ---
	print('Transcribing audio to MIDI (General Purpose)...')
	# The predict function handles audio loading internally
	model_output, midi_data, note_events = basic_pitch.inference.predict(
	audio_path=input_file,
	model_or_model_path=ICASSP_2022_MODEL_PATH,
	onset_threshold=onset_thresh,
	frame_threshold=frame_thresh,
	minimum_note_length=min_note_len,
	minimum_frequency=min_freq,
	maximum_frequency=max_freq,
	infer_onsets=infer_onsets_bool,
	melodia_trick=melodia_trick_bool,
	multiple_pitch_bends=multiple_bends_bool
	)

	# --- Save the MIDI file ---
	midi_data.write(out_mid_path)
	print('General transcription complete.')
	print('=' * 70)

	return out_mid_path

	# =================================================================================================
	# === Stage 2: MIDI Transformation and Rendering Function ===
	# =================================================================================================

	def Render_MIDI(input_midi_path,
	render_type,
	soundfont_bank,
	render_sample_rate,
	render_with_sustains,
	merge_misaligned_notes,
	custom_render_patch,
	render_align,
	render_transpose_value,
	render_transpose_to_C4,
	render_output_as_solo_piano,
	render_remove_drums,
	# --- 8-bit synth params ---
	s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
	s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth,
	s8bit_bass_boost_level
	):
	"""
	Processes and renders a MIDI file according to user-defined settings.
	Can render using SoundFonts or a custom 8-bit synthesizer.
	Args:
	input_midi_path (str): The path to the input MIDI file.
	All other arguments are rendering options from the Gradio UI.
	Returns:
	A tuple containing all the output elements for the Gradio UI.
	"""
	print('' 70)
	print('STAGE 2: Starting MIDI Rendering')
	print('' 70)

	# --- File and Settings Setup ---
	fn = os.path.basename(input_midi_path)
	fn1 = fn.split('.')[0]

	# Use os.path.join to create a platform-independent directory path
	output_dir = os.path.join("output", "rendered_midi")
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Now, join the clean directory path with the filename
	new_fn_path = os.path.join(output_dir, fn1 + '_rendered.mid')

	try:
	with open(input_midi_path, 'rb') as f:
	fdata = f.read()
	input_midi_md5hash = hashlib.md5(fdata).hexdigest()
	except FileNotFoundError:
	# Handle cases where the input file might not exist
	print(f"Error: Input MIDI file not found at {input_midi_path}")
	return [None] * 7 # Return empty values for all outputs

	print('=' * 70)
	print('Requested settings:')
	print(f'Input MIDI file name: {fn}')
	print(f'Input MIDI md5 hash: {input_midi_md5hash}')
	print('-' * 70)
	print(f'Render type: {render_type}')
	print(f'Soundfont bank: {soundfont_bank}')
	print(f'Audio render sample rate: {render_sample_rate}')
	# ... (add other print statements for settings if needed)
	print('=' * 70)

	# --- MIDI Processing using TMIDIX ---
	print('Processing MIDI... Please wait...')
	raw_score = MIDI.midi2single_track_ms_score(fdata)
	escore = TMIDIX.advanced_score_processor(raw_score,
	return_enhanced_score_notes=True,
	apply_sustain=render_with_sustains
	)[0]

	# Handle cases where the MIDI might not contain any notes
	if not escore:
	print("Warning: MIDI file contains no processable notes.")
	return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.")

	# This line will now work correctly because merge_misaligned_notes is guaranteed to be an integer.
	if merge_misaligned_notes > 0:
	escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes)

	escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1)

	first_note_index = [e[0] for e in raw_score[1]].index('note')
	cscore = TMIDIX.chordify_score([1000, escore])

	meta_data = raw_score[1][:first_note_index] + [escore[0]] + [escore[-1]] + [raw_score[1][-1]]

	aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True)
	song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes)

	print('Done!')
	print('=' * 70)
	print('Input MIDI metadata:', meta_data[:5])
	print('=' * 70)
	print('Input MIDI song description:', song_description)
	print('=' * 70)
	print('Processing...Please wait...')

	# A deep copy of the score to be modified
	output_score = copy.deepcopy(escore)

	# Apply transformations based on render_type
	if render_type == "Extract melody":
	output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True)
	output_score = TMIDIX.recalculate_score_timings(output_score)
	elif render_type == "Flip":
	output_score = TMIDIX.flip_enhanced_score_notes(escore)
	elif render_type == "Reverse":
	output_score = TMIDIX.reverse_enhanced_score_notes(escore)
	elif render_type == 'Repair Durations':
	output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0)
	elif render_type == 'Repair Chords':
	fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0]
	output_score = TMIDIX.flatten(fixed_cscore)
	elif render_type == 'Remove Duplicate Pitches':
	output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore)
	elif render_type == "Add Drum Track":
	nd_escore = [e for e in escore if e[3] != 9]
	nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore)
	output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore)

	for e in output_score:
	e[1] *= 16
	e[2] *= 16

	print('MIDI processing complete.')
	print('=' * 70)

	# --- Final Processing and Patching ---
	if render_type != "Render as-is":
	print('Applying final adjustments (transpose, align, patch)...')
	if custom_render_patch != -1: # -1 indicates no change
	for e in output_score:
	if e[3] != 9: # not a drum channel
	e[6] = custom_render_patch

	if render_transpose_value != 0:
	output_score = TMIDIX.transpose_escore_notes(output_score, render_transpose_value)

	if render_transpose_to_C4:
	output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) # C4 is MIDI pitch 60

	if render_align == "Start Times":
	output_score = TMIDIX.recalculate_score_timings(output_score)
	output_score = TMIDIX.align_escore_notes_to_bars(output_score)

	elif render_align == "Start Times and Durations":
	output_score = TMIDIX.recalculate_score_timings(output_score)
	output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True)

	elif render_align == "Start Times and Split Durations":
	output_score = TMIDIX.recalculate_score_timings(output_score)
	output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True)

	if render_type == "Longest Repeating Phrase":
	zscore = TMIDIX.recalculate_score_timings(output_score)
	lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore)

	if lrno_score is not None:
	output_score = lrno_score

	else:
	output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50))

	if render_type == "Multi-Instrumental Summary":
	zscore = TMIDIX.recalculate_score_timings(output_score)
	c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore)

	if len(c_escore_notes) > 128:
	cmatrix = TMIDIX.escore_notes_to_image_matrix(c_escore_notes, filter_out_zero_rows=True, filter_out_duplicate_rows=True)
	smatrix = TPLOTS.square_image_matrix(cmatrix, num_pca_components=max(1, min(5, len(c_escore_notes) // 128)))
	output_score = TMIDIX.image_matrix_to_original_escore_notes(smatrix)

	for o in output_score:
	o[1] *= 250
	o[2] *= 250

	if render_output_as_solo_piano:
	output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not render_remove_drums))

	if render_remove_drums and not render_output_as_solo_piano:
	output_score = TMIDIX.strip_drums_from_escore_notes(output_score)

	if render_type == "Solo Piano Summary":
	sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False)
	zscore = TMIDIX.recalculate_score_timings(sp_escore_notes)

	if len(zscore) > 128:

	bmatrix = TMIDIX.escore_notes_to_binary_matrix(zscore)
	cmatrix = TMIDIX.compress_binary_matrix(bmatrix, only_compress_zeros=True)
	smatrix = TPLOTS.square_binary_matrix(cmatrix, interpolation_order=max(1, min(5, len(zscore) // 128)))
	output_score = TMIDIX.binary_matrix_to_original_escore_notes(smatrix)

	for o in output_score:
	o[1] *= 200
	o[2] *= 200

	print('Final adjustments complete.')
	print('=' * 70)

	# --- Saving Processed MIDI File ---
	# Save the transformed MIDI data
	SONG, patches, _ = TMIDIX.patch_enhanced_score_notes(output_score)

	# The underlying function mistakenly adds a '.mid' extension.
	# We must pass the path without the extension to compensate.
	path_without_ext = new_fn_path.rsplit('.mid', 1)[0]

	TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(SONG,
	output_signature = 'Integrated-MIDI-Processor',
	output_file_name = path_without_ext,
	track_name='Processed Track',
	list_of_MIDI_patches=patches
	)
	midi_to_render_path = new_fn_path
	else:
	# If "Render as-is", use the original MIDI data
	with open(new_fn_path, 'wb') as f:
	f.write(fdata)
	midi_to_render_path = new_fn_path

	# --- Audio Rendering ---
	print('Rendering final audio...')

	# Select sample rate
	srate = int(render_sample_rate)

	# --- Conditional Rendering Logic ---
	if soundfont_bank == SYNTH_8_BIT_LABEL:
	print("Using 8-bit style synthesizer...")
	try:
	# Load the MIDI file with pretty_midi for manual synthesis
	midi_data_for_synth = pretty_midi.PrettyMIDI(midi_to_render_path)
	# Synthesize the waveform
	audio = synthesize_8bit_style(
	midi_data_for_synth,
	s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
	s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth,
	s8bit_bass_boost_level,
	fs=srate
	)
	# Normalize and prepare for Gradio
	peak_val = np.max(np.abs(audio))
	if peak_val > 0:
	audio /= peak_val
	# Transpose from (2, N) to (N, 2) and convert to int16 for Gradio
	audio_out = (audio.T * 32767).astype(np.int16)
	except Exception as e:
	print(f"Error during 8-bit synthesis: {e}")
	return [None] * 7
	else:
	print(f"Using SoundFont: {soundfont_bank}")
	# Get the full path from the global dictionary
	soundfont_path = soundfonts_dict.get(soundfont_bank)

	# Select soundfont
	if not soundfont_path or not os.path.exists(soundfont_path):
	# Error handling in case the selected file is not found
	error_msg = f"SoundFont '{soundfont_bank}' not found!"
	print(f"ERROR: {error_msg}")
	# Fallback to the first available soundfont if possible
	if soundfonts_dict:
	fallback_key = list(soundfonts_dict.keys())[0]
	soundfont_path = soundfonts_dict[fallback_key]
	print(f"Falling back to '{fallback_key}'.")
	else:
	# If no soundfonts are available at all, raise an error
	raise gr.Error("No SoundFonts are available for rendering!")

	with open(midi_to_render_path, 'rb') as f:
	midi_file_content = f.read()

	audio_out = midi_to_colab_audio(midi_file_content,
	soundfont_path=soundfont_path, # Use the dynamically found path
	sample_rate=srate,
	output_for_gradio=True
	)

	print('Audio rendering complete.')
	print('=' * 70)

	# --- Preparing Outputs for Gradio ---
	with open(midi_to_render_path, 'rb') as f:
	new_md5_hash = hashlib.md5(f.read()).hexdigest()
	output_plot = TPLOTS.plot_ms_SONG(output_score, plot_title=f"Score of {fn1}", return_plt=True)

	output_midi_summary = str(meta_data)

	return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description

	# =================================================================================================
	# === Main Application Logic ===
	# =================================================================================================

	def process_and_render_file(input_file,
	# --- Transcription params ---
	enable_stereo_processing,
	transcription_method,
	onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool,
	# --- MIDI rendering params ---
	render_type, soundfont_bank, render_sample_rate,
	render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align,
	render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums,
	# --- 8-bit synth params ---
	s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
	s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth,
	s8bit_bass_boost_level
	):
	"""
	Main function to handle file processing. It determines the file type and calls the
	appropriate functions for transcription and/or rendering based on user selections.
	"""
	start_time = reqtime.time()
	if input_file is None:
	# Return a list of updates to clear all output fields
	return [gr.update(value=None)] * 7

	# The input_file from gr.Audio(type="filepath") is now the direct path (a string),
	# not a temporary file object. We no longer need to access the .name attribute.
	input_file_path = input_file
	filename = os.path.basename(input_file_path)
	print(f"Processing new file: {filename}")

	try:
	audio_data, native_sample_rate = librosa.load(input_file_path, sr=None, mono=False)
	except Exception as e:
	raise gr.Error(f"Failed to load audio file: {e}")

	# --- Step 1: Check file type and transcribe if necessary ---
	if filename.lower().endswith(('.mid', '.midi', '.kar')):
	print("MIDI file detected. Proceeding directly to rendering.")
	midi_path_for_rendering = input_file_path
	else: #if filename.lower().endswith(('.wav', '.mp3'))
	print("Audio file detected. Starting transcription...")

	base_name = os.path.splitext(filename)[0]
	temp_dir = "output/temp_normalized"
	os.makedirs(temp_dir, exist_ok=True)

	# === STEREO PROCESSING LOGIC ===
	if enable_stereo_processing:
	if audio_data.ndim != 2 or audio_data.shape[0] != 2:
	print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
	enable_stereo_processing = False # Disable stereo processing if audio is not stereo

	if enable_stereo_processing:
	print("Stereo processing enabled. Splitting channels...")
	try:
	left_channel = audio_data[0]
	right_channel = audio_data[1]

	normalized_left = normalize_loudness(left_channel, native_sample_rate)
	normalized_right = normalize_loudness(right_channel, native_sample_rate)

	temp_left_wav_path = os.path.join(temp_dir, f"{base_name}_left.wav")
	temp_right_wav_path = os.path.join(temp_dir, f"{base_name}_right.wav")

	sf.write(temp_left_wav_path, normalized_left, native_sample_rate)
	sf.write(temp_right_wav_path, normalized_right, native_sample_rate)

	print(f"Saved left channel to: {temp_left_wav_path}")
	print(f"Saved right channel to: {temp_right_wav_path}")

	print("Transcribing left channel...")
	if transcription_method == "General Purpose":
	midi_path_left = TranscribeGeneralAudio(temp_left_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
	else:
	midi_path_left = TranscribePianoAudio(temp_left_wav_path)

	print("Transcribing right channel...")
	if transcription_method == "General Purpose":
	midi_path_right = TranscribeGeneralAudio(temp_right_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
	else:
	midi_path_right = TranscribePianoAudio(temp_right_wav_path)

	if midi_path_left and midi_path_right:
	merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
	midi_path_for_rendering = merge_midis(midi_path_left, midi_path_right, merged_midi_path)
	elif midi_path_left:
	print("Warning: Right channel transcription failed. Using left channel only.")
	midi_path_for_rendering = midi_path_left
	elif midi_path_right:
	print("Warning: Left channel transcription failed. Using right channel only.")
	midi_path_for_rendering = midi_path_right
	else:
	raise gr.Error("Both left and right channel transcriptions failed.")

	except Exception as e:
	print(f"An error occurred during stereo processing: {e}")
	raise gr.Error(f"Stereo Processing Failed: {e}")
	else:
	print("Stereo processing disabled. Using standard mono transcription.")
	if audio_data.ndim == 1:
	mono_signal = audio_data
	else:
	mono_signal = np.mean(audio_data, axis=0)

	normalized_mono = normalize_loudness(mono_signal, native_sample_rate)

	temp_mono_wav_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
	sf.write(temp_mono_wav_path, normalized_mono, native_sample_rate)

	try:
	if transcription_method == "General Purpose":
	midi_path_for_rendering = TranscribeGeneralAudio(
	temp_mono_wav_path, onset_thresh, frame_thresh, min_note_len,
	min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
	)
	else: # Piano-Specific
	midi_path_for_rendering = TranscribePianoAudio(temp_mono_wav_path)
	analyze_midi_velocity(midi_path_for_rendering)
	except Exception as e:
	print(f"An error occurred during transcription: {e}")
	raise gr.Error(f"Transcription Failed: {e}")

	# --- Step 2: Render the MIDI file with selected options ---
	print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
	results = Render_MIDI(midi_path_for_rendering,
	render_type, soundfont_bank, render_sample_rate,
	render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align,
	render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums,
	s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
	s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level)

	print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
	print('' 70)

	return results

	# =================================================================================================
	# === Gradio UI Setup ===
	# =================================================================================================

	def update_ui_visibility(transcription_method, soundfont_choice):
	"""
	Dynamically updates the visibility of UI components based on user selections.
	"""
	is_general = (transcription_method == "General Purpose")
	is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)

	return {
	general_transcription_settings: gr.update(visible=is_general),
	synth_8bit_settings: gr.update(visible=is_8bit),
	}

	if __name__ == "__main__":
	# Initialize the app: download model (if needed) and apply patches
	# Set to False if you don't have 'requests' or 'tqdm' installed
	initialize_app()

	# --- Prepare soundfonts and make the map globally accessible ---
	global soundfonts_dict
	# On application start, download SoundFonts from Hugging Face Hub if they don't exist.
	soundfonts_dict = prepare_soundfonts()
	print(f"Found {len(soundfonts_dict)} local SoundFonts.")

	if not soundfonts_dict:
	print("\nWARNING: No SoundFonts were found or could be downloaded.")
	print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")

	app = gr.Blocks(theme=gr.themes.Base())

	with app:
	gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Audio-to-MIDI & Advanced Renderer</h1>")
	gr.Markdown(
	"Upload a Audio for transcription-then-rendering, or a MIDI for rendering-only.\n\n"
	"This application combines piano audio transcription with a powerful MIDI transformation and rendering toolkit. "
	"Based on the work of [asigalov61](https://github.com/asigalov61)."
	)

	with gr.Row():
	waveform_options = gr.WaveformOptions(show_recording_waveform=False)
	with gr.Column(scale=1):
	# --- INPUT COLUMN ---
	gr.Markdown("## 1. Upload File")

	# Changed from gr.File to gr.Audio to allow for audio preview.
	# type="filepath" ensures the component returns a string path to the uploaded file.
	# The component will show a player for supported audio types (e.g., WAV, MP3).
	input_file = gr.Audio(
	label="Input Audio or MIDI File",
	type="filepath",
	sources=["upload"], waveform_options=waveform_options
	)

	gr.Markdown("## 2. Configure Processing")

	# --- Transcription Method Selector ---
	transcription_method = gr.Radio(
	["General Purpose", "Piano-Specific"],
	label="Audio Transcription Method",
	value="General Purpose",
	info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings."
	)

	# --- Stereo Processing Checkbox ---
	enable_stereo_processing = gr.Checkbox(
	label="Enable Stereo Transcription",
	value=False,
	info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
	)

	with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
	onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
	frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
	minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
	minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.")
	maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.")
	infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)")
	melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)")
	multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends")

	# --- Rendering Settings ---
	render_type = gr.Radio(
	["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"],
	label="MIDI Transformation Render Type",
	value="Render as-is",
	info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations."
	)

	# --- SoundFont Bank with 8-bit option ---
	# --- Dynamically create the list of choices ---
	soundfont_choices = [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys())
	# Set a safe default value
	default_sf_choice = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" if "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" in soundfonts_dict else (soundfont_choices[0] if soundfont_choices else "")

	soundfont_bank = gr.Dropdown(
	soundfont_choices,
	label="SoundFont / Synthesizer",
	value=default_sf_choice
	)

	render_sample_rate = gr.Radio(
	["16000", "32000", "44100"],
	label="Audio Sample Rate",
	value="44100"
	)

	# --- NEW: 8-bit Synthesizer Settings ---
	with gr.Accordion("8-bit Synthesizer Settings", open=False, visible=False) as synth_8bit_settings:
	s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
	s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
	s8bit_decay_time_s = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Decay Time (s)")
	s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width")
	s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
	s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
	s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")

	# --- Original Advanced Options (Now tied to Piano-Specific) ---
	with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options:
	render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True)
	render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False)
	render_remove_drums = gr.Checkbox(label="Remove drum track", value=False)
	render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False)
	render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)")
	custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)")
	merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)")
	render_align = gr.Radio(
	["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"],
	label="Align notes to musical bars",
	value="Do not align"
	)

	submit_btn = gr.Button("Process and Render", variant="primary")

	with gr.Column(scale=2):
	# --- OUTPUT COLUMN ---
	gr.Markdown("## 3. Results")
	output_midi_title = gr.Textbox(label="MIDI Title")
	output_song_description = gr.Textbox(label="MIDI Description", lines=3)
	output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options)
	output_plot = gr.Plot(label="MIDI Score Plot")
	with gr.Row():
	output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"])
	output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash")
	output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4)

	# --- Define all input components for the click event ---
	all_inputs = [
	input_file,
	enable_stereo_processing,
	transcription_method,
	onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency,
	infer_onsets, melodia_trick, multiple_pitch_bends,
	render_type, soundfont_bank, render_sample_rate,
	render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align,
	render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums,
	s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
	s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level
	]
	all_outputs = [
	output_midi_md5, output_midi_title, output_midi_summary,
	output_midi, output_audio, output_plot, output_song_description
	]

	# --- Event Handling ---
	submit_btn.click(
	process_and_render_file,
	inputs=all_inputs,
	outputs=all_outputs
	)

	# --- Listeners for dynamic UI updates ---
	transcription_method.change(
	fn=update_ui_visibility,
	inputs=[transcription_method, soundfont_bank],
	outputs=[general_transcription_settings, synth_8bit_settings]
	)
	soundfont_bank.change(
	fn=update_ui_visibility,
	inputs=[transcription_method, soundfont_bank],
	outputs=[general_transcription_settings, synth_8bit_settings]
	)

	# Launch the Gradio app
	app.queue().launch(inbrowser=True, debug=True)