|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import io |
|
import os |
|
import hashlib |
|
import time as reqtime |
|
import copy |
|
import random |
|
import shutil |
|
import librosa |
|
import pyloudnorm as pyln |
|
import soundfile as sf |
|
from mutagen.flac import FLAC |
|
|
|
import torch |
|
import ffmpeg |
|
import gradio as gr |
|
from dataclasses import dataclass, fields |
|
|
|
|
|
import torchaudio |
|
from demucs.apply import apply_model |
|
from demucs.pretrained import get_model |
|
from demucs.audio import convert_audio |
|
|
|
from src.piano_transcription.utils import initialize_app |
|
from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate |
|
|
|
|
|
from src import TMIDIX, TPLOTS |
|
from src import MIDI |
|
from src.midi_to_colab_audio import midi_to_colab_audio |
|
|
|
|
|
import basic_pitch |
|
from basic_pitch.inference import predict |
|
from basic_pitch import ICASSP_2022_MODEL_PATH |
|
|
|
|
|
import pretty_midi |
|
import numpy as np |
|
from scipy import signal |
|
|
|
|
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
import glob |
|
|
|
|
|
SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)" |
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
class AppParameters: |
|
"""A dataclass to hold all configurable parameters for the application.""" |
|
|
|
|
|
|
|
input_file: str = None |
|
batch_input_files: list = None |
|
|
|
|
|
s8bit_preset_selector: str = "Custom" |
|
separate_vocals: bool = False |
|
|
|
|
|
enable_advanced_separation: bool = False |
|
separate_drums: bool = True |
|
separate_bass: bool = True |
|
separate_other: bool = True |
|
|
|
transcribe_vocals: bool = False |
|
transcribe_drums: bool = False |
|
transcribe_bass: bool = False |
|
transcribe_other_or_accompaniment: bool = True |
|
|
|
merge_vocals_to_render: bool = False |
|
merge_drums_to_render: bool = False |
|
merge_bass_to_render: bool = False |
|
merge_other_or_accompaniment: bool = False |
|
|
|
enable_stereo_processing: bool = False |
|
transcription_method: str = "General Purpose" |
|
basic_pitch_preset_selector: str = "Default (Balanced)" |
|
|
|
|
|
onset_threshold: float = 0.5 |
|
frame_threshold: float = 0.3 |
|
minimum_note_length: int = 128 |
|
minimum_frequency: float = 60.0 |
|
maximum_frequency: float = 4000.0 |
|
infer_onsets: bool = True |
|
melodia_trick: bool = True |
|
multiple_pitch_bends: bool = False |
|
|
|
|
|
render_type: str = "Render as-is" |
|
soundfont_bank: str = "None (8-bit Synthesizer)" |
|
render_sample_rate: str = "44100" |
|
render_with_sustains: bool = True |
|
merge_misaligned_notes: int = -1 |
|
custom_render_patch: int = -1 |
|
render_align: str = "Do not align" |
|
render_transpose_value: int = 0 |
|
render_transpose_to_C4: bool = False |
|
render_output_as_solo_piano: bool = False |
|
render_remove_drums: bool = False |
|
|
|
|
|
s8bit_waveform_type: str = 'Square' |
|
s8bit_pulse_width: float = 0.5 |
|
s8bit_envelope_type: str = 'Plucky (AD Envelope)' |
|
s8bit_decay_time_s: float = 0.1 |
|
s8bit_vibrato_rate: float = 5.0 |
|
s8bit_vibrato_depth: float = 0.0 |
|
s8bit_bass_boost_level: float = 0.0 |
|
s8bit_smooth_notes_level: float = 0.0 |
|
s8bit_continuous_vibrato_level: float = 0.0 |
|
s8bit_noise_level: float = 0.0 |
|
s8bit_distortion_level: float = 0.0 |
|
s8bit_fm_modulation_depth: float = 0.0 |
|
s8bit_fm_modulation_rate: float = 0.0 |
|
s8bit_adaptive_decay: bool = False |
|
s8bit_echo_sustain: bool = False |
|
s8bit_echo_rate_hz: float = 5.0 |
|
s8bit_echo_decay_factor: float = 0.6 |
|
s8bit_echo_trigger_threshold: float = 2.5 |
|
|
|
|
|
s8bit_enable_anti_aliasing: bool = True |
|
s8bit_use_additive_synthesis: bool = False |
|
s8bit_edge_smoothing_ms: float = 0.5 |
|
s8bit_noise_lowpass_hz: float = 9000.0 |
|
s8bit_harmonic_lowpass_factor: float = 12.0 |
|
s8bit_final_gain: float = 0.8 |
|
s8bit_bass_boost_cutoff_hz: float = 200.0 |
|
|
|
|
|
s8bit_enable_midi_preprocessing: bool = True |
|
s8bit_high_pitch_threshold: int = 84 |
|
s8bit_high_pitch_velocity_scale: float = 0.8 |
|
|
|
s8bit_low_pitch_threshold: int = 36 |
|
s8bit_low_pitch_velocity_scale: float = 0.9 |
|
|
|
s8bit_chord_density_threshold: int = 4 |
|
s8bit_chord_velocity_threshold: int = 100 |
|
s8bit_chord_velocity_scale: float = 0.75 |
|
|
|
|
|
s8bit_enable_arpeggiator: bool = False |
|
s8bit_arpeggio_target: str = "Accompaniment Only" |
|
s8bit_arpeggio_velocity_scale: float = 0.7 |
|
s8bit_arpeggio_density: float = 0.5 |
|
s8bit_arpeggio_rhythm: str = "Classic Upbeat (8th)" |
|
s8bit_arpeggio_pattern: str = "Up" |
|
s8bit_arpeggio_octave_range: int = 1 |
|
s8bit_arpeggio_panning: str = "Stereo" |
|
|
|
|
|
s8bit_enable_delay: bool = False |
|
s8bit_delay_on_melody_only: bool = True |
|
s8bit_delay_division: str = "Dotted 8th Note" |
|
s8bit_delay_feedback: float = 0.5 |
|
s8bit_delay_repeats: int = 3 |
|
|
|
s8bit_delay_highpass_cutoff_hz: int = 100 |
|
s8bit_delay_bass_pitch_shift: int = 0 |
|
|
|
s8bit_delay_lowpass_cutoff_hz: int = 5000 |
|
s8bit_delay_treble_pitch_shift: int = 0 |
|
|
|
|
|
|
|
|
|
|
|
def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int): |
|
""" |
|
Analyzes raw audio data to dynamically determine optimal parameters for basic-pitch. |
|
|
|
Args: |
|
audio_data: The audio signal as a NumPy array (can be stereo). |
|
sample_rate: The sample rate of the audio. |
|
|
|
Returns: |
|
A dictionary of recommended parameters for basic_pitch. |
|
""" |
|
print(" - Running adaptive analysis on audio to determine optimal transcription parameters...") |
|
|
|
|
|
if audio_data.ndim > 1: |
|
y_mono = librosa.to_mono(audio_data) |
|
else: |
|
y_mono = audio_data |
|
|
|
params = {} |
|
|
|
|
|
try: |
|
tempo_info = librosa.beat.tempo(y=y_mono, sr=sample_rate, aggregate=np.median) |
|
|
|
|
|
bpm = float(np.median(tempo_info)) |
|
|
|
if bpm <= 0 or np.isnan(bpm): |
|
raise ValueError("Invalid BPM detected") |
|
|
|
|
|
|
|
|
|
min_len_s = (60.0 / bpm) / 16.0 |
|
|
|
params['minimum_note_length'] = max(20, int(min_len_s * 1000)) |
|
print(f" - Detected BPM (median): {bpm:.1f} -> minimum_note_length: {params['minimum_note_length']}ms") |
|
except Exception as e: |
|
print(f" - BPM detection failed, using default minimum_note_length. Error: {e}") |
|
|
|
|
|
try: |
|
spectral_centroid = librosa.feature.spectral_centroid(y=y_mono, sr=sample_rate)[0] |
|
rolloff = librosa.feature.spectral_rolloff(y=y_mono, sr=sample_rate)[0] |
|
avg_centroid = np.mean(spectral_centroid) |
|
avg_rolloff = np.mean(rolloff) |
|
print(f" - Spectral centroid: {avg_centroid:.1f} Hz, rolloff (85%): {avg_rolloff:.1f} Hz") |
|
|
|
|
|
if avg_centroid < 500 and avg_rolloff < 1500: |
|
params['minimum_frequency'] = 30 |
|
params['maximum_frequency'] = 1200 |
|
elif avg_centroid > 2000 or avg_rolloff > 5000: |
|
params['minimum_frequency'] = 100 |
|
params['maximum_frequency'] = 8000 |
|
else: |
|
params['minimum_frequency'] = 50 |
|
params['maximum_frequency'] = 4000 |
|
except Exception as e: |
|
print(f" - Spectral analysis failed, using default frequencies. Error: {e}") |
|
|
|
|
|
try: |
|
y_harmonic, y_percussive = librosa.effects.hpss(y_mono) |
|
percussive_ratio = np.sum(y_percussive**2) / (np.sum(y_harmonic**2) + 1e-10) |
|
|
|
params['onset_threshold'] = 0.6 if percussive_ratio > 0.5 else 0.45 |
|
print(f" - Percussive ratio: {percussive_ratio:.2f} -> onset_threshold: {params['onset_threshold']}") |
|
except Exception as e: |
|
print(f" - Percussiveness analysis failed, using default onset_threshold. Error: {e}") |
|
|
|
|
|
try: |
|
rms = librosa.feature.rms(y=y_mono)[0] |
|
|
|
noise_floor_rms = np.percentile(rms, 10) |
|
|
|
|
|
params['frame_threshold'] = max(0.05, min(0.4, noise_floor_rms * 4)) |
|
print(f" - Noise floor RMS: {noise_floor_rms:.5f} -> frame_threshold: {params['frame_threshold']:.2f}") |
|
except Exception as e: |
|
print(f" - RMS analysis failed, using default frame_threshold. Error: {e}") |
|
|
|
return params |
|
|
|
|
|
def format_params_for_metadata(params: AppParameters, transcription_log: dict = None) -> str: |
|
""" |
|
Formats the AppParameters object into a human-readable string |
|
suitable for embedding as metadata in an audio file. |
|
""" |
|
import json |
|
|
|
params_dict = copy.copy(params.__dict__) |
|
|
|
|
|
structured_metadata = { |
|
"main_settings": {}, |
|
"transcription_log": transcription_log if transcription_log else "Not Performed", |
|
"synthesis_settings": {} |
|
} |
|
|
|
|
|
transcription_keys = [ |
|
'transcription_method', 'basic_pitch_preset_selector', 'onset_threshold', |
|
'frame_threshold', 'minimum_note_length', 'minimum_frequency', 'maximum_frequency', |
|
'infer_onsets', 'melodia_trick', 'multiple_pitch_bends' |
|
] |
|
|
|
synthesis_keys = [key for key in params_dict.keys() if key.startswith('s8bit_')] |
|
|
|
|
|
for key, value in params_dict.items(): |
|
if key not in transcription_keys and key not in synthesis_keys: |
|
structured_metadata["main_settings"][key] = value |
|
|
|
for key in synthesis_keys: |
|
structured_metadata["synthesis_settings"][key] = params_dict[key] |
|
|
|
|
|
if not transcription_log: |
|
structured_metadata["transcription_log"] = { |
|
"ui_settings": {key: params_dict[key] for key in transcription_keys} |
|
} |
|
|
|
|
|
|
|
return json.dumps(params_dict, indent=2) |
|
|
|
|
|
def preprocess_midi_for_harshness(midi_data: pretty_midi.PrettyMIDI, params: AppParameters): |
|
""" |
|
Analyzes and modifies a PrettyMIDI object in-place to reduce characteristics |
|
that can cause harshness or muddiness in simple synthesizers. |
|
Now includes both high and low pitch attenuation. |
|
|
|
Args: |
|
midi_data: The PrettyMIDI object to process. |
|
params: The AppParameters object containing the control thresholds. |
|
""" |
|
print("Running MIDI pre-processing to reduce harshness and muddiness...") |
|
high_notes_tamed = 0 |
|
low_notes_tamed = 0 |
|
chords_tamed = 0 |
|
|
|
|
|
for instrument in midi_data.instruments: |
|
for note in instrument.notes: |
|
|
|
if note.pitch > params.s8bit_high_pitch_threshold: |
|
note.velocity = int(note.velocity * params.s8bit_high_pitch_velocity_scale) |
|
if note.velocity < 1: note.velocity = 1 |
|
high_notes_tamed += 1 |
|
|
|
|
|
if note.pitch < params.s8bit_low_pitch_threshold: |
|
note.velocity = int(note.velocity * params.s8bit_low_pitch_velocity_scale) |
|
if note.velocity < 1: note.velocity = 1 |
|
low_notes_tamed += 1 |
|
|
|
if high_notes_tamed > 0: |
|
print(f" - Tamed {high_notes_tamed} individual high-pitched notes.") |
|
if low_notes_tamed > 0: |
|
print(f" - Tamed {low_notes_tamed} individual low-pitched notes.") |
|
|
|
|
|
|
|
all_notes = sorted([note for instrument in midi_data.instruments for note in instrument.notes], key=lambda x: x.start) |
|
|
|
time_window = 0.02 |
|
i = 0 |
|
while i < len(all_notes): |
|
current_chord = [all_notes[i]] |
|
|
|
j = i + 1 |
|
while j < len(all_notes) and (all_notes[j].start - all_notes[i].start) < time_window: |
|
current_chord.append(all_notes[j]) |
|
j += 1 |
|
|
|
|
|
if len(current_chord) >= params.s8bit_chord_density_threshold: |
|
avg_velocity = sum(n.velocity for n in current_chord) / len(current_chord) |
|
if avg_velocity > params.s8bit_chord_velocity_threshold: |
|
chords_tamed += 1 |
|
for note in current_chord: |
|
note.velocity = int(note.velocity * params.s8bit_chord_velocity_scale) |
|
if note.velocity < 1: note.velocity = 1 |
|
|
|
|
|
i = j |
|
|
|
if chords_tamed > 0: |
|
print(f" - Tamed {chords_tamed} loud, dense chords.") |
|
|
|
return midi_data |
|
|
|
|
|
def arpeggiate_midi(midi_data: pretty_midi.PrettyMIDI, params: AppParameters): |
|
""" |
|
Applies a tempo-synced, rhythmic arpeggiator effect. It can generate |
|
various rhythmic patterns (not just continuous notes) to create a more |
|
musical and less "stiff" accompaniment. |
|
Improved rhythmic arpeggiator with dynamic density, stereo layer splitting, |
|
micro-randomization, and cross-beat continuity. |
|
|
|
Applies a highly configurable arpeggiator with selectable targets: |
|
- Accompaniment Only: The classic approach, arpeggiates harmony. |
|
- Melody Only: A modern approach, adds flair to the lead melody. |
|
- Full Mix: Applies the effect to all notes. |
|
|
|
Args: |
|
midi_data: The original PrettyMIDI object. |
|
params: AppParameters containing arpeggiator settings. |
|
|
|
Returns: |
|
A new PrettyMIDI object with arpeggiated chords. |
|
""" |
|
print(f"Applying arpeggiator with target: {params.s8bit_arpeggio_target}...") |
|
processed_midi = copy.deepcopy(midi_data) |
|
|
|
|
|
all_notes = [] |
|
|
|
for i, instrument in enumerate(processed_midi.instruments): |
|
if not instrument.is_drum: |
|
for note in instrument.notes: |
|
|
|
all_notes.append({'note': note, 'instrument_idx': i}) |
|
|
|
if not all_notes: |
|
return processed_midi |
|
all_notes.sort(key=lambda x: x['note'].start) |
|
|
|
|
|
lead_note_objects = set() |
|
harmony_note_objects = set() |
|
|
|
note_idx = 0 |
|
while note_idx < len(all_notes): |
|
current_slice_start = all_notes[note_idx]['note'].start |
|
notes_in_slice = [item for item in all_notes[note_idx:] if (item['note'].start - current_slice_start) < 0.02] |
|
|
|
if not notes_in_slice: |
|
note_idx += 1 |
|
continue |
|
|
|
notes_in_slice.sort(key=lambda x: x['note'].pitch, reverse=True) |
|
lead_note_objects.add(notes_in_slice[0]['note']) |
|
for item in notes_in_slice[1:]: |
|
harmony_note_objects.add(item['note']) |
|
|
|
note_idx += len(notes_in_slice) |
|
|
|
|
|
notes_to_arpeggiate = set() |
|
notes_to_keep_original = set() |
|
|
|
if params.s8bit_arpeggio_target == "Accompaniment Only": |
|
print(" - Arpeggiating harmony notes.") |
|
notes_to_arpeggiate = harmony_note_objects |
|
notes_to_keep_original = lead_note_objects |
|
elif params.s8bit_arpeggio_target == "Melody Only": |
|
print(" - Arpeggiating lead melody notes.") |
|
notes_to_arpeggiate = lead_note_objects |
|
notes_to_keep_original = harmony_note_objects |
|
else: |
|
print(" - Arpeggiating all non-drum notes.") |
|
notes_to_arpeggiate = lead_note_objects.union(harmony_note_objects) |
|
notes_to_keep_original = set() |
|
|
|
|
|
try: |
|
bpm = midi_data.estimate_tempo() |
|
except: |
|
bpm = 120.0 |
|
beat_duration_s = 60.0 / bpm |
|
|
|
rhythm_patterns = { |
|
"Continuous 16ths": [(0.0, 0.25), (0.25, 0.25), (0.5, 0.25), (0.75, 0.25)], |
|
"Classic Upbeat (8th)": [(0.5, 0.25), (0.75, 0.25)], |
|
"Pulsing 8ths": [(0.0, 0.5), (0.5, 0.5)], |
|
"Pulsing 4ths": [(0.0, 0.5)], |
|
"Galloping": [(0.0, 0.75), (0.75, 0.25)], |
|
"Simple Quarter Notes": [(0.0, 1.0)], |
|
"Triplet 8ths": [(0.0, 1/3), (1/3, 1/3), (2/3, 1/3)], |
|
} |
|
selected_rhythm = rhythm_patterns.get(params.s8bit_arpeggio_rhythm, rhythm_patterns["Classic Upbeat (8th)"]) |
|
|
|
|
|
for instrument in processed_midi.instruments: |
|
if instrument.is_drum: |
|
continue |
|
|
|
new_note_list = [] |
|
|
|
|
|
inst_notes_to_keep = [n for n in instrument.notes if n in notes_to_keep_original] |
|
new_note_list.extend(inst_notes_to_keep) |
|
|
|
|
|
inst_notes_to_arp = [n for n in instrument.notes if n in notes_to_arpeggiate] |
|
processed_arp_notes = set() |
|
|
|
for note1 in inst_notes_to_arp: |
|
if note1 in processed_arp_notes: |
|
continue |
|
|
|
|
|
|
|
chord_notes = [note1] |
|
if params.s8bit_arpeggio_target != "Melody Only": |
|
chord_notes.extend([n2 for n2 in inst_notes_to_arp if n2 != note1 and n2 not in processed_arp_notes and abs(n2.start - note1.start) < 0.02]) |
|
|
|
|
|
for n in chord_notes: |
|
processed_arp_notes.add(n) |
|
|
|
chord_start_time = min(n.start for n in chord_notes) |
|
chord_end_time = max(n.end for n in chord_notes) |
|
avg_velocity = int(np.mean([n.velocity for n in chord_notes])) |
|
|
|
|
|
|
|
|
|
scale = params.s8bit_arpeggio_velocity_scale |
|
|
|
|
|
final_velocity_base = int(avg_velocity * (scale ** 2.5)) |
|
|
|
if final_velocity_base < 1: |
|
final_velocity_base = 1 |
|
|
|
|
|
base_pitches = sorted([n.pitch for n in chord_notes]) |
|
|
|
|
|
if params.s8bit_arpeggio_target == "Melody Only" and len(base_pitches) == 1: |
|
|
|
|
|
root = base_pitches[0] |
|
base_pitches = [root, root + 4, root + 7] |
|
|
|
pattern = [] |
|
for octave in range(params.s8bit_arpeggio_octave_range): |
|
octave_pitches = [p + (12 * octave) for p in base_pitches] |
|
if params.s8bit_arpeggio_pattern == "Up": |
|
pattern.extend(octave_pitches) |
|
elif params.s8bit_arpeggio_pattern == "Down": |
|
pattern.extend(reversed(octave_pitches)) |
|
elif params.s8bit_arpeggio_pattern == "UpDown": |
|
pattern.extend(octave_pitches) |
|
if len(octave_pitches) > 2: |
|
pattern.extend(reversed(octave_pitches[1:-1])) |
|
|
|
if not pattern: |
|
continue |
|
|
|
|
|
note_base_density = getattr(params, "s8bit_arpeggio_density", 0.6) |
|
chord_duration = chord_end_time - chord_start_time |
|
note_duration_factor = min(1.0, chord_duration / (2 * beat_duration_s)) if beat_duration_s > 0 else 1.0 |
|
note_density_factor = note_base_density * note_duration_factor |
|
|
|
current_beat = chord_start_time / beat_duration_s if beat_duration_s > 0 else 0 |
|
current_time = chord_start_time |
|
pattern_index = 0 |
|
while current_time < chord_end_time: |
|
|
|
current_beat_start_time = np.floor(current_beat) * beat_duration_s |
|
|
|
for start_offset, duration_beats in selected_rhythm: |
|
note_start_time = current_beat_start_time + (start_offset * beat_duration_s) |
|
note_duration_s = duration_beats * beat_duration_s * note_density_factor |
|
|
|
|
|
if note_start_time >= chord_end_time: |
|
break |
|
|
|
pitch = pattern[pattern_index % len(pattern)] |
|
|
|
|
|
rand_offset = random.uniform(-0.01, 0.01) |
|
final_velocity = max(1, min(127, final_velocity_base + random.randint(-5, 5))) |
|
|
|
new_note = pretty_midi.Note( |
|
velocity=final_velocity, |
|
pitch=pitch, |
|
start=max(0.0, note_start_time + rand_offset), |
|
end=min(chord_end_time, note_start_time + note_duration_s) |
|
) |
|
new_note_list.append(new_note) |
|
pattern_index += 1 |
|
|
|
current_beat += 1.0 |
|
current_time = current_beat * beat_duration_s if beat_duration_s > 0 else float('inf') |
|
|
|
|
|
instrument.notes = new_note_list |
|
|
|
print("Targeted arpeggiator finished.") |
|
return processed_midi |
|
|
|
|
|
def create_delay_effect(midi_data: pretty_midi.PrettyMIDI, params: AppParameters): |
|
""" |
|
Creates a delay/echo effect by duplicating notes with delayed start times |
|
and scaled velocities. Can be configured to apply only to the lead melody. |
|
based on the MIDI's estimated BPM and the user's selected musical division. |
|
""" |
|
print("Applying tempo-synced MIDI delay/echo effect...") |
|
|
|
processed_midi = copy.deepcopy(midi_data) |
|
|
|
|
|
try: |
|
bpm = midi_data.estimate_tempo() |
|
except: |
|
bpm = 120.0 |
|
print(f" - Delay using tempo: {bpm:.2f} BPM") |
|
|
|
|
|
division_map = { |
|
"Quarter Note": 1.0, |
|
"Dotted 8th Note": 0.75, |
|
"8th Note": 0.5, |
|
"Triplet 8th Note": 1.0 / 3.0, |
|
"16th Note": 0.25 |
|
} |
|
beat_duration_s = 60.0 / bpm |
|
division_multiplier = division_map.get(params.s8bit_delay_division, 0.75) |
|
delay_time_s = beat_duration_s * division_multiplier |
|
|
|
print(f" - Delay set to {params.s8bit_delay_division}, calculated time: {delay_time_s:.3f}s") |
|
|
|
|
|
notes_to_echo = [] |
|
|
|
if params.s8bit_delay_on_melody_only: |
|
print(" - Delay will be applied to lead melody notes only.") |
|
all_notes = [note for inst in processed_midi.instruments if not inst.is_drum for note in inst.notes] |
|
all_notes.sort(key=lambda n: n.start) |
|
|
|
note_idx = 0 |
|
while note_idx < len(all_notes): |
|
current_slice_start = all_notes[note_idx].start |
|
notes_in_slice = [n for n in all_notes[note_idx:] if (n.start - current_slice_start) < 0.02] |
|
if not notes_in_slice: |
|
note_idx += 1 |
|
continue |
|
|
|
|
|
notes_in_slice.sort(key=lambda n: n.pitch, reverse=True) |
|
notes_to_echo.append(notes_in_slice[0]) |
|
note_idx += len(notes_in_slice) |
|
else: |
|
print(" - Delay will be applied to all non-drum notes.") |
|
notes_to_echo = [note for inst in processed_midi.instruments if not inst.is_drum for note in inst.notes] |
|
|
|
if not notes_to_echo: |
|
print(" - No notes found to apply delay to. Skipping.") |
|
return processed_midi |
|
|
|
|
|
echo_notes = [] |
|
bass_note_threshold = 48 |
|
treble_note_threshold = 84 |
|
|
|
for i in range(1, params.s8bit_delay_repeats + 1): |
|
for original_note in notes_to_echo: |
|
|
|
echo_note = copy.copy(original_note) |
|
|
|
|
|
if params.s8bit_delay_bass_pitch_shift and original_note.pitch < bass_note_threshold: |
|
echo_note.pitch += params.s8bit_delay_bass_pitch_shift |
|
elif params.s8bit_delay_treble_pitch_shift and original_note.pitch > treble_note_threshold: |
|
echo_note.pitch += params.s8bit_delay_treble_pitch_shift |
|
|
|
|
|
time_offset = i * delay_time_s |
|
echo_note.start += time_offset |
|
echo_note.end += time_offset |
|
echo_note.velocity = int(echo_note.velocity * (params.s8bit_delay_feedback ** i)) |
|
|
|
|
|
if echo_note.velocity > 1: |
|
echo_notes.append(echo_note) |
|
|
|
|
|
if echo_notes: |
|
|
|
|
|
|
|
base_program = 0 |
|
for inst in midi_data.instruments: |
|
if not inst.is_drum: |
|
base_program = inst.program |
|
break |
|
|
|
echo_instrument = pretty_midi.Instrument(program=base_program, is_drum=False, name="Echo Layer") |
|
echo_instrument.notes.extend(echo_notes) |
|
processed_midi.instruments.append(echo_instrument) |
|
print(f" - Generated {len(echo_notes)} tempo-synced echo notes on a new track with program {base_program}.") |
|
|
|
return processed_midi |
|
|
|
|
|
def butter_highpass(cutoff, fs, order=5): |
|
nyq = 0.5 * fs |
|
normal_cutoff = cutoff / nyq |
|
b, a = signal.butter(order, normal_cutoff, btype='high', analog=False) |
|
return b, a |
|
|
|
def apply_butter_highpass_filter(data, cutoff, fs, order=5): |
|
"""Applies a Butterworth highpass filter to a stereo audio signal.""" |
|
if cutoff <= 0: |
|
return data |
|
b, a = butter_highpass(cutoff, fs, order=order) |
|
|
|
filtered_data = np.zeros_like(data) |
|
for channel in range(data.shape[1]): |
|
filtered_data[:, channel] = signal.lfilter(b, a, data[:, channel]) |
|
return filtered_data |
|
|
|
|
|
def butter_lowpass(cutoff, fs, order=5): |
|
nyq = 0.5 * fs |
|
normal_cutoff = cutoff / nyq |
|
b, a = signal.butter(order, normal_cutoff, btype='low', analog=False) |
|
return b, a |
|
|
|
def apply_butter_lowpass_filter(data, cutoff, fs, order=5): |
|
"""Applies a Butterworth lowpass filter to a stereo audio signal.""" |
|
|
|
if cutoff >= fs / 2: |
|
return data |
|
b, a = butter_lowpass(cutoff, fs, order=order) |
|
filtered_data = np.zeros_like(data) |
|
for channel in range(data.shape[1]): |
|
filtered_data[:, channel] = signal.lfilter(b, a, data[:, channel]) |
|
return filtered_data |
|
|
|
|
|
def one_pole_lowpass(x, cutoff_hz, fs): |
|
"""Simple one-pole lowpass filter (causal), stable and cheap.""" |
|
if cutoff_hz <= 0 or cutoff_hz >= fs/2: |
|
return x |
|
dt = 1.0 / fs |
|
rc = 1.0 / (2 * np.pi * cutoff_hz) |
|
alpha = dt / (rc + dt) |
|
y = np.empty_like(x) |
|
y[0] = alpha * x[0] |
|
for n in range(1, len(x)): |
|
y[n] = y[n-1] + alpha * (x[n] - y[n-1]) |
|
return y |
|
|
|
def smooth_square_or_saw(note_waveform, fs, smooth_ms=0.6): |
|
"""Short triangular smoothing to soften sharp edges (simple anti-alias-ish).""" |
|
if smooth_ms <= 0: |
|
return note_waveform |
|
kernel_len = max(1, int(fs * (smooth_ms/1000.0))) |
|
|
|
k = np.convolve(np.ones(kernel_len), np.ones(kernel_len)) |
|
k = k / k.sum() |
|
|
|
y = np.convolve(note_waveform, k, mode='same') |
|
return y |
|
|
|
def additive_bandlimited_waveform(wave_type, freq, t, fs, max_harmonics_cap=200): |
|
""" |
|
Simple additive band-limited generator: |
|
- saw: sum_{n=1..N} sin(2π n f t)/n |
|
- square: sum odd harmonics sin(2π n f t)/n |
|
N chosen so n*f < fs/2. |
|
This is heavier but yields much less aliasing. |
|
""" |
|
nyq = fs / 2.0 |
|
max_n = int(nyq // freq) |
|
if max_n < 1: |
|
return np.zeros_like(t) |
|
max_n = min(max_n, max_harmonics_cap) |
|
y = np.zeros_like(t) |
|
if wave_type == 'Sawtooth': |
|
|
|
for n in range(1, max_n + 1): |
|
y += np.sin(2*np.pi * n * freq * t) / n |
|
|
|
y = - (2/np.pi) * y |
|
else: |
|
n = 1 |
|
while n <= max_n: |
|
y += np.sin(2*np.pi * n * freq * t) / n |
|
n += 2 |
|
y = (4/np.pi) * y |
|
|
|
y = np.clip(y, -1.0, 1.0) |
|
return y |
|
|
|
def safe_tanh_distortion(x, strength): |
|
"""Milder soft clipping: scale then tanh, with adjustable drive.""" |
|
|
|
drive = 1.0 + strength * 4.0 |
|
return np.tanh(x * drive) / np.tanh(drive) |
|
|
|
def prepare_soundfonts(): |
|
""" |
|
Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2' |
|
directory recursively for all .sf2 files. |
|
Returns a dictionary mapping a user-friendly name to its full file path, with |
|
default soundfonts listed first in their specified order. |
|
|
|
Downloads soundfont files from the specified Hugging Face Space repository |
|
to a local 'src/sf2' directory if they don't already exist. |
|
Returns a list of local paths to the soundfont files. |
|
""" |
|
SF2_REPO_ID = "asigalov61/Advanced-MIDI-Renderer" |
|
SF2_DIR = "src/sf2" |
|
|
|
|
|
DEFAULT_SF2_FILENAMES = [ |
|
"SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2", |
|
"Orpheus_18.06.2020.sf2", |
|
"Live HQ Natural SoundFont GM.sf2", |
|
"Nice-Strings-PlusOrchestra-v1.6.sf2", |
|
"KBH-Real-Choir-V2.5.sf2", |
|
"SuperGameBoy.sf2", |
|
"ProtoSquare.sf2" |
|
] |
|
|
|
|
|
os.makedirs(SF2_DIR, exist_ok=True) |
|
|
|
|
|
print("Checking for SoundFont files...") |
|
for filename in DEFAULT_SF2_FILENAMES: |
|
local_path = os.path.join(SF2_DIR, filename) |
|
|
|
|
|
if not os.path.exists(local_path): |
|
print(f"Downloading '{filename}' from Hugging Face Hub...") |
|
try: |
|
|
|
|
|
hf_hub_download( |
|
repo_id=SF2_REPO_ID, |
|
repo_type='space', |
|
filename=f"{filename}", |
|
local_dir=SF2_DIR, |
|
|
|
) |
|
print(f"'{filename}' downloaded successfully.") |
|
except Exception as e: |
|
print(f"Error downloading {filename}: {e}") |
|
|
|
|
|
|
|
print(f"Scanning '{SF2_DIR}' for all .sf2 files...") |
|
all_sfs_map = {} |
|
|
|
search_pattern = os.path.join(SF2_DIR, '**', '*.sf2') |
|
for full_path in glob.glob(search_pattern, recursive=True): |
|
|
|
relative_path = os.path.relpath(full_path, SF2_DIR) |
|
display_name = os.path.splitext(relative_path)[0].replace("\\", "/") |
|
all_sfs_map[display_name] = full_path |
|
|
|
|
|
ordered_soundfont_map = {} |
|
|
|
|
|
default_display_names = [os.path.splitext(f)[0] for f in DEFAULT_SF2_FILENAMES] |
|
|
|
|
|
other_display_names = [name for name in all_sfs_map.keys() if name not in default_display_names] |
|
other_display_names.sort() |
|
|
|
|
|
for name in default_display_names: |
|
if name in all_sfs_map: |
|
ordered_soundfont_map[name] = all_sfs_map[name] |
|
|
|
|
|
for name in other_display_names: |
|
ordered_soundfont_map[name] = all_sfs_map[name] |
|
|
|
return ordered_soundfont_map |
|
|
|
|
|
|
|
|
|
def synthesize_8bit_style(*, midi_data: pretty_midi.PrettyMIDI, fs: int, params: AppParameters, progress: gr.Progress = None): |
|
""" |
|
Synthesizes an 8-bit style audio waveform from a PrettyMIDI object. |
|
This function generates waveforms manually instead of using a synthesizer like FluidSynth. |
|
Includes an optional sub-octave bass booster with adjustable level. |
|
Instruments are panned based on their order in the MIDI file. |
|
Instrument 1 -> Left, Instrument 2 -> Right. |
|
Now supports graded levels for smoothing and vibrato continuity. |
|
This enhanced version includes advanced anti-aliasing and quality features to produce a cleaner, less harsh sound. |
|
""" |
|
total_duration = midi_data.get_end_time() |
|
|
|
waveform = np.zeros((2, int(total_duration * fs) + fs)) |
|
|
|
num_instruments = len(midi_data.instruments) |
|
|
|
|
|
osc_phase = {} |
|
|
|
vibrato_phase = 0.0 |
|
|
|
|
|
use_aa = getattr(params, 's8bit_enable_anti_aliasing', False) |
|
|
|
|
|
|
|
all_notes_with_instrument_info = [] |
|
for i, instrument in enumerate(midi_data.instruments): |
|
|
|
panning_override = getattr(params, '_temp_panning_override', None) |
|
|
|
if panning_override: |
|
if panning_override == "Center": |
|
pan_l, pan_r = 0.707, 0.707 |
|
elif panning_override == "Left": |
|
pan_l, pan_r = 1.0, 0.0 |
|
elif panning_override == "Right": |
|
pan_l, pan_r = 0.0, 1.0 |
|
else: |
|
|
|
if i % 2 == 0: |
|
pan_l, pan_r = 1.0, 0.0 |
|
else: |
|
pan_l, pan_r = 0.0, 1.0 |
|
else: |
|
|
|
|
|
pan_l, pan_r = 0.707, 0.707 |
|
if num_instruments == 2: |
|
if i == 0: |
|
pan_l, pan_r = 1.0, 0.0 |
|
elif i == 1: |
|
pan_l, pan_r = 0.0, 1.0 |
|
elif num_instruments > 2: |
|
if i == 0: |
|
pan_l, pan_r = 1.0, 0.0 |
|
elif i == 1: |
|
pan_l, pan_r = 0.0, 1.0 |
|
|
|
|
|
|
|
for note in instrument.notes: |
|
all_notes_with_instrument_info.append({'note': note, 'instrument_index': i, 'pan_l': pan_l, 'pan_r': pan_r}) |
|
|
|
|
|
osc_phase[i] = 0.0 |
|
|
|
|
|
notes_iterable = all_notes_with_instrument_info |
|
total_notes = len(notes_iterable) |
|
|
|
|
|
if progress and hasattr(progress, 'tqdm'): |
|
notes_iterable = progress.tqdm( |
|
notes_iterable, |
|
desc="Synthesizing Notes...", |
|
total=total_notes |
|
) |
|
|
|
|
|
for item in notes_iterable: |
|
note = item['note'] |
|
i = item['instrument_index'] |
|
pan_l = item['pan_l'] |
|
pan_r = item['pan_r'] |
|
|
|
freq = pretty_midi.note_number_to_hz(note.pitch) |
|
note_duration = note.end - note.start |
|
num_samples = int(note_duration * fs) |
|
if num_samples <= 0: |
|
continue |
|
|
|
t = np.arange(num_samples) / fs |
|
|
|
|
|
|
|
|
|
vib_phase_inc = 2 * np.pi * params.s8bit_vibrato_rate / fs |
|
per_note_vib_phase = 2 * np.pi * params.s8bit_vibrato_rate * t |
|
continuous_vib_phase = vibrato_phase + np.arange(num_samples) * vib_phase_inc |
|
|
|
|
|
final_vib_phase = ( |
|
per_note_vib_phase * (1 - params.s8bit_continuous_vibrato_level) + |
|
continuous_vib_phase * params.s8bit_continuous_vibrato_level |
|
) |
|
vibrato_lfo = params.s8bit_vibrato_depth * np.sin(final_vib_phase) |
|
|
|
|
|
if num_samples > 0: |
|
vibrato_phase = (continuous_vib_phase[-1] + vib_phase_inc) % (2 * np.pi) |
|
|
|
|
|
fm_lfo = params.s8bit_fm_modulation_depth * np.sin(2 * np.pi * params.s8bit_fm_modulation_rate * t) |
|
modulated_freq = freq * (1 + fm_lfo) |
|
|
|
|
|
use_additive = use_aa and getattr(params, 's8bit_use_additive_synthesis', False) |
|
if use_additive and params.s8bit_waveform_type in ['Square', 'Sawtooth']: |
|
note_waveform = additive_bandlimited_waveform(params.s8bit_waveform_type, freq, t, fs) |
|
else: |
|
|
|
phase_inc = 2 * np.pi * (modulated_freq + vibrato_lfo) / fs |
|
phase = osc_phase[i] + np.cumsum(phase_inc) |
|
if num_samples > 0: |
|
osc_phase[i] = phase[-1] % (2 * np.pi) |
|
|
|
if params.s8bit_waveform_type == 'Square': |
|
note_waveform = signal.square(phase, duty=params.s8bit_pulse_width) |
|
elif params.s8bit_waveform_type == 'Sawtooth': |
|
note_waveform = signal.sawtooth(phase) |
|
else: |
|
note_waveform = signal.sawtooth(phase, width=0.5) |
|
|
|
if use_aa and params.s8bit_waveform_type in ['Square', 'Sawtooth']: |
|
edge_smooth_ms = getattr(params, 's8bit_edge_smoothing_ms', 0.5) |
|
note_waveform = smooth_square_or_saw(note_waveform, fs, smooth_ms=edge_smooth_ms) |
|
|
|
|
|
if params.s8bit_bass_boost_level > 0: |
|
|
|
cutoff_hz = getattr(params, 's8bit_bass_boost_cutoff_hz', 200.0) |
|
|
|
|
|
|
|
|
|
|
|
dynamic_boost_scale = np.clip((freq - (cutoff_hz / 2)) / (cutoff_hz / 2), 0, 1) |
|
|
|
|
|
final_boost_level = params.s8bit_bass_boost_level * dynamic_boost_scale |
|
|
|
|
|
if final_boost_level > 0.01: |
|
bass_freq = freq / 2.0 |
|
|
|
if bass_freq > 20: |
|
|
|
bass_phase_inc = 2 * np.pi * bass_freq / fs |
|
bass_phase = np.cumsum(np.full(num_samples, bass_phase_inc)) |
|
bass_sub_waveform = signal.square(bass_phase, duty=0.5) |
|
|
|
|
|
main_level = 1.0 - (0.5 * final_boost_level) |
|
note_waveform = (note_waveform * main_level) + (bass_sub_waveform * final_boost_level) |
|
|
|
|
|
if params.s8bit_noise_level > 0: |
|
raw_noise = np.random.uniform(-1, 1, num_samples) * params.s8bit_noise_level |
|
if use_aa: |
|
noise_cutoff = getattr(params, 's8bit_noise_lowpass_hz', 9000.0) |
|
raw_noise = one_pole_lowpass(raw_noise, cutoff_hz=noise_cutoff, fs=fs) |
|
note_waveform += raw_noise |
|
|
|
|
|
if params.s8bit_distortion_level > 0: |
|
if use_aa: |
|
note_waveform = safe_tanh_distortion(note_waveform, params.s8bit_distortion_level) |
|
else: |
|
|
|
note_waveform = np.tanh(note_waveform * (1 + params.s8bit_distortion_level * 5)) |
|
|
|
|
|
start_amp = note.velocity / 127.0 |
|
envelope = np.zeros(num_samples) |
|
|
|
min_attack_s = 0.001 |
|
if params.s8bit_envelope_type == 'Plucky (AD Envelope)': |
|
attack_samples = max(int(min_attack_s * fs), min(int(0.005 * fs), num_samples)) |
|
|
|
|
|
|
|
|
|
if params.s8bit_adaptive_decay: |
|
|
|
ideal_decay_samples = int(params.s8bit_decay_time_s * fs) |
|
if ideal_decay_samples <= 0: |
|
ideal_decay_samples = 1 |
|
|
|
|
|
ideal_decay_curve = np.linspace(start_amp, 0, ideal_decay_samples) |
|
|
|
|
|
actual_decay_samples = num_samples - attack_samples |
|
|
|
if actual_decay_samples > 0: |
|
|
|
num_samples_to_take = min(len(ideal_decay_curve), actual_decay_samples) |
|
|
|
|
|
envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples) |
|
|
|
envelope[attack_samples : attack_samples + num_samples_to_take] = ideal_decay_curve[:num_samples_to_take] |
|
|
|
|
|
else: |
|
decay_samples = min(int(params.s8bit_decay_time_s * fs), num_samples - attack_samples) |
|
envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples) |
|
if decay_samples > 0: |
|
envelope[attack_samples:attack_samples+decay_samples] = np.linspace(start_amp, 0, decay_samples) |
|
|
|
else: |
|
envelope = np.linspace(start_amp, 0, num_samples) |
|
if use_aa and num_samples > 20: |
|
release_samples = int(min(0.005*fs, num_samples // 10)) |
|
if release_samples > 0: |
|
envelope[-release_samples:] *= np.linspace(1.0, 0.0, release_samples) |
|
|
|
|
|
|
|
|
|
|
|
if params.s8bit_smooth_notes_level > 0 and num_samples > 10: |
|
|
|
|
|
max_fade_duration_s = 0.03 |
|
|
|
|
|
|
|
fade_percentage = 0.1 * params.s8bit_smooth_notes_level |
|
proportional_fade_samples = int(num_samples * fade_percentage) |
|
|
|
|
|
absolute_max_fade_samples = int(fs * max_fade_duration_s) |
|
|
|
|
|
|
|
|
|
|
|
fade_samples = min(proportional_fade_samples, absolute_max_fade_samples, num_samples // 2) |
|
|
|
if fade_samples > 0: |
|
|
|
envelope[:fade_samples] *= np.linspace(0.5, 1.0, fade_samples) |
|
|
|
envelope[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples) |
|
|
|
|
|
note_waveform *= envelope |
|
|
|
|
|
|
|
|
|
|
|
|
|
if params.s8bit_envelope_type == 'Plucky (AD Envelope)' and params.s8bit_echo_sustain and num_samples > 0: |
|
|
|
|
|
initial_pluck_duration_s = params.s8bit_decay_time_s |
|
initial_pluck_samples = int(initial_pluck_duration_s * fs) |
|
|
|
|
|
if num_samples > initial_pluck_samples * params.s8bit_echo_trigger_threshold: |
|
|
|
|
|
echo_delay_samples = int(fs / params.s8bit_echo_rate_hz) |
|
if echo_delay_samples > 0: |
|
echo_amplitude = start_amp * params.s8bit_echo_decay_factor |
|
|
|
|
|
current_sample_offset = initial_pluck_samples |
|
|
|
while current_sample_offset < num_samples: |
|
|
|
if current_sample_offset + echo_delay_samples <= num_samples: |
|
|
|
|
|
echo_attack_samples = min(int(0.002 * fs), echo_delay_samples) |
|
echo_decay_samples = echo_delay_samples - echo_attack_samples |
|
|
|
if echo_decay_samples > 0: |
|
|
|
echo_envelope = np.zeros(echo_delay_samples) |
|
echo_envelope[:echo_attack_samples] = np.linspace(0, echo_amplitude, echo_attack_samples) |
|
echo_envelope[echo_attack_samples:] = np.linspace(echo_amplitude, 0, echo_decay_samples) |
|
|
|
|
|
|
|
|
|
phase_inc_echo = 2 * np.pi * freq / fs |
|
phase_echo = np.cumsum(np.full(echo_delay_samples, phase_inc_echo)) |
|
|
|
if params.s8bit_waveform_type == 'Square': |
|
echo_waveform_segment = signal.square(phase_echo, duty=params.s8bit_pulse_width) |
|
elif params.s8bit_waveform_type == 'Sawtooth': |
|
echo_waveform_segment = signal.sawtooth(phase_echo) |
|
else: |
|
echo_waveform_segment = signal.sawtooth(phase_echo, width=0.5) |
|
|
|
|
|
note_waveform[current_sample_offset : current_sample_offset + echo_delay_samples] += echo_waveform_segment * echo_envelope |
|
|
|
|
|
echo_amplitude *= params.s8bit_echo_decay_factor |
|
|
|
current_sample_offset += echo_delay_samples |
|
|
|
|
|
|
|
if use_aa: |
|
|
|
harm_limit = getattr(params, 's8bit_harmonic_lowpass_factor', 12.0) |
|
cutoff = min(fs * 0.45, max(3000.0, freq * harm_limit)) |
|
note_waveform = one_pole_lowpass(note_waveform, cutoff_hz=cutoff, fs=fs) |
|
|
|
|
|
final_gain = getattr(params, 's8bit_final_gain', 0.8) |
|
note_waveform *= final_gain |
|
note_waveform = np.tanh(note_waveform) |
|
|
|
|
|
start_sample = int(note.start * fs) |
|
end_sample = start_sample + num_samples |
|
if end_sample > waveform.shape[1]: |
|
end_sample = waveform.shape[1] |
|
note_waveform = note_waveform[:end_sample-start_sample] |
|
|
|
|
|
waveform[0, start_sample:end_sample] += note_waveform * pan_l |
|
waveform[1, start_sample:end_sample] += note_waveform * pan_r |
|
|
|
return waveform |
|
|
|
|
|
def analyze_midi_velocity(midi_path): |
|
midi = pretty_midi.PrettyMIDI(midi_path) |
|
all_velocities = [] |
|
|
|
print(f"Analyzing velocity for MIDI: {midi_path}") |
|
for i, instrument in enumerate(midi.instruments): |
|
velocities = [note.velocity for note in instrument.notes] |
|
all_velocities.extend(velocities) |
|
|
|
if velocities: |
|
print(f"Instrument {i} ({instrument.name}):") |
|
print(f" Notes count: {len(velocities)}") |
|
print(f" Velocity min: {min(velocities)}") |
|
print(f" Velocity max: {max(velocities)}") |
|
print(f" Velocity mean: {np.mean(velocities):.2f}") |
|
else: |
|
print(f"Instrument {i} ({instrument.name}): no notes found.") |
|
|
|
if all_velocities: |
|
print("\nOverall MIDI velocity stats:") |
|
print(f" Total notes: {len(all_velocities)}") |
|
print(f" Velocity min: {min(all_velocities)}") |
|
print(f" Velocity max: {max(all_velocities)}") |
|
print(f" Velocity mean: {np.mean(all_velocities):.2f}") |
|
else: |
|
print("No notes found in this MIDI.") |
|
|
|
|
|
def preview_sound_source(sound_source_name: str, *args): |
|
""" |
|
Generates a short audio preview for either a selected SoundFont or the |
|
8-bit Synthesizer, using the Super Mario Bros. theme as a test melody. |
|
|
|
This function acts as a router: |
|
- If a SoundFont is selected, it uses FluidSynth. |
|
- If the 8-bit Synthesizer is selected, it uses the internal `synthesize_8bit_style` |
|
function, capturing the current UI settings for an accurate preview. |
|
|
|
Args: |
|
sound_source_name (str): The name of the SoundFont or the 8-bit synth label. |
|
*args: Captures all current UI settings, which are passed to build an |
|
AppParameters object for the 8-bit synth preview. |
|
|
|
Returns: |
|
A Gradio-compatible audio tuple (sample_rate, numpy_array). |
|
""" |
|
srate = 44100 |
|
|
|
|
|
preview_midi = pretty_midi.PrettyMIDI() |
|
|
|
|
|
instrument = pretty_midi.Instrument(program=81, is_drum=False, name="Preview Lead") |
|
|
|
|
|
|
|
|
|
tempo = 200.0 |
|
time_per_step = 60.0 / tempo / 2 |
|
|
|
|
|
|
|
melody_data = [ |
|
(76, 1), (76, 2), (76, 2), (72, 1), (76, 2), |
|
(79, 4), (67, 4) |
|
] |
|
|
|
current_time = 0.0 |
|
for pitch, duration_steps in melody_data: |
|
start_time = current_time |
|
end_time = start_time + (duration_steps * time_per_step) |
|
|
|
|
|
note_end_time = end_time - 0.01 |
|
|
|
note = pretty_midi.Note( |
|
velocity=120, |
|
pitch=pitch, |
|
start=start_time, |
|
end=note_end_time |
|
) |
|
instrument.notes.append(note) |
|
current_time = end_time |
|
|
|
preview_midi.instruments.append(instrument) |
|
|
|
|
|
|
|
|
|
if sound_source_name == SYNTH_8_BIT_LABEL: |
|
print("Generating preview for: 8-bit Synthesizer") |
|
try: |
|
|
|
params = AppParameters(**dict(zip(ALL_PARAM_KEYS, args))) |
|
|
|
|
|
audio_waveform = synthesize_8bit_style(midi_data=preview_midi, fs=srate, params=params) |
|
|
|
|
|
peak_val = np.max(np.abs(audio_waveform)) |
|
if peak_val > 0: |
|
audio_waveform /= peak_val |
|
|
|
|
|
audio_out = (audio_waveform.T * 32767).astype(np.int16) |
|
|
|
print("8-bit preview generated successfully.") |
|
return (srate, audio_out) |
|
|
|
except Exception as e: |
|
print(f"An error occurred during 8-bit preview generation: {e}") |
|
return None |
|
|
|
|
|
else: |
|
soundfont_path = soundfonts_dict.get(sound_source_name) |
|
if not soundfont_path or not os.path.exists(soundfont_path): |
|
print(f"Preview failed: SoundFont file not found at '{soundfont_path}'") |
|
raise gr.Error(f"Could not find the SoundFont file for '{sound_source_name}'.") |
|
|
|
try: |
|
print(f"Generating preview for: {sound_source_name}") |
|
|
|
midi_io = io.BytesIO() |
|
preview_midi.write(midi_io) |
|
midi_data = midi_io.getvalue() |
|
|
|
|
|
|
|
audio_out = midi_to_colab_audio( |
|
midi_data, |
|
soundfont_path=soundfont_path, |
|
sample_rate=srate, |
|
output_for_gradio=True |
|
) |
|
|
|
|
|
|
|
|
|
if isinstance(audio_out, np.ndarray): |
|
print("SoundFont preview generated successfully.") |
|
return (srate, audio_out) |
|
else: |
|
|
|
|
|
print("Preview failed: Rendering function did not return valid audio data.") |
|
return None |
|
|
|
except Exception as e: |
|
|
|
print(f"An error occurred during SoundFont preview generation: {e}") |
|
|
|
|
|
return None |
|
|
|
|
|
def scale_instrument_velocity(instrument, scale=0.8): |
|
for note in instrument.notes: |
|
note.velocity = max(1, min(127, int(note.velocity * scale))) |
|
|
|
|
|
def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0): |
|
""" |
|
Normalizes the audio data to a target integrated loudness (LUFS). |
|
This provides more consistent perceived volume than peak normalization. |
|
|
|
Args: |
|
audio_data (np.ndarray): The audio signal. |
|
sample_rate (int): The sample rate of the audio. |
|
target_lufs (float): The target loudness in LUFS. Defaults to -23.0, |
|
a common standard for broadcast. |
|
|
|
Returns: |
|
np.ndarray: The loudness-normalized audio data. |
|
""" |
|
try: |
|
|
|
meter = pyln.Meter(sample_rate) |
|
loudness = meter.integrated_loudness(audio_data) |
|
|
|
|
|
|
|
loudness_gain_db = target_lufs - loudness |
|
loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0) |
|
|
|
|
|
normalized_audio = audio_data * loudness_gain_linear |
|
|
|
|
|
|
|
peak_val = np.max(np.abs(normalized_audio)) |
|
if peak_val > 1.0: |
|
normalized_audio /= peak_val |
|
print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.") |
|
|
|
print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.") |
|
return normalized_audio |
|
|
|
except Exception as e: |
|
print(f"Loudness normalization failed: {e}. Falling back to original audio.") |
|
return audio_data |
|
|
|
|
|
|
|
|
|
|
|
def merge_midis(midi_path_left: str, midi_path_right: str, output_path: str): |
|
""" |
|
Merges two MIDI files into a single MIDI file. This robust version iterates |
|
through ALL instruments in both MIDI files, ensuring no data is lost if the |
|
source files are multi-instrumental. |
|
|
|
It applies hard-left panning (Pan=0) to every instrument from the left MIDI |
|
and hard-right panning (Pan=127) to every instrument from the right MIDI. |
|
""" |
|
try: |
|
analyze_midi_velocity(midi_path_left) |
|
analyze_midi_velocity(midi_path_right) |
|
midi_left = pretty_midi.PrettyMIDI(midi_path_left) |
|
midi_right = pretty_midi.PrettyMIDI(midi_path_right) |
|
|
|
merged_midi = pretty_midi.PrettyMIDI() |
|
|
|
|
|
if midi_left.instruments: |
|
print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.") |
|
|
|
for instrument in midi_left.instruments: |
|
scale_instrument_velocity(instrument, scale=0.8) |
|
|
|
instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}" |
|
|
|
|
|
|
|
|
|
|
|
pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0) |
|
|
|
instrument.control_changes.insert(0, pan_left) |
|
|
|
|
|
merged_midi.instruments.append(instrument) |
|
|
|
|
|
if midi_right.instruments: |
|
print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.") |
|
|
|
for instrument in midi_right.instruments: |
|
scale_instrument_velocity(instrument, scale=0.8) |
|
instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}" |
|
|
|
|
|
|
|
|
|
|
|
pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0) |
|
instrument.control_changes.insert(0, pan_right) |
|
|
|
merged_midi.instruments.append(instrument) |
|
|
|
merged_midi.write(output_path) |
|
print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'") |
|
analyze_midi_velocity(output_path) |
|
return output_path |
|
|
|
except Exception as e: |
|
print(f"Error merging MIDI files: {e}") |
|
|
|
if os.path.exists(midi_path_left): |
|
print("Fallback: Using only the left channel MIDI.") |
|
return midi_path_left |
|
return None |
|
|
|
|
|
def is_stereo_midi(midi_path: str) -> bool: |
|
""" |
|
Checks if a MIDI file contains the specific stereo panning control changes |
|
(hard left and hard right) created by the merge_midis function. |
|
|
|
Args: |
|
midi_path (str): The file path to the MIDI file. |
|
|
|
Returns: |
|
bool: True if both hard-left (0) and hard-right (127) pan controls are found, False otherwise. |
|
""" |
|
try: |
|
midi_data = pretty_midi.PrettyMIDI(midi_path) |
|
|
|
found_left_pan = False |
|
found_right_pan = False |
|
|
|
for instrument in midi_data.instruments: |
|
for control_change in instrument.control_changes: |
|
|
|
if control_change.number == 10: |
|
if control_change.value == 0: |
|
found_left_pan = True |
|
elif control_change.value == 127: |
|
found_right_pan = True |
|
|
|
|
|
if found_left_pan and found_right_pan: |
|
return True |
|
|
|
return found_left_pan and found_right_pan |
|
|
|
except Exception as e: |
|
|
|
print(f"Could not analyze MIDI for stereo info: {e}") |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
def TranscribePianoAudio(input_file): |
|
""" |
|
Transcribes a WAV or MP3 audio file of a SOLO PIANO performance into a MIDI file. |
|
This uses the ByteDance model. |
|
Args: |
|
input_file_path (str): The path to the input audio file. |
|
Returns: |
|
str: The file path of the generated MIDI file. |
|
""" |
|
print('=' * 70) |
|
print('STAGE 1: Starting Piano-Specific Transcription') |
|
print('=' * 70) |
|
|
|
|
|
fn = os.path.basename(input_file) |
|
fn1 = fn.split('.')[0] |
|
|
|
|
|
output_dir = os.path.join("output", "transcribed_piano_") |
|
out_mid_path = os.path.join(output_dir, fn1 + '.mid') |
|
|
|
|
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
print('-' * 70) |
|
print(f'Input file name: {fn}') |
|
print(f'Output MIDI path: {out_mid_path}') |
|
print('-' * 70) |
|
|
|
|
|
print('Loading audio...') |
|
(audio, _) = utilities.load_audio(input_file, sr=transcription_sample_rate, mono=True) |
|
print('Audio loaded successfully.') |
|
print('-' * 70) |
|
|
|
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
print(f'Loading transcriptor model... device= {device}') |
|
transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth") |
|
print('Transcriptor loaded.') |
|
print('-' * 70) |
|
|
|
|
|
print('Transcribing audio to MIDI (Piano-Specific)...') |
|
|
|
transcriptor.transcribe(audio, out_mid_path) |
|
print('Piano transcription complete.') |
|
print('=' * 70) |
|
|
|
|
|
return out_mid_path |
|
|
|
def TranscribeGeneralAudio(input_file, **kwargs): |
|
""" |
|
Transcribes a general audio file into a MIDI file using basic-pitch. |
|
This is suitable for various instruments and vocals. |
|
""" |
|
print('=' * 70) |
|
print('STAGE 1: Starting General Purpose Transcription') |
|
print('=' * 70) |
|
|
|
fn = os.path.basename(input_file) |
|
fn1 = fn.split('.')[0] |
|
output_dir = os.path.join("output", "transcribed_general_") |
|
out_mid_path = os.path.join(output_dir, fn1 + '.mid') |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
print(f'Input file: {fn}\nOutput MIDI: {out_mid_path}') |
|
|
|
|
|
print('Transcribing audio to MIDI (General Purpose)...') |
|
|
|
model_output, midi_data, note_events = basic_pitch.inference.predict( |
|
audio_path=input_file, |
|
model_or_model_path=ICASSP_2022_MODEL_PATH, |
|
**kwargs |
|
) |
|
|
|
|
|
midi_data.write(out_mid_path) |
|
print('General transcription complete.') |
|
print('=' * 70) |
|
|
|
return out_mid_path |
|
|
|
|
|
|
|
|
|
|
|
def Render_MIDI(*, input_midi_path: str, params: AppParameters, progress: gr.Progress = None): |
|
""" |
|
Processes and renders a MIDI file according to user-defined settings. |
|
Can render using SoundFonts or a custom 8-bit synthesizer. |
|
|
|
This version supports a parallel arpeggiator workflow, where the original MIDI |
|
and an arpeggiated version are synthesized separately and then mixed together. |
|
|
|
Args: |
|
input_midi_path (str): The path to the input MIDI file. |
|
All other arguments are rendering options from the Gradio UI. |
|
Returns: |
|
A tuple containing all the output elements for the Gradio UI. |
|
""" |
|
print('*' * 70) |
|
print('STAGE 2: Starting MIDI Rendering') |
|
print('*' * 70) |
|
|
|
|
|
fn = os.path.basename(input_midi_path) |
|
fn1 = fn.split('.')[0] |
|
|
|
|
|
output_dir = os.path.join("output", "rendered_midi") |
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
|
|
new_fn_path = os.path.join(output_dir, fn1 + '_rendered.mid') |
|
|
|
try: |
|
with open(input_midi_path, 'rb') as f: |
|
fdata = f.read() |
|
input_midi_md5hash = hashlib.md5(fdata).hexdigest() |
|
except FileNotFoundError: |
|
|
|
print(f"Error: Input MIDI file not found at {input_midi_path}") |
|
return [None] * 7 |
|
|
|
print('=' * 70) |
|
print('Requested settings:') |
|
print(f'Input MIDI file name: {fn}') |
|
print(f'Input MIDI md5 hash: {input_midi_md5hash}') |
|
print('-' * 70) |
|
print(f"Render type: {params.render_type}") |
|
print(f"Soundfont bank: {params.soundfont_bank}") |
|
print(f"Audio render sample rate: {params.render_sample_rate}") |
|
|
|
print('=' * 70) |
|
|
|
|
|
print('Processing MIDI... Please wait...') |
|
raw_score = MIDI.midi2single_track_ms_score(fdata) |
|
|
|
processed_scores = TMIDIX.advanced_score_processor(raw_score, |
|
return_enhanced_score_notes=True, |
|
apply_sustain=params.render_with_sustains) |
|
|
|
|
|
if not processed_scores: |
|
|
|
print("Warning: MIDI file contains no processable notes.") |
|
|
|
|
|
return ("N/A", fn1, "MIDI file contains no notes.", None, None, None, "No notes found.") |
|
|
|
|
|
escore = processed_scores[0] |
|
|
|
|
|
if not escore: |
|
print("Warning: MIDI file contains no processable notes.") |
|
return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.") |
|
|
|
|
|
if params.merge_misaligned_notes > 0: |
|
escore = TMIDIX.merge_escore_notes(escore, merge_threshold=params.merge_misaligned_notes) |
|
|
|
escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1) |
|
|
|
first_note_index = [e[0] for e in raw_score[1]].index('note') |
|
cscore = TMIDIX.chordify_score([1000, escore]) |
|
|
|
meta_data = raw_score[1][:first_note_index] + [escore[0]] + [escore[-1]] + [raw_score[1][-1]] |
|
|
|
aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True) |
|
song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes) |
|
|
|
print('Done!') |
|
print('=' * 70) |
|
print('Input MIDI metadata:', meta_data[:5]) |
|
print('=' * 70) |
|
print('Input MIDI song description:', song_description) |
|
print('=' * 70) |
|
print('Processing...Please wait...') |
|
|
|
|
|
output_score = copy.deepcopy(escore) |
|
|
|
|
|
if params.render_type == "Extract melody": |
|
output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True) |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
elif params.render_type == "Flip": |
|
output_score = TMIDIX.flip_enhanced_score_notes(escore) |
|
elif params.render_type == "Reverse": |
|
output_score = TMIDIX.reverse_enhanced_score_notes(escore) |
|
elif params.render_type == 'Repair Durations': |
|
output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0) |
|
elif params.render_type == 'Repair Chords': |
|
fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0] |
|
output_score = TMIDIX.flatten(fixed_cscore) |
|
elif params.render_type == 'Remove Duplicate Pitches': |
|
output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore) |
|
elif params.render_type == "Add Drum Track": |
|
nd_escore = [e for e in escore if e[3] != 9] |
|
nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore) |
|
output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore) |
|
|
|
for e in output_score: |
|
e[1] *= 16 |
|
e[2] *= 16 |
|
|
|
print('MIDI processing complete.') |
|
print('=' * 70) |
|
|
|
|
|
if params.render_type != "Render as-is": |
|
print('Applying final adjustments (transpose, align, patch)...') |
|
if params.custom_render_patch != -1: |
|
for e in output_score: |
|
if e[3] != 9: |
|
e[6] = params.custom_render_patch |
|
|
|
if params.render_transpose_value != 0: |
|
output_score = TMIDIX.transpose_escore_notes(output_score, params.render_transpose_value) |
|
|
|
if params.render_transpose_to_C4: |
|
output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) |
|
|
|
if params.render_align == "Start Times": |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
output_score = TMIDIX.align_escore_notes_to_bars(output_score) |
|
|
|
elif params.render_align == "Start Times and Durations": |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True) |
|
|
|
elif params.render_align == "Start Times and Split Durations": |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True) |
|
|
|
if params.render_type == "Longest Repeating Phrase": |
|
zscore = TMIDIX.recalculate_score_timings(output_score) |
|
lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore) |
|
|
|
if lrno_score is not None: |
|
output_score = lrno_score |
|
|
|
else: |
|
output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50)) |
|
|
|
if params.render_type == "Multi-Instrumental Summary": |
|
zscore = TMIDIX.recalculate_score_timings(output_score) |
|
c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore) |
|
|
|
if len(c_escore_notes) > 128: |
|
cmatrix = TMIDIX.escore_notes_to_image_matrix(c_escore_notes, filter_out_zero_rows=True, filter_out_duplicate_rows=True) |
|
smatrix = TPLOTS.square_image_matrix(cmatrix, num_pca_components=max(1, min(5, len(c_escore_notes) // 128))) |
|
output_score = TMIDIX.image_matrix_to_original_escore_notes(smatrix) |
|
|
|
for o in output_score: |
|
o[1] *= 250 |
|
o[2] *= 250 |
|
|
|
if params.render_output_as_solo_piano: |
|
output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not params.render_remove_drums)) |
|
|
|
if params.render_remove_drums and not params.render_output_as_solo_piano: |
|
output_score = TMIDIX.strip_drums_from_escore_notes(output_score) |
|
|
|
if params.render_type == "Solo Piano Summary": |
|
sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False) |
|
zscore = TMIDIX.recalculate_score_timings(sp_escore_notes) |
|
|
|
if len(zscore) > 128: |
|
|
|
bmatrix = TMIDIX.escore_notes_to_binary_matrix(zscore) |
|
cmatrix = TMIDIX.compress_binary_matrix(bmatrix, only_compress_zeros=True) |
|
smatrix = TPLOTS.square_binary_matrix(cmatrix, interpolation_order=max(1, min(5, len(zscore) // 128))) |
|
output_score = TMIDIX.binary_matrix_to_original_escore_notes(smatrix) |
|
|
|
for o in output_score: |
|
o[1] *= 200 |
|
o[2] *= 200 |
|
|
|
print('Final adjustments complete.') |
|
print('=' * 70) |
|
|
|
|
|
|
|
SONG, patches, _ = TMIDIX.patch_enhanced_score_notes(output_score) |
|
|
|
|
|
|
|
path_without_ext = new_fn_path.rsplit('.mid', 1)[0] |
|
|
|
MIDI.Tegridy_ms_SONG_to_MIDI_Converter(SONG, |
|
output_signature = 'Integrated-MIDI-Processor', |
|
output_file_name = path_without_ext, |
|
track_name='Processed Track', |
|
list_of_MIDI_patches=patches |
|
) |
|
midi_to_render_path = new_fn_path |
|
else: |
|
|
|
with open(new_fn_path, 'wb') as f: |
|
f.write(fdata) |
|
midi_to_render_path = new_fn_path |
|
|
|
|
|
print('Rendering final audio...') |
|
|
|
|
|
srate = int(params.render_sample_rate) |
|
|
|
|
|
|
|
if params.soundfont_bank == SYNTH_8_BIT_LABEL: |
|
print("Using 8-bit style synthesizer with parallel processing workflow...") |
|
try: |
|
|
|
base_midi = pretty_midi.PrettyMIDI(midi_to_render_path) |
|
|
|
if getattr(params, 's8bit_enable_midi_preprocessing', False): |
|
base_midi = preprocess_midi_for_harshness(base_midi, params) |
|
|
|
|
|
|
|
|
|
if getattr(params, 's8bit_enable_delay', False): |
|
base_midi = create_delay_effect(base_midi, params) |
|
|
|
|
|
|
|
|
|
arpeggiated_midi = None |
|
if getattr(params, 's8bit_enable_arpeggiator', False): |
|
|
|
arpeggiated_midi = arpeggiate_midi(base_midi, params) |
|
|
|
|
|
print(" - Rendering main synthesis layer (including echoes)...") |
|
|
|
main_and_echo_waveform = synthesize_8bit_style( |
|
midi_data=base_midi, |
|
fs=srate, |
|
params=params, |
|
progress=progress |
|
) |
|
|
|
|
|
echo_instrument = None |
|
for inst in base_midi.instruments: |
|
if inst.name == "Echo Layer": |
|
echo_instrument = inst |
|
break |
|
|
|
|
|
if echo_instrument: |
|
print(" - Processing echo layer audio effects...") |
|
|
|
echo_only_midi = pretty_midi.PrettyMIDI() |
|
echo_only_midi.instruments.append(echo_instrument) |
|
|
|
|
|
echo_waveform_raw = synthesize_8bit_style(midi_data=echo_only_midi, fs=srate, params=params) |
|
|
|
|
|
|
|
unfiltered_echo = echo_waveform_raw |
|
filtered_echo = echo_waveform_raw |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
temp_filtered_echo = echo_waveform_raw.T |
|
|
|
should_filter = False |
|
|
|
if params.s8bit_delay_highpass_cutoff_hz > 0: |
|
print(f" - Applying high-pass filter at {params.s8bit_delay_highpass_cutoff_hz} Hz...") |
|
temp_filtered_echo = apply_butter_highpass_filter(temp_filtered_echo, params.s8bit_delay_highpass_cutoff_hz, srate) |
|
should_filter = True |
|
|
|
|
|
if params.s8bit_delay_lowpass_cutoff_hz < srate / 2: |
|
print(f" - Applying low-pass filter at {params.s8bit_delay_lowpass_cutoff_hz} Hz...") |
|
temp_filtered_echo = apply_butter_lowpass_filter(temp_filtered_echo, params.s8bit_delay_lowpass_cutoff_hz, srate) |
|
should_filter = True |
|
|
|
|
|
if should_filter: |
|
filtered_echo = temp_filtered_echo.T |
|
|
|
|
|
|
|
target_length = main_and_echo_waveform.shape[1] |
|
|
|
|
|
len_unfiltered = unfiltered_echo.shape[1] |
|
if len_unfiltered < target_length: |
|
unfiltered_echo = np.pad(unfiltered_echo, ((0, 0), (0, target_length - len_unfiltered))) |
|
|
|
|
|
len_filtered = filtered_echo.shape[1] |
|
if len_filtered < target_length: |
|
filtered_echo = np.pad(filtered_echo, ((0, 0), (0, target_length - len_filtered))) |
|
|
|
|
|
main_and_echo_waveform -= unfiltered_echo[:, :target_length] |
|
main_and_echo_waveform += filtered_echo[:, :target_length] |
|
|
|
final_waveform = main_and_echo_waveform |
|
|
|
|
|
if arpeggiated_midi and arpeggiated_midi.instruments: |
|
print(" - Rendering and mixing arpeggiator layer...") |
|
|
|
arp_params = copy.copy(params) |
|
|
|
|
|
|
|
setattr(arp_params, '_temp_panning_override', params.s8bit_arpeggio_panning) |
|
|
|
arpeggiated_waveform = synthesize_8bit_style( |
|
midi_data=arpeggiated_midi, |
|
fs=srate, |
|
params=arp_params, |
|
progress=None |
|
) |
|
|
|
|
|
|
|
len_main = final_waveform.shape[1] |
|
len_arp = arpeggiated_waveform.shape[1] |
|
if len_arp > len_main: |
|
final_waveform = np.pad(final_waveform, ((0, 0), (0, len_arp - len_main))) |
|
elif len_main > len_arp: |
|
arpeggiated_waveform = np.pad(arpeggiated_waveform, ((0, 0), (0, len_main - len_arp))) |
|
|
|
final_waveform += arpeggiated_waveform |
|
|
|
|
|
peak_val = np.max(np.abs(final_waveform)) |
|
if peak_val > 0: |
|
final_waveform /= peak_val |
|
|
|
audio_out = (final_waveform.T * 32767).astype(np.int16) |
|
except Exception as e: |
|
print(f"Error during 8-bit synthesis: {e}") |
|
return [None] * 7 |
|
|
|
else: |
|
print(f"Using SoundFont: {params.soundfont_bank}") |
|
|
|
soundfont_path = soundfonts_dict.get(params.soundfont_bank) |
|
|
|
|
|
if not soundfont_path or not os.path.exists(soundfont_path): |
|
|
|
raise gr.Error(f"SoundFont file '{params.soundfont_bank}' could not be found. Please check your 'src/sf2' directory or select another SoundFont.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(midi_to_render_path, 'rb') as f: |
|
midi_file_content = f.read() |
|
|
|
audio_out = midi_to_colab_audio(midi_file_content, |
|
soundfont_path=soundfont_path, |
|
sample_rate=srate, |
|
output_for_gradio=True |
|
) |
|
|
|
print('Audio rendering complete.') |
|
print('=' * 70) |
|
|
|
|
|
with open(midi_to_render_path, 'rb') as f: |
|
new_md5_hash = hashlib.md5(f.read()).hexdigest() |
|
output_plot = TPLOTS.plot_ms_SONG(output_score, plot_title=f"Score of {fn1}", return_plt=True) |
|
|
|
output_midi_summary = str(meta_data) |
|
|
|
return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description |
|
|
|
|
|
def analyze_midi_features(midi_data): |
|
""" |
|
Analyzes a PrettyMIDI object to extract musical features for parameter recommendation. |
|
|
|
Args: |
|
midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze. |
|
|
|
Returns: |
|
dict or None: A dictionary containing features, or None if the MIDI is empty. |
|
Features: 'note_count', 'instruments_count', 'duration', |
|
'note_density', 'avg_velocity', 'pitch_range'. |
|
""" |
|
all_notes = [note for instrument in midi_data.instruments for note in instrument.notes] |
|
note_count = len(all_notes) |
|
|
|
|
|
if note_count == 0: |
|
return None |
|
|
|
duration = midi_data.get_end_time() |
|
|
|
if duration == 0: |
|
note_density = 0 |
|
else: |
|
note_density = note_count / duration |
|
|
|
|
|
avg_velocity = sum(note.velocity for note in all_notes) / note_count |
|
avg_pitch = sum(note.pitch for note in all_notes) / note_count |
|
avg_note_length = sum(note.end - note.start for note in all_notes) / note_count |
|
|
|
|
|
if note_count > 1: |
|
min_pitch = min(note.pitch for note in all_notes) |
|
max_pitch = max(note.pitch for note in all_notes) |
|
pitch_range = max_pitch - min_pitch |
|
else: |
|
pitch_range = 0 |
|
|
|
return { |
|
'note_count': note_count, |
|
'instruments_count': len(midi_data.instruments), |
|
'duration': duration, |
|
'note_density': note_density, |
|
'avg_velocity': avg_velocity, |
|
'pitch_range': pitch_range, |
|
'avg_pitch': avg_pitch, |
|
'avg_note_length': avg_note_length, |
|
} |
|
|
|
def determine_waveform_type(features): |
|
""" |
|
Determines the best waveform type based on analyzed MIDI features. |
|
- Square: Best for most general-purpose, bright melodies. |
|
- Sawtooth: Best for intense, heavy, or powerful leads and basses. |
|
- Triangle: Best for soft, gentle basses or flute-like sounds. |
|
|
|
Args: |
|
features (dict): The dictionary of features from analyze_midi_features. |
|
|
|
Returns: |
|
str: The recommended waveform type ('Square', 'Sawtooth', or 'Triangle'). |
|
""" |
|
|
|
|
|
|
|
if features['avg_pitch'] <= 52 and features['avg_note_length'] >= 0.3 and features['pitch_range'] < 12: |
|
return "Triangle" |
|
|
|
|
|
|
|
|
|
if features['note_density'] >= 6 or features['pitch_range'] >= 18: |
|
return "Sawtooth" |
|
|
|
|
|
return "Square" |
|
|
|
def recommend_8bit_params(midi_data, default_preset): |
|
""" |
|
Recommends 8-bit synthesizer parameters using a unified, factor-based model. |
|
This "AI" generates a sound profile based on normalized musical features. |
|
|
|
Args: |
|
midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze. |
|
default_preset (dict): A fallback preset if analysis fails. |
|
|
|
Returns: |
|
dict: A dictionary of recommended synthesizer parameters. |
|
""" |
|
features = analyze_midi_features(midi_data) |
|
if features is None: |
|
|
|
return default_preset |
|
|
|
|
|
params = {} |
|
|
|
|
|
|
|
params['waveform_type'] = determine_waveform_type(features) |
|
|
|
|
|
if params['waveform_type'] == 'Square': |
|
|
|
|
|
|
|
params['pulse_width'] = 0.3 if features['pitch_range'] > 30 else 0.5 |
|
else: |
|
|
|
params['pulse_width'] = 0.5 |
|
|
|
|
|
|
|
is_plucky = features['note_density'] > 10 |
|
params['envelope_type'] = 'Plucky (AD Envelope)' if is_plucky else 'Sustained (Full Decay)' |
|
params['decay_time_s'] = 0.15 if is_plucky else 0.4 |
|
|
|
|
|
|
|
params['vibrato_depth'] = min(max((features['avg_velocity'] - 60) / 20, 0), 10) |
|
if features['note_density'] > 12: |
|
params['vibrato_rate'] = 7.0 |
|
elif features['note_density'] > 6: |
|
params['vibrato_rate'] = 5.0 |
|
else: |
|
params['vibrato_rate'] = 3.0 |
|
|
|
|
|
|
|
|
|
|
|
params['smooth_notes_level'] = min(max((features['note_density'] - 3) / 5.0, 0.0), 1.0) |
|
|
|
|
|
|
|
params['continuous_vibrato_level'] = 1.0 - min(max((features['note_density'] - 5) / 5.0, 0.0), 1.0) |
|
|
|
|
|
|
|
params['noise_level'] = min(max((features['avg_velocity'] - 50) / 40.0, 0.0), 1.0) * 0.1 |
|
|
|
|
|
|
|
if features['avg_note_length'] < 0.25: |
|
params['distortion_level'] = 0.1 |
|
elif features['avg_note_length'] < 0.5: |
|
params['distortion_level'] = 0.05 |
|
else: |
|
params['distortion_level'] = 0.0 |
|
|
|
|
|
|
|
density_factor = min(max((features['note_density'] - 5) / 15, 0), 1) |
|
range_factor = min(max((features['pitch_range'] - 15) / 30, 0), 1) |
|
|
|
|
|
complexity_factor = (density_factor + range_factor) / 2 |
|
params['fm_modulation_depth'] = round(0.3 * complexity_factor, 3) |
|
params['fm_modulation_rate'] = round(200 * complexity_factor, 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
params['bass_boost_level'] = max(0.2, 1.0 - (features['instruments_count'] - 1) * 0.15) |
|
|
|
|
|
for key, value in params.items(): |
|
if isinstance(value, float): |
|
params[key] = round(value, 3) |
|
|
|
return params |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: AppParameters): |
|
""" |
|
Takes a single audio file path and runs the full transcription pipeline on it. |
|
This includes stereo/mono handling and normalization. |
|
Returns: |
|
A tuple containing: |
|
- The file path of the resulting transcribed MIDI. |
|
- The dictionary of the final basic_pitch parameters that were actually used. |
|
""" |
|
print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---") |
|
|
|
|
|
audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False) |
|
|
|
|
|
final_bp_params = { |
|
"onset_threshold": params.onset_threshold, |
|
"frame_threshold": params.frame_threshold, |
|
"minimum_note_length": params.minimum_note_length, |
|
"minimum_frequency": params.minimum_frequency, |
|
"maximum_frequency": params.maximum_frequency, |
|
"infer_onsets": params.infer_onsets, |
|
"melodia_trick": params.melodia_trick, |
|
"multiple_pitch_bends": params.multiple_pitch_bends, |
|
} |
|
|
|
|
|
if params.transcription_method == "General Purpose" and params.basic_pitch_preset_selector == "Auto-Analyze Audio": |
|
adaptive_params = analyze_audio_for_adaptive_params(audio_data, native_sample_rate) |
|
|
|
final_bp_params.update(adaptive_params) |
|
print(f" - Overriding manual settings with auto-analyzed parameters. final_bp_params: {final_bp_params}") |
|
|
|
if params.enable_stereo_processing and audio_data.ndim == 2 and audio_data.shape[0] == 2: |
|
print("Stereo processing enabled for stem.") |
|
left_channel_np = audio_data[0] |
|
right_channel_np = audio_data[1] |
|
|
|
normalized_left = normalize_loudness(left_channel_np, native_sample_rate) |
|
normalized_right = normalize_loudness(right_channel_np, native_sample_rate) |
|
|
|
temp_left_path = os.path.join(temp_dir, f"{base_name}_left.flac") |
|
temp_right_path = os.path.join(temp_dir, f"{base_name}_right.flac") |
|
|
|
sf.write(temp_left_path, normalized_left, native_sample_rate) |
|
sf.write(temp_right_path, normalized_right, native_sample_rate) |
|
|
|
print(f"Saved left channel to: {temp_left_path}") |
|
print(f"Saved right channel to: {temp_right_path}") |
|
|
|
print("Transcribing left and right channel...") |
|
if params.transcription_method == "General Purpose": |
|
midi_path_left = TranscribeGeneralAudio(temp_left_path, **final_bp_params) |
|
midi_path_right = TranscribeGeneralAudio(temp_right_path, **final_bp_params) |
|
else: |
|
midi_path_left = TranscribePianoAudio(temp_left_path) |
|
midi_path_right = TranscribePianoAudio(temp_right_path) |
|
|
|
if midi_path_left and midi_path_right: |
|
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid") |
|
return merge_midis(midi_path_left, midi_path_right, merged_midi_path), final_bp_params |
|
elif midi_path_left: |
|
print("Warning: Right channel transcription failed. Using left channel only.") |
|
return midi_path_left, final_bp_params |
|
elif midi_path_right: |
|
print("Warning: Left channel transcription failed. Using right channel only.") |
|
return midi_path_right, final_bp_params |
|
else: |
|
print(f"Warning: Stereo transcription failed for stem {base_name}.") |
|
return None, {} |
|
else: |
|
print("Mono processing for stem.") |
|
mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data |
|
normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate) |
|
temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.flac") |
|
sf.write(temp_mono_path, normalized_mono, native_sample_rate) |
|
|
|
if params.transcription_method == "General Purpose": |
|
return TranscribeGeneralAudio(temp_mono_path, **final_bp_params), final_bp_params |
|
else: |
|
|
|
return TranscribePianoAudio(temp_mono_path), {} |
|
|
|
|
|
|
|
def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppParameters, progress: gr.Progress = None): |
|
""" |
|
This is the main processing engine. It takes a file path and a dictionary of all settings, |
|
and performs the full pipeline: load, separate, transcribe, render, re-merge. |
|
It is UI-agnostic and returns file paths and data, not Gradio updates. |
|
It now accepts a Gradio Progress object to report granular progress. |
|
""" |
|
|
|
def update_progress(fraction, desc): |
|
if progress: |
|
progress(fraction, desc=desc) |
|
|
|
|
|
file_start_time = reqtime.time() |
|
|
|
filename = os.path.basename(input_file_path) |
|
base_name = os.path.splitext(filename)[0] |
|
|
|
|
|
is_midi_input = filename.lower().endswith(('.mid', '.midi', '.kar')) |
|
|
|
update_progress(0, f"Starting: {filename}") |
|
print(f"\n{'='*20} Starting Pipeline for: {filename} {'='*20}") |
|
|
|
|
|
timestamped_base_name = f"{base_name}_{timestamp}" |
|
|
|
|
|
transcription_params_log = {} |
|
|
|
|
|
if is_midi_input: |
|
|
|
update_progress(0, "MIDI file detected, skipping transcription...") |
|
print("MIDI file detected. Skipping transcription. Proceeding directly to rendering.") |
|
|
|
if is_stereo_midi(input_file_path): |
|
print("\nINFO: Stereo pan information (Left/Right) detected in the input MIDI. It will be rendered in stereo.\n") |
|
|
|
midi_path_for_rendering = input_file_path |
|
else: |
|
temp_dir = "output/temp_transcribe" |
|
os.makedirs(temp_dir, exist_ok=True) |
|
|
|
|
|
update_progress(0.1, "Audio file detected, loading...") |
|
print("Audio file detected. Starting pre-processing...") |
|
|
|
try: |
|
|
|
|
|
print("Attempting to load audio with torchaudio...") |
|
audio_tensor, native_sample_rate = torchaudio.load(input_file_path) |
|
print("Torchaudio loading successful.") |
|
except Exception as e: |
|
update_progress(0.15, "Torchaudio failed, trying ffmpeg...") |
|
print(f"Torchaudio failed: {e}. Attempting fallback with ffmpeg...") |
|
try: |
|
|
|
converted_flac_path = os.path.join(temp_dir, f"{timestamped_base_name}_converted.flac") |
|
|
|
( |
|
ffmpeg |
|
.input(input_file_path) |
|
.output(converted_flac_path, acodec='flac') |
|
.overwrite_output() |
|
.run(capture_stdout=True, capture_stderr=True) |
|
) |
|
|
|
audio_tensor, native_sample_rate = torchaudio.load(converted_flac_path) |
|
print(f"FFmpeg fallback successful. Loaded from: {converted_flac_path}") |
|
except Exception as ffmpeg_err: |
|
|
|
stderr = ffmpeg_err.stderr.decode() if hasattr(ffmpeg_err, 'stderr') else str(ffmpeg_err) |
|
print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}") |
|
return None |
|
|
|
|
|
|
|
separated_stems = {} |
|
|
|
if params.separate_vocals and demucs_model is not None: |
|
|
|
update_progress(0.2, "Separating audio with Demucs...") |
|
|
|
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels) |
|
|
|
if torch.cuda.is_available(): |
|
audio_tensor = audio_tensor.cuda() |
|
|
|
print("Separating audio with Demucs... This may take some time.") |
|
|
|
with torch.no_grad(): |
|
all_stems = apply_model( |
|
demucs_model, |
|
audio_tensor[None], |
|
device='cuda' if torch.cuda.is_available() else 'cpu', |
|
progress=True |
|
)[0] |
|
|
|
|
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
print("CUDA cache cleared.") |
|
|
|
sources = {name: stem for name, stem in zip(demucs_model.sources, all_stems)} |
|
|
|
|
|
for name, tensor in sources.items(): |
|
separated_stems[name] = (tensor.cpu(), demucs_model.samplerate) |
|
|
|
|
|
stems_to_transcribe = {} |
|
if params.enable_advanced_separation: |
|
|
|
if params.transcribe_vocals: |
|
stems_to_transcribe['vocals'] = sources['vocals'] |
|
if params.transcribe_drums: |
|
stems_to_transcribe['drums'] = sources['drums'] |
|
if params.transcribe_bass: |
|
stems_to_transcribe['bass'] = sources['bass'] |
|
if params.transcribe_other_or_accompaniment: |
|
stems_to_transcribe['other'] = sources['other'] |
|
else: |
|
|
|
accompaniment_tensor = sources['drums'] + sources['bass'] + sources['other'] |
|
if params.transcribe_vocals: |
|
stems_to_transcribe['vocals'] = sources['vocals'] |
|
if params.transcribe_other_or_accompaniment: |
|
stems_to_transcribe['accompaniment'] = accompaniment_tensor |
|
|
|
|
|
transcribed_midi_paths = [] |
|
if stems_to_transcribe: |
|
stem_count = len(stems_to_transcribe) |
|
for i, (name, tensor) in enumerate(stems_to_transcribe.items()): |
|
update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...") |
|
stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac") |
|
torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate) |
|
midi_path, used_bp_params = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params) |
|
if midi_path: |
|
transcribed_midi_paths.append((name, midi_path)) |
|
|
|
if used_bp_params: |
|
|
|
used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector |
|
transcription_params_log[name] = used_bp_params |
|
|
|
|
|
if not transcribed_midi_paths: |
|
raise gr.Error("Separation was enabled, but no stems were selected for transcription, or transcription failed.") |
|
elif len(transcribed_midi_paths) == 1: |
|
midi_path_for_rendering = transcribed_midi_paths[0][1] |
|
else: |
|
update_progress(0.6, "Merging transcribed MIDIs...") |
|
merged_midi = pretty_midi.PrettyMIDI() |
|
for name, path in transcribed_midi_paths: |
|
try: |
|
midi_stem = pretty_midi.PrettyMIDI(path) |
|
for inst in midi_stem.instruments: |
|
inst.name = f"{name.capitalize()} - {inst.name}" |
|
merged_midi.instruments.append(inst) |
|
except Exception as e: |
|
print(f"Warning: Could not merge MIDI for stem {name}. Error: {e}") |
|
final_merged_midi_path = os.path.join(temp_dir, f"{timestamped_base_name}_full_transcription.mid") |
|
merged_midi.write(final_merged_midi_path) |
|
midi_path_for_rendering = final_merged_midi_path |
|
|
|
else: |
|
|
|
audio_to_transcribe_path = os.path.join(temp_dir, f"{timestamped_base_name}_original.flac") |
|
torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate) |
|
|
|
update_progress(0.2, "Transcribing audio to MIDI...") |
|
midi_path_for_rendering, used_bp_params = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params) |
|
|
|
|
|
if used_bp_params: |
|
used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector |
|
|
|
transcription_params_log["full_mix"] = used_bp_params |
|
print(" - Logged transcription parameters for the full mix.") |
|
|
|
if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering): |
|
print(f"ERROR: Transcription failed for {filename}. Skipping.") |
|
return None |
|
|
|
|
|
|
|
update_progress(0.1 if is_midi_input else 0.6, "Applying MIDI transformations...") |
|
|
|
|
|
|
|
if params.s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": |
|
update_progress(0.15 if is_midi_input else 0.65, "Auto-recommending 8-bit parameters...") |
|
print("Auto-Recommendation is enabled. Analyzing MIDI features...") |
|
try: |
|
midi_to_analyze = pretty_midi.PrettyMIDI(midi_path_for_rendering) |
|
default_preset = S8BIT_PRESETS[FALLBACK_PRESET_NAME] |
|
recommended_params = recommend_8bit_params(midi_to_analyze, default_preset) |
|
|
|
print("Recommended parameters:", recommended_params) |
|
|
|
for key, value in recommended_params.items(): |
|
setattr(params, f"s8bit_{key}", value) |
|
print("Parameters updated with recommendations.") |
|
except Exception as e: |
|
print(f"Could not auto-recommend parameters for {filename}: {e}.") |
|
|
|
|
|
update_progress(0.2 if is_midi_input else 0.7, "Rendering MIDI to audio...") |
|
print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}") |
|
|
|
|
|
results_tuple = Render_MIDI(input_midi_path=midi_path_for_rendering, params=params, progress=progress) |
|
|
|
|
|
stems_to_merge = [] |
|
if params.separate_vocals: |
|
if params.merge_vocals_to_render and 'vocals' in separated_stems: |
|
stems_to_merge.append(separated_stems['vocals']) |
|
|
|
if params.enable_advanced_separation: |
|
if params.merge_drums_to_render and 'drums' in separated_stems: |
|
stems_to_merge.append(separated_stems['drums']) |
|
if params.merge_bass_to_render and 'bass' in separated_stems: |
|
stems_to_merge.append(separated_stems['bass']) |
|
if params.merge_other_or_accompaniment and 'other' in separated_stems: |
|
stems_to_merge.append(separated_stems['other']) |
|
else: |
|
if params.merge_other_or_accompaniment: |
|
accompaniment_tensor = separated_stems['drums'][0] + separated_stems['bass'][0] + separated_stems['other'][0] |
|
stems_to_merge.append((accompaniment_tensor, demucs_model.samplerate)) |
|
|
|
if stems_to_merge: |
|
update_progress(0.9, "Re-merging audio stems...") |
|
rendered_srate, rendered_music_int16 = results_tuple[4] |
|
rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0 |
|
final_mix_tensor = torch.from_numpy(rendered_music_float).T |
|
final_srate = rendered_srate |
|
|
|
for stem_tensor, stem_srate in stems_to_merge: |
|
|
|
if stem_srate != final_srate: |
|
|
|
resampler = torchaudio.transforms.Resample(stem_srate, final_srate) |
|
stem_tensor = resampler(stem_tensor) |
|
|
|
|
|
len_mix = final_mix_tensor.shape[1] |
|
len_stem = stem_tensor.shape[1] |
|
if len_mix > len_stem: |
|
stem_tensor = torch.nn.functional.pad(stem_tensor, (0, len_mix - len_stem)) |
|
elif len_stem > len_mix: |
|
final_mix_tensor = torch.nn.functional.pad(final_mix_tensor, (0, len_stem - len_mix)) |
|
|
|
final_mix_tensor += stem_tensor |
|
|
|
|
|
max_abs = torch.max(torch.abs(final_mix_tensor)) |
|
if max_abs > 1.0: final_mix_tensor /= max_abs |
|
|
|
|
|
merged_audio_int16 = (final_mix_tensor.T.numpy() * 32767).astype(np.int16) |
|
|
|
|
|
temp_results_list = list(results_tuple) |
|
temp_results_list[4] = (final_srate, merged_audio_int16) |
|
results_tuple = tuple(temp_results_list) |
|
print("Re-merging complete.") |
|
|
|
|
|
update_progress(0.95, "Saving final files...") |
|
final_srate, final_audio_data = results_tuple[4] |
|
final_midi_path_from_render = results_tuple[3] |
|
|
|
|
|
output_audio_dir = "output/final_audio" |
|
output_midi_dir = "output/final_midi" |
|
os.makedirs(output_audio_dir, exist_ok=True) |
|
os.makedirs(output_midi_dir, exist_ok=True) |
|
|
|
final_audio_path = os.path.join(output_audio_dir, f"{timestamped_base_name}_rendered.flac") |
|
|
|
final_midi_path = os.path.join(output_midi_dir, f"{timestamped_base_name}_processed.mid") |
|
|
|
|
|
try: |
|
|
|
metadata_string = format_params_for_metadata(params, transcription_params_log) |
|
|
|
sf.write(final_audio_path, final_audio_data, final_srate) |
|
audio = FLAC(final_audio_path) |
|
audio["comment"] = metadata_string |
|
audio.save() |
|
|
|
print(f" - Successfully saved audio with embedded parameters to {os.path.basename(final_audio_path)}") |
|
|
|
except Exception as e: |
|
print(f" - Warning: Could not save audio with metadata. Error: {e}") |
|
print(" - Falling back to standard save method.") |
|
|
|
sf.write(final_audio_path, final_audio_data, final_srate) |
|
|
|
|
|
shutil.copy(final_midi_path_from_render, final_midi_path) |
|
|
|
|
|
|
|
file_processing_time = reqtime.time() - file_start_time |
|
print(f"--- Pipeline finished for {filename} in {file_processing_time:.2f} seconds. ---") |
|
print(f"Output Audio: {final_audio_path}\nOutput MIDI: {final_midi_path}") |
|
|
|
|
|
results = { |
|
"final_audio_path": final_audio_path, |
|
"final_midi_path": final_midi_path, |
|
"md5_hash": results_tuple[0], |
|
"title": results_tuple[1], |
|
"summary": results_tuple[2], |
|
"plot": results_tuple[5], |
|
"description": results_tuple[6] |
|
} |
|
update_progress(1.0, "Done!") |
|
|
|
return results, params |
|
|
|
|
|
|
|
|
|
|
|
|
|
class BatchProgressTracker: |
|
""" |
|
A custom progress tracker for batch processing that can update a main |
|
progress bar and also create its own tqdm-style sub-progress bars. |
|
""" |
|
def __init__(self, main_progress: gr.Progress, total_files: int, current_file_index: int, filename: str): |
|
self._main_progress = main_progress |
|
self._total_files = total_files |
|
self._current_file_index = current_file_index |
|
self._filename = filename |
|
self._progress_per_file = 1 / total_files if total_files > 0 else 0 |
|
|
|
def __call__(self, local_fraction: float, desc: str = ""): |
|
"""Makes the object callable like a function for simple progress updates.""" |
|
overall_fraction = (self._current_file_index / self._total_files) + (local_fraction * self._progress_per_file) |
|
full_desc = f"({self._current_file_index + 1}/{self._total_files}) {self._filename}: {desc}" |
|
|
|
self._main_progress(overall_fraction, desc=full_desc) |
|
|
|
def tqdm(self, iterable, desc="", total=None): |
|
"""Provides a tqdm method that delegates to the original gr.Progress object.""" |
|
|
|
tqdm_desc = f"({self._current_file_index + 1}/{self._total_files}) {self._filename}: {desc}" |
|
|
|
return self._main_progress.tqdm(iterable, desc=tqdm_desc, total=total) |
|
|
|
|
|
def batch_process_files(input_files, progress=gr.Progress(track_tqdm=True), *args): |
|
""" |
|
Gradio wrapper for batch processing. It iterates through files, calls the core pipeline, |
|
and collects the output file paths. It now provides detailed, nested progress updates. |
|
""" |
|
|
|
if not input_files: |
|
print("No files uploaded for batch processing.") |
|
return [], [] |
|
|
|
|
|
batch_start_time = reqtime.time() |
|
|
|
|
|
batch_timestamp = reqtime.strftime("%Y%m%d-%H%M%S") |
|
|
|
|
|
params = AppParameters(**dict(zip(ALL_PARAM_KEYS, args))) |
|
|
|
output_audio_paths = [] |
|
output_midi_paths = [] |
|
total_files = len(input_files) |
|
|
|
|
|
progress(0, desc="Starting Batch Process...") |
|
for i, file_obj in enumerate(input_files): |
|
|
|
input_path = file_obj.name |
|
filename = os.path.basename(input_path) |
|
|
|
|
|
|
|
|
|
batch_progress_tracker = BatchProgressTracker( |
|
main_progress=progress, |
|
total_files=total_files, |
|
current_file_index=i, |
|
filename=filename |
|
) |
|
|
|
|
|
results, _ = run_single_file_pipeline(input_path, batch_timestamp, copy.copy(params), progress=batch_progress_tracker) |
|
|
|
if results: |
|
if results.get("final_audio_path"): |
|
output_audio_paths.append(results["final_audio_path"]) |
|
if results.get("final_midi_path"): |
|
output_midi_paths.append(results["final_midi_path"]) |
|
|
|
|
|
progress(1, desc="Batch Process Complete!") |
|
|
|
|
|
total_batch_time = reqtime.time() - batch_start_time |
|
print(f"\nBatch processing complete. {len(output_audio_paths)} of {total_files} files processed successfully.") |
|
print(f"Total batch execution time: {total_batch_time:.2f} seconds.") |
|
|
|
|
|
return output_audio_paths, output_midi_paths |
|
|
|
|
|
|
|
def process_and_render_file(input_file, *args, progress=gr.Progress()): |
|
""" |
|
Gradio wrapper for the single file processing UI. Packs UI values into an AppParameters object. |
|
Calls the core pipeline and formats the output for all UI components. |
|
Main function to handle file processing. It determines the file type and calls the |
|
appropriate functions for transcription and/or rendering based on user selections. |
|
Now includes a progress bar. |
|
""" |
|
if input_file is None: |
|
|
|
return [gr.update(value=None)] * (7 + 14) |
|
|
|
|
|
job_start_time = reqtime.time() |
|
|
|
|
|
single_file_timestamp = reqtime.strftime("%Y%m%d-%H%M%S") |
|
|
|
|
|
|
|
params = AppParameters(input_file=input_file, **dict(zip(ALL_PARAM_KEYS, args))) |
|
|
|
|
|
results, final_params = run_single_file_pipeline(input_file, single_file_timestamp, params, progress=progress) |
|
|
|
if results is None: |
|
raise gr.Error("File processing failed. Check console for details.") |
|
|
|
|
|
total_job_time = reqtime.time() - job_start_time |
|
print(f"Total single-file job execution time: {total_job_time:.2f} seconds.") |
|
|
|
|
|
|
|
final_ui_updates = [] |
|
|
|
|
|
if params.s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": |
|
|
|
final_ui_updates.append(gr.update(value="Custom")) |
|
else: |
|
|
|
final_ui_updates.append(gr.update(value=final_params.s8bit_preset_selector)) |
|
|
|
|
|
s8bit_control_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_') and key != 's8bit_preset_selector'] |
|
|
|
|
|
for key in s8bit_control_keys: |
|
value = getattr(final_params, key) |
|
|
|
|
|
|
|
if isinstance(value, np.generic): |
|
value = value.item() |
|
final_ui_updates.append(value) |
|
|
|
|
|
main_results = [ |
|
results['md5_hash'], results['title'], results['summary'], |
|
results['final_midi_path'], results['final_audio_path'], |
|
results['plot'], results['description'] |
|
] |
|
|
|
|
|
return main_results + final_ui_updates |
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
initialize_app() |
|
|
|
|
|
global soundfonts_dict, demucs_model |
|
|
|
soundfonts_dict = prepare_soundfonts() |
|
print(f"Found {len(soundfonts_dict)} local SoundFonts.") |
|
|
|
if not soundfonts_dict: |
|
print("\nWARNING: No SoundFonts were found or could be downloaded.") |
|
print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.") |
|
|
|
|
|
print("Loading Demucs model (htdemucs_ft), this may take a moment on first run...") |
|
try: |
|
demucs_model = get_model(name='htdemucs_ft') |
|
if torch.cuda.is_available(): |
|
demucs_model = demucs_model.cuda() |
|
print("Demucs model loaded successfully.") |
|
except Exception as e: |
|
print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}") |
|
demucs_model = None |
|
|
|
|
|
RENDER_TYPE_DESCRIPTIONS = { |
|
"Render as-is": "**Mode: Pass-through.** Renders the MIDI file directly without any modifications. Advanced MIDI options will be ignored.", |
|
"Custom render": "**Mode: Activate Advanced Options.** Applies all settings from the 'Advanced MIDI Rendering Options' accordion without making other structural changes to the MIDI.", |
|
"Extract melody": "**Action: Simplify.** Analyzes all tracks and attempts to isolate and render only the main melody line.", |
|
"Flip": "**Action: Experimental.** Inverts the pitch of each note around the song's average pitch.", |
|
"Reverse": "**Action: Experimental.** Reverses the playback order of all notes in the MIDI file.", |
|
"Repair Durations": "**Action: Fix.** Recalculates note durations to ensure they connect smoothly (legato), filling any small gaps.", |
|
"Repair Chords": "**Action: Fix.** Analyzes and aligns notes that occur at similar times to form cleaner, more structured chords.", |
|
"Remove Duplicate Pitches": "**Action: Simplify.** If multiple instruments play the exact same pitch at the same time, it keeps only one.", |
|
"Longest Repeating Phrase": "**Action: Analyze.** Finds the longest, most-repeated musical phrase (often the chorus) and renders only that section.", |
|
"Multi-Instrumental Summary": "**Action: AI Summary.** Creates a short, compressed summary of a complex, multi-instrument song.", |
|
"Solo Piano Summary": "**Action: AI Summary.** First converts the song to a solo piano arrangement, then creates a short, compressed summary.", |
|
"Add Drum Track": "**Action: Enhance.** Analyzes the rhythm of the MIDI and automatically generates a basic drum track to accompany it." |
|
} |
|
|
|
|
|
|
|
FALLBACK_PRESET_NAME = "Generic Chiptune Loop" |
|
|
|
|
|
|
|
|
|
|
|
S8BIT_PRESETS = { |
|
|
|
"Mario (Super Mario Bros / スーパーマリオブラザーズ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.8, |
|
'continuous_vibrato_level': 0.25, |
|
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Mega Man (Rockman / ロックマン)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, |
|
'vibrato_rate': 6.0, 'vibrato_depth': 8, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Zelda (The Legend of Zelda / ゼルダの伝説)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, |
|
'vibrato_rate': 4.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, |
|
'vibrato_rate': 6.0, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.3, |
|
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Pokémon (Game Boy Classics / ポケットモンスター)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, |
|
'vibrato_rate': 6.5, 'vibrato_depth': 6, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Final Fantasy (Arpeggio / ファイナルファンタジー)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.2, |
|
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 3.5, 'vibrato_depth': 3, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Commodore 64 (SID Feel)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, |
|
'vibrato_rate': 8.0, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.3, |
|
'bass_boost_level': 0.2, 'noise_level': 0.05, 'distortion_level': 0.1, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Megadrive/Genesis (FM Grit)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, |
|
'vibrato_rate': 0.0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.4, 'noise_level': 0.1, 'distortion_level': 0.2, |
|
'fm_modulation_depth': 0.2, 'fm_modulation_rate': 150 |
|
}, |
|
"PC-98 (Touhou Feel / 東方Project)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.15, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.12, |
|
'vibrato_rate': 7.5, 'vibrato_depth': 7, |
|
'smooth_notes_level': 0.95, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.1, 'fm_modulation_rate': 200 |
|
}, |
|
"Roland SC-88 (GM Vibe)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, |
|
'vibrato_rate': 0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 1.0, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Falcom Ys (Rock Lead / イース)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, |
|
'vibrato_rate': 5.5, 'vibrato_depth': 6, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.8, |
|
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Arcade Brawler Lead (Street Fighter / ストリートファイター)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 6, |
|
'smooth_notes_level': 0.8, |
|
'continuous_vibrato_level': 0.7, |
|
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, |
|
'vibrato_rate': 4.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.8, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Dragon Quest (Orchestral Feel / ドラゴンクエスト)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6, |
|
'vibrato_rate': 3.0, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, |
|
'vibrato_rate': 2.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 1.0, |
|
'continuous_vibrato_level': 0.95, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Modern JRPG Pad (Persona / ペルソナ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, |
|
'vibrato_rate': 2.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 1.0, |
|
'continuous_vibrato_level': 0.95, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Tactical Brass (Fire Emblem / ファイアーエムブレム)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 3.5, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.95, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 3.5, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.95, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, |
|
'vibrato_rate': 7.0, 'vibrato_depth': 12, |
|
'smooth_notes_level': 0.1, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"8-Bit Vocal Lead": { |
|
|
|
'waveform_type': 'Triangle', |
|
'pulse_width': 0.5, |
|
'envelope_type': 'Sustained (Full Decay)', |
|
'decay_time_s': 0.8, |
|
'vibrato_rate': 5.5, |
|
'vibrato_depth': 4, |
|
'bass_boost_level': 0.1, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.9, |
|
'noise_level': 0.02, |
|
'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.05, |
|
'fm_modulation_rate': 20 |
|
}, |
|
"8-Bit Male Vocal": { |
|
|
|
'waveform_type': 'Triangle', |
|
'pulse_width': 0.5, |
|
'envelope_type': 'Sustained (Full Decay)', |
|
'decay_time_s': 1.0, |
|
'vibrato_rate': 5.0, |
|
'vibrato_depth': 3, |
|
'bass_boost_level': 0.3, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'noise_level': 0.015, |
|
'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.08, |
|
'fm_modulation_rate': 25 |
|
}, |
|
"8-Bit Female Vocal": { |
|
|
|
'waveform_type': 'Triangle', |
|
'pulse_width': 0.5, |
|
'envelope_type': 'Sustained (Full Decay)', |
|
'decay_time_s': 0.7, |
|
'vibrato_rate': 6.0, |
|
'vibrato_depth': 5, |
|
'bass_boost_level': 0.05, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.92, |
|
'noise_level': 0.025, |
|
'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.04, |
|
'fm_modulation_rate': 30 |
|
}, |
|
"Lo-Fi Vocal": { |
|
|
|
'waveform_type': 'Square', |
|
'pulse_width': 0.48, |
|
'envelope_type': 'Plucky (AD Envelope)', |
|
'decay_time_s': 0.4, |
|
'vibrato_rate': 4.8, |
|
'vibrato_depth': 2, |
|
'bass_boost_level': 0.1, |
|
'smooth_notes_level': 0.65, |
|
'continuous_vibrato_level': 0.6, |
|
'noise_level': 0.05, |
|
'distortion_level': 0.05, |
|
'fm_modulation_depth': 0.02, |
|
'fm_modulation_rate': 20 |
|
}, |
|
|
|
"Sci-Fi Energy Field": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 10.0, 'vibrato_depth': 3, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.1, 'noise_level': 0.1, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.05, 'fm_modulation_rate': 50 |
|
}, |
|
"Industrial Alarm": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, |
|
'vibrato_rate': 15.0, 'vibrato_depth': 8, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.3, 'noise_level': 0.2, 'distortion_level': 0.3, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Laser Charge-Up": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, |
|
'vibrato_rate': 4.0, 'vibrato_depth': 25, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.95, |
|
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Unstable Machine Core": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, |
|
'vibrato_rate': 1.0, 'vibrato_depth': 50, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.5, 'noise_level': 0.3, 'distortion_level': 0.4, |
|
'fm_modulation_depth': 0.5, 'fm_modulation_rate': 10 |
|
}, |
|
"Hardcore Gabber Kick": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.1, |
|
'vibrato_rate': 0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Generic Chiptune Loop": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, |
|
'vibrato_rate': 5.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
} |
|
|
|
|
|
BASIC_PITCH_PRESETS = { |
|
|
|
"Default (Balanced)": { |
|
'description': "A good all-around starting point for most music types.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 128, |
|
'minimum_frequency': 60, 'maximum_frequency': 4000, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False |
|
}, |
|
"Anime / J-Pop": { |
|
'description': "For tracks with clear melodies and pop/rock arrangements.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 150, |
|
'minimum_frequency': 40, 'maximum_frequency': 2500, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
|
|
|
|
"Solo Vocals": { |
|
'description': "Optimized for a single singing voice. Sensitive to nuances.", |
|
'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 100, |
|
'minimum_frequency': 80, 'maximum_frequency': 1200, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Solo Piano": { |
|
'description': "For solo piano with a wide dynamic and frequency range.", |
|
'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 120, |
|
'minimum_frequency': 27, 'maximum_frequency': 4200, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Acoustic Guitar": { |
|
'description': "Balanced for picked or strummed acoustic guitar.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 90, |
|
'minimum_frequency': 80, 'maximum_frequency': 2500, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False |
|
}, |
|
"Bass Guitar": { |
|
'description': "Isolates and transcribes only the low frequencies of a bassline.", |
|
'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 100, |
|
'minimum_frequency': 30, 'maximum_frequency': 400, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False |
|
}, |
|
"Percussion / Drums": { |
|
'description': "For drums and rhythmic elements. Catches fast, sharp hits.", |
|
'onset_threshold': 0.7, 'frame_threshold': 0.6, 'minimum_note_length': 30, |
|
'minimum_frequency': 40, 'maximum_frequency': 10000, |
|
'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': False |
|
}, |
|
|
|
|
|
"Rock / Metal": { |
|
'description': "Higher thresholds for distorted guitars, bass, and drums in a dense mix.", |
|
'onset_threshold': 0.6, 'frame_threshold': 0.4, 'minimum_note_length': 100, |
|
'minimum_frequency': 50, 'maximum_frequency': 3000, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Jazz (Multi-instrument)": { |
|
'description': "High thresholds to separate notes in complex, improvisational passages.", |
|
'onset_threshold': 0.7, 'frame_threshold': 0.5, 'minimum_note_length': 150, |
|
'minimum_frequency': 55, 'maximum_frequency': 2000, |
|
'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': True |
|
}, |
|
"Classical (Orchestral)": { |
|
'description': "Longer note length to focus on sustained notes and filter out performance noise.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.4, 'minimum_note_length': 200, |
|
'minimum_frequency': 32, 'maximum_frequency': 4200, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Electronic / Synth": { |
|
'description': "Low thresholds and short note length for sharp, synthetic sounds.", |
|
'onset_threshold': 0.3, 'frame_threshold': 0.2, 'minimum_note_length': 50, |
|
'minimum_frequency': 20, 'maximum_frequency': 8000, |
|
'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': False |
|
} |
|
} |
|
|
|
|
|
|
|
def update_vocal_ui_visibility(separate_vocals): |
|
"""Shows or hides the separation-related UI controls based on selections.""" |
|
is_visible = gr.update(visible=separate_vocals) |
|
return is_visible, is_visible, is_visible |
|
|
|
def update_ui_visibility(transcription_method, soundfont_choice): |
|
""" |
|
Dynamically updates the visibility of UI components based on user selections. |
|
""" |
|
is_general = (transcription_method == "General Purpose") |
|
is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL) |
|
|
|
return { |
|
general_transcription_settings: gr.update(visible=is_general), |
|
synth_8bit_settings: gr.update(visible=is_8bit), |
|
} |
|
|
|
|
|
def update_advanced_midi_options_visibility(render_type_choice): |
|
""" |
|
Shows or hides the advanced MIDI rendering options based on the render type. |
|
The options are only visible if the type is NOT 'Render as-is'. |
|
""" |
|
is_visible = (render_type_choice != "Render as-is") |
|
return gr.update(visible=is_visible) |
|
|
|
|
|
def update_render_type_description(render_type_choice): |
|
""" |
|
Returns the description for the selected render type. |
|
""" |
|
return RENDER_TYPE_DESCRIPTIONS.get(render_type_choice, "Select a render type to see its description.") |
|
|
|
|
|
def apply_basic_pitch_preset(preset_name): |
|
if preset_name not in BASIC_PITCH_PRESETS: |
|
|
|
return {comp: gr.update() for comp in basic_pitch_ui_components} |
|
|
|
settings = BASIC_PITCH_PRESETS[preset_name] |
|
|
|
|
|
return { |
|
onset_threshold: gr.update(value=settings['onset_threshold']), |
|
frame_threshold: gr.update(value=settings['frame_threshold']), |
|
minimum_note_length: gr.update(value=settings['minimum_note_length']), |
|
minimum_frequency: gr.update(value=settings['minimum_frequency']), |
|
maximum_frequency: gr.update(value=settings['maximum_frequency']), |
|
infer_onsets: gr.update(value=settings['infer_onsets']), |
|
melodia_trick: gr.update(value=settings['melodia_trick']), |
|
multiple_pitch_bends: gr.update(value=settings['multiple_bends']) |
|
} |
|
|
|
|
|
|
|
def apply_8bit_preset(preset_name): |
|
""" |
|
Takes the name of a preset and returns a dictionary of gr.update objects |
|
to set the values of the 13 8-bit synthesizer control components. |
|
This version is more robust as it directly maps keys to UI components. |
|
""" |
|
|
|
if preset_name in ["Custom", "Auto-Recommend (Analyze MIDI)"] or preset_name not in S8BIT_PRESETS: |
|
|
|
s8bit_control_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_') and key != 's8bit_preset_selector'] |
|
return {ui_component_map[key]: gr.update() for key in s8bit_control_keys} |
|
|
|
|
|
settings = S8BIT_PRESETS[preset_name] |
|
updates = {} |
|
|
|
|
|
for simple_key, value in settings.items(): |
|
|
|
full_key = f"s8bit_{simple_key}" |
|
|
|
|
|
if full_key in ui_component_map: |
|
component = ui_component_map[full_key] |
|
updates[component] = gr.update(value=value) |
|
|
|
return updates |
|
|
|
|
|
def update_separation_mode_ui(is_advanced): |
|
""" |
|
Updates the visibility and labels of UI components based on whether |
|
the advanced separation mode is enabled. |
|
""" |
|
if is_advanced: |
|
|
|
return { |
|
advanced_separation_controls: gr.update(visible=True), |
|
transcribe_drums: gr.update(visible=True), |
|
transcribe_bass: gr.update(visible=True), |
|
transcribe_other_or_accompaniment: gr.update(label="Transcribe Other"), |
|
merge_drums_to_render: gr.update(visible=True), |
|
merge_bass_to_render: gr.update(visible=True), |
|
merge_other_or_accompaniment: gr.update(label="Merge Other") |
|
} |
|
else: |
|
|
|
return { |
|
advanced_separation_controls: gr.update(visible=False), |
|
transcribe_drums: gr.update(visible=False), |
|
transcribe_bass: gr.update(visible=False), |
|
transcribe_other_or_accompaniment: gr.update(label="Transcribe Accompaniment"), |
|
merge_drums_to_render: gr.update(visible=False), |
|
merge_bass_to_render: gr.update(visible=False), |
|
merge_other_or_accompaniment: gr.update(label="Merge Accompaniment") |
|
} |
|
|
|
|
|
|
|
ALL_PARAM_KEYS = [field.name for field in fields(AppParameters) if field.name not in ["input_file", "batch_input_files"]] |
|
|
|
app = gr.Blocks(theme=gr.themes.Base()) |
|
|
|
with app: |
|
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Audio-to-MIDI & Advanced Renderer</h1>") |
|
gr.Markdown( |
|
"**Upload a Audio for transcription-then-rendering, or a MIDI for rendering-only.**\n\n" |
|
"This application combines piano audio transcription with a powerful MIDI transformation and rendering toolkit. " |
|
"Based on the work of [asigalov61](https://github.com/asigalov61)." |
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
waveform_options = gr.WaveformOptions(show_recording_waveform=False) |
|
|
|
with gr.TabItem("Single File Processing"): |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
|
|
gr.Markdown("## 1. Upload File") |
|
|
|
|
|
|
|
|
|
input_file = gr.Audio( |
|
label="Input Audio or MIDI File", |
|
type="filepath", |
|
sources=["upload"], waveform_options=waveform_options |
|
) |
|
|
|
submit_btn = gr.Button("Process and Render Single File", variant="primary") |
|
|
|
with gr.Column(scale=2): |
|
|
|
gr.Markdown("### 2. Results") |
|
output_midi_title = gr.Textbox(label="MIDI Title") |
|
output_song_description = gr.Textbox(label="MIDI Description", lines=3) |
|
output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options) |
|
output_plot = gr.Plot(label="MIDI Score Plot") |
|
with gr.Row(): |
|
output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"]) |
|
output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash") |
|
output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4) |
|
|
|
|
|
|
|
with gr.TabItem("Batch Processing"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### 1. Upload Files") |
|
gr.Markdown("Uses the **global settings** configured above.") |
|
batch_input_files = gr.File( |
|
label="Upload Audio or MIDI Files", |
|
file_count="multiple" |
|
) |
|
|
|
batch_process_btn = gr.Button("Process Batch", variant="primary") |
|
|
|
with gr.Column(): |
|
gr.Markdown("### 2. Download Results") |
|
batch_output_audio_files = gr.File( |
|
label="Download Rendered FLAC Files", |
|
file_count="multiple", |
|
interactive=False |
|
) |
|
batch_output_midi_files = gr.File( |
|
label="Download Processed MIDI Files", |
|
file_count="multiple", |
|
interactive=False |
|
) |
|
|
|
with gr.Accordion("▶️ Configure Global Settings (for both Single File and Batch)", open=True): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### Transcription Settings\n" |
|
"> _**Note:** This entire section is for audio-to-MIDI conversion. All settings here are ignored if a MIDI file is uploaded._" |
|
) |
|
|
|
transcription_method = gr.Radio(["General Purpose", "Piano-Specific"], label="Audio Transcription Method", value="General Purpose", |
|
info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings.") |
|
|
|
enable_stereo_processing = gr.Checkbox(label="Enable Stereo Transcription", value=False, |
|
info="For stereo audio files only. When enabled, transcribes left and right channels independently, then merges them. Note: This will double the transcription time.") |
|
|
|
|
|
with gr.Group(): |
|
separate_vocals = gr.Checkbox(label="Enable Source Separation (Demucs)", value=False, |
|
info="If checked, separates the audio into its component stems (vocals, drums, etc.) before processing.") |
|
|
|
|
|
with gr.Group(visible=False) as separation_options_box: |
|
gr.Markdown("#### 1. Stem Separation Options") |
|
enable_advanced_separation = gr.Checkbox(label="Enable Advanced Stem Control (for Accompaniment)", value=False, |
|
info="If checked, you can individually control drums, bass, and other. If unchecked, they are treated as a single 'Accompaniment' track.") |
|
|
|
with gr.Row(visible=False) as advanced_separation_controls: |
|
separate_drums = gr.Checkbox(label="Drums", value=True) |
|
separate_bass = gr.Checkbox(label="Bass", value=True) |
|
separate_other = gr.Checkbox(label="Other", value=True) |
|
|
|
gr.Markdown("#### 2. Transcription Targets") |
|
gr.Markdown("_Select which separated stem(s) to convert to MIDI._") |
|
with gr.Row(): |
|
transcribe_vocals = gr.Checkbox(label="Transcribe Vocals", value=False) |
|
|
|
transcribe_drums = gr.Checkbox(label="Transcribe Drums", value=False, visible=False) |
|
transcribe_bass = gr.Checkbox(label="Transcribe Bass", value=False, visible=False) |
|
|
|
transcribe_other_or_accompaniment = gr.Checkbox(label="Transcribe Accompaniment", value=True) |
|
|
|
gr.Markdown("#### 3. Audio Merging Targets") |
|
gr.Markdown( |
|
""" |
|
_Select which **original, unprocessed** audio stems to merge back into the final output. |
|
This does **not** use the transcribed MIDI; it uses the raw audio from the initial separation. |
|
You can leave all boxes unchecked. This step only affects the final audio file, not the MIDI output._ |
|
""" |
|
) |
|
with gr.Row(): |
|
merge_vocals_to_render = gr.Checkbox(label="Merge Vocals", value=False) |
|
|
|
merge_drums_to_render = gr.Checkbox(label="Merge Drums", value=False, visible=False) |
|
merge_bass_to_render = gr.Checkbox(label="Merge Bass", value=False, visible=False) |
|
|
|
merge_other_or_accompaniment = gr.Checkbox(label="Merge Accompaniment", value=False) |
|
|
|
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings: |
|
|
|
basic_pitch_preset_selector = gr.Dropdown( |
|
choices=["Auto-Analyze Audio", "Custom"] + list(BASIC_PITCH_PRESETS.keys()), |
|
value="Default (Balanced)", |
|
label="Transcription Profile Preset", |
|
info="Select a profile to auto-fill settings for different instrument types." |
|
"For reference only; it is recommended to test and adjust for optimal results.") |
|
|
|
onset_threshold = gr.Slider( |
|
0.0, 1.0, value=0.5, step=0.05, |
|
label="On-set Threshold", |
|
info="Sensitivity for detecting the start of a new note. Lower values will detect more notes (even faint ones), but may create false positives. Higher values are stricter and cleaner, but might miss subtle notes." |
|
) |
|
frame_threshold = gr.Slider( |
|
0.0, 1.0, value=0.3, step=0.05, |
|
label="Frame Threshold", |
|
info="Sensitivity for determining if a note is 'on' or 'off'. Lower values will sustain notes longer, but can merge distinct notes. Higher values create shorter, more separated notes, but might cut off tails." |
|
) |
|
minimum_note_length = gr.Slider( |
|
10, 500, value=128, step=1, |
|
label="Minimum Note Length (ms)", |
|
info="Filters out notes shorter than this duration. Increase this to remove fast, noisy artifacts or clicks. Decrease it if the transcription is missing very short, staccato notes." |
|
) |
|
minimum_frequency = gr.Slider( |
|
0, 500, value=60, step=5, |
|
label="Minimum Frequency (Hz)", |
|
info="Ignores any detected pitches below this frequency. Increase this to filter out low-frequency noise like rumble or hum. Set it just below your target instrument's lowest note (e.g., ~80Hz for guitar)." |
|
) |
|
maximum_frequency = gr.Slider( |
|
501, 10000, value=4000, step=10, |
|
label="Maximum Frequency (Hz)", |
|
info="Ignores any detected pitches above this frequency. Decrease this to filter out high-frequency noise like hiss or cymbals. Set it just above your target instrument's highest note (e.g., ~1200Hz for vocals)." |
|
) |
|
infer_onsets = gr.Checkbox( |
|
value=True, |
|
label="Infer Onsets (Boost Onsets)", |
|
info="When enabled, the model actively looks for and emphasizes the start of each note (the 'attack'). Recommended for percussive or clear, rhythmic music. Disable for very smooth, legato music like vocal pads." |
|
) |
|
melodia_trick = gr.Checkbox( |
|
value=True, |
|
label="Melodia Trick (Contour Optimization)", |
|
info="When enabled, uses a secondary melody-detection algorithm to refine the main pitch contour. Highly recommended for most melodic content. Disable if you are transcribing non-melodic noise or complex polyphony." |
|
) |
|
multiple_pitch_bends = gr.Checkbox( |
|
value=False, |
|
label="Allow Multiple Pitch Bends", |
|
info="When enabled, allows a single note to have multiple, continuous pitch bends within it. Essential for transcribing vocals, slides, or vibrato-heavy instruments. Disable for clean, discrete notes like a standard piano." |
|
) |
|
|
|
with gr.Column(scale=1): |
|
|
|
gr.Markdown("### MIDI Transformation & Rendering Settings") |
|
render_type = gr.Radio( |
|
list(RENDER_TYPE_DESCRIPTIONS.keys()), |
|
["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"], |
|
label="MIDI Transformation Render Type", |
|
value="Render as-is", |
|
info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations.") |
|
|
|
render_type_info = gr.Markdown( |
|
value=RENDER_TYPE_DESCRIPTIONS["Render as-is"], |
|
elem_classes="description-box" |
|
) |
|
|
|
with gr.Row(elem_id="soundfont_selector_row"): |
|
soundfont_bank = gr.Dropdown( |
|
[SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys()), |
|
label="SoundFont / Synthesizer", |
|
value=list(soundfonts_dict.keys())[0] if soundfonts_dict else SYNTH_8_BIT_LABEL, |
|
scale=4 |
|
) |
|
|
|
preview_sf_button = gr.Button("🔊 Preview", scale=1) |
|
|
|
|
|
|
|
preview_sf_player = gr.Audio(label="SoundFont Preview", interactive=False, show_label=False) |
|
render_sample_rate = gr.Radio( |
|
["16000", "32000", "44100"], |
|
label="Audio Sample Rate", |
|
value="44100") |
|
|
|
with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options: |
|
render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True, |
|
info="Applies sustain pedal effects (CC64) to lengthen notes, creating a more realistic and connected performance, especially for piano.") |
|
render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False, |
|
info="Converts all non-drum instruments to a Grand Piano patch, creating a solo piano arrangement of the entire score.") |
|
render_remove_drums = gr.Checkbox(label="Remove drum track", value=False, |
|
info="Removes the entire drum track (typically MIDI Channel 9) from the score. Ideal for creating instrumental or karaoke versions.") |
|
render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False, |
|
info="Transposes the entire score so that its average pitch is centered around C4 (MIDI note 60). Useful for standardizing key.") |
|
render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)", |
|
info="Shifts the pitch of all non-drum notes up (positive values) or down (negative values) by the specified number of semitones.") |
|
custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)", |
|
info="Forces all non-drum instruments to use a single specified MIDI patch number. Set to -1 to use the original instruments.") |
|
merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)", |
|
info="Aligns the start times of notes that are played almost simultaneously (within the specified ms threshold). Cleans up sloppy timing. -1 to disable.") |
|
render_align = gr.Radio( |
|
["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"], |
|
label="Align notes to musical bars", |
|
value="Do not align", |
|
info="Quantizes the score to a fixed bar length. 'Start Times' aligns onsets. " |
|
"'Durations' trims notes at the bar line. 'Split Durations' splits notes that cross the bar line." |
|
) |
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown("### 8-bit Synthesizer Settings") |
|
with gr.Accordion("8-bit Synthesizer Settings", open=True, visible=False) as synth_8bit_settings: |
|
s8bit_preset_selector = gr.Dropdown( |
|
choices=["Custom", "Auto-Recommend (Analyze MIDI)"] + list(S8BIT_PRESETS.keys()), |
|
value="Custom", |
|
label="Style Preset", |
|
info="Select a preset to auto-fill the settings below. Choose 'Custom' for manual control or 'Auto-Recommend' to analyze the MIDI.\nFor reference and entertainment only. These presets are not guaranteed to be perfectly accurate." |
|
) |
|
s8bit_waveform_type = gr.Dropdown( |
|
['Square', 'Sawtooth', 'Triangle'], |
|
value='Square', |
|
label="Waveform Type", |
|
info="The fundamental timbre of the sound. Square is bright and hollow (classic NES), Sawtooth is aggressive and buzzy, Triangle is soft and flute-like." |
|
) |
|
s8bit_pulse_width = gr.Slider( |
|
0.01, 0.99, value=0.5, step=0.01, |
|
label="Pulse Width (Square Wave Only)", |
|
info="Changes the character of the Square wave. Low values (\~0.1) are thin and nasal, while mid values (\~0.5) are full and round." |
|
) |
|
s8bit_envelope_type = gr.Dropdown( |
|
['Plucky (AD Envelope)', 'Sustained (Full Decay)'], |
|
value='Plucky (AD Envelope)', |
|
label="Envelope Type", |
|
info="Shapes the volume of each note. 'Plucky' is a short, percussive sound. 'Sustained' holds the note for its full duration." |
|
) |
|
s8bit_decay_time_s = gr.Slider( |
|
0.01, 1.0, value=0.1, step=0.01, |
|
label="Decay Time (s)", |
|
info="For the 'Plucky' envelope, this is the time it takes for a note to fade to silence. Low values are short and staccato; high values are longer and more resonant." |
|
) |
|
s8bit_adaptive_decay = gr.Checkbox( |
|
value=True, |
|
label="Enable Adaptive Decay (Fix for Staccato)", |
|
info="Recommended! Fixes low volume on fast/short notes by ensuring a consistent decay rate, regardless of note length. Makes staccato passages sound fuller and more powerful." |
|
) |
|
s8bit_vibrato_rate = gr.Slider( |
|
0, 20, value=5, |
|
label="Vibrato Rate (Hz)", |
|
info="The SPEED of the pitch wobble. Low values create a slow, gentle waver. High values create a fast, frantic buzz." |
|
) |
|
s8bit_vibrato_depth = gr.Slider( |
|
0, 50, value=0, |
|
label="Vibrato Depth (Hz)", |
|
info="The INTENSITY of the pitch wobble. Low values are subtle or off. High values create a dramatic, siren-like pitch bend." |
|
) |
|
s8bit_bass_boost_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Bass Boost Level", |
|
info="Mixes in a sub-octave (a square wave one octave lower). Low values have no effect; high values add significant weight and power." |
|
) |
|
|
|
s8bit_bass_boost_cutoff_hz = gr.Slider( |
|
50.0, 500.0, value=200.0, step=10.0, |
|
label="Bass Boost Cutoff (Hz)", |
|
info="Intelligent Bass Boost: The boost effect will gradually fade out for notes BELOW this frequency, preventing muddiness in the low-end." |
|
) |
|
s8bit_smooth_notes_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Smooth Notes Level", |
|
info="Applies a tiny fade-in/out to reduce clicking. Low values (or 0) give a hard, abrupt attack. High values give a softer, cleaner onset." |
|
) |
|
s8bit_continuous_vibrato_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Continuous Vibrato Level", |
|
info="Controls vibrato continuity across notes. Low values (0) reset vibrato on each note (bouncy). High values (1) create a smooth, connected 'singing' vibrato." |
|
) |
|
|
|
with gr.Accordion("Advanced Synthesis & FX", open=True): |
|
s8bit_noise_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Noise Level", |
|
info="Mixes in white noise with the main waveform. Low values are clean; high values add 'grit', 'air', or a hissing quality, useful for percussion." |
|
) |
|
s8bit_distortion_level = gr.Slider( |
|
0.0, 0.9, value=0.0, step=0.05, |
|
label="Distortion Level", |
|
info="Applies wave-shaping to make the sound harsher. Low values are clean; high values create a crushed, 'fuzzy', and aggressive tone." |
|
) |
|
s8bit_fm_modulation_depth = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="FM Depth", |
|
info="Frequency Modulation intensity. At low values, there is no effect. At high values, it creates complex, metallic, or bell-like tones." |
|
) |
|
s8bit_fm_modulation_rate = gr.Slider( |
|
0.0, 500.0, value=0.0, step=1.0, |
|
label="FM Rate", |
|
info="Frequency Modulation speed. Low values create a slow 'wobble'. High values create fast modulation, resulting in bright, dissonant harmonics." |
|
) |
|
|
|
|
|
with gr.Group(): |
|
s8bit_echo_sustain = gr.Checkbox( |
|
value=False, |
|
label="Enable Echo Sustain for Long Notes", |
|
info="For 'Plucky' envelope only. Fills the silent tail of long, sustained notes with quiet, repeating pulses. Fixes 'choppy' sound on long piano notes." |
|
) |
|
|
|
with gr.Group(visible=False) as echo_sustain_settings: |
|
s8bit_echo_rate_hz = gr.Slider( |
|
1.0, 20.0, value=5.0, step=0.5, |
|
label="Echo Rate (Hz)", |
|
info="How many echoes (pulses) per second. Higher values create a faster, 'tremolo'-like effect." |
|
) |
|
s8bit_echo_decay_factor = gr.Slider( |
|
0.1, 0.95, value=0.45, step=0.05, |
|
label="Echo Decay Factor", |
|
info="How quickly the echoes fade. A value of 0.6 means each echo is 60% of the previous one's volume. Lower is faster." |
|
) |
|
s8bit_echo_trigger_threshold = gr.Slider( |
|
1.1, 30.0, value=20, step=0.1, |
|
label="Echo Trigger Threshold (x Decay Time)", |
|
info="Controls how long a note must be to trigger echoes. This value is a multiplier of the 'Decay Time'. Example: If 'Decay Time' is 0.1s and this threshold is set to 10.0, only notes longer than 1.0s (0.1 * 10.0) will produce echoes." |
|
) |
|
|
|
|
|
with gr.Accordion("Arpeggiator (Creative Tool to Reduce Stiffness)", open=False): |
|
s8bit_enable_arpeggiator = gr.Checkbox( |
|
value=False, |
|
label="Enable Arpeggiator (to reduce stiffness)", |
|
info="Transforms chords into rapid sequences of notes, creating a classic, lively chiptune feel. This is a key technique to make 8-bit music sound more fluid." |
|
) |
|
with gr.Group(visible=False) as arpeggiator_settings_box: |
|
s8bit_arpeggio_target = gr.Dropdown( |
|
["Accompaniment Only", "Melody Only", "Full Mix"], |
|
value="Accompaniment Only", |
|
label="Arpeggiation Target", |
|
info=""" |
|
- **Accompaniment Only (Classic):** Applies arpeggios only to the harmony/chord parts, leaving the lead melody untouched. The classic chiptune style. |
|
- **Melody Only (Modern):** Applies arpeggios as a decorative effect to the lead melody notes, leaving the accompaniment as is. Creates a modern, expressive synth lead sound. |
|
- **Full Mix:** Applies arpeggios to all non-drum tracks. Can create a very dense, complex texture. |
|
""" |
|
) |
|
s8bit_arpeggio_velocity_scale = gr.Slider( |
|
0.1, 1.5, value=0.3, step=0.05, |
|
label="Arpeggio Velocity Scale", |
|
info="A master volume control for the arpeggiator. 0.7 means arpeggiated notes will have 70% of the original chord's velocity." |
|
) |
|
s8bit_arpeggio_density = gr.Slider( |
|
0.1, 1.0, value=0.4, step=0.05, |
|
label="Arpeggio Density Scale", |
|
info="Controls the density/sparseness of arpeggios. Lower values create more silence between notes, making long chords feel more relaxed." |
|
) |
|
s8bit_arpeggio_rhythm = gr.Dropdown( |
|
[ |
|
"Continuous 16ths", |
|
"Classic Upbeat (8th)", |
|
"Pulsing 8ths", |
|
"Triplet 8ths", |
|
"Pulsing 4ths", |
|
"Galloping", |
|
"Simple Quarter Notes" |
|
], |
|
value="Pulsing 8ths", |
|
label="Arpeggio Rhythm Pattern", |
|
info=""" |
|
- **Continuous 16ths:** A constant, driving wall of sound with no breaks. Creates a very dense, high-energy texture. (Sounds like: ta-ta-ta-ta ta-ta-ta-ta) |
|
- **Classic Upbeat (8th):** The quintessential chiptune rhythm. Creates a bouncy, syncopated feel by playing on the off-beats. (Sounds like: _ _ ta-ta _ _ ta-ta) |
|
- **Pulsing 8ths:** A steady, on-beat rhythm playing two notes per beat. Good for a solid, rhythmic foundation. (Sounds like: ta-ta ta-ta) |
|
- **Triplet 8ths:** A rolling, "three-feel" rhythm that creates a swing or shuffle groove. Very common in Blues, Jazz, and Hip-Hop. (Sounds like: ta-ta-ta ta-ta-ta) |
|
- **Pulsing 4ths:** A strong, deliberate pulse on each downbeat, with a clear separation between notes. (Sounds like: ta_ ta_ ta_) |
|
- **Galloping:** A driving, forward-moving rhythm with a distinctive long-short pattern. Excellent for action themes. (Sounds like: ta--ta ta--ta) |
|
- **Simple Quarter Notes:** The most sparse pattern, playing one sustained note per beat. Creates a calm and open feel. (Sounds like: ta _ ta _ ta _ ta _) |
|
""" |
|
) |
|
s8bit_arpeggio_pattern = gr.Dropdown( |
|
["Up", "Down", "UpDown"], |
|
value="Up", |
|
label="Arpeggio Pattern", |
|
info=""" |
|
- **Up:** The classic choice. Ascends from the lowest to the highest note of the chord, then jumps back to the bottom. Creates a feeling of energy, optimism, and forward momentum. |
|
- **Down:** Descends from the highest to the lowest note. Often creates a more melancholic, reflective, or suspenseful mood. |
|
- **UpDown:** Ascends to the highest note, then descends back down without jumping. This is the smoothest and most fluid pattern, creating a gentle, wave-like motion. |
|
""" |
|
) |
|
s8bit_arpeggio_octave_range = gr.Slider( |
|
1, 4, value=1, step=1, |
|
label="Arpeggio Octave Range", |
|
info="How many octaves the arpeggio pattern will span before repeating." |
|
) |
|
s8bit_arpeggio_panning = gr.Dropdown( |
|
["Stereo", "Center", "Left", "Right"], |
|
value="Stereo", |
|
label="Arpeggio Layer Panning", |
|
info=""" |
|
- **Stereo (Recommended):** Creates a wide, immersive sound by alternating arpeggio tracks between the left and right speakers. This provides maximum clarity and separation from the main melody. |
|
- **Center:** Places the arpeggio directly in the middle (mono). Creates a focused, powerful, and retro sound, but may conflict with a centered lead melody. |
|
- **Left / Right:** Places the entire arpeggio layer on only one side. Useful for creative "call and response" effects or special mixing choices. |
|
""" |
|
) |
|
|
|
with gr.Group(): |
|
s8bit_enable_delay = gr.Checkbox( |
|
value=False, |
|
label="Enable Delay / Echo Effect", |
|
info="Adds repeating, decaying echoes to notes, creating a sense of space and rhythmic complexity." |
|
) |
|
with gr.Group(visible=False) as delay_settings_box: |
|
s8bit_delay_on_melody_only = gr.Checkbox( |
|
value=True, |
|
label="Apply Delay to Melody Only", |
|
info="Recommended. Applies the echo effect only to the lead melody notes, keeping the harmony clean." |
|
) |
|
s8bit_delay_division = gr.Dropdown( |
|
["Quarter Note", "Dotted 8th Note", "8th Note", "Triplet 8th Note", "16th Note"], |
|
value="Dotted 8th Note", |
|
label="Delay Time (Tempo Synced)", |
|
info=""" |
|
"The time between echoes, synced to the MIDI's tempo. 'Dotted 8th Note' is a classic rhythmic choice." |
|
- **Quarter Note:** A simple, stable echo on the next beat. (1, 2, 3, 4) |
|
- **Dotted 8th Note (Classic):** Creates a very popular, rolling syncopated rhythm. Highly recommended for adding energy and complexity. |
|
- **8th Note:** A steady, "call and response" echo on the off-beat. Creates a running or swing feel. |
|
- **Triplet 8th Note:** Creates a unique "shuffling" or "bouncing" 3-feel groove over the standard 4/4 beat. |
|
- **16th Note:** A very fast, dense echo. Can act more like a textural effect than a distinct rhythmic delay. |
|
""" |
|
) |
|
s8bit_delay_feedback = gr.Slider( |
|
0.1, 0.9, value=0.5, step=0.05, |
|
label="Delay Feedback (Volume Decay)", |
|
info="Controls how much quieter each echo is. 0.5 means each echo is 50% the volume of the one before it." |
|
) |
|
s8bit_delay_repeats = gr.Slider( |
|
1, 10, value=3, step=1, |
|
label="Number of Repeats", |
|
info="The total number of echoes to generate for each note." |
|
) |
|
|
|
s8bit_delay_highpass_cutoff_hz = gr.Slider( |
|
0, 500, value=100, step=10, |
|
label="Echo High-Pass Filter (Hz)", |
|
info="Filters out low frequencies from the echoes to prevent muddiness. Set to 0 to disable. 80-120Hz is a good range to clean up bass." |
|
) |
|
s8bit_delay_bass_pitch_shift = gr.Slider( |
|
-12, 24, value=12, step=1, |
|
label="Echo Pitch Shift for Low Notes (Semitones)", |
|
info="Shifts the pitch of echoes for very low notes (below C3). +12 is one octave up, +7 is a perfect fifth. 0 to disable." |
|
) |
|
|
|
s8bit_delay_lowpass_cutoff_hz = gr.Slider( |
|
1000, 20000, value=5000, step=500, |
|
label="Echo Low-Pass Filter (Hz)", |
|
info="Filters out high frequencies from the echoes to reduce harshness. Set to 20000 to disable. 4k-8kHz is a good range to make echoes sound 'darker'." |
|
) |
|
s8bit_delay_treble_pitch_shift = gr.Slider( |
|
-24, 12, value=-12, step=1, |
|
label="Echo Pitch Shift for High Notes (Semitones)", |
|
info="Shifts the pitch of echoes for very high notes (above C6). -12 is one octave down. 0 to disable." |
|
) |
|
|
|
|
|
with gr.Accordion("MIDI Pre-processing (Corrective Tool)", open=False): |
|
s8bit_enable_midi_preprocessing = gr.Checkbox( |
|
value=True, |
|
label="Enable MIDI Pre-processing (Anti-Harshness)", |
|
info="Intelligently reduces the velocity of notes that are likely to cause harshness (e.g., very high notes or loud, dense chords) before synthesis begins." |
|
) |
|
with gr.Group(visible=True) as midi_preprocessing_settings_box: |
|
s8bit_high_pitch_threshold = gr.Slider( |
|
60, 108, value=84, step=1, |
|
label="High Pitch Threshold (MIDI Note)", |
|
info="Notes above this pitch will have their velocity reduced. 84 = C6." |
|
) |
|
s8bit_high_pitch_velocity_scale = gr.Slider( |
|
0.1, 1.0, value=0.8, step=0.05, |
|
label="High Pitch Velocity Scale", |
|
info="Multiplier for high notes' velocity (e.g., 0.8 = 80% of original velocity)." |
|
) |
|
|
|
s8bit_low_pitch_threshold = gr.Slider( |
|
21, 60, value=36, step=1, |
|
label="Low Pitch Threshold (MIDI Note)", |
|
info="Notes below this pitch will have their velocity reduced to prevent muddiness. 36 = C2." |
|
) |
|
s8bit_low_pitch_velocity_scale = gr.Slider( |
|
0.1, 1.0, value=0.9, step=0.05, |
|
label="Low Pitch Velocity Scale", |
|
info="Multiplier for low notes' velocity. Use this to gently tame excessive sub-bass." |
|
) |
|
s8bit_chord_density_threshold = gr.Slider( |
|
2, 10, value=4, step=1, |
|
label="Chord Density Threshold", |
|
info="Minimum number of notes to be considered a 'dense' chord." |
|
) |
|
s8bit_chord_velocity_threshold = gr.Slider( |
|
50, 127, value=100, step=1, |
|
label="Chord Velocity Threshold", |
|
info="If a dense chord's average velocity is above this, it will be tamed." |
|
) |
|
s8bit_chord_velocity_scale = gr.Slider( |
|
0.1, 1.0, value=0.75, step=0.05, |
|
label="Chord Velocity Scale", |
|
info="Velocity multiplier for loud, dense chords." |
|
) |
|
|
|
|
|
with gr.Accordion("Audio Quality & Anti-Aliasing (Post-processing)", open=False): |
|
s8bit_enable_anti_aliasing = gr.Checkbox( |
|
value=False, |
|
label="Enable All Audio Quality Enhancements", |
|
info="Master toggle for all settings below. Disabling may slightly speed up rendering but can result in harsher, more aliased sound." |
|
) |
|
with gr.Group(visible=False) as anti_aliasing_settings_box: |
|
s8bit_use_additive_synthesis = gr.Checkbox( |
|
value=False, |
|
label="Use Additive Synthesis (High Quality, High CPU)", |
|
info="Generates band-limited waveforms to drastically reduce aliasing (harshness). Slower to render but produces a much cleaner sound. Note: The other anti-aliasing settings below will still apply even if this is disabled." |
|
) |
|
s8bit_edge_smoothing_ms = gr.Slider( |
|
0.0, 2.0, value=0.5, step=0.1, |
|
label="Waveform Edge Smoothing (ms)", |
|
info="Applies a tiny blur to the sharp edges of standard Square/Sawtooth waves to reduce aliasing. A cheap and effective alternative to Additive Synthesis." |
|
) |
|
s8bit_noise_lowpass_hz = gr.Slider( |
|
1000, 20000, value=9000, step=500, |
|
label="Noise Lowpass Filter (Hz)", |
|
info="Applies a lowpass filter to the white noise, making it sound softer and less harsh. Lower values produce a 'darker' noise." |
|
) |
|
s8bit_harmonic_lowpass_factor = gr.Slider( |
|
4.0, 32.0, value=12.0, step=0.5, |
|
label="Harmonic Lowpass Factor", |
|
info="Controls a dynamic lowpass filter. The cutoff frequency is (Note Frequency * this factor). Lower values create a darker, more muted sound." |
|
) |
|
s8bit_final_gain = gr.Slider( |
|
0.1, 1.5, value=0.8, step=0.05, |
|
label="Final Gain / Limiter Level", |
|
info="A final volume adjustment before adding the sound to the mix. Values > 1.0 can introduce soft clipping (distortion)." |
|
) |
|
|
|
|
|
ui_component_map = locals() |
|
|
|
|
|
all_settings_components = [ui_component_map[key] for key in ALL_PARAM_KEYS] |
|
|
|
|
|
|
|
s8bit_ui_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_')] |
|
s8bit_ui_components = [ui_component_map[key] for key in s8bit_ui_keys] |
|
|
|
|
|
s8bit_control_components = [comp for comp in s8bit_ui_components if comp != s8bit_preset_selector] |
|
|
|
|
|
basic_pitch_keys = ['onset_threshold', 'frame_threshold', 'minimum_note_length', 'minimum_frequency', 'maximum_frequency', |
|
'infer_onsets', 'melodia_trick', 'multiple_pitch_bends'] |
|
basic_pitch_ui_components = [ui_component_map[key] for key in basic_pitch_keys] |
|
|
|
|
|
single_file_inputs = [input_file] + all_settings_components |
|
result_outputs = [output_midi_md5, output_midi_title, output_midi_summary, output_midi, output_audio, output_plot, output_song_description] |
|
|
|
single_file_outputs = result_outputs + s8bit_ui_components |
|
|
|
batch_inputs = [batch_input_files] + all_settings_components |
|
batch_outputs = [batch_output_audio_files, batch_output_midi_files] |
|
|
|
|
|
submit_btn.click( |
|
fn=process_and_render_file, |
|
inputs=single_file_inputs, |
|
outputs=single_file_outputs |
|
) |
|
|
|
batch_process_btn.click( |
|
fn=batch_process_files, |
|
inputs=batch_inputs, |
|
outputs=batch_outputs |
|
) |
|
|
|
|
|
|
|
separate_vocals.change( |
|
fn=lambda x: gr.update(visible=x), |
|
inputs=separate_vocals, |
|
outputs=[separation_options_box] |
|
) |
|
|
|
|
|
enable_advanced_separation.change( |
|
fn=update_separation_mode_ui, |
|
inputs=enable_advanced_separation, |
|
outputs=[ |
|
advanced_separation_controls, |
|
transcribe_drums, |
|
transcribe_bass, |
|
transcribe_other_or_accompaniment, |
|
merge_drums_to_render, |
|
merge_bass_to_render, |
|
merge_other_or_accompaniment |
|
] |
|
) |
|
|
|
|
|
transcription_method.change( |
|
fn=lambda x: gr.update(visible=(x == "General Purpose")), |
|
inputs=transcription_method, |
|
outputs=general_transcription_settings |
|
) |
|
soundfont_bank.change( |
|
fn=lambda x: gr.update(visible=(x == SYNTH_8_BIT_LABEL)), |
|
inputs=soundfont_bank, |
|
outputs=synth_8bit_settings |
|
) |
|
|
|
|
|
basic_pitch_preset_selector.change( |
|
fn=apply_basic_pitch_preset, |
|
inputs=basic_pitch_preset_selector, |
|
outputs=basic_pitch_ui_components |
|
) |
|
|
|
|
|
s8bit_preset_selector.change( |
|
fn=apply_8bit_preset, |
|
inputs=s8bit_preset_selector, |
|
outputs=s8bit_control_components |
|
) |
|
|
|
|
|
|
|
render_type.change( |
|
fn=update_advanced_midi_options_visibility, |
|
inputs=render_type, |
|
outputs=advanced_rendering_options |
|
).then( |
|
fn=update_render_type_description, |
|
inputs=render_type, |
|
outputs=render_type_info |
|
) |
|
|
|
s8bit_echo_sustain.change( |
|
fn=lambda x: gr.update(visible=x), |
|
inputs=s8bit_echo_sustain, |
|
outputs=echo_sustain_settings |
|
) |
|
|
|
preview_sf_button.click( |
|
fn=preview_sound_source, |
|
inputs=[soundfont_bank] + all_settings_components, |
|
outputs=[preview_sf_player] |
|
) |
|
|
|
s8bit_enable_anti_aliasing.change( |
|
fn=lambda x: gr.update(visible=x), |
|
inputs=s8bit_enable_anti_aliasing, |
|
outputs=anti_aliasing_settings_box |
|
) |
|
|
|
s8bit_enable_midi_preprocessing.change( |
|
fn=lambda x: gr.update(visible=x), |
|
inputs=s8bit_enable_midi_preprocessing, |
|
outputs=midi_preprocessing_settings_box |
|
) |
|
|
|
s8bit_enable_arpeggiator.change( |
|
fn=lambda x: gr.update(visible=x), |
|
inputs=s8bit_enable_arpeggiator, |
|
outputs=arpeggiator_settings_box |
|
) |
|
|
|
s8bit_enable_delay.change( |
|
fn=lambda x: gr.update(visible=x), |
|
inputs=s8bit_enable_delay, |
|
outputs=delay_settings_box |
|
) |
|
|
|
|
|
app.queue().launch(inbrowser=True, debug=True) |
|
|