|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import io |
|
import os |
|
import hashlib |
|
import time as reqtime |
|
import copy |
|
import shutil |
|
import librosa |
|
import pyloudnorm as pyln |
|
import soundfile as sf |
|
|
|
import torch |
|
import ffmpeg |
|
import gradio as gr |
|
from dataclasses import dataclass, fields |
|
|
|
|
|
import torchaudio |
|
from demucs.apply import apply_model |
|
from demucs.pretrained import get_model |
|
from demucs.audio import convert_audio |
|
|
|
from src.piano_transcription.utils import initialize_app |
|
from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate |
|
|
|
|
|
from src import TMIDIX, TPLOTS |
|
from src import MIDI |
|
from src.midi_to_colab_audio import midi_to_colab_audio |
|
|
|
|
|
import basic_pitch |
|
from basic_pitch.inference import predict |
|
from basic_pitch import ICASSP_2022_MODEL_PATH |
|
|
|
|
|
import pretty_midi |
|
import numpy as np |
|
from scipy import signal |
|
|
|
|
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
import glob |
|
|
|
|
|
SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)" |
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
class AppParameters: |
|
"""A dataclass to hold all configurable parameters for the application.""" |
|
|
|
|
|
|
|
input_file: str = None |
|
batch_input_files: list = None |
|
|
|
|
|
s8bit_preset_selector: str = "Custom" |
|
separate_vocals: bool = False |
|
remerge_vocals: bool = False |
|
transcription_target: str = "Transcribe Music (Accompaniment)" |
|
transcribe_both_stems: bool = False |
|
enable_stereo_processing: bool = False |
|
transcription_method: str = "General Purpose" |
|
|
|
|
|
onset_threshold: float = 0.5 |
|
frame_threshold: float = 0.3 |
|
minimum_note_length: int = 128 |
|
minimum_frequency: float = 60.0 |
|
maximum_frequency: float = 4000.0 |
|
infer_onsets: bool = True |
|
melodia_trick: bool = True |
|
multiple_pitch_bends: bool = False |
|
|
|
|
|
render_type: str = "Render as-is" |
|
soundfont_bank: str = "None (8-bit Synthesizer)" |
|
render_sample_rate: str = "44100" |
|
render_with_sustains: bool = True |
|
merge_misaligned_notes: int = -1 |
|
custom_render_patch: int = -1 |
|
render_align: str = "Do not align" |
|
render_transpose_value: int = 0 |
|
render_transpose_to_C4: bool = False |
|
render_output_as_solo_piano: bool = False |
|
render_remove_drums: bool = False |
|
|
|
|
|
s8bit_waveform_type: str = 'Square' |
|
s8bit_pulse_width: float = 0.5 |
|
s8bit_envelope_type: str = 'Plucky (AD Envelope)' |
|
s8bit_decay_time_s: float = 0.1 |
|
s8bit_vibrato_rate: float = 5.0 |
|
s8bit_vibrato_depth: float = 0.0 |
|
s8bit_bass_boost_level: float = 0.0 |
|
s8bit_smooth_notes_level: float = 0.0 |
|
s8bit_continuous_vibrato_level: float = 0.0 |
|
s8bit_noise_level: float = 0.0 |
|
s8bit_distortion_level: float = 0.0 |
|
s8bit_fm_modulation_depth: float = 0.0 |
|
s8bit_fm_modulation_rate: float = 0.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_soundfonts(): |
|
""" |
|
Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2' |
|
directory recursively for all .sf2 files. |
|
Returns a dictionary mapping a user-friendly name to its full file path, with |
|
default soundfonts listed first in their specified order. |
|
|
|
Downloads soundfont files from the specified Hugging Face Space repository |
|
to a local 'src/sf2' directory if they don't already exist. |
|
Returns a list of local paths to the soundfont files. |
|
""" |
|
SF2_REPO_ID = "asigalov61/Advanced-MIDI-Renderer" |
|
SF2_DIR = "src/sf2" |
|
|
|
|
|
DEFAULT_SF2_FILENAMES = [ |
|
"SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2", |
|
"Orpheus_18.06.2020.sf2", |
|
"Live HQ Natural SoundFont GM.sf2", |
|
"Nice-Strings-PlusOrchestra-v1.6.sf2", |
|
"KBH-Real-Choir-V2.5.sf2", |
|
"SuperGameBoy.sf2", |
|
"ProtoSquare.sf2" |
|
] |
|
|
|
|
|
os.makedirs(SF2_DIR, exist_ok=True) |
|
|
|
|
|
print("Checking for SoundFont files...") |
|
for filename in DEFAULT_SF2_FILENAMES: |
|
local_path = os.path.join(SF2_DIR, filename) |
|
|
|
|
|
if not os.path.exists(local_path): |
|
print(f"Downloading '{filename}' from Hugging Face Hub...") |
|
try: |
|
|
|
|
|
hf_hub_download( |
|
repo_id=SF2_REPO_ID, |
|
repo_type='space', |
|
filename=f"{filename}", |
|
local_dir=SF2_DIR, |
|
|
|
) |
|
print(f"'{filename}' downloaded successfully.") |
|
except Exception as e: |
|
print(f"Error downloading {filename}: {e}") |
|
|
|
|
|
|
|
print(f"Scanning '{SF2_DIR}' for all .sf2 files...") |
|
all_sfs_map = {} |
|
|
|
search_pattern = os.path.join(SF2_DIR, '**', '*.sf2') |
|
for full_path in glob.glob(search_pattern, recursive=True): |
|
|
|
relative_path = os.path.relpath(full_path, SF2_DIR) |
|
display_name = os.path.splitext(relative_path)[0].replace("\\", "/") |
|
all_sfs_map[display_name] = full_path |
|
|
|
|
|
ordered_soundfont_map = {} |
|
|
|
|
|
default_display_names = [os.path.splitext(f)[0] for f in DEFAULT_SF2_FILENAMES] |
|
|
|
|
|
other_display_names = [name for name in all_sfs_map.keys() if name not in default_display_names] |
|
other_display_names.sort() |
|
|
|
|
|
for name in default_display_names: |
|
if name in all_sfs_map: |
|
ordered_soundfont_map[name] = all_sfs_map[name] |
|
|
|
|
|
for name in other_display_names: |
|
ordered_soundfont_map[name] = all_sfs_map[name] |
|
|
|
return ordered_soundfont_map |
|
|
|
|
|
|
|
|
|
def synthesize_8bit_style(*, midi_data: pretty_midi.PrettyMIDI, fs: int, params: AppParameters): |
|
""" |
|
Synthesizes an 8-bit style audio waveform from a PrettyMIDI object. |
|
This function generates waveforms manually instead of using a synthesizer like FluidSynth. |
|
Includes an optional sub-octave bass booster with adjustable level. |
|
Instruments are panned based on their order in the MIDI file. |
|
Instrument 1 -> Left, Instrument 2 -> Right. |
|
Now supports graded levels for smoothing and vibrato continuity. |
|
""" |
|
total_duration = midi_data.get_end_time() |
|
|
|
waveform = np.zeros((2, int(total_duration * fs) + fs)) |
|
|
|
num_instruments = len(midi_data.instruments) |
|
|
|
|
|
osc_phase = {} |
|
|
|
vibrato_phase = 0.0 |
|
|
|
for i, instrument in enumerate(midi_data.instruments): |
|
|
|
|
|
pan_l, pan_r = 0.707, 0.707 |
|
if num_instruments == 2: |
|
if i == 0: |
|
pan_l, pan_r = 1.0, 0.0 |
|
elif i == 1: |
|
pan_l, pan_r = 0.0, 1.0 |
|
elif num_instruments > 2: |
|
if i == 0: |
|
pan_l, pan_r = 1.0, 0.0 |
|
elif i == 1: |
|
pan_l, pan_r = 0.0, 1.0 |
|
|
|
|
|
osc_phase[i] = 0.0 |
|
|
|
for note in instrument.notes: |
|
freq = pretty_midi.note_number_to_hz(note.pitch) |
|
note_duration = note.end - note.start |
|
num_samples = int(note_duration * fs) |
|
if num_samples <= 0: |
|
continue |
|
|
|
t = np.arange(num_samples) / fs |
|
|
|
|
|
|
|
|
|
vib_phase_inc = 2 * np.pi * params.s8bit_vibrato_rate / fs |
|
per_note_vib_phase = 2 * np.pi * params.s8bit_vibrato_rate * t |
|
continuous_vib_phase = vibrato_phase + np.arange(num_samples) * vib_phase_inc |
|
|
|
|
|
final_vib_phase = ( |
|
per_note_vib_phase * (1 - params.s8bit_continuous_vibrato_level) + |
|
continuous_vib_phase * params.s8bit_continuous_vibrato_level |
|
) |
|
vibrato_lfo = params.s8bit_vibrato_depth * np.sin(final_vib_phase) |
|
|
|
|
|
if num_samples > 0: |
|
vibrato_phase = (continuous_vib_phase[-1] + vib_phase_inc) % (2 * np.pi) |
|
|
|
|
|
fm_lfo = params.s8bit_fm_modulation_depth * np.sin(2 * np.pi * params.s8bit_fm_modulation_rate * t) |
|
modulated_freq = freq * (1 + fm_lfo) |
|
|
|
|
|
phase_inc = 2 * np.pi * (modulated_freq + vibrato_lfo) / fs |
|
phase = osc_phase[i] + np.cumsum(phase_inc) |
|
if num_samples > 0: |
|
osc_phase[i] = phase[-1] % (2 * np.pi) |
|
|
|
if params.s8bit_waveform_type == 'Square': |
|
note_waveform = signal.square(phase, duty=params.s8bit_pulse_width) |
|
elif params.s8bit_waveform_type == 'Sawtooth': |
|
note_waveform = signal.sawtooth(phase) |
|
else: |
|
note_waveform = signal.sawtooth(phase, width=0.5) |
|
|
|
|
|
if params.s8bit_bass_boost_level > 0: |
|
bass_freq = freq / 2.0 |
|
|
|
if bass_freq > 20: |
|
|
|
bass_phase_inc = 2 * np.pi * bass_freq / fs |
|
bass_phase = np.cumsum(np.full(num_samples, bass_phase_inc)) |
|
bass_sub_waveform = signal.square(bass_phase, duty=0.5) |
|
|
|
|
|
main_level = 1.0 - (0.5 * params.s8bit_bass_boost_level) |
|
note_waveform = (note_waveform * main_level) + (bass_sub_waveform * params.s8bit_bass_boost_level) |
|
|
|
|
|
if params.s8bit_noise_level > 0: |
|
note_waveform += np.random.uniform(-1, 1, num_samples) * params.s8bit_noise_level |
|
|
|
|
|
if params.s8bit_distortion_level > 0: |
|
|
|
note_waveform = np.tanh(note_waveform * (1 + params.s8bit_distortion_level * 5)) |
|
|
|
|
|
start_amp = note.velocity / 127.0 |
|
envelope = np.zeros(num_samples) |
|
|
|
if params.s8bit_envelope_type == 'Plucky (AD Envelope)': |
|
attack_samples = min(int(0.005 * fs), num_samples) |
|
decay_samples = min(int(params.s8bit_decay_time_s * fs), num_samples - attack_samples) |
|
|
|
envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples) |
|
if decay_samples > 0: |
|
envelope[attack_samples:attack_samples+decay_samples] = np.linspace(start_amp, 0, decay_samples) |
|
else: |
|
envelope = np.linspace(start_amp, 0, num_samples) |
|
|
|
|
|
|
|
if params.s8bit_smooth_notes_level > 0 and num_samples > 10: |
|
fade_length = int(fs * 0.01 * params.s8bit_smooth_notes_level) |
|
fade_samples = min(fade_length, num_samples // 2) |
|
if fade_samples > 0: |
|
envelope[:fade_samples] *= np.linspace(0.5, 1.0, fade_samples) |
|
envelope[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples) |
|
|
|
|
|
note_waveform *= envelope |
|
|
|
start_sample = int(note.start * fs) |
|
end_sample = start_sample + num_samples |
|
if end_sample > waveform.shape[1]: |
|
end_sample = waveform.shape[1] |
|
note_waveform = note_waveform[:end_sample-start_sample] |
|
|
|
|
|
waveform[0, start_sample:end_sample] += note_waveform * pan_l |
|
waveform[1, start_sample:end_sample] += note_waveform * pan_r |
|
|
|
return waveform |
|
|
|
|
|
def analyze_midi_velocity(midi_path): |
|
midi = pretty_midi.PrettyMIDI(midi_path) |
|
all_velocities = [] |
|
|
|
print(f"Analyzing velocity for MIDI: {midi_path}") |
|
for i, instrument in enumerate(midi.instruments): |
|
velocities = [note.velocity for note in instrument.notes] |
|
all_velocities.extend(velocities) |
|
|
|
if velocities: |
|
print(f"Instrument {i} ({instrument.name}):") |
|
print(f" Notes count: {len(velocities)}") |
|
print(f" Velocity min: {min(velocities)}") |
|
print(f" Velocity max: {max(velocities)}") |
|
print(f" Velocity mean: {np.mean(velocities):.2f}") |
|
else: |
|
print(f"Instrument {i} ({instrument.name}): no notes found.") |
|
|
|
if all_velocities: |
|
print("\nOverall MIDI velocity stats:") |
|
print(f" Total notes: {len(all_velocities)}") |
|
print(f" Velocity min: {min(all_velocities)}") |
|
print(f" Velocity max: {max(all_velocities)}") |
|
print(f" Velocity mean: {np.mean(all_velocities):.2f}") |
|
else: |
|
print("No notes found in this MIDI.") |
|
|
|
|
|
def scale_instrument_velocity(instrument, scale=0.8): |
|
for note in instrument.notes: |
|
note.velocity = max(1, min(127, int(note.velocity * scale))) |
|
|
|
|
|
def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0): |
|
""" |
|
Normalizes the audio data to a target integrated loudness (LUFS). |
|
This provides more consistent perceived volume than peak normalization. |
|
|
|
Args: |
|
audio_data (np.ndarray): The audio signal. |
|
sample_rate (int): The sample rate of the audio. |
|
target_lufs (float): The target loudness in LUFS. Defaults to -23.0, |
|
a common standard for broadcast. |
|
|
|
Returns: |
|
np.ndarray: The loudness-normalized audio data. |
|
""" |
|
try: |
|
|
|
meter = pyln.Meter(sample_rate) |
|
loudness = meter.integrated_loudness(audio_data) |
|
|
|
|
|
|
|
loudness_gain_db = target_lufs - loudness |
|
loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0) |
|
|
|
|
|
normalized_audio = audio_data * loudness_gain_linear |
|
|
|
|
|
|
|
peak_val = np.max(np.abs(normalized_audio)) |
|
if peak_val > 1.0: |
|
normalized_audio /= peak_val |
|
print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.") |
|
|
|
print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.") |
|
return normalized_audio |
|
|
|
except Exception as e: |
|
print(f"Loudness normalization failed: {e}. Falling back to original audio.") |
|
return audio_data |
|
|
|
|
|
|
|
|
|
|
|
def merge_midis(midi_path_left, midi_path_right, output_path): |
|
""" |
|
Merges two MIDI files into a single MIDI file. This robust version iterates |
|
through ALL instruments in both MIDI files, ensuring no data is lost if the |
|
source files are multi-instrumental. |
|
|
|
It applies hard-left panning (Pan=0) to every instrument from the left MIDI |
|
and hard-right panning (Pan=127) to every instrument from the right MIDI. |
|
""" |
|
try: |
|
analyze_midi_velocity(midi_path_left) |
|
analyze_midi_velocity(midi_path_right) |
|
midi_left = pretty_midi.PrettyMIDI(midi_path_left) |
|
midi_right = pretty_midi.PrettyMIDI(midi_path_right) |
|
|
|
merged_midi = pretty_midi.PrettyMIDI() |
|
|
|
|
|
if midi_left.instruments: |
|
print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.") |
|
|
|
for instrument in midi_left.instruments: |
|
scale_instrument_velocity(instrument, scale=0.8) |
|
|
|
instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}" |
|
|
|
|
|
|
|
|
|
|
|
pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0) |
|
|
|
instrument.control_changes.insert(0, pan_left) |
|
|
|
|
|
merged_midi.instruments.append(instrument) |
|
|
|
|
|
if midi_right.instruments: |
|
print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.") |
|
|
|
for instrument in midi_right.instruments: |
|
scale_instrument_velocity(instrument, scale=0.8) |
|
instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}" |
|
|
|
|
|
|
|
|
|
|
|
pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0) |
|
instrument.control_changes.insert(0, pan_right) |
|
|
|
merged_midi.instruments.append(instrument) |
|
|
|
merged_midi.write(output_path) |
|
print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'") |
|
analyze_midi_velocity(output_path) |
|
return output_path |
|
|
|
except Exception as e: |
|
print(f"Error merging MIDI files: {e}") |
|
|
|
if os.path.exists(midi_path_left): |
|
print("Fallback: Using only the left channel MIDI.") |
|
return midi_path_left |
|
return None |
|
|
|
|
|
def is_stereo_midi(midi_path: str) -> bool: |
|
""" |
|
Checks if a MIDI file contains the specific stereo panning control changes |
|
(hard left and hard right) created by the merge_midis function. |
|
|
|
Args: |
|
midi_path (str): The file path to the MIDI file. |
|
|
|
Returns: |
|
bool: True if both hard-left (0) and hard-right (127) pan controls are found, False otherwise. |
|
""" |
|
try: |
|
midi_data = pretty_midi.PrettyMIDI(midi_path) |
|
|
|
found_left_pan = False |
|
found_right_pan = False |
|
|
|
for instrument in midi_data.instruments: |
|
for control_change in instrument.control_changes: |
|
|
|
if control_change.number == 10: |
|
if control_change.value == 0: |
|
found_left_pan = True |
|
elif control_change.value == 127: |
|
found_right_pan = True |
|
|
|
|
|
if found_left_pan and found_right_pan: |
|
return True |
|
|
|
return found_left_pan and found_right_pan |
|
|
|
except Exception as e: |
|
|
|
print(f"Could not analyze MIDI for stereo info: {e}") |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
def TranscribePianoAudio(input_file): |
|
""" |
|
Transcribes a WAV or MP3 audio file of a SOLO PIANO performance into a MIDI file. |
|
This uses the ByteDance model. |
|
Args: |
|
input_file_path (str): The path to the input audio file. |
|
Returns: |
|
str: The file path of the generated MIDI file. |
|
""" |
|
print('=' * 70) |
|
print('STAGE 1: Starting Piano-Specific Transcription') |
|
print('=' * 70) |
|
|
|
|
|
fn = os.path.basename(input_file) |
|
fn1 = fn.split('.')[0] |
|
|
|
|
|
output_dir = os.path.join("output", "transcribed_piano_") |
|
out_mid_path = os.path.join(output_dir, fn1 + '.mid') |
|
|
|
|
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
print('-' * 70) |
|
print(f'Input file name: {fn}') |
|
print(f'Output MIDI path: {out_mid_path}') |
|
print('-' * 70) |
|
|
|
|
|
print('Loading audio...') |
|
(audio, _) = utilities.load_audio(input_file, sr=transcription_sample_rate, mono=True) |
|
print('Audio loaded successfully.') |
|
print('-' * 70) |
|
|
|
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
print(f'Loading transcriptor model... device= {device}') |
|
transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth") |
|
print('Transcriptor loaded.') |
|
print('-' * 70) |
|
|
|
|
|
print('Transcribing audio to MIDI (Piano-Specific)...') |
|
|
|
transcriptor.transcribe(audio, out_mid_path) |
|
print('Piano transcription complete.') |
|
print('=' * 70) |
|
|
|
|
|
return out_mid_path |
|
|
|
def TranscribeGeneralAudio(input_file, onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_bends): |
|
""" |
|
Transcribes a general audio file into a MIDI file using basic-pitch. |
|
This is suitable for various instruments and vocals. |
|
""" |
|
print('=' * 70) |
|
print('STAGE 1: Starting General Purpose Transcription') |
|
print('=' * 70) |
|
|
|
fn = os.path.basename(input_file) |
|
fn1 = fn.split('.')[0] |
|
output_dir = os.path.join("output", "transcribed_general_") |
|
out_mid_path = os.path.join(output_dir, fn1 + '.mid') |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
print(f'Input file: {fn}\nOutput MIDI: {out_mid_path}') |
|
|
|
|
|
print('Transcribing audio to MIDI (General Purpose)...') |
|
|
|
model_output, midi_data, note_events = basic_pitch.inference.predict( |
|
audio_path=input_file, |
|
model_or_model_path=ICASSP_2022_MODEL_PATH, |
|
onset_threshold=onset_threshold, |
|
frame_threshold=frame_threshold, |
|
minimum_note_length=minimum_note_length, |
|
minimum_frequency=minimum_frequency, |
|
maximum_frequency=maximum_frequency, |
|
infer_onsets=infer_onsets, |
|
melodia_trick=melodia_trick, |
|
multiple_pitch_bends=multiple_bends |
|
) |
|
|
|
|
|
midi_data.write(out_mid_path) |
|
print('General transcription complete.') |
|
print('=' * 70) |
|
|
|
return out_mid_path |
|
|
|
|
|
|
|
|
|
|
|
def Render_MIDI(*, input_midi_path: str, params: AppParameters): |
|
""" |
|
Processes and renders a MIDI file according to user-defined settings. |
|
Can render using SoundFonts or a custom 8-bit synthesizer. |
|
Args: |
|
input_midi_path (str): The path to the input MIDI file. |
|
All other arguments are rendering options from the Gradio UI. |
|
Returns: |
|
A tuple containing all the output elements for the Gradio UI. |
|
""" |
|
print('*' * 70) |
|
print('STAGE 2: Starting MIDI Rendering') |
|
print('*' * 70) |
|
|
|
|
|
fn = os.path.basename(input_midi_path) |
|
fn1 = fn.split('.')[0] |
|
|
|
|
|
output_dir = os.path.join("output", "rendered_midi") |
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
|
|
new_fn_path = os.path.join(output_dir, fn1 + '_rendered.mid') |
|
|
|
try: |
|
with open(input_midi_path, 'rb') as f: |
|
fdata = f.read() |
|
input_midi_md5hash = hashlib.md5(fdata).hexdigest() |
|
except FileNotFoundError: |
|
|
|
print(f"Error: Input MIDI file not found at {input_midi_path}") |
|
return [None] * 7 |
|
|
|
print('=' * 70) |
|
print('Requested settings:') |
|
print(f'Input MIDI file name: {fn}') |
|
print(f'Input MIDI md5 hash: {input_midi_md5hash}') |
|
print('-' * 70) |
|
print(f"Render type: {params.render_type}") |
|
print(f"Soundfont bank: {params.soundfont_bank}") |
|
print(f"Audio render sample rate: {params.render_sample_rate}") |
|
|
|
print('=' * 70) |
|
|
|
|
|
print('Processing MIDI... Please wait...') |
|
raw_score = MIDI.midi2single_track_ms_score(fdata) |
|
|
|
processed_scores = TMIDIX.advanced_score_processor(raw_score, |
|
return_enhanced_score_notes=True, |
|
apply_sustain=params.render_with_sustains) |
|
|
|
|
|
if not processed_scores: |
|
|
|
print("Warning: MIDI file contains no processable notes.") |
|
|
|
|
|
return ("N/A", fn1, "MIDI file contains no notes.", None, None, None, "No notes found.") |
|
|
|
|
|
escore = processed_scores[0] |
|
|
|
|
|
if not escore: |
|
print("Warning: MIDI file contains no processable notes.") |
|
return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.") |
|
|
|
|
|
if params.merge_misaligned_notes > 0: |
|
escore = TMIDIX.merge_escore_notes(escore, merge_threshold=params.merge_misaligned_notes) |
|
|
|
escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1) |
|
|
|
first_note_index = [e[0] for e in raw_score[1]].index('note') |
|
cscore = TMIDIX.chordify_score([1000, escore]) |
|
|
|
meta_data = raw_score[1][:first_note_index] + [escore[0]] + [escore[-1]] + [raw_score[1][-1]] |
|
|
|
aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True) |
|
song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes) |
|
|
|
print('Done!') |
|
print('=' * 70) |
|
print('Input MIDI metadata:', meta_data[:5]) |
|
print('=' * 70) |
|
print('Input MIDI song description:', song_description) |
|
print('=' * 70) |
|
print('Processing...Please wait...') |
|
|
|
|
|
output_score = copy.deepcopy(escore) |
|
|
|
|
|
if params.render_type == "Extract melody": |
|
output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True) |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
elif params.render_type == "Flip": |
|
output_score = TMIDIX.flip_enhanced_score_notes(escore) |
|
elif params.render_type == "Reverse": |
|
output_score = TMIDIX.reverse_enhanced_score_notes(escore) |
|
elif params.render_type == 'Repair Durations': |
|
output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0) |
|
elif params.render_type == 'Repair Chords': |
|
fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0] |
|
output_score = TMIDIX.flatten(fixed_cscore) |
|
elif params.render_type == 'Remove Duplicate Pitches': |
|
output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore) |
|
elif params.render_type == "Add Drum Track": |
|
nd_escore = [e for e in escore if e[3] != 9] |
|
nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore) |
|
output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore) |
|
|
|
for e in output_score: |
|
e[1] *= 16 |
|
e[2] *= 16 |
|
|
|
print('MIDI processing complete.') |
|
print('=' * 70) |
|
|
|
|
|
if params.render_type != "Render as-is": |
|
print('Applying final adjustments (transpose, align, patch)...') |
|
if params.custom_render_patch != -1: |
|
for e in output_score: |
|
if e[3] != 9: |
|
e[6] = params.custom_render_patch |
|
|
|
if params.render_transpose_value != 0: |
|
output_score = TMIDIX.transpose_escore_notes(output_score, params.render_transpose_value) |
|
|
|
if params.render_transpose_to_C4: |
|
output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) |
|
|
|
if params.render_align == "Start Times": |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
output_score = TMIDIX.align_escore_notes_to_bars(output_score) |
|
|
|
elif params.render_align == "Start Times and Durations": |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True) |
|
|
|
elif params.render_align == "Start Times and Split Durations": |
|
output_score = TMIDIX.recalculate_score_timings(output_score) |
|
output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True) |
|
|
|
if params.render_type == "Longest Repeating Phrase": |
|
zscore = TMIDIX.recalculate_score_timings(output_score) |
|
lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore) |
|
|
|
if lrno_score is not None: |
|
output_score = lrno_score |
|
|
|
else: |
|
output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50)) |
|
|
|
if params.render_type == "Multi-Instrumental Summary": |
|
zscore = TMIDIX.recalculate_score_timings(output_score) |
|
c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore) |
|
|
|
if len(c_escore_notes) > 128: |
|
cmatrix = TMIDIX.escore_notes_to_image_matrix(c_escore_notes, filter_out_zero_rows=True, filter_out_duplicate_rows=True) |
|
smatrix = TPLOTS.square_image_matrix(cmatrix, num_pca_components=max(1, min(5, len(c_escore_notes) // 128))) |
|
output_score = TMIDIX.image_matrix_to_original_escore_notes(smatrix) |
|
|
|
for o in output_score: |
|
o[1] *= 250 |
|
o[2] *= 250 |
|
|
|
if params.render_output_as_solo_piano: |
|
output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not params.render_remove_drums)) |
|
|
|
if params.render_remove_drums and not params.render_output_as_solo_piano: |
|
output_score = TMIDIX.strip_drums_from_escore_notes(output_score) |
|
|
|
if params.render_type == "Solo Piano Summary": |
|
sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False) |
|
zscore = TMIDIX.recalculate_score_timings(sp_escore_notes) |
|
|
|
if len(zscore) > 128: |
|
|
|
bmatrix = TMIDIX.escore_notes_to_binary_matrix(zscore) |
|
cmatrix = TMIDIX.compress_binary_matrix(bmatrix, only_compress_zeros=True) |
|
smatrix = TPLOTS.square_binary_matrix(cmatrix, interpolation_order=max(1, min(5, len(zscore) // 128))) |
|
output_score = TMIDIX.binary_matrix_to_original_escore_notes(smatrix) |
|
|
|
for o in output_score: |
|
o[1] *= 200 |
|
o[2] *= 200 |
|
|
|
print('Final adjustments complete.') |
|
print('=' * 70) |
|
|
|
|
|
|
|
SONG, patches, _ = TMIDIX.patch_enhanced_score_notes(output_score) |
|
|
|
|
|
|
|
path_without_ext = new_fn_path.rsplit('.mid', 1)[0] |
|
|
|
MIDI.Tegridy_ms_SONG_to_MIDI_Converter(SONG, |
|
output_signature = 'Integrated-MIDI-Processor', |
|
output_file_name = path_without_ext, |
|
track_name='Processed Track', |
|
list_of_MIDI_patches=patches |
|
) |
|
midi_to_render_path = new_fn_path |
|
else: |
|
|
|
with open(new_fn_path, 'wb') as f: |
|
f.write(fdata) |
|
midi_to_render_path = new_fn_path |
|
|
|
|
|
print('Rendering final audio...') |
|
|
|
|
|
srate = int(params.render_sample_rate) |
|
|
|
|
|
if params.soundfont_bank == SYNTH_8_BIT_LABEL: |
|
print("Using 8-bit style synthesizer...") |
|
try: |
|
|
|
midi_data_for_synth = pretty_midi.PrettyMIDI(midi_to_render_path) |
|
|
|
|
|
audio = synthesize_8bit_style(midi_data=midi_data_for_synth, fs=srate, params=params) |
|
|
|
peak_val = np.max(np.abs(audio)) |
|
if peak_val > 0: |
|
audio /= peak_val |
|
|
|
audio_out = (audio.T * 32767).astype(np.int16) |
|
except Exception as e: |
|
print(f"Error during 8-bit synthesis: {e}") |
|
return [None] * 7 |
|
else: |
|
print(f"Using SoundFont: {params.soundfont_bank}") |
|
|
|
soundfont_path = soundfonts_dict.get(params.soundfont_bank) |
|
|
|
|
|
if not soundfont_path or not os.path.exists(soundfont_path): |
|
|
|
raise gr.Error(f"SoundFont file '{params.soundfont_bank}' could not be found. Please check your 'src/sf2' directory or select another SoundFont.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(midi_to_render_path, 'rb') as f: |
|
midi_file_content = f.read() |
|
|
|
audio_out = midi_to_colab_audio(midi_file_content, |
|
soundfont_path=soundfont_path, |
|
sample_rate=srate, |
|
output_for_gradio=True |
|
) |
|
|
|
print('Audio rendering complete.') |
|
print('=' * 70) |
|
|
|
|
|
with open(midi_to_render_path, 'rb') as f: |
|
new_md5_hash = hashlib.md5(f.read()).hexdigest() |
|
output_plot = TPLOTS.plot_ms_SONG(output_score, plot_title=f"Score of {fn1}", return_plt=True) |
|
|
|
output_midi_summary = str(meta_data) |
|
|
|
return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description |
|
|
|
|
|
def analyze_midi_features(midi_data): |
|
""" |
|
Analyzes a PrettyMIDI object to extract musical features for parameter recommendation. |
|
|
|
Args: |
|
midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze. |
|
|
|
Returns: |
|
dict or None: A dictionary containing features, or None if the MIDI is empty. |
|
Features: 'note_count', 'instruments_count', 'duration', |
|
'note_density', 'avg_velocity', 'pitch_range'. |
|
""" |
|
all_notes = [note for instrument in midi_data.instruments for note in instrument.notes] |
|
note_count = len(all_notes) |
|
|
|
|
|
if note_count == 0: |
|
return None |
|
|
|
duration = midi_data.get_end_time() |
|
|
|
if duration == 0: |
|
note_density = 0 |
|
else: |
|
note_density = note_count / duration |
|
|
|
|
|
avg_velocity = sum(note.velocity for note in all_notes) / note_count |
|
avg_pitch = sum(note.pitch for note in all_notes) / note_count |
|
avg_note_length = sum(note.end - note.start for note in all_notes) / note_count |
|
|
|
|
|
if note_count > 1: |
|
min_pitch = min(note.pitch for note in all_notes) |
|
max_pitch = max(note.pitch for note in all_notes) |
|
pitch_range = max_pitch - min_pitch |
|
else: |
|
pitch_range = 0 |
|
|
|
return { |
|
'note_count': note_count, |
|
'instruments_count': len(midi_data.instruments), |
|
'duration': duration, |
|
'note_density': note_density, |
|
'avg_velocity': avg_velocity, |
|
'pitch_range': pitch_range, |
|
'avg_pitch': avg_pitch, |
|
'avg_note_length': avg_note_length, |
|
} |
|
|
|
def determine_waveform_type(features): |
|
""" |
|
Determines the best waveform type based on analyzed MIDI features. |
|
- Square: Best for most general-purpose, bright melodies. |
|
- Sawtooth: Best for intense, heavy, or powerful leads and basses. |
|
- Triangle: Best for soft, gentle basses or flute-like sounds. |
|
|
|
Args: |
|
features (dict): The dictionary of features from analyze_midi_features. |
|
|
|
Returns: |
|
str: The recommended waveform type ('Square', 'Sawtooth', or 'Triangle'). |
|
""" |
|
|
|
|
|
|
|
if features['avg_pitch'] <= 52 and features['avg_note_length'] >= 0.3 and features['pitch_range'] < 12: |
|
return "Triangle" |
|
|
|
|
|
|
|
|
|
if features['note_density'] >= 6 or features['pitch_range'] >= 18: |
|
return "Sawtooth" |
|
|
|
|
|
return "Square" |
|
|
|
def recommend_8bit_params(midi_data, default_preset): |
|
""" |
|
Recommends 8-bit synthesizer parameters using a unified, factor-based model. |
|
This "AI" generates a sound profile based on normalized musical features. |
|
|
|
Args: |
|
midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze. |
|
default_preset (dict): A fallback preset if analysis fails. |
|
|
|
Returns: |
|
dict: A dictionary of recommended synthesizer parameters. |
|
""" |
|
features = analyze_midi_features(midi_data) |
|
if features is None: |
|
|
|
return default_preset |
|
|
|
|
|
params = {} |
|
|
|
|
|
|
|
params['waveform_type'] = determine_waveform_type(features) |
|
|
|
|
|
if params['waveform_type'] == 'Square': |
|
|
|
|
|
|
|
params['pulse_width'] = 0.3 if features['pitch_range'] > 30 else 0.5 |
|
else: |
|
|
|
params['pulse_width'] = 0.5 |
|
|
|
|
|
|
|
is_plucky = features['note_density'] > 10 |
|
params['envelope_type'] = 'Plucky (AD Envelope)' if is_plucky else 'Sustained (Full Decay)' |
|
params['decay_time_s'] = 0.15 if is_plucky else 0.4 |
|
|
|
|
|
|
|
params['vibrato_depth'] = min(max((features['avg_velocity'] - 60) / 20, 0), 10) |
|
if features['note_density'] > 12: |
|
params['vibrato_rate'] = 7.0 |
|
elif features['note_density'] > 6: |
|
params['vibrato_rate'] = 5.0 |
|
else: |
|
params['vibrato_rate'] = 3.0 |
|
|
|
|
|
|
|
|
|
|
|
params['smooth_notes_level'] = min(max((features['note_density'] - 3) / 5.0, 0.0), 1.0) |
|
|
|
|
|
|
|
params['continuous_vibrato_level'] = 1.0 - min(max((features['note_density'] - 5) / 5.0, 0.0), 1.0) |
|
|
|
|
|
|
|
params['noise_level'] = min(max((features['avg_velocity'] - 50) / 40.0, 0.0), 1.0) * 0.1 |
|
|
|
|
|
|
|
if features['avg_note_length'] < 0.25: |
|
params['distortion_level'] = 0.1 |
|
elif features['avg_note_length'] < 0.5: |
|
params['distortion_level'] = 0.05 |
|
else: |
|
params['distortion_level'] = 0.0 |
|
|
|
|
|
|
|
density_factor = min(max((features['note_density'] - 5) / 15, 0), 1) |
|
range_factor = min(max((features['pitch_range'] - 15) / 30, 0), 1) |
|
|
|
|
|
complexity_factor = (density_factor + range_factor) / 2 |
|
params['fm_modulation_depth'] = round(0.3 * complexity_factor, 3) |
|
params['fm_modulation_rate'] = round(200 * complexity_factor, 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
params['bass_boost_level'] = max(0.2, 1.0 - (features['instruments_count'] - 1) * 0.15) |
|
|
|
|
|
for key, value in params.items(): |
|
if isinstance(value, float): |
|
params[key] = round(value, 3) |
|
|
|
return params |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: AppParameters): |
|
""" |
|
Takes a single audio file path and runs the full transcription pipeline on it. |
|
This includes stereo/mono handling and normalization. |
|
Returns the file path of the resulting transcribed MIDI. |
|
""" |
|
print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---") |
|
|
|
|
|
audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False) |
|
|
|
if params.enable_stereo_processing and audio_data.ndim == 2 and audio_data.shape[0] == 2: |
|
print("Stereo processing enabled for stem.") |
|
left_channel_np = audio_data[0] |
|
right_channel_np = audio_data[1] |
|
|
|
normalized_left = normalize_loudness(left_channel_np, native_sample_rate) |
|
normalized_right = normalize_loudness(right_channel_np, native_sample_rate) |
|
|
|
temp_left_path = os.path.join(temp_dir, f"{base_name}_left.flac") |
|
temp_right_path = os.path.join(temp_dir, f"{base_name}_right.flac") |
|
|
|
sf.write(temp_left_path, normalized_left, native_sample_rate) |
|
sf.write(temp_right_path, normalized_right, native_sample_rate) |
|
|
|
print(f"Saved left channel to: {temp_left_path}") |
|
print(f"Saved right channel to: {temp_right_path}") |
|
|
|
print("Transcribing left and right channel...") |
|
if params.transcription_method == "General Purpose": |
|
midi_path_left = TranscribeGeneralAudio(temp_left_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends) |
|
midi_path_right = TranscribeGeneralAudio(temp_right_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends) |
|
else: |
|
midi_path_left = TranscribePianoAudio(temp_left_path) |
|
midi_path_right = TranscribePianoAudio(temp_right_path) |
|
|
|
if midi_path_left and midi_path_right: |
|
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid") |
|
return merge_midis(midi_path_left, midi_path_right, merged_midi_path) |
|
elif midi_path_left: |
|
print("Warning: Right channel transcription failed. Using left channel only.") |
|
return midi_path_left |
|
elif midi_path_right: |
|
print("Warning: Left channel transcription failed. Using right channel only.") |
|
return midi_path_right |
|
else: |
|
print(f"Warning: Stereo transcription failed for stem {base_name}.") |
|
return None |
|
else: |
|
print("Mono processing for stem.") |
|
mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data |
|
normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate) |
|
temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.flac") |
|
sf.write(temp_mono_path, normalized_mono, native_sample_rate) |
|
|
|
if params.transcription_method == "General Purpose": |
|
return TranscribeGeneralAudio(temp_mono_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends) |
|
else: |
|
return TranscribePianoAudio(temp_mono_path) |
|
|
|
|
|
|
|
def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppParameters, progress: gr.Progress = None): |
|
""" |
|
This is the main processing engine. It takes a file path and a dictionary of all settings, |
|
and performs the full pipeline: load, separate, transcribe, render, re-merge. |
|
It is UI-agnostic and returns file paths and data, not Gradio updates. |
|
It now accepts a Gradio Progress object to report granular progress. |
|
""" |
|
|
|
def update_progress(fraction, desc): |
|
if progress: |
|
progress(fraction, desc=desc) |
|
|
|
|
|
file_start_time = reqtime.time() |
|
|
|
filename = os.path.basename(input_file_path) |
|
base_name = os.path.splitext(filename)[0] |
|
|
|
|
|
is_midi_input = filename.lower().endswith(('.mid', '.midi', '.kar')) |
|
|
|
update_progress(0, f"Starting: {filename}") |
|
print(f"\n{'='*20} Starting Pipeline for: {filename} {'='*20}") |
|
|
|
|
|
timestamped_base_name = f"{base_name}_{timestamp}" |
|
|
|
|
|
other_part_tensor = None |
|
other_part_sr = None |
|
|
|
|
|
if is_midi_input: |
|
|
|
update_progress(0, "MIDI file detected, skipping transcription...") |
|
print("MIDI file detected. Skipping transcription. Proceeding directly to rendering.") |
|
|
|
if is_stereo_midi(input_file_path): |
|
print("\nINFO: Stereo pan information (Left/Right) detected in the input MIDI. It will be rendered in stereo.\n") |
|
|
|
midi_path_for_rendering = input_file_path |
|
else: |
|
temp_dir = "output/temp_transcribe" |
|
os.makedirs(temp_dir, exist_ok=True) |
|
|
|
|
|
update_progress(0.1, "Audio file detected, loading...") |
|
print("Audio file detected. Starting pre-processing...") |
|
|
|
try: |
|
|
|
|
|
print("Attempting to load audio with torchaudio...") |
|
audio_tensor, native_sample_rate = torchaudio.load(input_file_path) |
|
print("Torchaudio loading successful.") |
|
except Exception as e: |
|
update_progress(0.15, "Torchaudio failed, trying ffmpeg...") |
|
print(f"Torchaudio failed: {e}. Attempting fallback with ffmpeg...") |
|
try: |
|
|
|
converted_flac_path = os.path.join(temp_dir, f"{timestamped_base_name}_converted.flac") |
|
|
|
( |
|
ffmpeg |
|
.input(input_file_path) |
|
.output(converted_flac_path, acodec='flac') |
|
.overwrite_output() |
|
.run(capture_stdout=True, capture_stderr=True) |
|
) |
|
|
|
audio_tensor, native_sample_rate = torchaudio.load(converted_flac_path) |
|
print(f"FFmpeg fallback successful. Loaded from: {converted_flac_path}") |
|
except Exception as ffmpeg_err: |
|
|
|
stderr = ffmpeg_err.stderr.decode() if hasattr(ffmpeg_err, 'stderr') else str(ffmpeg_err) |
|
print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}") |
|
return None |
|
|
|
|
|
if not params.separate_vocals or demucs_model is None: |
|
if params.separate_vocals and demucs_model is None: |
|
print("ERROR: Demucs model not loaded. Skipping separation.") |
|
|
|
audio_to_transcribe_path = os.path.join(temp_dir, f"{timestamped_base_name}_original.flac") |
|
torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate) |
|
|
|
update_progress(0.2, "Transcribing audio to MIDI...") |
|
midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params) |
|
else: |
|
|
|
update_progress(0.2, "Separating vocals with Demucs...") |
|
|
|
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels) |
|
|
|
if torch.cuda.is_available(): |
|
audio_tensor = audio_tensor.cuda() |
|
|
|
print("Separating audio with Demucs... This may take some time.") |
|
|
|
with torch.no_grad(): |
|
all_stems = apply_model( |
|
demucs_model, |
|
audio_tensor[None], |
|
device='cuda' if torch.cuda.is_available() else 'cpu', |
|
progress=True, |
|
)[0] |
|
|
|
|
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
print("CUDA cache cleared.") |
|
|
|
|
|
|
|
|
|
sources = {} |
|
for i, source_name in enumerate(demucs_model.sources): |
|
sources[source_name] = all_stems[i] |
|
|
|
vocals_tensor = sources['vocals'] |
|
|
|
|
|
|
|
accompaniment_tensor = torch.zeros_like(vocals_tensor) |
|
for source_name, stem_tensor in sources.items(): |
|
if source_name != 'vocals': |
|
accompaniment_tensor += stem_tensor |
|
|
|
|
|
vocals_path = os.path.join(temp_dir, f"{base_name}_vocals.flac") |
|
accompaniment_path = os.path.join(temp_dir, f"{base_name}_accompaniment.flac") |
|
torchaudio.save(vocals_path, vocals_tensor.cpu(), demucs_model.samplerate) |
|
torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate) |
|
|
|
|
|
primary_target_path = vocals_path if params.transcription_target == "Transcribe Vocals" else accompaniment_path |
|
other_part_path = accompaniment_path if params.transcription_target == "Transcribe Vocals" else vocals_path |
|
|
|
|
|
other_part_tensor = accompaniment_tensor if params.transcription_target == "Transcribe Vocals" else vocals_tensor |
|
other_part_sr = demucs_model.samplerate |
|
print("Separation complete.") |
|
|
|
|
|
if not params.transcribe_both_stems: |
|
print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}") |
|
update_progress(0.4, f"Transcribing primary target: {os.path.basename(primary_target_path)}") |
|
midi_path_for_rendering = _transcribe_stem(primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, params) |
|
else: |
|
print("Transcribing BOTH stems and merging the MIDI results.") |
|
|
|
|
|
update_progress(0.4, "Transcribing primary stem...") |
|
midi_path_primary = _transcribe_stem(primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, params) |
|
|
|
|
|
update_progress(0.5, "Transcribing second stem...") |
|
midi_path_other = _transcribe_stem(other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir, params) |
|
|
|
|
|
if midi_path_primary and midi_path_other: |
|
update_progress(0.55, "Merging transcribed MIDIs...") |
|
final_merged_midi_path = os.path.join(temp_dir, f"{base_name}_full_transcription.mid") |
|
print(f"Merging transcribed MIDI files into {os.path.basename(final_merged_midi_path)}") |
|
|
|
|
|
primary_midi = pretty_midi.PrettyMIDI(midi_path_primary) |
|
other_midi = pretty_midi.PrettyMIDI(midi_path_other) |
|
|
|
|
|
for instrument in other_midi.instruments: |
|
instrument.name = f"Other - {instrument.name}" |
|
primary_midi.instruments.append(instrument) |
|
|
|
primary_midi.write(final_merged_midi_path) |
|
midi_path_for_rendering = final_merged_midi_path |
|
elif midi_path_primary: |
|
print("Warning: Transcription of the 'other' part failed. Using primary transcription only.") |
|
midi_path_for_rendering = midi_path_primary |
|
else: |
|
raise gr.Error("Transcription of the primary target failed. Aborting.") |
|
|
|
if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering): |
|
print(f"ERROR: Transcription failed for {filename}. Skipping.") |
|
return None |
|
|
|
|
|
|
|
update_progress(0.1 if is_midi_input else 0.6, "Applying MIDI transformations...") |
|
|
|
|
|
|
|
if params.s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": |
|
update_progress(0.15 if is_midi_input else 0.65, "Auto-recommending 8-bit parameters...") |
|
print("Auto-Recommendation is enabled. Analyzing MIDI features...") |
|
try: |
|
midi_to_analyze = pretty_midi.PrettyMIDI(midi_path_for_rendering) |
|
default_preset = S8BIT_PRESETS[FALLBACK_PRESET_NAME] |
|
recommended_params = recommend_8bit_params(midi_to_analyze, default_preset) |
|
|
|
print("Recommended parameters:", recommended_params) |
|
|
|
for key, value in recommended_params.items(): |
|
setattr(params, f"s8bit_{key}", value) |
|
print("Parameters updated with recommendations.") |
|
except Exception as e: |
|
print(f"Could not auto-recommend parameters for {filename}: {e}.") |
|
|
|
update_progress(0.2 if is_midi_input else 0.7, "Rendering MIDI to audio...") |
|
print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}") |
|
|
|
|
|
results_tuple = Render_MIDI(input_midi_path=midi_path_for_rendering, params=params) |
|
|
|
|
|
|
|
if params.separate_vocals and params.remerge_vocals and not params.transcribe_both_stems and other_part_tensor is not None: |
|
update_progress(0.8, "Re-merging rendered audio with vocals...") |
|
print(f"Re-merging the non-transcribed part with newly rendered music...") |
|
|
|
|
|
rendered_srate, rendered_music_int16 = results_tuple[4] |
|
|
|
|
|
rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0 |
|
rendered_music_tensor = torch.from_numpy(rendered_music_float).T |
|
|
|
|
|
if rendered_srate != other_part_sr: |
|
resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr) |
|
rendered_music_tensor = resampler(rendered_music_tensor) |
|
|
|
|
|
len_music = rendered_music_tensor.shape[1] |
|
len_other = other_part_tensor.shape[1] |
|
|
|
if len_music > len_other: |
|
padding = len_music - len_other |
|
other_part_tensor = torch.nn.functional.pad(other_part_tensor, (0, padding)) |
|
elif len_other > len_music: |
|
padding = len_other - len_music |
|
rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding)) |
|
|
|
|
|
merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu() |
|
max_abs = torch.max(torch.abs(merged_audio_tensor)) |
|
if max_abs > 1.0: |
|
merged_audio_tensor /= max_abs |
|
|
|
|
|
merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16) |
|
|
|
|
|
new_audio_tuple = (other_part_sr, merged_audio_int16) |
|
|
|
temp_results_list = list(results_tuple) |
|
temp_results_list[4] = new_audio_tuple |
|
results_tuple = tuple(temp_results_list) |
|
print("Re-merging complete.") |
|
|
|
|
|
update_progress(0.9, "Saving final files...") |
|
final_srate, final_audio_data = results_tuple[4] |
|
final_midi_path_from_render = results_tuple[3] |
|
|
|
|
|
output_audio_dir = "output/final_audio" |
|
output_midi_dir = "output/final_midi" |
|
os.makedirs(output_audio_dir, exist_ok=True) |
|
os.makedirs(output_midi_dir, exist_ok=True) |
|
|
|
final_audio_path = os.path.join(output_audio_dir, f"{timestamped_base_name}_rendered.flac") |
|
|
|
final_midi_path = os.path.join(output_midi_dir, f"{timestamped_base_name}_processed.mid") |
|
|
|
|
|
sf.write(final_audio_path, final_audio_data, final_srate) |
|
|
|
shutil.copy(final_midi_path_from_render, final_midi_path) |
|
|
|
|
|
|
|
file_processing_time = reqtime.time() - file_start_time |
|
print(f"--- Pipeline finished for {filename} in {file_processing_time:.2f} seconds. ---") |
|
print(f"Output Audio: {final_audio_path}\nOutput MIDI: {final_midi_path}") |
|
|
|
|
|
results = { |
|
"final_audio_path": final_audio_path, |
|
"final_midi_path": final_midi_path, |
|
"md5_hash": results_tuple[0], |
|
"title": results_tuple[1], |
|
"summary": results_tuple[2], |
|
"plot": results_tuple[5], |
|
"description": results_tuple[6] |
|
} |
|
update_progress(1.0, "Done!") |
|
|
|
return results, params |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def batch_process_files(input_files, progress=gr.Progress(track_tqdm=True), *args): |
|
""" |
|
Gradio wrapper for batch processing. It iterates through files, calls the core pipeline, |
|
and collects the output file paths. It now provides detailed, nested progress updates. |
|
""" |
|
|
|
if not input_files: |
|
print("No files uploaded for batch processing.") |
|
return [], [] |
|
|
|
|
|
batch_start_time = reqtime.time() |
|
|
|
|
|
batch_timestamp = reqtime.strftime("%Y%m%d-%H%M%S") |
|
|
|
|
|
params = AppParameters(**dict(zip(ALL_PARAM_KEYS, args))) |
|
|
|
output_audio_paths = [] |
|
output_midi_paths = [] |
|
total_files = len(input_files) |
|
|
|
|
|
progress(0, desc="Starting Batch Process...") |
|
for i, file_obj in enumerate(input_files): |
|
|
|
input_path = file_obj.name |
|
filename = os.path.basename(input_path) |
|
|
|
|
|
|
|
|
|
def batch_progress_updater(local_fraction, desc): |
|
|
|
|
|
progress_per_file = 1 / total_files |
|
overall_fraction = (i / total_files) + (local_fraction * progress_per_file) |
|
progress(overall_fraction, desc=f"({i+1}/{total_files}) {filename}: {desc}") |
|
progress(i / total_files, desc=f"Processing {os.path.basename(input_path)} ({i+1}/{total_files})") |
|
|
|
|
|
results, _ = run_single_file_pipeline(input_path, batch_timestamp, params, progress=batch_progress_updater) |
|
|
|
if results: |
|
if results.get("final_audio_path"): |
|
output_audio_paths.append(results["final_audio_path"]) |
|
if results.get("final_midi_path"): |
|
output_midi_paths.append(results["final_midi_path"]) |
|
|
|
|
|
progress(1, desc="Batch Process Complete!") |
|
|
|
|
|
total_batch_time = reqtime.time() - batch_start_time |
|
print(f"\nBatch processing complete. {len(output_audio_paths)} of {total_files} files processed successfully.") |
|
print(f"Total batch execution time: {total_batch_time:.2f} seconds.") |
|
|
|
|
|
return output_audio_paths, output_midi_paths |
|
|
|
|
|
|
|
def process_and_render_file(input_file, *args, progress=gr.Progress()): |
|
""" |
|
Gradio wrapper for the single file processing UI. Packs UI values into an AppParameters object. |
|
Calls the core pipeline and formats the output for all UI components. |
|
Main function to handle file processing. It determines the file type and calls the |
|
appropriate functions for transcription and/or rendering based on user selections. |
|
Now includes a progress bar. |
|
""" |
|
if input_file is None: |
|
|
|
return [gr.update(value=None)] * (7 + 14) |
|
|
|
|
|
job_start_time = reqtime.time() |
|
|
|
|
|
single_file_timestamp = reqtime.strftime("%Y%m%d-%H%M%S") |
|
|
|
|
|
|
|
params = AppParameters(input_file=input_file, **dict(zip(ALL_PARAM_KEYS, args))) |
|
|
|
|
|
results, final_params = run_single_file_pipeline(input_file, single_file_timestamp, params, progress=progress) |
|
|
|
if results is None: |
|
raise gr.Error("File processing failed. Check console for details.") |
|
|
|
|
|
total_job_time = reqtime.time() - job_start_time |
|
print(f"Total single-file job execution time: {total_job_time:.2f} seconds.") |
|
|
|
|
|
|
|
final_ui_updates = [] |
|
|
|
|
|
if params.s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": |
|
|
|
final_ui_updates.append(gr.update(value="Custom")) |
|
else: |
|
|
|
final_ui_updates.append(gr.update(value=final_params.s8bit_preset_selector)) |
|
|
|
|
|
s8bit_control_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_') and key != 's8bit_preset_selector'] |
|
|
|
|
|
for key in s8bit_control_keys: |
|
final_ui_updates.append(getattr(final_params, key)) |
|
|
|
|
|
main_results = [ |
|
results['md5_hash'], results['title'], results['summary'], |
|
results['final_midi_path'], results['final_audio_path'], |
|
results['plot'], results['description'] |
|
] |
|
|
|
|
|
return main_results + final_ui_updates |
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
initialize_app() |
|
|
|
|
|
global soundfonts_dict, demucs_model |
|
|
|
soundfonts_dict = prepare_soundfonts() |
|
print(f"Found {len(soundfonts_dict)} local SoundFonts.") |
|
|
|
if not soundfonts_dict: |
|
print("\nWARNING: No SoundFonts were found or could be downloaded.") |
|
print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.") |
|
|
|
|
|
print("Loading Demucs model (htdemucs_ft), this may take a moment on first run...") |
|
try: |
|
demucs_model = get_model(name='htdemucs_ft') |
|
if torch.cuda.is_available(): |
|
demucs_model = demucs_model.cuda() |
|
print("Demucs model loaded successfully.") |
|
except Exception as e: |
|
print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}") |
|
demucs_model = None |
|
|
|
|
|
RENDER_TYPE_DESCRIPTIONS = { |
|
"Render as-is": "**Mode: Pass-through.** Renders the MIDI file directly without any modifications. Advanced MIDI options will be ignored.", |
|
"Custom render": "**Mode: Activate Advanced Options.** Applies all settings from the 'Advanced MIDI Rendering Options' accordion without making other structural changes to the MIDI.", |
|
"Extract melody": "**Action: Simplify.** Analyzes all tracks and attempts to isolate and render only the main melody line.", |
|
"Flip": "**Action: Experimental.** Inverts the pitch of each note around the song's average pitch.", |
|
"Reverse": "**Action: Experimental.** Reverses the playback order of all notes in the MIDI file.", |
|
"Repair Durations": "**Action: Fix.** Recalculates note durations to ensure they connect smoothly (legato), filling any small gaps.", |
|
"Repair Chords": "**Action: Fix.** Analyzes and aligns notes that occur at similar times to form cleaner, more structured chords.", |
|
"Remove Duplicate Pitches": "**Action: Simplify.** If multiple instruments play the exact same pitch at the same time, it keeps only one.", |
|
"Longest Repeating Phrase": "**Action: Analyze.** Finds the longest, most-repeated musical phrase (often the chorus) and renders only that section.", |
|
"Multi-Instrumental Summary": "**Action: AI Summary.** Creates a short, compressed summary of a complex, multi-instrument song.", |
|
"Solo Piano Summary": "**Action: AI Summary.** First converts the song to a solo piano arrangement, then creates a short, compressed summary.", |
|
"Add Drum Track": "**Action: Enhance.** Analyzes the rhythm of the MIDI and automatically generates a basic drum track to accompany it." |
|
} |
|
|
|
|
|
|
|
FALLBACK_PRESET_NAME = "Generic Chiptune Loop" |
|
|
|
|
|
|
|
|
|
|
|
S8BIT_PRESETS = { |
|
|
|
"Mario (Super Mario Bros / スーパーマリオブラザーズ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.8, |
|
'continuous_vibrato_level': 0.25, |
|
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Mega Man (Rockman / ロックマン)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, |
|
'vibrato_rate': 6.0, 'vibrato_depth': 8, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Zelda (The Legend of Zelda / ゼルダの伝説)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, |
|
'vibrato_rate': 4.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, |
|
'vibrato_rate': 6.0, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.3, |
|
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Pokémon (Game Boy Classics / ポケットモンスター)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, |
|
'vibrato_rate': 6.5, 'vibrato_depth': 6, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Final Fantasy (Arpeggio / ファイナルファンタジー)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.2, |
|
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 3.5, 'vibrato_depth': 3, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Commodore 64 (SID Feel)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, |
|
'vibrato_rate': 8.0, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.3, |
|
'bass_boost_level': 0.2, 'noise_level': 0.05, 'distortion_level': 0.1, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Megadrive/Genesis (FM Grit)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, |
|
'vibrato_rate': 0.0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.4, 'noise_level': 0.1, 'distortion_level': 0.2, |
|
'fm_modulation_depth': 0.2, 'fm_modulation_rate': 150 |
|
}, |
|
"PC-98 (Touhou Feel / 東方Project)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.15, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.12, |
|
'vibrato_rate': 7.5, 'vibrato_depth': 7, |
|
'smooth_notes_level': 0.95, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.1, 'fm_modulation_rate': 200 |
|
}, |
|
"Roland SC-88 (GM Vibe)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, |
|
'vibrato_rate': 0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 1.0, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Falcom Ys (Rock Lead / イース)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, |
|
'vibrato_rate': 5.5, 'vibrato_depth': 6, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.8, |
|
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Arcade Brawler Lead (Street Fighter / ストリートファイター)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, |
|
'vibrato_rate': 5.0, 'vibrato_depth': 6, |
|
'smooth_notes_level': 0.8, |
|
'continuous_vibrato_level': 0.7, |
|
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, |
|
'vibrato_rate': 4.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.8, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Dragon Quest (Orchestral Feel / ドラゴンクエスト)": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6, |
|
'vibrato_rate': 3.0, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, |
|
'vibrato_rate': 2.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 1.0, |
|
'continuous_vibrato_level': 0.95, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Modern JRPG Pad (Persona / ペルソナ)": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, |
|
'vibrato_rate': 2.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 1.0, |
|
'continuous_vibrato_level': 0.95, |
|
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Tactical Brass (Fire Emblem / ファイアーエムブレム)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 3.5, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.95, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 3.5, 'vibrato_depth': 5, |
|
'smooth_notes_level': 0.95, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, |
|
'vibrato_rate': 7.0, 'vibrato_depth': 12, |
|
'smooth_notes_level': 0.1, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"8-Bit Vocal Lead": { |
|
|
|
'waveform_type': 'Triangle', |
|
'pulse_width': 0.5, |
|
'envelope_type': 'Sustained (Full Decay)', |
|
'decay_time_s': 0.8, |
|
'vibrato_rate': 5.5, |
|
'vibrato_depth': 4, |
|
'bass_boost_level': 0.1, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.9, |
|
'noise_level': 0.02, |
|
'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.05, |
|
'fm_modulation_rate': 20 |
|
}, |
|
"8-Bit Male Vocal": { |
|
|
|
'waveform_type': 'Triangle', |
|
'pulse_width': 0.5, |
|
'envelope_type': 'Sustained (Full Decay)', |
|
'decay_time_s': 1.0, |
|
'vibrato_rate': 5.0, |
|
'vibrato_depth': 3, |
|
'bass_boost_level': 0.3, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'noise_level': 0.015, |
|
'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.08, |
|
'fm_modulation_rate': 25 |
|
}, |
|
"8-Bit Female Vocal": { |
|
|
|
'waveform_type': 'Triangle', |
|
'pulse_width': 0.5, |
|
'envelope_type': 'Sustained (Full Decay)', |
|
'decay_time_s': 0.7, |
|
'vibrato_rate': 6.0, |
|
'vibrato_depth': 5, |
|
'bass_boost_level': 0.05, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.92, |
|
'noise_level': 0.025, |
|
'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.04, |
|
'fm_modulation_rate': 30 |
|
}, |
|
"Lo-Fi Vocal": { |
|
|
|
'waveform_type': 'Square', |
|
'pulse_width': 0.48, |
|
'envelope_type': 'Plucky (AD Envelope)', |
|
'decay_time_s': 0.4, |
|
'vibrato_rate': 4.8, |
|
'vibrato_depth': 2, |
|
'bass_boost_level': 0.1, |
|
'smooth_notes_level': 0.65, |
|
'continuous_vibrato_level': 0.6, |
|
'noise_level': 0.05, |
|
'distortion_level': 0.05, |
|
'fm_modulation_depth': 0.02, |
|
'fm_modulation_rate': 20 |
|
}, |
|
|
|
"Sci-Fi Energy Field": { |
|
|
|
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, |
|
'vibrato_rate': 10.0, 'vibrato_depth': 3, |
|
'smooth_notes_level': 0.85, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.1, 'noise_level': 0.1, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.05, 'fm_modulation_rate': 50 |
|
}, |
|
"Industrial Alarm": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, |
|
'vibrato_rate': 15.0, 'vibrato_depth': 8, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.3, 'noise_level': 0.2, 'distortion_level': 0.3, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Laser Charge-Up": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, |
|
'vibrato_rate': 4.0, 'vibrato_depth': 25, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.95, |
|
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
"Unstable Machine Core": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, |
|
'vibrato_rate': 1.0, 'vibrato_depth': 50, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.9, |
|
'bass_boost_level': 0.5, 'noise_level': 0.3, 'distortion_level': 0.4, |
|
'fm_modulation_depth': 0.5, 'fm_modulation_rate': 10 |
|
}, |
|
"Hardcore Gabber Kick": { |
|
|
|
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.1, |
|
'vibrato_rate': 0, 'vibrato_depth': 0, |
|
'smooth_notes_level': 0.0, |
|
'continuous_vibrato_level': 0.0, |
|
'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
|
|
"Generic Chiptune Loop": { |
|
|
|
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, |
|
'vibrato_rate': 5.5, 'vibrato_depth': 4, |
|
'smooth_notes_level': 0.9, |
|
'continuous_vibrato_level': 0.85, |
|
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, |
|
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
|
}, |
|
} |
|
|
|
|
|
BASIC_PITCH_PRESETS = { |
|
|
|
"Default (Balanced)": { |
|
'description': "A good all-around starting point for most music types.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 128, |
|
'minimum_frequency': 60, 'maximum_frequency': 4000, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False |
|
}, |
|
"Anime / J-Pop": { |
|
'description': "For tracks with clear melodies and pop/rock arrangements.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 150, |
|
'minimum_frequency': 40, 'maximum_frequency': 2500, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
|
|
|
|
"Solo Vocals": { |
|
'description': "Optimized for a single singing voice. Sensitive to nuances.", |
|
'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 100, |
|
'minimum_frequency': 80, 'maximum_frequency': 1200, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Solo Piano": { |
|
'description': "For solo piano with a wide dynamic and frequency range.", |
|
'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 120, |
|
'minimum_frequency': 27, 'maximum_frequency': 4200, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Acoustic Guitar": { |
|
'description': "Balanced for picked or strummed acoustic guitar.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.3, 'minimum_note_length': 90, |
|
'minimum_frequency': 80, 'maximum_frequency': 2500, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False |
|
}, |
|
"Bass Guitar": { |
|
'description': "Isolates and transcribes only the low frequencies of a bassline.", |
|
'onset_threshold': 0.4, 'frame_threshold': 0.3, 'minimum_note_length': 100, |
|
'minimum_frequency': 30, 'maximum_frequency': 400, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': False |
|
}, |
|
"Percussion / Drums": { |
|
'description': "For drums and rhythmic elements. Catches fast, sharp hits.", |
|
'onset_threshold': 0.7, 'frame_threshold': 0.6, 'minimum_note_length': 30, |
|
'minimum_frequency': 40, 'maximum_frequency': 10000, |
|
'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': False |
|
}, |
|
|
|
|
|
"Rock / Metal": { |
|
'description': "Higher thresholds for distorted guitars, bass, and drums in a dense mix.", |
|
'onset_threshold': 0.6, 'frame_threshold': 0.4, 'minimum_note_length': 100, |
|
'minimum_frequency': 50, 'maximum_frequency': 3000, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Jazz (Multi-instrument)": { |
|
'description': "High thresholds to separate notes in complex, improvisational passages.", |
|
'onset_threshold': 0.7, 'frame_threshold': 0.5, 'minimum_note_length': 150, |
|
'minimum_frequency': 55, 'maximum_frequency': 2000, |
|
'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': True |
|
}, |
|
"Classical (Orchestral)": { |
|
'description': "Longer note length to focus on sustained notes and filter out performance noise.", |
|
'onset_threshold': 0.5, 'frame_threshold': 0.4, 'minimum_note_length': 200, |
|
'minimum_frequency': 32, 'maximum_frequency': 4200, |
|
'infer_onsets': True, 'melodia_trick': True, 'multiple_bends': True |
|
}, |
|
"Electronic / Synth": { |
|
'description': "Low thresholds and short note length for sharp, synthetic sounds.", |
|
'onset_threshold': 0.3, 'frame_threshold': 0.2, 'minimum_note_length': 50, |
|
'minimum_frequency': 20, 'maximum_frequency': 8000, |
|
'infer_onsets': True, 'melodia_trick': False, 'multiple_bends': False |
|
} |
|
} |
|
|
|
|
|
|
|
def update_vocal_ui_visibility(separate_vocals): |
|
"""Shows or hides the separation-related UI controls based on selections.""" |
|
is_visible = gr.update(visible=separate_vocals) |
|
return is_visible, is_visible, is_visible |
|
|
|
def update_ui_visibility(transcription_method, soundfont_choice): |
|
""" |
|
Dynamically updates the visibility of UI components based on user selections. |
|
""" |
|
is_general = (transcription_method == "General Purpose") |
|
is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL) |
|
|
|
return { |
|
general_transcription_settings: gr.update(visible=is_general), |
|
synth_8bit_settings: gr.update(visible=is_8bit), |
|
} |
|
|
|
|
|
def update_advanced_midi_options_visibility(render_type_choice): |
|
""" |
|
Shows or hides the advanced MIDI rendering options based on the render type. |
|
The options are only visible if the type is NOT 'Render as-is'. |
|
""" |
|
is_visible = (render_type_choice != "Render as-is") |
|
return gr.update(visible=is_visible) |
|
|
|
|
|
def update_render_type_description(render_type_choice): |
|
""" |
|
Returns the description for the selected render type. |
|
""" |
|
return RENDER_TYPE_DESCRIPTIONS.get(render_type_choice, "Select a render type to see its description.") |
|
|
|
|
|
def apply_basic_pitch_preset(preset_name): |
|
if preset_name not in BASIC_PITCH_PRESETS: |
|
|
|
return {comp: gr.update() for comp in basic_pitch_ui_components} |
|
|
|
settings = BASIC_PITCH_PRESETS[preset_name] |
|
|
|
|
|
return { |
|
onset_threshold: gr.update(value=settings['onset_threshold']), |
|
frame_threshold: gr.update(value=settings['frame_threshold']), |
|
minimum_note_length: gr.update(value=settings['minimum_note_length']), |
|
minimum_frequency: gr.update(value=settings['minimum_frequency']), |
|
maximum_frequency: gr.update(value=settings['maximum_frequency']), |
|
infer_onsets: gr.update(value=settings['infer_onsets']), |
|
melodia_trick: gr.update(value=settings['melodia_trick']), |
|
multiple_pitch_bends: gr.update(value=settings['multiple_bends']) |
|
} |
|
|
|
|
|
|
|
def apply_8bit_preset(preset_name): |
|
""" |
|
Takes the name of a preset and returns a dictionary of gr.update objects |
|
to set the values of the 13 8-bit synthesizer control components. |
|
This version is more robust as it directly maps keys to UI components. |
|
""" |
|
|
|
if preset_name in ["Custom", "Auto-Recommend (Analyze MIDI)"] or preset_name not in S8BIT_PRESETS: |
|
|
|
s8bit_control_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_') and key != 's8bit_preset_selector'] |
|
return {ui_component_map[key]: gr.update() for key in s8bit_control_keys} |
|
|
|
|
|
settings = S8BIT_PRESETS[preset_name] |
|
updates = {} |
|
|
|
|
|
for simple_key, value in settings.items(): |
|
|
|
full_key = f"s8bit_{simple_key}" |
|
|
|
|
|
if full_key in ui_component_map: |
|
component = ui_component_map[full_key] |
|
updates[component] = gr.update(value=value) |
|
|
|
return updates |
|
|
|
|
|
|
|
ALL_PARAM_KEYS = [field.name for field in fields(AppParameters) if field.name not in ["input_file", "batch_input_files"]] |
|
|
|
app = gr.Blocks(theme=gr.themes.Base()) |
|
|
|
with app: |
|
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Audio-to-MIDI & Advanced Renderer</h1>") |
|
gr.Markdown( |
|
"**Upload a Audio for transcription-then-rendering, or a MIDI for rendering-only.**\n\n" |
|
"This application combines piano audio transcription with a powerful MIDI transformation and rendering toolkit. " |
|
"Based on the work of [asigalov61](https://github.com/asigalov61)." |
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
waveform_options = gr.WaveformOptions(show_recording_waveform=False) |
|
|
|
with gr.TabItem("Single File Processing"): |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
|
|
gr.Markdown("## 1. Upload File") |
|
|
|
|
|
|
|
|
|
input_file = gr.Audio( |
|
label="Input Audio or MIDI File", |
|
type="filepath", |
|
sources=["upload"], waveform_options=waveform_options |
|
) |
|
|
|
submit_btn = gr.Button("Process and Render Single File", variant="primary") |
|
|
|
with gr.Column(scale=2): |
|
|
|
gr.Markdown("### 2. Results") |
|
output_midi_title = gr.Textbox(label="MIDI Title") |
|
output_song_description = gr.Textbox(label="MIDI Description", lines=3) |
|
output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options) |
|
output_plot = gr.Plot(label="MIDI Score Plot") |
|
with gr.Row(): |
|
output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"]) |
|
output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash") |
|
output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4) |
|
|
|
|
|
|
|
with gr.TabItem("Batch Processing"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### 1. Upload Files") |
|
gr.Markdown("Uses the **global settings** configured above.") |
|
batch_input_files = gr.File( |
|
label="Upload Audio or MIDI Files", |
|
file_count="multiple" |
|
) |
|
|
|
batch_process_btn = gr.Button("Process Batch", variant="primary") |
|
|
|
with gr.Column(): |
|
gr.Markdown("### 2. Download Results") |
|
batch_output_audio_files = gr.File( |
|
label="Download Rendered FLAC Files", |
|
file_count="multiple", |
|
interactive=False |
|
) |
|
batch_output_midi_files = gr.File( |
|
label="Download Processed MIDI Files", |
|
file_count="multiple", |
|
interactive=False |
|
) |
|
|
|
with gr.Accordion("▶️ Configure Global Settings (for both Single File and Batch)", open=True): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### Transcription Settings") |
|
|
|
transcription_method = gr.Radio(["General Purpose", "Piano-Specific"], label="Audio Transcription Method", value="General Purpose", |
|
info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings.") |
|
|
|
enable_stereo_processing = gr.Checkbox(label="Enable Stereo Transcription", value=False, |
|
info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time.") |
|
|
|
|
|
with gr.Group(): |
|
separate_vocals = gr.Checkbox(label="Separate Vocals", value=False, |
|
info="If checked, separates the audio into vocals and music stems before processing.") |
|
transcription_target = gr.Radio(["Transcribe Music (Accompaniment)", "Transcribe Vocals"], label="Transcription Target", value="Transcribe Music (Accompaniment)", visible=False, |
|
info="Choose which part of the separated audio to transcribe to MIDI.") |
|
remerge_vocals = gr.Checkbox(label="Re-merge Other Part with Rendered Audio", value=False, visible=False, |
|
info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.") |
|
transcribe_both_stems = gr.Checkbox(label="Transcribe Both Parts & Merge MIDI", value=False, visible=False, |
|
info="If checked, transcribes BOTH vocals and music, then merges them into one MIDI file for rendering. Disables audio re-merging.") |
|
|
|
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings: |
|
|
|
basic_pitch_preset_selector = gr.Dropdown( |
|
choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()), |
|
value="Default (Balanced)", |
|
label="Transcription Profile Preset", |
|
info="Select a profile to auto-fill settings for different instrument types." |
|
"For reference only; it is recommended to test and adjust for optimal results.") |
|
|
|
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.") |
|
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.") |
|
minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.") |
|
minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.") |
|
maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.") |
|
infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)") |
|
melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)") |
|
multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends") |
|
|
|
with gr.Column(scale=1): |
|
|
|
gr.Markdown("### MIDI Transformation & Rendering Settings") |
|
render_type = gr.Radio( |
|
list(RENDER_TYPE_DESCRIPTIONS.keys()), |
|
["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"], |
|
label="MIDI Transformation Render Type", |
|
value="Render as-is", |
|
info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations.") |
|
|
|
render_type_info = gr.Markdown( |
|
value=RENDER_TYPE_DESCRIPTIONS["Render as-is"], |
|
elem_classes="description-box" |
|
) |
|
|
|
soundfont_bank = gr.Dropdown( |
|
[SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys()), |
|
label="SoundFont / Synthesizer", |
|
value=list(soundfonts_dict.keys())[0] if soundfonts_dict else SYNTH_8_BIT_LABEL) |
|
render_sample_rate = gr.Radio( |
|
["16000", "32000", "44100"], |
|
label="Audio Sample Rate", |
|
value="44100") |
|
|
|
with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options: |
|
render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True, |
|
info="Applies sustain pedal effects (CC64) to lengthen notes, creating a more realistic and connected performance, especially for piano.") |
|
render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False, |
|
info="Converts all non-drum instruments to a Grand Piano patch, creating a solo piano arrangement of the entire score.") |
|
render_remove_drums = gr.Checkbox(label="Remove drum track", value=False, |
|
info="Removes the entire drum track (typically MIDI Channel 9) from the score. Ideal for creating instrumental or karaoke versions.") |
|
render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False, |
|
info="Transposes the entire score so that its average pitch is centered around C4 (MIDI note 60). Useful for standardizing key.") |
|
render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)", |
|
info="Shifts the pitch of all non-drum notes up (positive values) or down (negative values) by the specified number of semitones.") |
|
custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)", |
|
info="Forces all non-drum instruments to use a single specified MIDI patch number. Set to -1 to use the original instruments.") |
|
merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)", |
|
info="Aligns the start times of notes that are played almost simultaneously (within the specified ms threshold). Cleans up sloppy timing. -1 to disable.") |
|
render_align = gr.Radio( |
|
["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"], |
|
label="Align notes to musical bars", |
|
value="Do not align", |
|
info="Quantizes the score to a fixed bar length. 'Start Times' aligns onsets. " |
|
"'Durations' trims notes at the bar line. 'Split Durations' splits notes that cross the bar line." |
|
) |
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown("### 8-bit Synthesizer Settings") |
|
with gr.Accordion("8-bit Synthesizer Settings", open=True, visible=False) as synth_8bit_settings: |
|
s8bit_preset_selector = gr.Dropdown( |
|
choices=["Custom", "Auto-Recommend (Analyze MIDI)"] + list(S8BIT_PRESETS.keys()), |
|
value="Custom", |
|
label="Style Preset", |
|
info="Select a preset to auto-fill the settings below. Choose 'Custom' for manual control or 'Auto-Recommend' to analyze the MIDI.\nFor reference and entertainment only. These presets are not guaranteed to be perfectly accurate." |
|
) |
|
s8bit_waveform_type = gr.Dropdown( |
|
['Square', 'Sawtooth', 'Triangle'], |
|
value='Square', |
|
label="Waveform Type", |
|
info="The fundamental timbre of the sound. Square is bright and hollow (classic NES), Sawtooth is aggressive and buzzy, Triangle is soft and flute-like." |
|
) |
|
s8bit_pulse_width = gr.Slider( |
|
0.01, 0.99, value=0.5, step=0.01, |
|
label="Pulse Width (Square Wave Only)", |
|
info="Changes the character of the Square wave. Low values (~0.1) are thin and nasal, while mid values (~0.5) are full and round." |
|
) |
|
s8bit_envelope_type = gr.Dropdown( |
|
['Plucky (AD Envelope)', 'Sustained (Full Decay)'], |
|
value='Plucky (AD Envelope)', |
|
label="Envelope Type", |
|
info="Shapes the volume of each note. 'Plucky' is a short, percussive sound. 'Sustained' holds the note for its full duration." |
|
) |
|
s8bit_decay_time_s = gr.Slider( |
|
0.01, 1.0, value=0.1, step=0.01, |
|
label="Decay Time (s)", |
|
info="For the 'Plucky' envelope, this is the time it takes for a note to fade to silence. Low values are short and staccato; high values are longer and more resonant." |
|
) |
|
s8bit_vibrato_rate = gr.Slider( |
|
0, 20, value=5, |
|
label="Vibrato Rate (Hz)", |
|
info="The SPEED of the pitch wobble. Low values create a slow, gentle waver. High values create a fast, frantic buzz." |
|
) |
|
s8bit_vibrato_depth = gr.Slider( |
|
0, 50, value=0, |
|
label="Vibrato Depth (Hz)", |
|
info="The INTENSITY of the pitch wobble. Low values are subtle or off. High values create a dramatic, siren-like pitch bend." |
|
) |
|
s8bit_bass_boost_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Bass Boost Level", |
|
info="Mixes in a sub-octave (a square wave one octave lower). Low values have no effect; high values add significant weight and power." |
|
) |
|
s8bit_smooth_notes_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Smooth Notes Level", |
|
info="Applies a tiny fade-in/out to reduce clicking. Low values (or 0) give a hard, abrupt attack. High values give a softer, cleaner onset." |
|
) |
|
s8bit_continuous_vibrato_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Continuous Vibrato Level", |
|
info="Controls vibrato continuity across notes. Low values (0) reset vibrato on each note (bouncy). High values (1) create a smooth, connected 'singing' vibrato." |
|
) |
|
|
|
with gr.Accordion("Advanced Synthesis & FX", open=False): |
|
s8bit_noise_level = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="Noise Level", |
|
info="Mixes in white noise with the main waveform. Low values are clean; high values add 'grit', 'air', or a hissing quality, useful for percussion." |
|
) |
|
s8bit_distortion_level = gr.Slider( |
|
0.0, 0.9, value=0.0, step=0.05, |
|
label="Distortion Level", |
|
info="Applies wave-shaping to make the sound harsher. Low values are clean; high values create a crushed, 'fuzzy', and aggressive tone." |
|
) |
|
s8bit_fm_modulation_depth = gr.Slider( |
|
0.0, 1.0, value=0.0, step=0.05, |
|
label="FM Depth", |
|
info="Frequency Modulation intensity. At low values, there is no effect. At high values, it creates complex, metallic, or bell-like tones." |
|
) |
|
s8bit_fm_modulation_rate = gr.Slider( |
|
0.0, 500.0, value=0.0, step=1.0, |
|
label="FM Rate", |
|
info="Frequency Modulation speed. Low values create a slow 'wobble'. High values create fast modulation, resulting in bright, dissonant harmonics." |
|
) |
|
|
|
|
|
ui_component_map = locals() |
|
|
|
|
|
all_settings_components = [ui_component_map[key] for key in ALL_PARAM_KEYS] |
|
|
|
|
|
|
|
s8bit_ui_keys = [key for key in ALL_PARAM_KEYS if key.startswith('s8bit_')] |
|
s8bit_ui_components = [ui_component_map[key] for key in s8bit_ui_keys] |
|
|
|
|
|
s8bit_control_components = [comp for comp in s8bit_ui_components if comp != s8bit_preset_selector] |
|
|
|
|
|
basic_pitch_keys = ['onset_threshold', 'frame_threshold', 'minimum_note_length', 'minimum_frequency', 'maximum_frequency', |
|
'infer_onsets', 'melodia_trick', 'multiple_pitch_bends'] |
|
basic_pitch_ui_components = [ui_component_map[key] for key in basic_pitch_keys] |
|
|
|
|
|
single_file_inputs = [input_file] + all_settings_components |
|
result_outputs = [output_midi_md5, output_midi_title, output_midi_summary, output_midi, output_audio, output_plot, output_song_description] |
|
|
|
single_file_outputs = result_outputs + s8bit_ui_components |
|
|
|
batch_inputs = [batch_input_files] + all_settings_components |
|
batch_outputs = [batch_output_audio_files, batch_output_midi_files] |
|
|
|
|
|
submit_btn.click( |
|
fn=process_and_render_file, |
|
inputs=single_file_inputs, |
|
outputs=single_file_outputs |
|
) |
|
|
|
batch_process_btn.click( |
|
fn=batch_process_files, |
|
inputs=batch_inputs, |
|
outputs=batch_outputs |
|
) |
|
|
|
|
|
separate_vocals.change( |
|
fn=update_vocal_ui_visibility, |
|
inputs=separate_vocals, |
|
outputs=[transcription_target, remerge_vocals, transcribe_both_stems] |
|
) |
|
|
|
|
|
transcription_method.change( |
|
fn=lambda x: gr.update(visible=(x == "General Purpose")), |
|
inputs=transcription_method, |
|
outputs=general_transcription_settings |
|
) |
|
soundfont_bank.change( |
|
fn=lambda x: gr.update(visible=(x == SYNTH_8_BIT_LABEL)), |
|
inputs=soundfont_bank, |
|
outputs=synth_8bit_settings |
|
) |
|
|
|
|
|
basic_pitch_preset_selector.change( |
|
fn=apply_basic_pitch_preset, |
|
inputs=basic_pitch_preset_selector, |
|
outputs=basic_pitch_ui_components |
|
) |
|
|
|
|
|
s8bit_preset_selector.change( |
|
fn=apply_8bit_preset, |
|
inputs=s8bit_preset_selector, |
|
outputs=s8bit_control_components |
|
) |
|
|
|
|
|
|
|
render_type.change( |
|
fn=update_advanced_midi_options_visibility, |
|
inputs=render_type, |
|
outputs=advanced_rendering_options |
|
).then( |
|
fn=update_render_type_description, |
|
inputs=render_type, |
|
outputs=render_type_info |
|
) |
|
|
|
|
|
app.queue().launch(inbrowser=True, debug=True) |
|
|