feat: Add dual-stem MIDI transcription and `basic-pitch` profiles
Browse files**1. Dual-Stem MIDI Transcription:**
- When using vocal separation, a new option allows for transcribing **both** the vocal and accompaniment stems independently.
- The two resulting MIDI files are then automatically merged into a single, more complete MIDI file for rendering. This improves transcription quality on complex tracks.
**2. `basic-pitch` Profile Presets:**
- A "Transcription Profile Preset" dropdown has been added to the UI for the general-purpose transcription method.
- Includes a library of presets optimized for different instruments (Vocals, Piano, Drums) and genres (Rock, Jazz, Classical).
- Selecting a profile automatically configures all `basic-pitch` parameters for better results on specific audio types.
- app.py +411 -177
- requirements.txt +1 -1
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# =================================================================
|
2 |
#
|
3 |
# Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
|
4 |
#
|
@@ -39,6 +39,7 @@
|
|
39 |
#
|
40 |
# =================================================================
|
41 |
|
|
|
42 |
import os
|
43 |
import hashlib
|
44 |
import time as reqtime
|
@@ -48,6 +49,7 @@ import pyloudnorm as pyln
|
|
48 |
import soundfile as sf
|
49 |
|
50 |
import torch
|
|
|
51 |
import gradio as gr
|
52 |
|
53 |
# --- Imports for Vocal Separation ---
|
@@ -185,7 +187,7 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s,
|
|
185 |
total_duration = midi_data.get_end_time()
|
186 |
# Initialize a stereo waveform buffer (2 channels: Left, Right)
|
187 |
waveform = np.zeros((2, int(total_duration * fs) + fs))
|
188 |
-
|
189 |
num_instruments = len(midi_data.instruments)
|
190 |
|
191 |
# Phase tracking: main oscillator phase for each instrument
|
@@ -320,7 +322,7 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s,
|
|
320 |
def analyze_midi_velocity(midi_path):
|
321 |
midi = pretty_midi.PrettyMIDI(midi_path)
|
322 |
all_velocities = []
|
323 |
-
|
324 |
print(f"Analyzing velocity for MIDI: {midi_path}")
|
325 |
for i, instrument in enumerate(midi.instruments):
|
326 |
velocities = [note.velocity for note in instrument.notes]
|
@@ -348,13 +350,13 @@ def analyze_midi_velocity(midi_path):
|
|
348 |
def scale_instrument_velocity(instrument, scale=0.8):
|
349 |
for note in instrument.notes:
|
350 |
note.velocity = max(1, min(127, int(note.velocity * scale)))
|
351 |
-
|
352 |
|
353 |
def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
|
354 |
"""
|
355 |
Normalizes the audio data to a target integrated loudness (LUFS).
|
356 |
This provides more consistent perceived volume than peak normalization.
|
357 |
-
|
358 |
Args:
|
359 |
audio_data (np.ndarray): The audio signal.
|
360 |
sample_rate (int): The sample rate of the audio.
|
@@ -400,7 +402,7 @@ def merge_midis(midi_path_left, midi_path_right, output_path):
|
|
400 |
Merges two MIDI files into a single MIDI file. This robust version iterates
|
401 |
through ALL instruments in both MIDI files, ensuring no data is lost if the
|
402 |
source files are multi-instrumental.
|
403 |
-
|
404 |
It applies hard-left panning (Pan=0) to every instrument from the left MIDI
|
405 |
and hard-right panning (Pan=127) to every instrument from the right MIDI.
|
406 |
"""
|
@@ -479,7 +481,7 @@ def TranscribePianoAudio(input_file):
|
|
479 |
print('=' * 70)
|
480 |
print('STAGE 1: Starting Piano-Specific Transcription')
|
481 |
print('=' * 70)
|
482 |
-
|
483 |
# Generate a unique output filename for the MIDI
|
484 |
fn = os.path.basename(input_file)
|
485 |
fn1 = fn.split('.')[0]
|
@@ -529,7 +531,7 @@ def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len,
|
|
529 |
print('=' * 70)
|
530 |
print('STAGE 1: Starting General Purpose Transcription')
|
531 |
print('=' * 70)
|
532 |
-
|
533 |
fn = os.path.basename(input_file)
|
534 |
fn1 = fn.split('.')[0]
|
535 |
output_dir = os.path.join("output", "transcribed_general_")
|
@@ -867,7 +869,7 @@ def Render_MIDI(input_midi_path,
|
|
867 |
def analyze_midi_features(midi_data):
|
868 |
"""
|
869 |
Analyzes a PrettyMIDI object to extract musical features for parameter recommendation.
|
870 |
-
|
871 |
Args:
|
872 |
midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze.
|
873 |
|
@@ -1044,12 +1046,81 @@ def recommend_8bit_params(midi_data, default_preset):
|
|
1044 |
# === Main Application Logic ===
|
1045 |
# =================================================================================================
|
1046 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1047 |
def process_and_render_file(input_file,
|
1048 |
# --- Pass the preset selector value ---
|
1049 |
s8bit_preset_selector,
|
1050 |
separate_vocals,
|
1051 |
remerge_vocals,
|
1052 |
transcription_target,
|
|
|
|
|
1053 |
# --- Transcription params ---
|
1054 |
enable_stereo_processing,
|
1055 |
transcription_method,
|
@@ -1082,140 +1153,164 @@ def process_and_render_file(input_file,
|
|
1082 |
# This will store the other part if separation is performed
|
1083 |
other_part_tensor = None
|
1084 |
other_part_sr = None
|
1085 |
-
|
1086 |
# --- Step 1: Check file type and transcribe if necessary ---
|
1087 |
if filename.lower().endswith(('.mid', '.midi', '.kar')):
|
1088 |
-
print("MIDI file detected. Proceeding directly to rendering.")
|
1089 |
midi_path_for_rendering = input_file_path
|
1090 |
-
else:
|
1091 |
-
print("Audio file detected. Starting
|
1092 |
-
|
|
|
1093 |
try:
|
1094 |
-
#
|
1095 |
-
# This
|
|
|
1096 |
audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
|
|
|
1097 |
except Exception as e:
|
1098 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1099 |
|
|
|
|
|
|
|
|
|
1100 |
# --- Demucs Vocal Separation Logic, now decides which stem to process ---
|
1101 |
-
if separate_vocals:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1102 |
if demucs_model is None:
|
1103 |
raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
|
1104 |
|
1105 |
# Convert to a common format (stereo, float32) that demucs expects
|
1106 |
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
|
1107 |
-
|
1108 |
if torch.cuda.is_available():
|
1109 |
audio_tensor = audio_tensor.cuda()
|
1110 |
|
1111 |
print("Separating audio with Demucs... This may take some time.")
|
1112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1113 |
|
1114 |
-
|
1115 |
-
#
|
1116 |
-
|
|
|
|
|
|
|
|
|
|
|
1117 |
|
1118 |
-
|
1119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1120 |
|
1121 |
-
# ---
|
1122 |
-
if transcription_target == "Transcribe Vocals"
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
-
print("Target: Transcribing MUSIC (ACCOMPANIMENT).")
|
1128 |
-
tensor_to_process = accompaniment_tensor
|
1129 |
-
other_part_tensor = vocals_tensor # Save vocals for re-merging
|
1130 |
-
|
1131 |
other_part_sr = demucs_model.samplerate
|
1132 |
-
audio_tensor = tensor_to_process # The audio to be processed is now the chosen stem
|
1133 |
-
native_sample_rate = demucs_model.samplerate # Update sample rate to match demucs output
|
1134 |
print("Separation complete.")
|
1135 |
-
|
1136 |
-
|
1137 |
-
|
1138 |
-
|
1139 |
-
|
1140 |
-
|
1141 |
-
|
1142 |
-
|
1143 |
-
|
1144 |
-
|
1145 |
-
|
1146 |
-
|
1147 |
-
# We work with a CPU copy of the tensor.
|
1148 |
-
audio_data_np = audio_tensor.cpu().numpy()
|
1149 |
-
|
1150 |
-
# === STEREO PROCESSING LOGIC ===
|
1151 |
-
if enable_stereo_processing:
|
1152 |
-
if audio_data_np.ndim != 2 or audio_data_np.shape[0] != 2:
|
1153 |
-
print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
|
1154 |
-
enable_stereo_processing = False # Disable stereo processing if audio is not stereo
|
1155 |
-
|
1156 |
-
if enable_stereo_processing:
|
1157 |
-
print("Stereo processing enabled. Splitting, normalizing, and transcribing channels...")
|
1158 |
-
try:
|
1159 |
-
left_channel_np = audio_data_np[0]
|
1160 |
-
right_channel_np = audio_data_np[1]
|
1161 |
-
|
1162 |
-
normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
|
1163 |
-
normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
|
1164 |
-
|
1165 |
-
temp_left_path = os.path.join(temp_dir, f"{base_name}_left.wav")
|
1166 |
-
temp_right_path = os.path.join(temp_dir, f"{base_name}_right.wav")
|
1167 |
|
1168 |
-
|
1169 |
-
|
1170 |
-
|
1171 |
-
|
1172 |
-
|
|
|
|
|
1173 |
|
1174 |
-
|
1175 |
-
|
1176 |
-
|
1177 |
-
|
1178 |
-
|
1179 |
-
|
1180 |
-
|
1181 |
|
1182 |
-
|
1183 |
-
|
1184 |
-
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
-
|
1190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1191 |
else:
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
-
print(f"An error occurred during stereo processing: {e}")
|
1196 |
-
raise gr.Error(f"Stereo Processing Failed: {e}")
|
1197 |
-
else: # Standard mono transcription
|
1198 |
-
print("Mono processing. Normalizing and transcribing audio...")
|
1199 |
-
# If the audio is stereo but stereo processing is disabled, convert to mono.
|
1200 |
-
if audio_data_np.shape[0] == 2:
|
1201 |
-
mono_signal_np = np.mean(audio_data_np, axis=0)
|
1202 |
-
else:
|
1203 |
-
mono_signal_np = audio_data_np[0]
|
1204 |
-
|
1205 |
-
normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
|
1206 |
-
temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
|
1207 |
-
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
1208 |
-
|
1209 |
-
try:
|
1210 |
-
if transcription_method == "General Purpose":
|
1211 |
-
midi_path_for_rendering = TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
1212 |
-
else: # Piano-Specific
|
1213 |
-
midi_path_for_rendering = TranscribePianoAudio(temp_mono_path)
|
1214 |
-
except Exception as e:
|
1215 |
-
print(f"An error occurred during transcription: {e}")
|
1216 |
-
raise gr.Error(f"Transcription Failed: {e}")
|
1217 |
-
|
1218 |
-
# --- Step 2: Render the MIDI file with selected options ---
|
1219 |
|
1220 |
# --- Auto-Recommendation Logic ---
|
1221 |
# Store the original parameters from the UI sliders into a dictionary.
|
@@ -1272,7 +1367,8 @@ def process_and_render_file(input_file,
|
|
1272 |
)
|
1273 |
|
1274 |
# --- Vocal Re-merging Logic now uses the generic "other_part" ---
|
1275 |
-
|
|
|
1276 |
print(f"Re-merging the non-transcribed part with newly rendered music...")
|
1277 |
|
1278 |
rendered_srate, rendered_music_int16 = results[4]
|
@@ -1330,7 +1426,7 @@ def process_and_render_file(input_file,
|
|
1330 |
# We send a gr.update() for each UI component.
|
1331 |
for _ in param_order:
|
1332 |
final_ui_updates.append(gr.update())
|
1333 |
-
|
1334 |
# The final return is a combination of the result values and the UI update values.
|
1335 |
return list(results) + final_ui_updates
|
1336 |
|
@@ -1338,52 +1434,6 @@ def process_and_render_file(input_file,
|
|
1338 |
# === Gradio UI Setup ===
|
1339 |
# =================================================================================================
|
1340 |
|
1341 |
-
def update_ui_visibility(transcription_method, soundfont_choice):
|
1342 |
-
"""
|
1343 |
-
Dynamically updates the visibility of UI components based on user selections.
|
1344 |
-
"""
|
1345 |
-
is_general = (transcription_method == "General Purpose")
|
1346 |
-
is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
|
1347 |
-
|
1348 |
-
return {
|
1349 |
-
general_transcription_settings: gr.update(visible=is_general),
|
1350 |
-
synth_8bit_settings: gr.update(visible=is_8bit),
|
1351 |
-
}
|
1352 |
-
|
1353 |
-
# --- Function to apply 8-bit synthesizer presets ---
|
1354 |
-
# --- This function must be defined before the UI components that use it ---
|
1355 |
-
def apply_8bit_preset(preset_name):
|
1356 |
-
"""
|
1357 |
-
Takes the name of a preset and returns a dictionary of gr.update objects
|
1358 |
-
to set the values of all 13 of the 8-bit synthesizer's UI components.
|
1359 |
-
"""
|
1360 |
-
# --- Use a list of keys for consistent updates ---
|
1361 |
-
param_keys = [
|
1362 |
-
'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate',
|
1363 |
-
'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level',
|
1364 |
-
'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate'
|
1365 |
-
]
|
1366 |
-
|
1367 |
-
# If the user selects "Custom" or the preset is not found, do not change the values.
|
1368 |
-
if preset_name == "Custom" or preset_name not in S8BIT_PRESETS:
|
1369 |
-
# When switching to custom, don't change any values, just return empty updates.
|
1370 |
-
return {comp: gr.update() for comp in s8bit_ui_components}
|
1371 |
-
|
1372 |
-
# Get the settings dictionary for the chosen preset.
|
1373 |
-
settings = S8BIT_PRESETS[preset_name]
|
1374 |
-
|
1375 |
-
# Create a dictionary mapping UI components to their new values from the preset.
|
1376 |
-
update_dict = {}
|
1377 |
-
for i, key in enumerate(param_keys):
|
1378 |
-
component = s8bit_ui_components[i]
|
1379 |
-
value = settings.get(key)
|
1380 |
-
if value is not None:
|
1381 |
-
update_dict[component] = gr.update(value=value)
|
1382 |
-
else:
|
1383 |
-
update_dict[component] = gr.update()
|
1384 |
-
return update_dict
|
1385 |
-
|
1386 |
-
|
1387 |
if __name__ == "__main__":
|
1388 |
# Initialize the app: download model (if needed) and apply patches
|
1389 |
# Set to False if you don't have 'requests' or 'tqdm' installed
|
@@ -1735,11 +1785,154 @@ if __name__ == "__main__":
|
|
1735 |
},
|
1736 |
}
|
1737 |
|
1738 |
-
# ---
|
1739 |
-
|
1740 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1741 |
is_visible = gr.update(visible=separate_vocals)
|
1742 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1743 |
|
1744 |
app = gr.Blocks(theme=gr.themes.Base())
|
1745 |
|
@@ -1803,8 +1996,25 @@ if __name__ == "__main__":
|
|
1803 |
info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
|
1804 |
visible=False # Initially hidden
|
1805 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1806 |
|
1807 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1808 |
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
|
1809 |
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
|
1810 |
minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
|
@@ -1967,10 +2177,12 @@ if __name__ == "__main__":
|
|
1967 |
# all_inputs now includes the preset selector itself
|
1968 |
# Inputs for the main processing function
|
1969 |
all_inputs = [
|
1970 |
-
input_file,
|
1971 |
-
|
|
|
1972 |
remerge_vocals,
|
1973 |
transcription_target,
|
|
|
1974 |
enable_stereo_processing,
|
1975 |
transcription_method, onset_threshold, frame_threshold, minimum_note_length,
|
1976 |
minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
|
@@ -1989,6 +2201,13 @@ if __name__ == "__main__":
|
|
1989 |
output_midi, output_audio, output_plot, output_song_description
|
1990 |
]
|
1991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1992 |
# The list of 8-bit UI components that can be updated
|
1993 |
# This MUST be defined after the components themselves are created in the UI.
|
1994 |
s8bit_ui_components = [
|
@@ -2007,12 +2226,20 @@ if __name__ == "__main__":
|
|
2007 |
inputs=all_inputs,
|
2008 |
outputs=all_outputs # Pass the combined list
|
2009 |
)
|
2010 |
-
|
2011 |
-
# ---
|
|
|
|
|
|
|
2012 |
separate_vocals.change(
|
2013 |
-
fn=
|
2014 |
-
inputs=separate_vocals,
|
2015 |
-
outputs=[transcription_target, remerge_vocals]
|
|
|
|
|
|
|
|
|
|
|
2016 |
)
|
2017 |
|
2018 |
# --- Listeners for dynamic UI updates ---
|
@@ -2026,6 +2253,13 @@ if __name__ == "__main__":
|
|
2026 |
inputs=[transcription_method, soundfont_bank],
|
2027 |
outputs=[general_transcription_settings, synth_8bit_settings]
|
2028 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2029 |
|
2030 |
# This listener now correctly handles only the named presets, ignoring "Auto-Recommend"
|
2031 |
# --- Event listener for the preset selector ---
|
@@ -2038,7 +2272,7 @@ if __name__ == "__main__":
|
|
2038 |
inputs=[s8bit_preset_selector],
|
2039 |
outputs=s8bit_ui_components # This now correctly targets the new sliders
|
2040 |
)
|
2041 |
-
|
2042 |
|
2043 |
# Launch the Gradio app
|
2044 |
app.queue().launch(inbrowser=True, debug=True)
|
|
|
1 |
+
# =================================================================
|
2 |
#
|
3 |
# Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
|
4 |
#
|
|
|
39 |
#
|
40 |
# =================================================================
|
41 |
|
42 |
+
import io
|
43 |
import os
|
44 |
import hashlib
|
45 |
import time as reqtime
|
|
|
49 |
import soundfile as sf
|
50 |
|
51 |
import torch
|
52 |
+
import ffmpeg
|
53 |
import gradio as gr
|
54 |
|
55 |
# --- Imports for Vocal Separation ---
|
|
|
187 |
total_duration = midi_data.get_end_time()
|
188 |
# Initialize a stereo waveform buffer (2 channels: Left, Right)
|
189 |
waveform = np.zeros((2, int(total_duration * fs) + fs))
|
190 |
+
|
191 |
num_instruments = len(midi_data.instruments)
|
192 |
|
193 |
# Phase tracking: main oscillator phase for each instrument
|
|
|
322 |
def analyze_midi_velocity(midi_path):
|
323 |
midi = pretty_midi.PrettyMIDI(midi_path)
|
324 |
all_velocities = []
|
325 |
+
|
326 |
print(f"Analyzing velocity for MIDI: {midi_path}")
|
327 |
for i, instrument in enumerate(midi.instruments):
|
328 |
velocities = [note.velocity for note in instrument.notes]
|
|
|
350 |
def scale_instrument_velocity(instrument, scale=0.8):
|
351 |
for note in instrument.notes:
|
352 |
note.velocity = max(1, min(127, int(note.velocity * scale)))
|
353 |
+
|
354 |
|
355 |
def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
|
356 |
"""
|
357 |
Normalizes the audio data to a target integrated loudness (LUFS).
|
358 |
This provides more consistent perceived volume than peak normalization.
|
359 |
+
|
360 |
Args:
|
361 |
audio_data (np.ndarray): The audio signal.
|
362 |
sample_rate (int): The sample rate of the audio.
|
|
|
402 |
Merges two MIDI files into a single MIDI file. This robust version iterates
|
403 |
through ALL instruments in both MIDI files, ensuring no data is lost if the
|
404 |
source files are multi-instrumental.
|
405 |
+
|
406 |
It applies hard-left panning (Pan=0) to every instrument from the left MIDI
|
407 |
and hard-right panning (Pan=127) to every instrument from the right MIDI.
|
408 |
"""
|
|
|
481 |
print('=' * 70)
|
482 |
print('STAGE 1: Starting Piano-Specific Transcription')
|
483 |
print('=' * 70)
|
484 |
+
|
485 |
# Generate a unique output filename for the MIDI
|
486 |
fn = os.path.basename(input_file)
|
487 |
fn1 = fn.split('.')[0]
|
|
|
531 |
print('=' * 70)
|
532 |
print('STAGE 1: Starting General Purpose Transcription')
|
533 |
print('=' * 70)
|
534 |
+
|
535 |
fn = os.path.basename(input_file)
|
536 |
fn1 = fn.split('.')[0]
|
537 |
output_dir = os.path.join("output", "transcribed_general_")
|
|
|
869 |
def analyze_midi_features(midi_data):
|
870 |
"""
|
871 |
Analyzes a PrettyMIDI object to extract musical features for parameter recommendation.
|
872 |
+
|
873 |
Args:
|
874 |
midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze.
|
875 |
|
|
|
1046 |
# === Main Application Logic ===
|
1047 |
# =================================================================================================
|
1048 |
|
1049 |
+
|
1050 |
+
# --- Helper function to encapsulate the transcription pipeline for a single audio file ---
|
1051 |
+
def _transcribe_stem(audio_path, base_name, temp_dir,
|
1052 |
+
# Pass all transcription-related parameters
|
1053 |
+
enable_stereo, transcription_method,
|
1054 |
+
onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
|
1055 |
+
infer_onsets_bool, melodia_trick_bool, multiple_bends_bool):
|
1056 |
+
"""
|
1057 |
+
Takes a single audio file path and runs the full transcription pipeline on it.
|
1058 |
+
This includes stereo/mono handling and normalization.
|
1059 |
+
Returns the file path of the resulting transcribed MIDI.
|
1060 |
+
"""
|
1061 |
+
print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
|
1062 |
+
|
1063 |
+
# Load the audio stem to process it
|
1064 |
+
audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False)
|
1065 |
+
|
1066 |
+
if enable_stereo and audio_data.ndim == 2 and audio_data.shape[0] == 2:
|
1067 |
+
print("Stereo processing enabled for stem.")
|
1068 |
+
left_channel_np = audio_data[0]
|
1069 |
+
right_channel_np = audio_data[1]
|
1070 |
+
|
1071 |
+
normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
|
1072 |
+
normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
|
1073 |
+
|
1074 |
+
temp_left_path = os.path.join(temp_dir, f"{base_name}_left.flac")
|
1075 |
+
temp_right_path = os.path.join(temp_dir, f"{base_name}_right.flac")
|
1076 |
+
|
1077 |
+
sf.write(temp_left_path, normalized_left, native_sample_rate)
|
1078 |
+
sf.write(temp_right_path, normalized_right, native_sample_rate)
|
1079 |
+
|
1080 |
+
print(f"Saved left channel to: {temp_left_path}")
|
1081 |
+
print(f"Saved right channel to: {temp_right_path}")
|
1082 |
+
|
1083 |
+
print("Transcribing left and right channel...")
|
1084 |
+
if transcription_method == "General Purpose":
|
1085 |
+
midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
1086 |
+
midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
1087 |
+
else: # Piano-Specific
|
1088 |
+
midi_path_left = TranscribePianoAudio(temp_left_path)
|
1089 |
+
midi_path_right = TranscribePianoAudio(temp_right_path)
|
1090 |
+
|
1091 |
+
if midi_path_left and midi_path_right:
|
1092 |
+
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
|
1093 |
+
return merge_midis(midi_path_left, midi_path_right, merged_midi_path)
|
1094 |
+
elif midi_path_left:
|
1095 |
+
print("Warning: Right channel transcription failed. Using left channel only.")
|
1096 |
+
return midi_path_left
|
1097 |
+
elif midi_path_right:
|
1098 |
+
print("Warning: Left channel transcription failed. Using right channel only.")
|
1099 |
+
return midi_path_right
|
1100 |
+
else:
|
1101 |
+
print(f"Warning: Stereo transcription failed for stem {base_name}.")
|
1102 |
+
return None
|
1103 |
+
else:
|
1104 |
+
print("Mono processing for stem.")
|
1105 |
+
mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
|
1106 |
+
normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
|
1107 |
+
temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.flac")
|
1108 |
+
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
1109 |
+
|
1110 |
+
if transcription_method == "General Purpose":
|
1111 |
+
return TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
1112 |
+
else:
|
1113 |
+
return TranscribePianoAudio(temp_mono_path)
|
1114 |
+
|
1115 |
+
# --- The main processing function is now significantly refactored ---
|
1116 |
def process_and_render_file(input_file,
|
1117 |
# --- Pass the preset selector value ---
|
1118 |
s8bit_preset_selector,
|
1119 |
separate_vocals,
|
1120 |
remerge_vocals,
|
1121 |
transcription_target,
|
1122 |
+
# --- ADDED: New parameter from UI ---
|
1123 |
+
transcribe_both_stems,
|
1124 |
# --- Transcription params ---
|
1125 |
enable_stereo_processing,
|
1126 |
transcription_method,
|
|
|
1153 |
# This will store the other part if separation is performed
|
1154 |
other_part_tensor = None
|
1155 |
other_part_sr = None
|
1156 |
+
|
1157 |
# --- Step 1: Check file type and transcribe if necessary ---
|
1158 |
if filename.lower().endswith(('.mid', '.midi', '.kar')):
|
1159 |
+
print("MIDI file detected. Cannot perform vocal separation. Proceeding directly to rendering.")
|
1160 |
midi_path_for_rendering = input_file_path
|
1161 |
+
else:
|
1162 |
+
print("Audio file detected. Starting pre-processing...")
|
1163 |
+
|
1164 |
+
# --- Robust audio loading with ffmpeg fallback ---
|
1165 |
try:
|
1166 |
+
# Try loading directly with torchaudio (efficient for supported formats).
|
1167 |
+
# This works for formats like WAV, MP3, FLAC, OGG, etc.
|
1168 |
+
print("Attempting to load audio with torchaudio...")
|
1169 |
audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
|
1170 |
+
print("Torchaudio loading successful.")
|
1171 |
except Exception as e:
|
1172 |
+
print(f"Torchaudio failed: {e}. Attempting fallback with ffmpeg...")
|
1173 |
+
try:
|
1174 |
+
# Use ffmpeg to convert the audio to WAV in-memory, then load the bytes.
|
1175 |
+
out, err = (
|
1176 |
+
ffmpeg
|
1177 |
+
.input(input_file_path)
|
1178 |
+
.output('pipe:', format='flac')
|
1179 |
+
.run(capture_stdout=True, capture_stderr=True)
|
1180 |
+
)
|
1181 |
+
# Load the WAV data from the in-memory buffer
|
1182 |
+
audio_tensor, native_sample_rate = torchaudio.load(io.BytesIO(out))
|
1183 |
+
print("FFmpeg fallback successful.")
|
1184 |
+
except Exception as ffmpeg_err:
|
1185 |
+
# If both direct loading and ffmpeg fallback fail, raise an error.
|
1186 |
+
raise gr.Error(f"Failed to load audio file with both torchaudio and ffmpeg.\n"
|
1187 |
+
f"Torchaudio error: {e}\n"
|
1188 |
+
f"FFmpeg error: {ffmpeg_err.decode() if isinstance(ffmpeg_err, bytes) else ffmpeg_err}")
|
1189 |
|
1190 |
+
base_name = os.path.splitext(filename)[0]
|
1191 |
+
temp_dir = "output/temp_transcribe"
|
1192 |
+
os.makedirs(temp_dir, exist_ok=True)
|
1193 |
+
|
1194 |
# --- Demucs Vocal Separation Logic, now decides which stem to process ---
|
1195 |
+
if not separate_vocals:
|
1196 |
+
# --- Standard Workflow: Transcribe the original full audio ---
|
1197 |
+
print("Standard workflow: No vocal separation.")
|
1198 |
+
audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}_original.flac")
|
1199 |
+
torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
|
1200 |
+
midi_path_for_rendering = _transcribe_stem(
|
1201 |
+
audio_to_transcribe_path, f"{base_name}_original", temp_dir,
|
1202 |
+
enable_stereo_processing, transcription_method,
|
1203 |
+
onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
|
1204 |
+
infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
|
1205 |
+
)
|
1206 |
+
else:
|
1207 |
+
# --- Vocal Separation Workflow ---
|
1208 |
if demucs_model is None:
|
1209 |
raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
|
1210 |
|
1211 |
# Convert to a common format (stereo, float32) that demucs expects
|
1212 |
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
|
1213 |
+
|
1214 |
if torch.cuda.is_available():
|
1215 |
audio_tensor = audio_tensor.cuda()
|
1216 |
|
1217 |
print("Separating audio with Demucs... This may take some time.")
|
1218 |
+
# --- Wrap the model call in a no_grad() context ---
|
1219 |
+
with torch.no_grad():
|
1220 |
+
all_stems = apply_model(
|
1221 |
+
demucs_model,
|
1222 |
+
audio_tensor[None], # The input shape is [batch, channels, samples]
|
1223 |
+
device='cuda' if torch.cuda.is_available() else 'cpu',
|
1224 |
+
progress=True,
|
1225 |
+
)[0] # Remove the batch dimension from the output
|
1226 |
+
|
1227 |
+
# --- Clear CUDA cache immediately after use ---
|
1228 |
+
if torch.cuda.is_available():
|
1229 |
+
torch.cuda.empty_cache()
|
1230 |
+
print("CUDA cache cleared.")
|
1231 |
|
1232 |
+
# --- Robust stem handling to prevent CUDA errors ---
|
1233 |
+
# Instead of complex GPU indexing, we create a dictionary of stems on the CPU.
|
1234 |
+
# This is safer and more robust across different hardware.
|
1235 |
+
sources = {}
|
1236 |
+
for i, source_name in enumerate(demucs_model.sources):
|
1237 |
+
sources[source_name] = all_stems[i]
|
1238 |
+
|
1239 |
+
vocals_tensor = sources['vocals']
|
1240 |
|
1241 |
+
# Sum the other stems to create the accompaniment.
|
1242 |
+
# This loop is safer than a single complex indexing operation.
|
1243 |
+
accompaniment_tensor = torch.zeros_like(vocals_tensor)
|
1244 |
+
for source_name, stem_tensor in sources.items():
|
1245 |
+
if source_name != 'vocals':
|
1246 |
+
accompaniment_tensor += stem_tensor
|
1247 |
+
|
1248 |
+
# --- Save both stems to temporary files ---
|
1249 |
+
vocals_path = os.path.join(temp_dir, f"{base_name}_vocals.flac")
|
1250 |
+
accompaniment_path = os.path.join(temp_dir, f"{base_name}_accompaniment.flac")
|
1251 |
+
torchaudio.save(vocals_path, vocals_tensor.cpu(), demucs_model.samplerate)
|
1252 |
+
torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate)
|
1253 |
|
1254 |
+
# --- Determine which stem is the primary target and which is the "other part" ---
|
1255 |
+
primary_target_path = vocals_path if transcription_target == "Transcribe Vocals" else accompaniment_path
|
1256 |
+
other_part_path = accompaniment_path if transcription_target == "Transcribe Vocals" else vocals_path
|
1257 |
+
|
1258 |
+
# Store the audio tensor of the "other part" for potential audio re-merging
|
1259 |
+
other_part_tensor = accompaniment_tensor if transcription_target == "Transcribe Vocals" else vocals_tensor
|
|
|
|
|
|
|
|
|
1260 |
other_part_sr = demucs_model.samplerate
|
|
|
|
|
1261 |
print("Separation complete.")
|
1262 |
+
|
1263 |
+
# --- Main Branching Logic: Transcribe one or both stems ---
|
1264 |
+
if not transcribe_both_stems:
|
1265 |
+
print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}")
|
1266 |
+
midi_path_for_rendering = _transcribe_stem(
|
1267 |
+
primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir,
|
1268 |
+
enable_stereo_processing, transcription_method,
|
1269 |
+
onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
|
1270 |
+
infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
|
1271 |
+
)
|
1272 |
+
else:
|
1273 |
+
print("Transcribing BOTH stems and merging the MIDI results.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1274 |
|
1275 |
+
# Transcribe the primary target
|
1276 |
+
midi_path_primary = _transcribe_stem(
|
1277 |
+
primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir,
|
1278 |
+
enable_stereo_processing, transcription_method,
|
1279 |
+
onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
|
1280 |
+
infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
|
1281 |
+
)
|
1282 |
|
1283 |
+
# Transcribe the other part
|
1284 |
+
midi_path_other = _transcribe_stem(
|
1285 |
+
other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir,
|
1286 |
+
enable_stereo_processing, transcription_method,
|
1287 |
+
onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
|
1288 |
+
infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
|
1289 |
+
)
|
1290 |
|
1291 |
+
# Merge the two resulting MIDI files
|
1292 |
+
if midi_path_primary and midi_path_other:
|
1293 |
+
final_merged_midi_path = os.path.join(temp_dir, f"{base_name}_full_transcription.mid")
|
1294 |
+
print(f"Merging transcribed MIDI files into {os.path.basename(final_merged_midi_path)}")
|
1295 |
+
|
1296 |
+
# A more robust MIDI merge is needed here
|
1297 |
+
primary_midi = pretty_midi.PrettyMIDI(midi_path_primary)
|
1298 |
+
other_midi = pretty_midi.PrettyMIDI(midi_path_other)
|
1299 |
+
|
1300 |
+
# Add all instruments from the other midi to the primary one
|
1301 |
+
for instrument in other_midi.instruments:
|
1302 |
+
instrument.name = f"Other - {instrument.name}" # Rename to avoid confusion
|
1303 |
+
primary_midi.instruments.append(instrument)
|
1304 |
+
|
1305 |
+
primary_midi.write(final_merged_midi_path)
|
1306 |
+
midi_path_for_rendering = final_merged_midi_path
|
1307 |
+
elif midi_path_primary:
|
1308 |
+
print("Warning: Transcription of the 'other' part failed. Using primary transcription only.")
|
1309 |
+
midi_path_for_rendering = midi_path_primary
|
1310 |
else:
|
1311 |
+
raise gr.Error("Transcription of the primary target failed. Aborting.")
|
1312 |
+
|
1313 |
+
# --- Step 2: Render the FINAL MIDI file with selected options ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1314 |
|
1315 |
# --- Auto-Recommendation Logic ---
|
1316 |
# Store the original parameters from the UI sliders into a dictionary.
|
|
|
1367 |
)
|
1368 |
|
1369 |
# --- Vocal Re-merging Logic now uses the generic "other_part" ---
|
1370 |
+
# IMPORTANT: This only runs if we did NOT transcribe both stems.
|
1371 |
+
if separate_vocals and remerge_vocals and not transcribe_both_stems and other_part_tensor is not None:
|
1372 |
print(f"Re-merging the non-transcribed part with newly rendered music...")
|
1373 |
|
1374 |
rendered_srate, rendered_music_int16 = results[4]
|
|
|
1426 |
# We send a gr.update() for each UI component.
|
1427 |
for _ in param_order:
|
1428 |
final_ui_updates.append(gr.update())
|
1429 |
+
|
1430 |
# The final return is a combination of the result values and the UI update values.
|
1431 |
return list(results) + final_ui_updates
|
1432 |
|
|
|
1434 |
# === Gradio UI Setup ===
|
1435 |
# =================================================================================================
|
1436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1437 |
if __name__ == "__main__":
|
1438 |
# Initialize the app: download model (if needed) and apply patches
|
1439 |
# Set to False if you don't have 'requests' or 'tqdm' installed
|
|
|
1785 |
},
|
1786 |
}
|
1787 |
|
1788 |
+
# --- Data structure for basic_pitch transcription presets ---
|
1789 |
+
BASIC_PITCH_PRESETS = {
|
1790 |
+
# --- General & All-Purpose ---
|
1791 |
+
"Default (Balanced)": {
|
1792 |
+
'description': "A good all-around starting point for most music types.",
|
1793 |
+
'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 128,
|
1794 |
+
'min_freq': 60, 'max_freq': 4000,
|
1795 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
|
1796 |
+
},
|
1797 |
+
"Anime / J-Pop": {
|
1798 |
+
'description': "For tracks with clear melodies and pop/rock arrangements.",
|
1799 |
+
'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 150,
|
1800 |
+
'min_freq': 40, 'max_freq': 2500,
|
1801 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
|
1802 |
+
},
|
1803 |
+
|
1804 |
+
# --- Specific Instruments ---
|
1805 |
+
"Solo Vocals": {
|
1806 |
+
'description': "Optimized for a single singing voice. Sensitive to nuances.",
|
1807 |
+
'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100,
|
1808 |
+
'min_freq': 80, 'max_freq': 1200,
|
1809 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
|
1810 |
+
},
|
1811 |
+
"Solo Piano": {
|
1812 |
+
'description': "For solo piano with a wide dynamic and frequency range.",
|
1813 |
+
'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 120,
|
1814 |
+
'min_freq': 27, 'max_freq': 4200,
|
1815 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
|
1816 |
+
},
|
1817 |
+
"Acoustic Guitar": {
|
1818 |
+
'description': "Balanced for picked or strummed acoustic guitar.",
|
1819 |
+
'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 90,
|
1820 |
+
'min_freq': 80, 'max_freq': 2500,
|
1821 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
|
1822 |
+
},
|
1823 |
+
"Bass Guitar": {
|
1824 |
+
'description': "Isolates and transcribes only the low frequencies of a bassline.",
|
1825 |
+
'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100,
|
1826 |
+
'min_freq': 30, 'max_freq': 400,
|
1827 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
|
1828 |
+
},
|
1829 |
+
"Percussion / Drums": {
|
1830 |
+
'description': "For drums and rhythmic elements. Catches fast, sharp hits.",
|
1831 |
+
'onset_thresh': 0.7, 'frame_thresh': 0.6, 'min_note_len': 30,
|
1832 |
+
'min_freq': 40, 'max_freq': 10000,
|
1833 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False
|
1834 |
+
},
|
1835 |
+
|
1836 |
+
# --- Complex Genres ---
|
1837 |
+
"Rock / Metal": {
|
1838 |
+
'description': "Higher thresholds for distorted guitars, bass, and drums in a dense mix.",
|
1839 |
+
'onset_thresh': 0.6, 'frame_thresh': 0.4, 'min_note_len': 100,
|
1840 |
+
'min_freq': 50, 'max_freq': 3000,
|
1841 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
|
1842 |
+
},
|
1843 |
+
"Jazz (Multi-instrument)": {
|
1844 |
+
'description': "High thresholds to separate notes in complex, improvisational passages.",
|
1845 |
+
'onset_thresh': 0.7, 'frame_thresh': 0.5, 'min_note_len': 150,
|
1846 |
+
'min_freq': 55, 'max_freq': 2000,
|
1847 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': True
|
1848 |
+
},
|
1849 |
+
"Classical (Orchestral)": {
|
1850 |
+
'description': "Longer note length to focus on sustained notes and filter out performance noise.",
|
1851 |
+
'onset_thresh': 0.5, 'frame_thresh': 0.4, 'min_note_len': 200,
|
1852 |
+
'min_freq': 32, 'max_freq': 4200,
|
1853 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
|
1854 |
+
},
|
1855 |
+
"Electronic / Synth": {
|
1856 |
+
'description': "Low thresholds and short note length for sharp, synthetic sounds.",
|
1857 |
+
'onset_thresh': 0.3, 'frame_thresh': 0.2, 'min_note_len': 50,
|
1858 |
+
'min_freq': 20, 'max_freq': 8000,
|
1859 |
+
'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False
|
1860 |
+
}
|
1861 |
+
}
|
1862 |
+
|
1863 |
+
|
1864 |
+
# --- UI visibility logic now controls three components ---
|
1865 |
+
def update_vocal_ui_visibility(separate_vocals, remerge_audio):
|
1866 |
+
"""Shows or hides the separation-related UI controls based on selections."""
|
1867 |
is_visible = gr.update(visible=separate_vocals)
|
1868 |
+
# The "Transcribe Both" checkbox is only visible if separation AND re-merging are active
|
1869 |
+
transcribe_both_visible = gr.update(visible=(separate_vocals and remerge_audio))
|
1870 |
+
return is_visible, is_visible, transcribe_both_visible
|
1871 |
+
|
1872 |
+
def update_ui_visibility(transcription_method, soundfont_choice):
|
1873 |
+
"""
|
1874 |
+
Dynamically updates the visibility of UI components based on user selections.
|
1875 |
+
"""
|
1876 |
+
is_general = (transcription_method == "General Purpose")
|
1877 |
+
is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
|
1878 |
+
|
1879 |
+
return {
|
1880 |
+
general_transcription_settings: gr.update(visible=is_general),
|
1881 |
+
synth_8bit_settings: gr.update(visible=is_8bit),
|
1882 |
+
}
|
1883 |
+
|
1884 |
+
# --- Controller function to apply basic_pitch presets to the UI ---
|
1885 |
+
def apply_basic_pitch_preset(preset_name):
|
1886 |
+
if preset_name not in BASIC_PITCH_PRESETS:
|
1887 |
+
# If "Custom" is selected or name is invalid, don't change anything
|
1888 |
+
return {comp: gr.update() for comp in basic_pitch_ui_components}
|
1889 |
+
|
1890 |
+
settings = BASIC_PITCH_PRESETS[preset_name]
|
1891 |
+
|
1892 |
+
# Return a dictionary that maps each UI component to its new value
|
1893 |
+
return {
|
1894 |
+
onset_threshold: gr.update(value=settings['onset_thresh']),
|
1895 |
+
frame_threshold: gr.update(value=settings['frame_thresh']),
|
1896 |
+
minimum_note_length: gr.update(value=settings['min_note_len']),
|
1897 |
+
minimum_frequency: gr.update(value=settings['min_freq']),
|
1898 |
+
maximum_frequency: gr.update(value=settings['max_freq']),
|
1899 |
+
infer_onsets: gr.update(value=settings['infer_onsets_bool']),
|
1900 |
+
melodia_trick: gr.update(value=settings['melodia_trick_bool']),
|
1901 |
+
multiple_pitch_bends: gr.update(value=settings['multiple_bends_bool'])
|
1902 |
+
}
|
1903 |
+
|
1904 |
+
# --- Function to apply 8-bit synthesizer presets ---
|
1905 |
+
# --- This function must be defined before the UI components that use it ---
|
1906 |
+
def apply_8bit_preset(preset_name):
|
1907 |
+
"""
|
1908 |
+
Takes the name of a preset and returns a dictionary of gr.update objects
|
1909 |
+
to set the values of all 13 of the 8-bit synthesizer's UI components.
|
1910 |
+
"""
|
1911 |
+
# --- Use a list of keys for consistent updates ---
|
1912 |
+
param_keys = [
|
1913 |
+
'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate',
|
1914 |
+
'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level',
|
1915 |
+
'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate'
|
1916 |
+
]
|
1917 |
+
|
1918 |
+
# If the user selects "Custom" or the preset is not found, do not change the values.
|
1919 |
+
if preset_name == "Custom" or preset_name not in S8BIT_PRESETS:
|
1920 |
+
# When switching to custom, don't change any values, just return empty updates.
|
1921 |
+
return {comp: gr.update() for comp in s8bit_ui_components}
|
1922 |
+
|
1923 |
+
# Get the settings dictionary for the chosen preset.
|
1924 |
+
settings = S8BIT_PRESETS[preset_name]
|
1925 |
+
|
1926 |
+
# Create a dictionary mapping UI components to their new values from the preset.
|
1927 |
+
update_dict = {}
|
1928 |
+
for i, key in enumerate(param_keys):
|
1929 |
+
component = s8bit_ui_components[i]
|
1930 |
+
value = settings.get(key)
|
1931 |
+
if value is not None:
|
1932 |
+
update_dict[component] = gr.update(value=value)
|
1933 |
+
else:
|
1934 |
+
update_dict[component] = gr.update()
|
1935 |
+
return update_dict
|
1936 |
|
1937 |
app = gr.Blocks(theme=gr.themes.Base())
|
1938 |
|
|
|
1996 |
info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
|
1997 |
visible=False # Initially hidden
|
1998 |
)
|
1999 |
+
# --- New checkbox for transcribing both stems ---
|
2000 |
+
transcribe_both_stems = gr.Checkbox(
|
2001 |
+
label="Transcribe Both Parts & Merge MIDI",
|
2002 |
+
value=False,
|
2003 |
+
info="If checked, transcribes BOTH vocals and music, then merges them into one MIDI file for rendering. Disables audio re-merging.",
|
2004 |
+
visible=False # Initially hidden
|
2005 |
+
)
|
2006 |
|
2007 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
2008 |
+
# --- Preset dropdown for basic_pitch ---
|
2009 |
+
basic_pitch_preset_selector = gr.Dropdown(
|
2010 |
+
choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()),
|
2011 |
+
value="Default (Balanced)",
|
2012 |
+
label="Transcription Profile Preset",
|
2013 |
+
info="Select a profile to auto-fill settings for different instrument types."
|
2014 |
+
"For reference only; it is recommended to test and adjust for optimal results."
|
2015 |
+
)
|
2016 |
+
|
2017 |
+
# --- The existing basic_pitch components ---
|
2018 |
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
|
2019 |
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
|
2020 |
minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
|
|
|
2177 |
# all_inputs now includes the preset selector itself
|
2178 |
# Inputs for the main processing function
|
2179 |
all_inputs = [
|
2180 |
+
input_file,
|
2181 |
+
s8bit_preset_selector,
|
2182 |
+
separate_vocals,
|
2183 |
remerge_vocals,
|
2184 |
transcription_target,
|
2185 |
+
transcribe_both_stems,
|
2186 |
enable_stereo_processing,
|
2187 |
transcription_method, onset_threshold, frame_threshold, minimum_note_length,
|
2188 |
minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
|
|
|
2201 |
output_midi, output_audio, output_plot, output_song_description
|
2202 |
]
|
2203 |
|
2204 |
+
# The list of basic_pitch UI components that can be updated by its preset selector.
|
2205 |
+
# This MUST be defined after the components themselves are created in the UI.
|
2206 |
+
basic_pitch_ui_components = [
|
2207 |
+
onset_threshold, frame_threshold, minimum_note_length, minimum_frequency,
|
2208 |
+
maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends
|
2209 |
+
]
|
2210 |
+
|
2211 |
# The list of 8-bit UI components that can be updated
|
2212 |
# This MUST be defined after the components themselves are created in the UI.
|
2213 |
s8bit_ui_components = [
|
|
|
2226 |
inputs=all_inputs,
|
2227 |
outputs=all_outputs # Pass the combined list
|
2228 |
)
|
2229 |
+
|
2230 |
+
# --- Visibility logic is now more complex ---
|
2231 |
+
# A simple lambda function to handle multiple inputs
|
2232 |
+
update_visibility_lambda = lambda sep, rem: update_vocal_ui_visibility(sep, rem)
|
2233 |
+
|
2234 |
separate_vocals.change(
|
2235 |
+
fn=update_visibility_lambda,
|
2236 |
+
inputs=[separate_vocals, remerge_vocals],
|
2237 |
+
outputs=[transcription_target, remerge_vocals, transcribe_both_stems]
|
2238 |
+
)
|
2239 |
+
remerge_vocals.change(
|
2240 |
+
fn=update_visibility_lambda,
|
2241 |
+
inputs=[separate_vocals, remerge_vocals],
|
2242 |
+
outputs=[transcription_target, remerge_vocals, transcribe_both_stems]
|
2243 |
)
|
2244 |
|
2245 |
# --- Listeners for dynamic UI updates ---
|
|
|
2253 |
inputs=[transcription_method, soundfont_bank],
|
2254 |
outputs=[general_transcription_settings, synth_8bit_settings]
|
2255 |
)
|
2256 |
+
|
2257 |
+
# --- Event listener for the new basic_pitch preset dropdown ---
|
2258 |
+
basic_pitch_preset_selector.change(
|
2259 |
+
fn=apply_basic_pitch_preset,
|
2260 |
+
inputs=[basic_pitch_preset_selector],
|
2261 |
+
outputs=basic_pitch_ui_components
|
2262 |
+
)
|
2263 |
|
2264 |
# This listener now correctly handles only the named presets, ignoring "Auto-Recommend"
|
2265 |
# --- Event listener for the preset selector ---
|
|
|
2272 |
inputs=[s8bit_preset_selector],
|
2273 |
outputs=s8bit_ui_components # This now correctly targets the new sliders
|
2274 |
)
|
2275 |
+
|
2276 |
|
2277 |
# Launch the Gradio app
|
2278 |
app.queue().launch(inbrowser=True, debug=True)
|
requirements.txt
CHANGED
@@ -19,7 +19,7 @@ psutil
|
|
19 |
pretty_midi
|
20 |
soundfile
|
21 |
pyloudnorm
|
22 |
-
|
23 |
piano_transcription_inference
|
24 |
|
25 |
basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
|
|
|
19 |
pretty_midi
|
20 |
soundfile
|
21 |
pyloudnorm
|
22 |
+
ffmpeg-python
|
23 |
piano_transcription_inference
|
24 |
|
25 |
basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
|