avans06 commited on
Commit
2923df9
·
1 Parent(s): 22dd15a

feat: Add dual-stem MIDI transcription and `basic-pitch` profiles

Browse files

**1. Dual-Stem MIDI Transcription:**
- When using vocal separation, a new option allows for transcribing **both** the vocal and accompaniment stems independently.
- The two resulting MIDI files are then automatically merged into a single, more complete MIDI file for rendering. This improves transcription quality on complex tracks.

**2. `basic-pitch` Profile Presets:**
- A "Transcription Profile Preset" dropdown has been added to the UI for the general-purpose transcription method.
- Includes a library of presets optimized for different instruments (Vocals, Piano, Drums) and genres (Rock, Jazz, Classical).
- Selecting a profile automatically configures all `basic-pitch` parameters for better results on specific audio types.

Files changed (2) hide show
  1. app.py +411 -177
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # =================================================================
2
  #
3
  # Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
4
  #
@@ -39,6 +39,7 @@
39
  #
40
  # =================================================================
41
 
 
42
  import os
43
  import hashlib
44
  import time as reqtime
@@ -48,6 +49,7 @@ import pyloudnorm as pyln
48
  import soundfile as sf
49
 
50
  import torch
 
51
  import gradio as gr
52
 
53
  # --- Imports for Vocal Separation ---
@@ -185,7 +187,7 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s,
185
  total_duration = midi_data.get_end_time()
186
  # Initialize a stereo waveform buffer (2 channels: Left, Right)
187
  waveform = np.zeros((2, int(total_duration * fs) + fs))
188
-
189
  num_instruments = len(midi_data.instruments)
190
 
191
  # Phase tracking: main oscillator phase for each instrument
@@ -320,7 +322,7 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s,
320
  def analyze_midi_velocity(midi_path):
321
  midi = pretty_midi.PrettyMIDI(midi_path)
322
  all_velocities = []
323
-
324
  print(f"Analyzing velocity for MIDI: {midi_path}")
325
  for i, instrument in enumerate(midi.instruments):
326
  velocities = [note.velocity for note in instrument.notes]
@@ -348,13 +350,13 @@ def analyze_midi_velocity(midi_path):
348
  def scale_instrument_velocity(instrument, scale=0.8):
349
  for note in instrument.notes:
350
  note.velocity = max(1, min(127, int(note.velocity * scale)))
351
-
352
 
353
  def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
354
  """
355
  Normalizes the audio data to a target integrated loudness (LUFS).
356
  This provides more consistent perceived volume than peak normalization.
357
-
358
  Args:
359
  audio_data (np.ndarray): The audio signal.
360
  sample_rate (int): The sample rate of the audio.
@@ -400,7 +402,7 @@ def merge_midis(midi_path_left, midi_path_right, output_path):
400
  Merges two MIDI files into a single MIDI file. This robust version iterates
401
  through ALL instruments in both MIDI files, ensuring no data is lost if the
402
  source files are multi-instrumental.
403
-
404
  It applies hard-left panning (Pan=0) to every instrument from the left MIDI
405
  and hard-right panning (Pan=127) to every instrument from the right MIDI.
406
  """
@@ -479,7 +481,7 @@ def TranscribePianoAudio(input_file):
479
  print('=' * 70)
480
  print('STAGE 1: Starting Piano-Specific Transcription')
481
  print('=' * 70)
482
-
483
  # Generate a unique output filename for the MIDI
484
  fn = os.path.basename(input_file)
485
  fn1 = fn.split('.')[0]
@@ -529,7 +531,7 @@ def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len,
529
  print('=' * 70)
530
  print('STAGE 1: Starting General Purpose Transcription')
531
  print('=' * 70)
532
-
533
  fn = os.path.basename(input_file)
534
  fn1 = fn.split('.')[0]
535
  output_dir = os.path.join("output", "transcribed_general_")
@@ -867,7 +869,7 @@ def Render_MIDI(input_midi_path,
867
  def analyze_midi_features(midi_data):
868
  """
869
  Analyzes a PrettyMIDI object to extract musical features for parameter recommendation.
870
-
871
  Args:
872
  midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze.
873
 
@@ -1044,12 +1046,81 @@ def recommend_8bit_params(midi_data, default_preset):
1044
  # === Main Application Logic ===
1045
  # =================================================================================================
1046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1047
  def process_and_render_file(input_file,
1048
  # --- Pass the preset selector value ---
1049
  s8bit_preset_selector,
1050
  separate_vocals,
1051
  remerge_vocals,
1052
  transcription_target,
 
 
1053
  # --- Transcription params ---
1054
  enable_stereo_processing,
1055
  transcription_method,
@@ -1082,140 +1153,164 @@ def process_and_render_file(input_file,
1082
  # This will store the other part if separation is performed
1083
  other_part_tensor = None
1084
  other_part_sr = None
1085
-
1086
  # --- Step 1: Check file type and transcribe if necessary ---
1087
  if filename.lower().endswith(('.mid', '.midi', '.kar')):
1088
- print("MIDI file detected. Proceeding directly to rendering.")
1089
  midi_path_for_rendering = input_file_path
1090
- else: #if filename.lower().endswith(('.wav', '.mp3'))
1091
- print("Audio file detected. Starting transcription...")
1092
-
 
1093
  try:
1094
- # Use torchaudio to load directly into a tensor, as demucs needs it.
1095
- # This is more efficient than loading with librosa then converting.
 
1096
  audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
 
1097
  except Exception as e:
1098
- raise gr.Error(f"Failed to load audio file: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1099
 
 
 
 
 
1100
  # --- Demucs Vocal Separation Logic, now decides which stem to process ---
1101
- if separate_vocals:
 
 
 
 
 
 
 
 
 
 
 
 
1102
  if demucs_model is None:
1103
  raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
1104
 
1105
  # Convert to a common format (stereo, float32) that demucs expects
1106
  audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
1107
-
1108
  if torch.cuda.is_available():
1109
  audio_tensor = audio_tensor.cuda()
1110
 
1111
  print("Separating audio with Demucs... This may take some time.")
1112
- all_stems = apply_model(demucs_model, audio_tensor[None], device='cuda' if torch.cuda.is_available() else 'cpu', progress=True)[0]
 
 
 
 
 
 
 
 
 
 
 
 
1113
 
1114
- vocals_idx = demucs_model.sources.index('vocals')
1115
- # Sum all stems that are NOT vocals to get the accompaniment
1116
- accompaniment_indices = [i for i, source in enumerate(demucs_model.sources) if source != 'vocals']
 
 
 
 
 
1117
 
1118
- vocals_tensor = all_stems[vocals_idx]
1119
- accompaniment_tensor = all_stems[accompaniment_indices].sum(0)
 
 
 
 
 
 
 
 
 
 
1120
 
1121
- # --- The new core branching logic ---
1122
- if transcription_target == "Transcribe Vocals":
1123
- print("Target: Transcribing VOCALS.")
1124
- tensor_to_process = vocals_tensor
1125
- other_part_tensor = accompaniment_tensor # Save accompaniment for re-merging
1126
- else: # Default to "Transcribe Music (Accompaniment)"
1127
- print("Target: Transcribing MUSIC (ACCOMPANIMENT).")
1128
- tensor_to_process = accompaniment_tensor
1129
- other_part_tensor = vocals_tensor # Save vocals for re-merging
1130
-
1131
  other_part_sr = demucs_model.samplerate
1132
- audio_tensor = tensor_to_process # The audio to be processed is now the chosen stem
1133
- native_sample_rate = demucs_model.samplerate # Update sample rate to match demucs output
1134
  print("Separation complete.")
1135
-
1136
- # --- Prepare audio for transcription (saving to a temp file) ---
1137
- # This part of the logic now works on whichever stem was selected above
1138
- base_name = os.path.splitext(filename)[0]
1139
- temp_dir = "output/temp_transcribe"
1140
- os.makedirs(temp_dir, exist_ok=True)
1141
- suffix = f"_{transcription_target.split(' ')[1].lower()}" if separate_vocals else "_original"
1142
- audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}{suffix}.wav")
1143
-
1144
- torchaudio.save(audio_to_transcribe_path, audio_tensor.cpu(), native_sample_rate)
1145
-
1146
- # Convert tensor to numpy array (channels, samples) for librosa/pyloudnorm compatibility
1147
- # We work with a CPU copy of the tensor.
1148
- audio_data_np = audio_tensor.cpu().numpy()
1149
-
1150
- # === STEREO PROCESSING LOGIC ===
1151
- if enable_stereo_processing:
1152
- if audio_data_np.ndim != 2 or audio_data_np.shape[0] != 2:
1153
- print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
1154
- enable_stereo_processing = False # Disable stereo processing if audio is not stereo
1155
-
1156
- if enable_stereo_processing:
1157
- print("Stereo processing enabled. Splitting, normalizing, and transcribing channels...")
1158
- try:
1159
- left_channel_np = audio_data_np[0]
1160
- right_channel_np = audio_data_np[1]
1161
-
1162
- normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
1163
- normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
1164
-
1165
- temp_left_path = os.path.join(temp_dir, f"{base_name}_left.wav")
1166
- temp_right_path = os.path.join(temp_dir, f"{base_name}_right.wav")
1167
 
1168
- sf.write(temp_left_path, normalized_left, native_sample_rate)
1169
- sf.write(temp_right_path, normalized_right, native_sample_rate)
1170
-
1171
- print(f"Saved left channel to: {temp_left_path}")
1172
- print(f"Saved right channel to: {temp_right_path}")
 
 
1173
 
1174
- print("Transcribing left and right channel...")
1175
- if transcription_method == "General Purpose":
1176
- midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1177
- midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1178
- else: # Piano-Specific
1179
- midi_path_left = TranscribePianoAudio(temp_left_path)
1180
- midi_path_right = TranscribePianoAudio(temp_right_path)
1181
 
1182
- if midi_path_left and midi_path_right:
1183
- merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
1184
- midi_path_for_rendering = merge_midis(midi_path_left, midi_path_right, merged_midi_path)
1185
- elif midi_path_left:
1186
- print("Warning: Right channel transcription failed. Using left channel only.")
1187
- midi_path_for_rendering = midi_path_left
1188
- elif midi_path_right:
1189
- print("Warning: Left channel transcription failed. Using right channel only.")
1190
- midi_path_for_rendering = midi_path_right
 
 
 
 
 
 
 
 
 
 
1191
  else:
1192
- raise gr.Error("Both left and right channel transcriptions failed.")
1193
-
1194
- except Exception as e:
1195
- print(f"An error occurred during stereo processing: {e}")
1196
- raise gr.Error(f"Stereo Processing Failed: {e}")
1197
- else: # Standard mono transcription
1198
- print("Mono processing. Normalizing and transcribing audio...")
1199
- # If the audio is stereo but stereo processing is disabled, convert to mono.
1200
- if audio_data_np.shape[0] == 2:
1201
- mono_signal_np = np.mean(audio_data_np, axis=0)
1202
- else:
1203
- mono_signal_np = audio_data_np[0]
1204
-
1205
- normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
1206
- temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
1207
- sf.write(temp_mono_path, normalized_mono, native_sample_rate)
1208
-
1209
- try:
1210
- if transcription_method == "General Purpose":
1211
- midi_path_for_rendering = TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1212
- else: # Piano-Specific
1213
- midi_path_for_rendering = TranscribePianoAudio(temp_mono_path)
1214
- except Exception as e:
1215
- print(f"An error occurred during transcription: {e}")
1216
- raise gr.Error(f"Transcription Failed: {e}")
1217
-
1218
- # --- Step 2: Render the MIDI file with selected options ---
1219
 
1220
  # --- Auto-Recommendation Logic ---
1221
  # Store the original parameters from the UI sliders into a dictionary.
@@ -1272,7 +1367,8 @@ def process_and_render_file(input_file,
1272
  )
1273
 
1274
  # --- Vocal Re-merging Logic now uses the generic "other_part" ---
1275
- if separate_vocals and remerge_vocals and other_part_tensor is not None:
 
1276
  print(f"Re-merging the non-transcribed part with newly rendered music...")
1277
 
1278
  rendered_srate, rendered_music_int16 = results[4]
@@ -1330,7 +1426,7 @@ def process_and_render_file(input_file,
1330
  # We send a gr.update() for each UI component.
1331
  for _ in param_order:
1332
  final_ui_updates.append(gr.update())
1333
-
1334
  # The final return is a combination of the result values and the UI update values.
1335
  return list(results) + final_ui_updates
1336
 
@@ -1338,52 +1434,6 @@ def process_and_render_file(input_file,
1338
  # === Gradio UI Setup ===
1339
  # =================================================================================================
1340
 
1341
- def update_ui_visibility(transcription_method, soundfont_choice):
1342
- """
1343
- Dynamically updates the visibility of UI components based on user selections.
1344
- """
1345
- is_general = (transcription_method == "General Purpose")
1346
- is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
1347
-
1348
- return {
1349
- general_transcription_settings: gr.update(visible=is_general),
1350
- synth_8bit_settings: gr.update(visible=is_8bit),
1351
- }
1352
-
1353
- # --- Function to apply 8-bit synthesizer presets ---
1354
- # --- This function must be defined before the UI components that use it ---
1355
- def apply_8bit_preset(preset_name):
1356
- """
1357
- Takes the name of a preset and returns a dictionary of gr.update objects
1358
- to set the values of all 13 of the 8-bit synthesizer's UI components.
1359
- """
1360
- # --- Use a list of keys for consistent updates ---
1361
- param_keys = [
1362
- 'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate',
1363
- 'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level',
1364
- 'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate'
1365
- ]
1366
-
1367
- # If the user selects "Custom" or the preset is not found, do not change the values.
1368
- if preset_name == "Custom" or preset_name not in S8BIT_PRESETS:
1369
- # When switching to custom, don't change any values, just return empty updates.
1370
- return {comp: gr.update() for comp in s8bit_ui_components}
1371
-
1372
- # Get the settings dictionary for the chosen preset.
1373
- settings = S8BIT_PRESETS[preset_name]
1374
-
1375
- # Create a dictionary mapping UI components to their new values from the preset.
1376
- update_dict = {}
1377
- for i, key in enumerate(param_keys):
1378
- component = s8bit_ui_components[i]
1379
- value = settings.get(key)
1380
- if value is not None:
1381
- update_dict[component] = gr.update(value=value)
1382
- else:
1383
- update_dict[component] = gr.update()
1384
- return update_dict
1385
-
1386
-
1387
  if __name__ == "__main__":
1388
  # Initialize the app: download model (if needed) and apply patches
1389
  # Set to False if you don't have 'requests' or 'tqdm' installed
@@ -1735,11 +1785,154 @@ if __name__ == "__main__":
1735
  },
1736
  }
1737
 
1738
- # --- Function to control visibility of BOTH new UI elements ---
1739
- def update_vocal_ui_visibility(separate_vocals):
1740
- """Shows or hides the separation-related UI controls."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1741
  is_visible = gr.update(visible=separate_vocals)
1742
- return is_visible, is_visible # Return two updates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1743
 
1744
  app = gr.Blocks(theme=gr.themes.Base())
1745
 
@@ -1803,8 +1996,25 @@ if __name__ == "__main__":
1803
  info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
1804
  visible=False # Initially hidden
1805
  )
 
 
 
 
 
 
 
1806
 
1807
  with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
 
 
 
 
 
 
 
 
 
 
1808
  onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
1809
  frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
1810
  minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
@@ -1967,10 +2177,12 @@ if __name__ == "__main__":
1967
  # all_inputs now includes the preset selector itself
1968
  # Inputs for the main processing function
1969
  all_inputs = [
1970
- input_file, s8bit_preset_selector,
1971
- separate_vocals,
 
1972
  remerge_vocals,
1973
  transcription_target,
 
1974
  enable_stereo_processing,
1975
  transcription_method, onset_threshold, frame_threshold, minimum_note_length,
1976
  minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
@@ -1989,6 +2201,13 @@ if __name__ == "__main__":
1989
  output_midi, output_audio, output_plot, output_song_description
1990
  ]
1991
 
 
 
 
 
 
 
 
1992
  # The list of 8-bit UI components that can be updated
1993
  # This MUST be defined after the components themselves are created in the UI.
1994
  s8bit_ui_components = [
@@ -2007,12 +2226,20 @@ if __name__ == "__main__":
2007
  inputs=all_inputs,
2008
  outputs=all_outputs # Pass the combined list
2009
  )
2010
-
2011
- # --- The change event now controls TWO components ---
 
 
 
2012
  separate_vocals.change(
2013
- fn=update_vocal_ui_visibility,
2014
- inputs=separate_vocals,
2015
- outputs=[transcription_target, remerge_vocals] # Update both components
 
 
 
 
 
2016
  )
2017
 
2018
  # --- Listeners for dynamic UI updates ---
@@ -2026,6 +2253,13 @@ if __name__ == "__main__":
2026
  inputs=[transcription_method, soundfont_bank],
2027
  outputs=[general_transcription_settings, synth_8bit_settings]
2028
  )
 
 
 
 
 
 
 
2029
 
2030
  # This listener now correctly handles only the named presets, ignoring "Auto-Recommend"
2031
  # --- Event listener for the preset selector ---
@@ -2038,7 +2272,7 @@ if __name__ == "__main__":
2038
  inputs=[s8bit_preset_selector],
2039
  outputs=s8bit_ui_components # This now correctly targets the new sliders
2040
  )
2041
-
2042
 
2043
  # Launch the Gradio app
2044
  app.queue().launch(inbrowser=True, debug=True)
 
1
+ # =================================================================
2
  #
3
  # Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
4
  #
 
39
  #
40
  # =================================================================
41
 
42
+ import io
43
  import os
44
  import hashlib
45
  import time as reqtime
 
49
  import soundfile as sf
50
 
51
  import torch
52
+ import ffmpeg
53
  import gradio as gr
54
 
55
  # --- Imports for Vocal Separation ---
 
187
  total_duration = midi_data.get_end_time()
188
  # Initialize a stereo waveform buffer (2 channels: Left, Right)
189
  waveform = np.zeros((2, int(total_duration * fs) + fs))
190
+
191
  num_instruments = len(midi_data.instruments)
192
 
193
  # Phase tracking: main oscillator phase for each instrument
 
322
  def analyze_midi_velocity(midi_path):
323
  midi = pretty_midi.PrettyMIDI(midi_path)
324
  all_velocities = []
325
+
326
  print(f"Analyzing velocity for MIDI: {midi_path}")
327
  for i, instrument in enumerate(midi.instruments):
328
  velocities = [note.velocity for note in instrument.notes]
 
350
  def scale_instrument_velocity(instrument, scale=0.8):
351
  for note in instrument.notes:
352
  note.velocity = max(1, min(127, int(note.velocity * scale)))
353
+
354
 
355
  def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
356
  """
357
  Normalizes the audio data to a target integrated loudness (LUFS).
358
  This provides more consistent perceived volume than peak normalization.
359
+
360
  Args:
361
  audio_data (np.ndarray): The audio signal.
362
  sample_rate (int): The sample rate of the audio.
 
402
  Merges two MIDI files into a single MIDI file. This robust version iterates
403
  through ALL instruments in both MIDI files, ensuring no data is lost if the
404
  source files are multi-instrumental.
405
+
406
  It applies hard-left panning (Pan=0) to every instrument from the left MIDI
407
  and hard-right panning (Pan=127) to every instrument from the right MIDI.
408
  """
 
481
  print('=' * 70)
482
  print('STAGE 1: Starting Piano-Specific Transcription')
483
  print('=' * 70)
484
+
485
  # Generate a unique output filename for the MIDI
486
  fn = os.path.basename(input_file)
487
  fn1 = fn.split('.')[0]
 
531
  print('=' * 70)
532
  print('STAGE 1: Starting General Purpose Transcription')
533
  print('=' * 70)
534
+
535
  fn = os.path.basename(input_file)
536
  fn1 = fn.split('.')[0]
537
  output_dir = os.path.join("output", "transcribed_general_")
 
869
  def analyze_midi_features(midi_data):
870
  """
871
  Analyzes a PrettyMIDI object to extract musical features for parameter recommendation.
872
+
873
  Args:
874
  midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze.
875
 
 
1046
  # === Main Application Logic ===
1047
  # =================================================================================================
1048
 
1049
+
1050
+ # --- Helper function to encapsulate the transcription pipeline for a single audio file ---
1051
+ def _transcribe_stem(audio_path, base_name, temp_dir,
1052
+ # Pass all transcription-related parameters
1053
+ enable_stereo, transcription_method,
1054
+ onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
1055
+ infer_onsets_bool, melodia_trick_bool, multiple_bends_bool):
1056
+ """
1057
+ Takes a single audio file path and runs the full transcription pipeline on it.
1058
+ This includes stereo/mono handling and normalization.
1059
+ Returns the file path of the resulting transcribed MIDI.
1060
+ """
1061
+ print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
1062
+
1063
+ # Load the audio stem to process it
1064
+ audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False)
1065
+
1066
+ if enable_stereo and audio_data.ndim == 2 and audio_data.shape[0] == 2:
1067
+ print("Stereo processing enabled for stem.")
1068
+ left_channel_np = audio_data[0]
1069
+ right_channel_np = audio_data[1]
1070
+
1071
+ normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
1072
+ normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
1073
+
1074
+ temp_left_path = os.path.join(temp_dir, f"{base_name}_left.flac")
1075
+ temp_right_path = os.path.join(temp_dir, f"{base_name}_right.flac")
1076
+
1077
+ sf.write(temp_left_path, normalized_left, native_sample_rate)
1078
+ sf.write(temp_right_path, normalized_right, native_sample_rate)
1079
+
1080
+ print(f"Saved left channel to: {temp_left_path}")
1081
+ print(f"Saved right channel to: {temp_right_path}")
1082
+
1083
+ print("Transcribing left and right channel...")
1084
+ if transcription_method == "General Purpose":
1085
+ midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1086
+ midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1087
+ else: # Piano-Specific
1088
+ midi_path_left = TranscribePianoAudio(temp_left_path)
1089
+ midi_path_right = TranscribePianoAudio(temp_right_path)
1090
+
1091
+ if midi_path_left and midi_path_right:
1092
+ merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
1093
+ return merge_midis(midi_path_left, midi_path_right, merged_midi_path)
1094
+ elif midi_path_left:
1095
+ print("Warning: Right channel transcription failed. Using left channel only.")
1096
+ return midi_path_left
1097
+ elif midi_path_right:
1098
+ print("Warning: Left channel transcription failed. Using right channel only.")
1099
+ return midi_path_right
1100
+ else:
1101
+ print(f"Warning: Stereo transcription failed for stem {base_name}.")
1102
+ return None
1103
+ else:
1104
+ print("Mono processing for stem.")
1105
+ mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
1106
+ normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
1107
+ temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.flac")
1108
+ sf.write(temp_mono_path, normalized_mono, native_sample_rate)
1109
+
1110
+ if transcription_method == "General Purpose":
1111
+ return TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1112
+ else:
1113
+ return TranscribePianoAudio(temp_mono_path)
1114
+
1115
+ # --- The main processing function is now significantly refactored ---
1116
  def process_and_render_file(input_file,
1117
  # --- Pass the preset selector value ---
1118
  s8bit_preset_selector,
1119
  separate_vocals,
1120
  remerge_vocals,
1121
  transcription_target,
1122
+ # --- ADDED: New parameter from UI ---
1123
+ transcribe_both_stems,
1124
  # --- Transcription params ---
1125
  enable_stereo_processing,
1126
  transcription_method,
 
1153
  # This will store the other part if separation is performed
1154
  other_part_tensor = None
1155
  other_part_sr = None
1156
+
1157
  # --- Step 1: Check file type and transcribe if necessary ---
1158
  if filename.lower().endswith(('.mid', '.midi', '.kar')):
1159
+ print("MIDI file detected. Cannot perform vocal separation. Proceeding directly to rendering.")
1160
  midi_path_for_rendering = input_file_path
1161
+ else:
1162
+ print("Audio file detected. Starting pre-processing...")
1163
+
1164
+ # --- Robust audio loading with ffmpeg fallback ---
1165
  try:
1166
+ # Try loading directly with torchaudio (efficient for supported formats).
1167
+ # This works for formats like WAV, MP3, FLAC, OGG, etc.
1168
+ print("Attempting to load audio with torchaudio...")
1169
  audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
1170
+ print("Torchaudio loading successful.")
1171
  except Exception as e:
1172
+ print(f"Torchaudio failed: {e}. Attempting fallback with ffmpeg...")
1173
+ try:
1174
+ # Use ffmpeg to convert the audio to WAV in-memory, then load the bytes.
1175
+ out, err = (
1176
+ ffmpeg
1177
+ .input(input_file_path)
1178
+ .output('pipe:', format='flac')
1179
+ .run(capture_stdout=True, capture_stderr=True)
1180
+ )
1181
+ # Load the WAV data from the in-memory buffer
1182
+ audio_tensor, native_sample_rate = torchaudio.load(io.BytesIO(out))
1183
+ print("FFmpeg fallback successful.")
1184
+ except Exception as ffmpeg_err:
1185
+ # If both direct loading and ffmpeg fallback fail, raise an error.
1186
+ raise gr.Error(f"Failed to load audio file with both torchaudio and ffmpeg.\n"
1187
+ f"Torchaudio error: {e}\n"
1188
+ f"FFmpeg error: {ffmpeg_err.decode() if isinstance(ffmpeg_err, bytes) else ffmpeg_err}")
1189
 
1190
+ base_name = os.path.splitext(filename)[0]
1191
+ temp_dir = "output/temp_transcribe"
1192
+ os.makedirs(temp_dir, exist_ok=True)
1193
+
1194
  # --- Demucs Vocal Separation Logic, now decides which stem to process ---
1195
+ if not separate_vocals:
1196
+ # --- Standard Workflow: Transcribe the original full audio ---
1197
+ print("Standard workflow: No vocal separation.")
1198
+ audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}_original.flac")
1199
+ torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
1200
+ midi_path_for_rendering = _transcribe_stem(
1201
+ audio_to_transcribe_path, f"{base_name}_original", temp_dir,
1202
+ enable_stereo_processing, transcription_method,
1203
+ onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
1204
+ infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
1205
+ )
1206
+ else:
1207
+ # --- Vocal Separation Workflow ---
1208
  if demucs_model is None:
1209
  raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
1210
 
1211
  # Convert to a common format (stereo, float32) that demucs expects
1212
  audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
1213
+
1214
  if torch.cuda.is_available():
1215
  audio_tensor = audio_tensor.cuda()
1216
 
1217
  print("Separating audio with Demucs... This may take some time.")
1218
+ # --- Wrap the model call in a no_grad() context ---
1219
+ with torch.no_grad():
1220
+ all_stems = apply_model(
1221
+ demucs_model,
1222
+ audio_tensor[None], # The input shape is [batch, channels, samples]
1223
+ device='cuda' if torch.cuda.is_available() else 'cpu',
1224
+ progress=True,
1225
+ )[0] # Remove the batch dimension from the output
1226
+
1227
+ # --- Clear CUDA cache immediately after use ---
1228
+ if torch.cuda.is_available():
1229
+ torch.cuda.empty_cache()
1230
+ print("CUDA cache cleared.")
1231
 
1232
+ # --- Robust stem handling to prevent CUDA errors ---
1233
+ # Instead of complex GPU indexing, we create a dictionary of stems on the CPU.
1234
+ # This is safer and more robust across different hardware.
1235
+ sources = {}
1236
+ for i, source_name in enumerate(demucs_model.sources):
1237
+ sources[source_name] = all_stems[i]
1238
+
1239
+ vocals_tensor = sources['vocals']
1240
 
1241
+ # Sum the other stems to create the accompaniment.
1242
+ # This loop is safer than a single complex indexing operation.
1243
+ accompaniment_tensor = torch.zeros_like(vocals_tensor)
1244
+ for source_name, stem_tensor in sources.items():
1245
+ if source_name != 'vocals':
1246
+ accompaniment_tensor += stem_tensor
1247
+
1248
+ # --- Save both stems to temporary files ---
1249
+ vocals_path = os.path.join(temp_dir, f"{base_name}_vocals.flac")
1250
+ accompaniment_path = os.path.join(temp_dir, f"{base_name}_accompaniment.flac")
1251
+ torchaudio.save(vocals_path, vocals_tensor.cpu(), demucs_model.samplerate)
1252
+ torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate)
1253
 
1254
+ # --- Determine which stem is the primary target and which is the "other part" ---
1255
+ primary_target_path = vocals_path if transcription_target == "Transcribe Vocals" else accompaniment_path
1256
+ other_part_path = accompaniment_path if transcription_target == "Transcribe Vocals" else vocals_path
1257
+
1258
+ # Store the audio tensor of the "other part" for potential audio re-merging
1259
+ other_part_tensor = accompaniment_tensor if transcription_target == "Transcribe Vocals" else vocals_tensor
 
 
 
 
1260
  other_part_sr = demucs_model.samplerate
 
 
1261
  print("Separation complete.")
1262
+
1263
+ # --- Main Branching Logic: Transcribe one or both stems ---
1264
+ if not transcribe_both_stems:
1265
+ print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}")
1266
+ midi_path_for_rendering = _transcribe_stem(
1267
+ primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir,
1268
+ enable_stereo_processing, transcription_method,
1269
+ onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
1270
+ infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
1271
+ )
1272
+ else:
1273
+ print("Transcribing BOTH stems and merging the MIDI results.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1274
 
1275
+ # Transcribe the primary target
1276
+ midi_path_primary = _transcribe_stem(
1277
+ primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir,
1278
+ enable_stereo_processing, transcription_method,
1279
+ onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
1280
+ infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
1281
+ )
1282
 
1283
+ # Transcribe the other part
1284
+ midi_path_other = _transcribe_stem(
1285
+ other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir,
1286
+ enable_stereo_processing, transcription_method,
1287
+ onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
1288
+ infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
1289
+ )
1290
 
1291
+ # Merge the two resulting MIDI files
1292
+ if midi_path_primary and midi_path_other:
1293
+ final_merged_midi_path = os.path.join(temp_dir, f"{base_name}_full_transcription.mid")
1294
+ print(f"Merging transcribed MIDI files into {os.path.basename(final_merged_midi_path)}")
1295
+
1296
+ # A more robust MIDI merge is needed here
1297
+ primary_midi = pretty_midi.PrettyMIDI(midi_path_primary)
1298
+ other_midi = pretty_midi.PrettyMIDI(midi_path_other)
1299
+
1300
+ # Add all instruments from the other midi to the primary one
1301
+ for instrument in other_midi.instruments:
1302
+ instrument.name = f"Other - {instrument.name}" # Rename to avoid confusion
1303
+ primary_midi.instruments.append(instrument)
1304
+
1305
+ primary_midi.write(final_merged_midi_path)
1306
+ midi_path_for_rendering = final_merged_midi_path
1307
+ elif midi_path_primary:
1308
+ print("Warning: Transcription of the 'other' part failed. Using primary transcription only.")
1309
+ midi_path_for_rendering = midi_path_primary
1310
  else:
1311
+ raise gr.Error("Transcription of the primary target failed. Aborting.")
1312
+
1313
+ # --- Step 2: Render the FINAL MIDI file with selected options ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1314
 
1315
  # --- Auto-Recommendation Logic ---
1316
  # Store the original parameters from the UI sliders into a dictionary.
 
1367
  )
1368
 
1369
  # --- Vocal Re-merging Logic now uses the generic "other_part" ---
1370
+ # IMPORTANT: This only runs if we did NOT transcribe both stems.
1371
+ if separate_vocals and remerge_vocals and not transcribe_both_stems and other_part_tensor is not None:
1372
  print(f"Re-merging the non-transcribed part with newly rendered music...")
1373
 
1374
  rendered_srate, rendered_music_int16 = results[4]
 
1426
  # We send a gr.update() for each UI component.
1427
  for _ in param_order:
1428
  final_ui_updates.append(gr.update())
1429
+
1430
  # The final return is a combination of the result values and the UI update values.
1431
  return list(results) + final_ui_updates
1432
 
 
1434
  # === Gradio UI Setup ===
1435
  # =================================================================================================
1436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1437
  if __name__ == "__main__":
1438
  # Initialize the app: download model (if needed) and apply patches
1439
  # Set to False if you don't have 'requests' or 'tqdm' installed
 
1785
  },
1786
  }
1787
 
1788
+ # --- Data structure for basic_pitch transcription presets ---
1789
+ BASIC_PITCH_PRESETS = {
1790
+ # --- General & All-Purpose ---
1791
+ "Default (Balanced)": {
1792
+ 'description': "A good all-around starting point for most music types.",
1793
+ 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 128,
1794
+ 'min_freq': 60, 'max_freq': 4000,
1795
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
1796
+ },
1797
+ "Anime / J-Pop": {
1798
+ 'description': "For tracks with clear melodies and pop/rock arrangements.",
1799
+ 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 150,
1800
+ 'min_freq': 40, 'max_freq': 2500,
1801
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
1802
+ },
1803
+
1804
+ # --- Specific Instruments ---
1805
+ "Solo Vocals": {
1806
+ 'description': "Optimized for a single singing voice. Sensitive to nuances.",
1807
+ 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100,
1808
+ 'min_freq': 80, 'max_freq': 1200,
1809
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
1810
+ },
1811
+ "Solo Piano": {
1812
+ 'description': "For solo piano with a wide dynamic and frequency range.",
1813
+ 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 120,
1814
+ 'min_freq': 27, 'max_freq': 4200,
1815
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
1816
+ },
1817
+ "Acoustic Guitar": {
1818
+ 'description': "Balanced for picked or strummed acoustic guitar.",
1819
+ 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 90,
1820
+ 'min_freq': 80, 'max_freq': 2500,
1821
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
1822
+ },
1823
+ "Bass Guitar": {
1824
+ 'description': "Isolates and transcribes only the low frequencies of a bassline.",
1825
+ 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100,
1826
+ 'min_freq': 30, 'max_freq': 400,
1827
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
1828
+ },
1829
+ "Percussion / Drums": {
1830
+ 'description': "For drums and rhythmic elements. Catches fast, sharp hits.",
1831
+ 'onset_thresh': 0.7, 'frame_thresh': 0.6, 'min_note_len': 30,
1832
+ 'min_freq': 40, 'max_freq': 10000,
1833
+ 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False
1834
+ },
1835
+
1836
+ # --- Complex Genres ---
1837
+ "Rock / Metal": {
1838
+ 'description': "Higher thresholds for distorted guitars, bass, and drums in a dense mix.",
1839
+ 'onset_thresh': 0.6, 'frame_thresh': 0.4, 'min_note_len': 100,
1840
+ 'min_freq': 50, 'max_freq': 3000,
1841
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
1842
+ },
1843
+ "Jazz (Multi-instrument)": {
1844
+ 'description': "High thresholds to separate notes in complex, improvisational passages.",
1845
+ 'onset_thresh': 0.7, 'frame_thresh': 0.5, 'min_note_len': 150,
1846
+ 'min_freq': 55, 'max_freq': 2000,
1847
+ 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': True
1848
+ },
1849
+ "Classical (Orchestral)": {
1850
+ 'description': "Longer note length to focus on sustained notes and filter out performance noise.",
1851
+ 'onset_thresh': 0.5, 'frame_thresh': 0.4, 'min_note_len': 200,
1852
+ 'min_freq': 32, 'max_freq': 4200,
1853
+ 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
1854
+ },
1855
+ "Electronic / Synth": {
1856
+ 'description': "Low thresholds and short note length for sharp, synthetic sounds.",
1857
+ 'onset_thresh': 0.3, 'frame_thresh': 0.2, 'min_note_len': 50,
1858
+ 'min_freq': 20, 'max_freq': 8000,
1859
+ 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False
1860
+ }
1861
+ }
1862
+
1863
+
1864
+ # --- UI visibility logic now controls three components ---
1865
+ def update_vocal_ui_visibility(separate_vocals, remerge_audio):
1866
+ """Shows or hides the separation-related UI controls based on selections."""
1867
  is_visible = gr.update(visible=separate_vocals)
1868
+ # The "Transcribe Both" checkbox is only visible if separation AND re-merging are active
1869
+ transcribe_both_visible = gr.update(visible=(separate_vocals and remerge_audio))
1870
+ return is_visible, is_visible, transcribe_both_visible
1871
+
1872
+ def update_ui_visibility(transcription_method, soundfont_choice):
1873
+ """
1874
+ Dynamically updates the visibility of UI components based on user selections.
1875
+ """
1876
+ is_general = (transcription_method == "General Purpose")
1877
+ is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
1878
+
1879
+ return {
1880
+ general_transcription_settings: gr.update(visible=is_general),
1881
+ synth_8bit_settings: gr.update(visible=is_8bit),
1882
+ }
1883
+
1884
+ # --- Controller function to apply basic_pitch presets to the UI ---
1885
+ def apply_basic_pitch_preset(preset_name):
1886
+ if preset_name not in BASIC_PITCH_PRESETS:
1887
+ # If "Custom" is selected or name is invalid, don't change anything
1888
+ return {comp: gr.update() for comp in basic_pitch_ui_components}
1889
+
1890
+ settings = BASIC_PITCH_PRESETS[preset_name]
1891
+
1892
+ # Return a dictionary that maps each UI component to its new value
1893
+ return {
1894
+ onset_threshold: gr.update(value=settings['onset_thresh']),
1895
+ frame_threshold: gr.update(value=settings['frame_thresh']),
1896
+ minimum_note_length: gr.update(value=settings['min_note_len']),
1897
+ minimum_frequency: gr.update(value=settings['min_freq']),
1898
+ maximum_frequency: gr.update(value=settings['max_freq']),
1899
+ infer_onsets: gr.update(value=settings['infer_onsets_bool']),
1900
+ melodia_trick: gr.update(value=settings['melodia_trick_bool']),
1901
+ multiple_pitch_bends: gr.update(value=settings['multiple_bends_bool'])
1902
+ }
1903
+
1904
+ # --- Function to apply 8-bit synthesizer presets ---
1905
+ # --- This function must be defined before the UI components that use it ---
1906
+ def apply_8bit_preset(preset_name):
1907
+ """
1908
+ Takes the name of a preset and returns a dictionary of gr.update objects
1909
+ to set the values of all 13 of the 8-bit synthesizer's UI components.
1910
+ """
1911
+ # --- Use a list of keys for consistent updates ---
1912
+ param_keys = [
1913
+ 'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate',
1914
+ 'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level',
1915
+ 'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate'
1916
+ ]
1917
+
1918
+ # If the user selects "Custom" or the preset is not found, do not change the values.
1919
+ if preset_name == "Custom" or preset_name not in S8BIT_PRESETS:
1920
+ # When switching to custom, don't change any values, just return empty updates.
1921
+ return {comp: gr.update() for comp in s8bit_ui_components}
1922
+
1923
+ # Get the settings dictionary for the chosen preset.
1924
+ settings = S8BIT_PRESETS[preset_name]
1925
+
1926
+ # Create a dictionary mapping UI components to their new values from the preset.
1927
+ update_dict = {}
1928
+ for i, key in enumerate(param_keys):
1929
+ component = s8bit_ui_components[i]
1930
+ value = settings.get(key)
1931
+ if value is not None:
1932
+ update_dict[component] = gr.update(value=value)
1933
+ else:
1934
+ update_dict[component] = gr.update()
1935
+ return update_dict
1936
 
1937
  app = gr.Blocks(theme=gr.themes.Base())
1938
 
 
1996
  info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
1997
  visible=False # Initially hidden
1998
  )
1999
+ # --- New checkbox for transcribing both stems ---
2000
+ transcribe_both_stems = gr.Checkbox(
2001
+ label="Transcribe Both Parts & Merge MIDI",
2002
+ value=False,
2003
+ info="If checked, transcribes BOTH vocals and music, then merges them into one MIDI file for rendering. Disables audio re-merging.",
2004
+ visible=False # Initially hidden
2005
+ )
2006
 
2007
  with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
2008
+ # --- Preset dropdown for basic_pitch ---
2009
+ basic_pitch_preset_selector = gr.Dropdown(
2010
+ choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()),
2011
+ value="Default (Balanced)",
2012
+ label="Transcription Profile Preset",
2013
+ info="Select a profile to auto-fill settings for different instrument types."
2014
+ "For reference only; it is recommended to test and adjust for optimal results."
2015
+ )
2016
+
2017
+ # --- The existing basic_pitch components ---
2018
  onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
2019
  frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
2020
  minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
 
2177
  # all_inputs now includes the preset selector itself
2178
  # Inputs for the main processing function
2179
  all_inputs = [
2180
+ input_file,
2181
+ s8bit_preset_selector,
2182
+ separate_vocals,
2183
  remerge_vocals,
2184
  transcription_target,
2185
+ transcribe_both_stems,
2186
  enable_stereo_processing,
2187
  transcription_method, onset_threshold, frame_threshold, minimum_note_length,
2188
  minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
 
2201
  output_midi, output_audio, output_plot, output_song_description
2202
  ]
2203
 
2204
+ # The list of basic_pitch UI components that can be updated by its preset selector.
2205
+ # This MUST be defined after the components themselves are created in the UI.
2206
+ basic_pitch_ui_components = [
2207
+ onset_threshold, frame_threshold, minimum_note_length, minimum_frequency,
2208
+ maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends
2209
+ ]
2210
+
2211
  # The list of 8-bit UI components that can be updated
2212
  # This MUST be defined after the components themselves are created in the UI.
2213
  s8bit_ui_components = [
 
2226
  inputs=all_inputs,
2227
  outputs=all_outputs # Pass the combined list
2228
  )
2229
+
2230
+ # --- Visibility logic is now more complex ---
2231
+ # A simple lambda function to handle multiple inputs
2232
+ update_visibility_lambda = lambda sep, rem: update_vocal_ui_visibility(sep, rem)
2233
+
2234
  separate_vocals.change(
2235
+ fn=update_visibility_lambda,
2236
+ inputs=[separate_vocals, remerge_vocals],
2237
+ outputs=[transcription_target, remerge_vocals, transcribe_both_stems]
2238
+ )
2239
+ remerge_vocals.change(
2240
+ fn=update_visibility_lambda,
2241
+ inputs=[separate_vocals, remerge_vocals],
2242
+ outputs=[transcription_target, remerge_vocals, transcribe_both_stems]
2243
  )
2244
 
2245
  # --- Listeners for dynamic UI updates ---
 
2253
  inputs=[transcription_method, soundfont_bank],
2254
  outputs=[general_transcription_settings, synth_8bit_settings]
2255
  )
2256
+
2257
+ # --- Event listener for the new basic_pitch preset dropdown ---
2258
+ basic_pitch_preset_selector.change(
2259
+ fn=apply_basic_pitch_preset,
2260
+ inputs=[basic_pitch_preset_selector],
2261
+ outputs=basic_pitch_ui_components
2262
+ )
2263
 
2264
  # This listener now correctly handles only the named presets, ignoring "Auto-Recommend"
2265
  # --- Event listener for the preset selector ---
 
2272
  inputs=[s8bit_preset_selector],
2273
  outputs=s8bit_ui_components # This now correctly targets the new sliders
2274
  )
2275
+
2276
 
2277
  # Launch the Gradio app
2278
  app.queue().launch(inbrowser=True, debug=True)
requirements.txt CHANGED
@@ -19,7 +19,7 @@ psutil
19
  pretty_midi
20
  soundfile
21
  pyloudnorm
22
-
23
  piano_transcription_inference
24
 
25
  basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
 
19
  pretty_midi
20
  soundfile
21
  pyloudnorm
22
+ ffmpeg-python
23
  piano_transcription_inference
24
 
25
  basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'