avans06 commited on
Commit
2a4c238
·
1 Parent(s): 5700ed3

fix(metadata): Correctly log auto-analyzed transcription parameters

Browse files

Resolves an issue where parameters generated by the "Auto-Analyze Audio" mode were not being saved in the output file's metadata.

Files changed (1) hide show
  1. app.py +64 -15
app.py CHANGED
@@ -294,14 +294,45 @@ def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int):
294
  return params
295
 
296
 
297
- def format_params_for_metadata(params: AppParameters) -> str:
298
  """
299
  Formats the AppParameters object into a human-readable string
300
  suitable for embedding as metadata in an audio file.
301
  """
302
  import json
303
- # Convert the dataclass to a dictionary
304
- params_dict = params.__dict__
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  # Use json.dumps for clean, well-formatted, multi-line string representation
306
  # indent=2 makes it look nice when read back
307
  return json.dumps(params_dict, indent=2)
@@ -1310,7 +1341,7 @@ def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
1310
  # =================================================================================================
1311
  # === MIDI Merging Function ===
1312
  # =================================================================================================
1313
- def merge_midis(midi_path_left, midi_path_right, output_path):
1314
  """
1315
  Merges two MIDI files into a single MIDI file. This robust version iterates
1316
  through ALL instruments in both MIDI files, ensuring no data is lost if the
@@ -2041,7 +2072,10 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
2041
  """
2042
  Takes a single audio file path and runs the full transcription pipeline on it.
2043
  This includes stereo/mono handling and normalization.
2044
- Returns the file path of the resulting transcribed MIDI.
 
 
 
2045
  """
2046
  print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
2047
 
@@ -2094,16 +2128,16 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
2094
 
2095
  if midi_path_left and midi_path_right:
2096
  merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
2097
- return merge_midis(midi_path_left, midi_path_right, merged_midi_path)
2098
  elif midi_path_left:
2099
  print("Warning: Right channel transcription failed. Using left channel only.")
2100
- return midi_path_left
2101
  elif midi_path_right:
2102
  print("Warning: Left channel transcription failed. Using right channel only.")
2103
- return midi_path_right
2104
  else:
2105
  print(f"Warning: Stereo transcription failed for stem {base_name}.")
2106
- return None
2107
  else:
2108
  print("Mono processing for stem.")
2109
  mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
@@ -2112,9 +2146,10 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
2112
  sf.write(temp_mono_path, normalized_mono, native_sample_rate)
2113
 
2114
  if params.transcription_method == "General Purpose":
2115
- return TranscribeGeneralAudio(temp_mono_path, **final_bp_params)
2116
  else:
2117
- return TranscribePianoAudio(temp_mono_path)
 
2118
 
2119
 
2120
  # --- The core processing engine for a single file ---
@@ -2145,6 +2180,8 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
2145
  # --- Use the provided timestamp for unique filenames ---
2146
  timestamped_base_name = f"{base_name}_{timestamp}"
2147
 
 
 
2148
 
2149
  # --- Step 1: Check file type and transcribe if necessary ---
2150
  if is_midi_input:
@@ -2196,7 +2233,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
2196
  # --- Demucs Vocal Separation Logic ---
2197
  # This block now handles multi-stem separation, transcription, and merging logic.
2198
  separated_stems = {} # This will store the audio tensors for merging
2199
-
2200
  if params.separate_vocals and demucs_model is not None:
2201
  # --- Vocal Separation Workflow ---
2202
  update_progress(0.2, "Separating audio with Demucs...")
@@ -2255,9 +2292,14 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
2255
  update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
2256
  stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
2257
  torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
2258
- midi_path = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
2259
  if midi_path:
2260
  transcribed_midi_paths.append((name, midi_path))
 
 
 
 
 
2261
 
2262
  # --- Merge Transcribed MIDIs ---
2263
  if not transcribed_midi_paths:
@@ -2285,7 +2327,14 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
2285
  torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
2286
 
2287
  update_progress(0.2, "Transcribing audio to MIDI...")
2288
- midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
 
 
 
 
 
 
 
2289
 
2290
  if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
2291
  print(f"ERROR: Transcription failed for {filename}. Skipping.")
@@ -2393,7 +2442,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
2393
  # --- Save audio with embedded parameter metadata ---
2394
  try:
2395
  # Generate the metadata string from the final parameters used for the render.
2396
- metadata_string = format_params_for_metadata(params)
2397
 
2398
  sf.write(final_audio_path, final_audio_data, final_srate)
2399
  audio = FLAC(final_audio_path)
 
294
  return params
295
 
296
 
297
+ def format_params_for_metadata(params: AppParameters, transcription_log: dict = None) -> str:
298
  """
299
  Formats the AppParameters object into a human-readable string
300
  suitable for embedding as metadata in an audio file.
301
  """
302
  import json
303
+ # Start with a clean dictionary of the main parameters
304
+ params_dict = copy.copy(params.__dict__)
305
+
306
+ # Create a structured dictionary for the final metadata
307
+ structured_metadata = {
308
+ "main_settings": {},
309
+ "transcription_log": transcription_log if transcription_log else "Not Performed",
310
+ "synthesis_settings": {}
311
+ }
312
+
313
+ # Separate parameters into logical groups
314
+ transcription_keys = [
315
+ 'transcription_method', 'basic_pitch_preset_selector', 'onset_threshold',
316
+ 'frame_threshold', 'minimum_note_length', 'minimum_frequency', 'maximum_frequency',
317
+ 'infer_onsets', 'melodia_trick', 'multiple_pitch_bends'
318
+ ]
319
+
320
+ synthesis_keys = [key for key in params_dict.keys() if key.startswith('s8bit_')]
321
+
322
+ # Populate the structured dictionary
323
+ for key, value in params_dict.items():
324
+ if key not in transcription_keys and key not in synthesis_keys:
325
+ structured_metadata["main_settings"][key] = value
326
+
327
+ for key in synthesis_keys:
328
+ structured_metadata["synthesis_settings"][key] = params_dict[key]
329
+
330
+ # If transcription log is empty, we still want to record the UI settings for transcription
331
+ if not transcription_log:
332
+ structured_metadata["transcription_log"] = {
333
+ "ui_settings": {key: params_dict[key] for key in transcription_keys}
334
+ }
335
+
336
  # Use json.dumps for clean, well-formatted, multi-line string representation
337
  # indent=2 makes it look nice when read back
338
  return json.dumps(params_dict, indent=2)
 
1341
  # =================================================================================================
1342
  # === MIDI Merging Function ===
1343
  # =================================================================================================
1344
+ def merge_midis(midi_path_left: str, midi_path_right: str, output_path: str):
1345
  """
1346
  Merges two MIDI files into a single MIDI file. This robust version iterates
1347
  through ALL instruments in both MIDI files, ensuring no data is lost if the
 
2072
  """
2073
  Takes a single audio file path and runs the full transcription pipeline on it.
2074
  This includes stereo/mono handling and normalization.
2075
+ Returns:
2076
+ A tuple containing:
2077
+ - The file path of the resulting transcribed MIDI.
2078
+ - The dictionary of the final basic_pitch parameters that were actually used.
2079
  """
2080
  print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
2081
 
 
2128
 
2129
  if midi_path_left and midi_path_right:
2130
  merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
2131
+ return merge_midis(midi_path_left, midi_path_right, merged_midi_path), final_bp_params
2132
  elif midi_path_left:
2133
  print("Warning: Right channel transcription failed. Using left channel only.")
2134
+ return midi_path_left, final_bp_params
2135
  elif midi_path_right:
2136
  print("Warning: Left channel transcription failed. Using right channel only.")
2137
+ return midi_path_right, final_bp_params
2138
  else:
2139
  print(f"Warning: Stereo transcription failed for stem {base_name}.")
2140
+ return None, {}
2141
  else:
2142
  print("Mono processing for stem.")
2143
  mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
 
2146
  sf.write(temp_mono_path, normalized_mono, native_sample_rate)
2147
 
2148
  if params.transcription_method == "General Purpose":
2149
+ return TranscribeGeneralAudio(temp_mono_path, **final_bp_params), final_bp_params
2150
  else:
2151
+ # For piano, there are no bp_params, so we return an empty dict
2152
+ return TranscribePianoAudio(temp_mono_path), {}
2153
 
2154
 
2155
  # --- The core processing engine for a single file ---
 
2180
  # --- Use the provided timestamp for unique filenames ---
2181
  timestamped_base_name = f"{base_name}_{timestamp}"
2182
 
2183
+ # --- Dictionary to log parameters for each transcribed stem ---
2184
+ transcription_params_log = {}
2185
 
2186
  # --- Step 1: Check file type and transcribe if necessary ---
2187
  if is_midi_input:
 
2233
  # --- Demucs Vocal Separation Logic ---
2234
  # This block now handles multi-stem separation, transcription, and merging logic.
2235
  separated_stems = {} # This will store the audio tensors for merging
2236
+
2237
  if params.separate_vocals and demucs_model is not None:
2238
  # --- Vocal Separation Workflow ---
2239
  update_progress(0.2, "Separating audio with Demucs...")
 
2292
  update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
2293
  stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
2294
  torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
2295
+ midi_path, used_bp_params = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
2296
  if midi_path:
2297
  transcribed_midi_paths.append((name, midi_path))
2298
+ # --- Log the used parameters for this specific stem ---
2299
+ if used_bp_params:
2300
+ # Also log which preset was active for this stem
2301
+ used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector
2302
+ transcription_params_log[name] = used_bp_params
2303
 
2304
  # --- Merge Transcribed MIDIs ---
2305
  if not transcribed_midi_paths:
 
2327
  torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
2328
 
2329
  update_progress(0.2, "Transcribing audio to MIDI...")
2330
+ midi_path_for_rendering, used_bp_params = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
2331
+
2332
+ # --- Populate the log in this workflow as well ---
2333
+ if used_bp_params:
2334
+ used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector
2335
+ # Use a standard key like "full_mix" for the log
2336
+ transcription_params_log["full_mix"] = used_bp_params
2337
+ print(" - Logged transcription parameters for the full mix.")
2338
 
2339
  if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
2340
  print(f"ERROR: Transcription failed for {filename}. Skipping.")
 
2442
  # --- Save audio with embedded parameter metadata ---
2443
  try:
2444
  # Generate the metadata string from the final parameters used for the render.
2445
+ metadata_string = format_params_for_metadata(params, transcription_params_log)
2446
 
2447
  sf.write(final_audio_path, final_audio_data, final_srate)
2448
  audio = FLAC(final_audio_path)