Audio-To-MIDI-And-Advanced-Renderer

Running

avans06 commited on 8 days ago

Commit

2a4c238

1 Parent(s): 5700ed3

fix(metadata): Correctly log auto-analyzed transcription parameters

Resolves an issue where parameters generated by the "Auto-Analyze Audio" mode were not being saved in the output file's metadata.

Files changed (1) hide show

app.py +64 -15

app.py CHANGED Viewed

@@ -294,14 +294,45 @@ def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int):
     return params
-def format_params_for_metadata(params: AppParameters) -> str:
     """
     Formats the AppParameters object into a human-readable string
     suitable for embedding as metadata in an audio file.
     """
     import json
-    # Convert the dataclass to a dictionary
-    params_dict = params.__dict__
     # Use json.dumps for clean, well-formatted, multi-line string representation
     # indent=2 makes it look nice when read back
     return json.dumps(params_dict, indent=2)
@@ -1310,7 +1341,7 @@ def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
 # =================================================================================================
 # === MIDI Merging Function ===
 # =================================================================================================
-def merge_midis(midi_path_left, midi_path_right, output_path):
     """
     Merges two MIDI files into a single MIDI file. This robust version iterates
     through ALL instruments in both MIDI files, ensuring no data is lost if the
@@ -2041,7 +2072,10 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
     """
     Takes a single audio file path and runs the full transcription pipeline on it.
     This includes stereo/mono handling and normalization.
-    Returns the file path of the resulting transcribed MIDI.
     """
     print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
@@ -2094,16 +2128,16 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
         if midi_path_left and midi_path_right:
             merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
-            return merge_midis(midi_path_left, midi_path_right, merged_midi_path)
         elif midi_path_left:
             print("Warning: Right channel transcription failed. Using left channel only.")
-            return midi_path_left
         elif midi_path_right:
             print("Warning: Left channel transcription failed. Using right channel only.")
-            return midi_path_right
         else:
             print(f"Warning: Stereo transcription failed for stem {base_name}.")
-            return None
     else:
         print("Mono processing for stem.")
         mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
@@ -2112,9 +2146,10 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
         sf.write(temp_mono_path, normalized_mono, native_sample_rate)
         if params.transcription_method == "General Purpose":
-            return TranscribeGeneralAudio(temp_mono_path, **final_bp_params)
         else:
-            return TranscribePianoAudio(temp_mono_path)
 # --- The core processing engine for a single file ---
@@ -2145,6 +2180,8 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
     # --- Use the provided timestamp for unique filenames ---
     timestamped_base_name = f"{base_name}_{timestamp}"
     # --- Step 1: Check file type and transcribe if necessary ---
     if is_midi_input:
@@ -2196,7 +2233,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
         # --- Demucs Vocal Separation Logic ---
         # This block now handles multi-stem separation, transcription, and merging logic.
         separated_stems = {} # This will store the audio tensors for merging
         if params.separate_vocals and demucs_model is not None:
             # --- Vocal Separation Workflow ---
             update_progress(0.2, "Separating audio with Demucs...")
@@ -2255,9 +2292,14 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
                     update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
                     stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
                     torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
-                    midi_path = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
                     if midi_path:
                         transcribed_midi_paths.append((name, midi_path))
             # --- Merge Transcribed MIDIs ---
             if not transcribed_midi_paths:
@@ -2285,7 +2327,14 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
             torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
             update_progress(0.2, "Transcribing audio to MIDI...")
-            midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
     if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
         print(f"ERROR: Transcription failed for {filename}. Skipping.")
@@ -2393,7 +2442,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
     # --- Save audio with embedded parameter metadata ---
     try:
         # Generate the metadata string from the final parameters used for the render.
-        metadata_string = format_params_for_metadata(params)
         sf.write(final_audio_path, final_audio_data, final_srate)
         audio = FLAC(final_audio_path)

     return params
+def format_params_for_metadata(params: AppParameters, transcription_log: dict = None) -> str:
     """
     Formats the AppParameters object into a human-readable string
     suitable for embedding as metadata in an audio file.
     """
     import json
+    # Start with a clean dictionary of the main parameters
+    params_dict = copy.copy(params.__dict__)
+    # Create a structured dictionary for the final metadata
+    structured_metadata = {
+        "main_settings": {},
+        "transcription_log": transcription_log if transcription_log else "Not Performed",
+        "synthesis_settings": {}
+    }
+    # Separate parameters into logical groups
+    transcription_keys = [
+        'transcription_method', 'basic_pitch_preset_selector', 'onset_threshold',
+        'frame_threshold', 'minimum_note_length', 'minimum_frequency', 'maximum_frequency',
+        'infer_onsets', 'melodia_trick', 'multiple_pitch_bends'
+    ]
+    synthesis_keys = [key for key in params_dict.keys() if key.startswith('s8bit_')]
+    # Populate the structured dictionary
+    for key, value in params_dict.items():
+        if key not in transcription_keys and key not in synthesis_keys:
+            structured_metadata["main_settings"][key] = value
+    for key in synthesis_keys:
+        structured_metadata["synthesis_settings"][key] = params_dict[key]
+    # If transcription log is empty, we still want to record the UI settings for transcription
+    if not transcription_log:
+        structured_metadata["transcription_log"] = {
+            "ui_settings": {key: params_dict[key] for key in transcription_keys}
+        }
     # Use json.dumps for clean, well-formatted, multi-line string representation
     # indent=2 makes it look nice when read back
     return json.dumps(params_dict, indent=2)
 # =================================================================================================
 # === MIDI Merging Function ===
 # =================================================================================================
+def merge_midis(midi_path_left: str, midi_path_right: str, output_path: str):
     """
     Merges two MIDI files into a single MIDI file. This robust version iterates
     through ALL instruments in both MIDI files, ensuring no data is lost if the
     """
     Takes a single audio file path and runs the full transcription pipeline on it.
     This includes stereo/mono handling and normalization.
+    Returns:
+        A tuple containing:
+        - The file path of the resulting transcribed MIDI.
+        - The dictionary of the final basic_pitch parameters that were actually used.
     """
     print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
         if midi_path_left and midi_path_right:
             merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
+            return merge_midis(midi_path_left, midi_path_right, merged_midi_path), final_bp_params
         elif midi_path_left:
             print("Warning: Right channel transcription failed. Using left channel only.")
+            return midi_path_left, final_bp_params
         elif midi_path_right:
             print("Warning: Left channel transcription failed. Using right channel only.")
+            return midi_path_right, final_bp_params
         else:
             print(f"Warning: Stereo transcription failed for stem {base_name}.")
+            return None, {}
     else:
         print("Mono processing for stem.")
         mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
         sf.write(temp_mono_path, normalized_mono, native_sample_rate)
         if params.transcription_method == "General Purpose":
+            return TranscribeGeneralAudio(temp_mono_path, **final_bp_params), final_bp_params
         else:
+            # For piano, there are no bp_params, so we return an empty dict
+            return TranscribePianoAudio(temp_mono_path), {}
 # --- The core processing engine for a single file ---
     # --- Use the provided timestamp for unique filenames ---
     timestamped_base_name = f"{base_name}_{timestamp}"
+    # --- Dictionary to log parameters for each transcribed stem ---
+    transcription_params_log = {}
     # --- Step 1: Check file type and transcribe if necessary ---
     if is_midi_input:
         # --- Demucs Vocal Separation Logic ---
         # This block now handles multi-stem separation, transcription, and merging logic.
         separated_stems = {} # This will store the audio tensors for merging
         if params.separate_vocals and demucs_model is not None:
             # --- Vocal Separation Workflow ---
             update_progress(0.2, "Separating audio with Demucs...")
                     update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
                     stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
                     torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
+                    midi_path, used_bp_params = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
                     if midi_path:
                         transcribed_midi_paths.append((name, midi_path))
+                        # --- Log the used parameters for this specific stem ---
+                        if used_bp_params:
+                            # Also log which preset was active for this stem
+                            used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector
+                            transcription_params_log[name] = used_bp_params
             # --- Merge Transcribed MIDIs ---
             if not transcribed_midi_paths:
             torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
             update_progress(0.2, "Transcribing audio to MIDI...")
+            midi_path_for_rendering, used_bp_params = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
+            # --- Populate the log in this workflow as well ---
+            if used_bp_params:
+                used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector
+                # Use a standard key like "full_mix" for the log
+                transcription_params_log["full_mix"] = used_bp_params
+                print("  - Logged transcription parameters for the full mix.")
     if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
         print(f"ERROR: Transcription failed for {filename}. Skipping.")
     # --- Save audio with embedded parameter metadata ---
     try:
         # Generate the metadata string from the final parameters used for the render.
+        metadata_string = format_params_for_metadata(params, transcription_params_log)
         sf.write(final_audio_path, final_audio_data, final_srate)
         audio = FLAC(final_audio_path)