fix(metadata): Correctly log auto-analyzed transcription parameters
Browse filesResolves an issue where parameters generated by the "Auto-Analyze Audio" mode were not being saved in the output file's metadata.
app.py
CHANGED
@@ -294,14 +294,45 @@ def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int):
|
|
294 |
return params
|
295 |
|
296 |
|
297 |
-
def format_params_for_metadata(params: AppParameters) -> str:
|
298 |
"""
|
299 |
Formats the AppParameters object into a human-readable string
|
300 |
suitable for embedding as metadata in an audio file.
|
301 |
"""
|
302 |
import json
|
303 |
-
#
|
304 |
-
params_dict = params.__dict__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
# Use json.dumps for clean, well-formatted, multi-line string representation
|
306 |
# indent=2 makes it look nice when read back
|
307 |
return json.dumps(params_dict, indent=2)
|
@@ -1310,7 +1341,7 @@ def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
|
|
1310 |
# =================================================================================================
|
1311 |
# === MIDI Merging Function ===
|
1312 |
# =================================================================================================
|
1313 |
-
def merge_midis(midi_path_left, midi_path_right, output_path):
|
1314 |
"""
|
1315 |
Merges two MIDI files into a single MIDI file. This robust version iterates
|
1316 |
through ALL instruments in both MIDI files, ensuring no data is lost if the
|
@@ -2041,7 +2072,10 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
|
|
2041 |
"""
|
2042 |
Takes a single audio file path and runs the full transcription pipeline on it.
|
2043 |
This includes stereo/mono handling and normalization.
|
2044 |
-
Returns
|
|
|
|
|
|
|
2045 |
"""
|
2046 |
print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
|
2047 |
|
@@ -2094,16 +2128,16 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
|
|
2094 |
|
2095 |
if midi_path_left and midi_path_right:
|
2096 |
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
|
2097 |
-
return merge_midis(midi_path_left, midi_path_right, merged_midi_path)
|
2098 |
elif midi_path_left:
|
2099 |
print("Warning: Right channel transcription failed. Using left channel only.")
|
2100 |
-
return midi_path_left
|
2101 |
elif midi_path_right:
|
2102 |
print("Warning: Left channel transcription failed. Using right channel only.")
|
2103 |
-
return midi_path_right
|
2104 |
else:
|
2105 |
print(f"Warning: Stereo transcription failed for stem {base_name}.")
|
2106 |
-
return None
|
2107 |
else:
|
2108 |
print("Mono processing for stem.")
|
2109 |
mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
|
@@ -2112,9 +2146,10 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
|
|
2112 |
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
2113 |
|
2114 |
if params.transcription_method == "General Purpose":
|
2115 |
-
return TranscribeGeneralAudio(temp_mono_path, **final_bp_params)
|
2116 |
else:
|
2117 |
-
return
|
|
|
2118 |
|
2119 |
|
2120 |
# --- The core processing engine for a single file ---
|
@@ -2145,6 +2180,8 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
2145 |
# --- Use the provided timestamp for unique filenames ---
|
2146 |
timestamped_base_name = f"{base_name}_{timestamp}"
|
2147 |
|
|
|
|
|
2148 |
|
2149 |
# --- Step 1: Check file type and transcribe if necessary ---
|
2150 |
if is_midi_input:
|
@@ -2196,7 +2233,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
2196 |
# --- Demucs Vocal Separation Logic ---
|
2197 |
# This block now handles multi-stem separation, transcription, and merging logic.
|
2198 |
separated_stems = {} # This will store the audio tensors for merging
|
2199 |
-
|
2200 |
if params.separate_vocals and demucs_model is not None:
|
2201 |
# --- Vocal Separation Workflow ---
|
2202 |
update_progress(0.2, "Separating audio with Demucs...")
|
@@ -2255,9 +2292,14 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
2255 |
update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
|
2256 |
stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
|
2257 |
torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
|
2258 |
-
midi_path = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
|
2259 |
if midi_path:
|
2260 |
transcribed_midi_paths.append((name, midi_path))
|
|
|
|
|
|
|
|
|
|
|
2261 |
|
2262 |
# --- Merge Transcribed MIDIs ---
|
2263 |
if not transcribed_midi_paths:
|
@@ -2285,7 +2327,14 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
2285 |
torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
|
2286 |
|
2287 |
update_progress(0.2, "Transcribing audio to MIDI...")
|
2288 |
-
midi_path_for_rendering = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2289 |
|
2290 |
if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
|
2291 |
print(f"ERROR: Transcription failed for {filename}. Skipping.")
|
@@ -2393,7 +2442,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
|
|
2393 |
# --- Save audio with embedded parameter metadata ---
|
2394 |
try:
|
2395 |
# Generate the metadata string from the final parameters used for the render.
|
2396 |
-
metadata_string = format_params_for_metadata(params)
|
2397 |
|
2398 |
sf.write(final_audio_path, final_audio_data, final_srate)
|
2399 |
audio = FLAC(final_audio_path)
|
|
|
294 |
return params
|
295 |
|
296 |
|
297 |
+
def format_params_for_metadata(params: AppParameters, transcription_log: dict = None) -> str:
|
298 |
"""
|
299 |
Formats the AppParameters object into a human-readable string
|
300 |
suitable for embedding as metadata in an audio file.
|
301 |
"""
|
302 |
import json
|
303 |
+
# Start with a clean dictionary of the main parameters
|
304 |
+
params_dict = copy.copy(params.__dict__)
|
305 |
+
|
306 |
+
# Create a structured dictionary for the final metadata
|
307 |
+
structured_metadata = {
|
308 |
+
"main_settings": {},
|
309 |
+
"transcription_log": transcription_log if transcription_log else "Not Performed",
|
310 |
+
"synthesis_settings": {}
|
311 |
+
}
|
312 |
+
|
313 |
+
# Separate parameters into logical groups
|
314 |
+
transcription_keys = [
|
315 |
+
'transcription_method', 'basic_pitch_preset_selector', 'onset_threshold',
|
316 |
+
'frame_threshold', 'minimum_note_length', 'minimum_frequency', 'maximum_frequency',
|
317 |
+
'infer_onsets', 'melodia_trick', 'multiple_pitch_bends'
|
318 |
+
]
|
319 |
+
|
320 |
+
synthesis_keys = [key for key in params_dict.keys() if key.startswith('s8bit_')]
|
321 |
+
|
322 |
+
# Populate the structured dictionary
|
323 |
+
for key, value in params_dict.items():
|
324 |
+
if key not in transcription_keys and key not in synthesis_keys:
|
325 |
+
structured_metadata["main_settings"][key] = value
|
326 |
+
|
327 |
+
for key in synthesis_keys:
|
328 |
+
structured_metadata["synthesis_settings"][key] = params_dict[key]
|
329 |
+
|
330 |
+
# If transcription log is empty, we still want to record the UI settings for transcription
|
331 |
+
if not transcription_log:
|
332 |
+
structured_metadata["transcription_log"] = {
|
333 |
+
"ui_settings": {key: params_dict[key] for key in transcription_keys}
|
334 |
+
}
|
335 |
+
|
336 |
# Use json.dumps for clean, well-formatted, multi-line string representation
|
337 |
# indent=2 makes it look nice when read back
|
338 |
return json.dumps(params_dict, indent=2)
|
|
|
1341 |
# =================================================================================================
|
1342 |
# === MIDI Merging Function ===
|
1343 |
# =================================================================================================
|
1344 |
+
def merge_midis(midi_path_left: str, midi_path_right: str, output_path: str):
|
1345 |
"""
|
1346 |
Merges two MIDI files into a single MIDI file. This robust version iterates
|
1347 |
through ALL instruments in both MIDI files, ensuring no data is lost if the
|
|
|
2072 |
"""
|
2073 |
Takes a single audio file path and runs the full transcription pipeline on it.
|
2074 |
This includes stereo/mono handling and normalization.
|
2075 |
+
Returns:
|
2076 |
+
A tuple containing:
|
2077 |
+
- The file path of the resulting transcribed MIDI.
|
2078 |
+
- The dictionary of the final basic_pitch parameters that were actually used.
|
2079 |
"""
|
2080 |
print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
|
2081 |
|
|
|
2128 |
|
2129 |
if midi_path_left and midi_path_right:
|
2130 |
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
|
2131 |
+
return merge_midis(midi_path_left, midi_path_right, merged_midi_path), final_bp_params
|
2132 |
elif midi_path_left:
|
2133 |
print("Warning: Right channel transcription failed. Using left channel only.")
|
2134 |
+
return midi_path_left, final_bp_params
|
2135 |
elif midi_path_right:
|
2136 |
print("Warning: Left channel transcription failed. Using right channel only.")
|
2137 |
+
return midi_path_right, final_bp_params
|
2138 |
else:
|
2139 |
print(f"Warning: Stereo transcription failed for stem {base_name}.")
|
2140 |
+
return None, {}
|
2141 |
else:
|
2142 |
print("Mono processing for stem.")
|
2143 |
mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
|
|
|
2146 |
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
2147 |
|
2148 |
if params.transcription_method == "General Purpose":
|
2149 |
+
return TranscribeGeneralAudio(temp_mono_path, **final_bp_params), final_bp_params
|
2150 |
else:
|
2151 |
+
# For piano, there are no bp_params, so we return an empty dict
|
2152 |
+
return TranscribePianoAudio(temp_mono_path), {}
|
2153 |
|
2154 |
|
2155 |
# --- The core processing engine for a single file ---
|
|
|
2180 |
# --- Use the provided timestamp for unique filenames ---
|
2181 |
timestamped_base_name = f"{base_name}_{timestamp}"
|
2182 |
|
2183 |
+
# --- Dictionary to log parameters for each transcribed stem ---
|
2184 |
+
transcription_params_log = {}
|
2185 |
|
2186 |
# --- Step 1: Check file type and transcribe if necessary ---
|
2187 |
if is_midi_input:
|
|
|
2233 |
# --- Demucs Vocal Separation Logic ---
|
2234 |
# This block now handles multi-stem separation, transcription, and merging logic.
|
2235 |
separated_stems = {} # This will store the audio tensors for merging
|
2236 |
+
|
2237 |
if params.separate_vocals and demucs_model is not None:
|
2238 |
# --- Vocal Separation Workflow ---
|
2239 |
update_progress(0.2, "Separating audio with Demucs...")
|
|
|
2292 |
update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
|
2293 |
stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
|
2294 |
torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
|
2295 |
+
midi_path, used_bp_params = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
|
2296 |
if midi_path:
|
2297 |
transcribed_midi_paths.append((name, midi_path))
|
2298 |
+
# --- Log the used parameters for this specific stem ---
|
2299 |
+
if used_bp_params:
|
2300 |
+
# Also log which preset was active for this stem
|
2301 |
+
used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector
|
2302 |
+
transcription_params_log[name] = used_bp_params
|
2303 |
|
2304 |
# --- Merge Transcribed MIDIs ---
|
2305 |
if not transcribed_midi_paths:
|
|
|
2327 |
torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
|
2328 |
|
2329 |
update_progress(0.2, "Transcribing audio to MIDI...")
|
2330 |
+
midi_path_for_rendering, used_bp_params = _transcribe_stem(audio_to_transcribe_path, f"{timestamped_base_name}_original", temp_dir, params)
|
2331 |
+
|
2332 |
+
# --- Populate the log in this workflow as well ---
|
2333 |
+
if used_bp_params:
|
2334 |
+
used_bp_params['preset_selector_mode'] = params.basic_pitch_preset_selector
|
2335 |
+
# Use a standard key like "full_mix" for the log
|
2336 |
+
transcription_params_log["full_mix"] = used_bp_params
|
2337 |
+
print(" - Logged transcription parameters for the full mix.")
|
2338 |
|
2339 |
if not midi_path_for_rendering or not os.path.exists(midi_path_for_rendering):
|
2340 |
print(f"ERROR: Transcription failed for {filename}. Skipping.")
|
|
|
2442 |
# --- Save audio with embedded parameter metadata ---
|
2443 |
try:
|
2444 |
# Generate the metadata string from the final parameters used for the render.
|
2445 |
+
metadata_string = format_params_for_metadata(params, transcription_params_log)
|
2446 |
|
2447 |
sf.write(final_audio_path, final_audio_data, final_srate)
|
2448 |
audio = FLAC(final_audio_path)
|