avans06 commited on
Commit
5700ed3
·
1 Parent(s): 487757b

feat(transcription): Implement adaptive parameters for basic-pitch

Browse files

Adds a new "Auto-Analyze Audio" mode for the "General Purpose" transcription.

When this mode is enabled, the system analyzes the input audio's tempo, spectral content, and dynamics to automatically determine optimal parameters for `basic-pitch`.

Files changed (1) hide show
  1. app.py +154 -22
app.py CHANGED
@@ -91,7 +91,7 @@ SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)"
91
 
92
 
93
  # =================================================================================================
94
- # === NEW: Central Parameter Object ===
95
  # =================================================================================================
96
 
97
  @dataclass
@@ -125,6 +125,7 @@ class AppParameters:
125
 
126
  enable_stereo_processing: bool = False
127
  transcription_method: str = "General Purpose"
 
128
 
129
  # Basic Pitch Settings
130
  onset_threshold: float = 0.5
@@ -206,6 +207,93 @@ class AppParameters:
206
  # === Helper Functions ===
207
  # =================================================================================================
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  def format_params_for_metadata(params: AppParameters) -> str:
210
  """
211
  Formats the AppParameters object into a human-readable string
@@ -1386,7 +1474,7 @@ def TranscribePianoAudio(input_file):
1386
  # Return the path to the newly created MIDI file
1387
  return out_mid_path
1388
 
1389
- def TranscribeGeneralAudio(input_file, onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_bends):
1390
  """
1391
  Transcribes a general audio file into a MIDI file using basic-pitch.
1392
  This is suitable for various instruments and vocals.
@@ -1409,14 +1497,7 @@ def TranscribeGeneralAudio(input_file, onset_threshold, frame_threshold, minimum
1409
  model_output, midi_data, note_events = basic_pitch.inference.predict(
1410
  audio_path=input_file,
1411
  model_or_model_path=ICASSP_2022_MODEL_PATH,
1412
- onset_threshold=onset_threshold,
1413
- frame_threshold=frame_threshold,
1414
- minimum_note_length=minimum_note_length,
1415
- minimum_frequency=minimum_frequency,
1416
- maximum_frequency=maximum_frequency,
1417
- infer_onsets=infer_onsets,
1418
- melodia_trick=melodia_trick,
1419
- multiple_pitch_bends=multiple_bends
1420
  )
1421
 
1422
  # --- Save the MIDI file ---
@@ -1967,6 +2048,25 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
1967
  # Load the audio stem to process it
1968
  audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False)
1969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1970
  if params.enable_stereo_processing and audio_data.ndim == 2 and audio_data.shape[0] == 2:
1971
  print("Stereo processing enabled for stem.")
1972
  left_channel_np = audio_data[0]
@@ -1986,8 +2086,8 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
1986
 
1987
  print("Transcribing left and right channel...")
1988
  if params.transcription_method == "General Purpose":
1989
- midi_path_left = TranscribeGeneralAudio(temp_left_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends)
1990
- midi_path_right = TranscribeGeneralAudio(temp_right_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends)
1991
  else: # Piano-Specific
1992
  midi_path_left = TranscribePianoAudio(temp_left_path)
1993
  midi_path_right = TranscribePianoAudio(temp_right_path)
@@ -2012,7 +2112,7 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
2012
  sf.write(temp_mono_path, normalized_mono, native_sample_rate)
2013
 
2014
  if params.transcription_method == "General Purpose":
2015
- return TranscribeGeneralAudio(temp_mono_path, params.onset_threshold, params.frame_threshold, params.minimum_note_length, params.minimum_frequency, params.maximum_frequency, params.infer_onsets, params.melodia_trick, params.multiple_pitch_bends)
2016
  else:
2017
  return TranscribePianoAudio(temp_mono_path)
2018
 
@@ -3186,20 +3286,52 @@ if __name__ == "__main__":
3186
  with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
3187
  # --- Preset dropdown for basic_pitch ---
3188
  basic_pitch_preset_selector = gr.Dropdown(
3189
- choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()),
3190
  value="Default (Balanced)",
3191
  label="Transcription Profile Preset",
3192
  info="Select a profile to auto-fill settings for different instrument types."
3193
  "For reference only; it is recommended to test and adjust for optimal results.")
3194
  # --- The existing basic_pitch components ---
3195
- onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
3196
- frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
3197
- minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
3198
- minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.")
3199
- maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.")
3200
- infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)")
3201
- melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)")
3202
- multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3203
 
3204
  with gr.Column(scale=1):
3205
  # --- Rendering Settings ---
 
91
 
92
 
93
  # =================================================================================================
94
+ # === Central Parameter Object ===
95
  # =================================================================================================
96
 
97
  @dataclass
 
125
 
126
  enable_stereo_processing: bool = False
127
  transcription_method: str = "General Purpose"
128
+ basic_pitch_preset_selector: str = "Default (Balanced)"
129
 
130
  # Basic Pitch Settings
131
  onset_threshold: float = 0.5
 
207
  # === Helper Functions ===
208
  # =================================================================================================
209
 
210
+ def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int):
211
+ """
212
+ Analyzes raw audio data to dynamically determine optimal parameters for basic-pitch.
213
+
214
+ Args:
215
+ audio_data: The audio signal as a NumPy array (can be stereo).
216
+ sample_rate: The sample rate of the audio.
217
+
218
+ Returns:
219
+ A dictionary of recommended parameters for basic_pitch.
220
+ """
221
+ print(" - Running adaptive analysis on audio to determine optimal transcription parameters...")
222
+
223
+ # Ensure audio is mono for most feature extractions
224
+ if audio_data.ndim > 1:
225
+ y_mono = librosa.to_mono(audio_data)
226
+ else:
227
+ y_mono = audio_data
228
+
229
+ params = {}
230
+
231
+ # 1. Tempo detection with enhanced stability
232
+ try:
233
+ tempo_info = librosa.beat.tempo(y=y_mono, sr=sample_rate, aggregate=np.median)
234
+
235
+ # Ensure BPM is a scalar float
236
+ bpm = float(np.median(tempo_info))
237
+
238
+ if bpm <= 0 or np.isnan(bpm):
239
+ raise ValueError("Invalid BPM detected")
240
+
241
+ # A 64th note is a reasonable shortest note length for most music
242
+ # Duration of a beat (quarter note) in seconds = 60 / BPM
243
+ # Duration of a 64th note = (60 / BPM) / 16
244
+ min_len_s = (60.0 / bpm) / 16.0
245
+ # basic-pitch expects milliseconds
246
+ params['minimum_note_length'] = max(20, int(min_len_s * 1000))
247
+ print(f" - Detected BPM (median): {bpm:.1f} -> minimum_note_length: {params['minimum_note_length']}ms")
248
+ except Exception as e:
249
+ print(f" - BPM detection failed, using default minimum_note_length. Error: {e}")
250
+
251
+ # 2. Spectral analysis: centroid + rolloff for richer info
252
+ try:
253
+ spectral_centroid = librosa.feature.spectral_centroid(y=y_mono, sr=sample_rate)[0]
254
+ rolloff = librosa.feature.spectral_rolloff(y=y_mono, sr=sample_rate)[0]
255
+ avg_centroid = np.mean(spectral_centroid)
256
+ avg_rolloff = np.mean(rolloff)
257
+ print(f" - Spectral centroid: {avg_centroid:.1f} Hz, rolloff (85%): {avg_rolloff:.1f} Hz")
258
+ # Simple logic: if the 'center of mass' of the spectrum is low, it's bass-heavy.
259
+ # If it's high, it contains high-frequency content.
260
+ if avg_centroid < 500 and avg_rolloff < 1500:
261
+ params['minimum_frequency'] = 30
262
+ params['maximum_frequency'] = 1200
263
+ elif avg_centroid > 2000 or avg_rolloff > 5000: # Likely bright, high-frequency content (cymbals, flutes)
264
+ params['minimum_frequency'] = 100
265
+ params['maximum_frequency'] = 8000
266
+ else:
267
+ params['minimum_frequency'] = 50
268
+ params['maximum_frequency'] = 4000
269
+ except Exception as e:
270
+ print(f" - Spectral analysis failed, using default frequencies. Error: {e}")
271
+
272
+ # 3. Onset threshold based on percussiveness
273
+ try:
274
+ y_harmonic, y_percussive = librosa.effects.hpss(y_mono)
275
+ percussive_ratio = np.sum(y_percussive**2) / (np.sum(y_harmonic**2) + 1e-10)
276
+ # If the percussive energy is high, we need a higher onset threshold to be stricter
277
+ params['onset_threshold'] = 0.6 if percussive_ratio > 0.5 else 0.45
278
+ print(f" - Percussive ratio: {percussive_ratio:.2f} -> onset_threshold: {params['onset_threshold']}")
279
+ except Exception as e:
280
+ print(f" - Percussiveness analysis failed, using default onset_threshold. Error: {e}")
281
+
282
+ # 4. Frame threshold from RMS
283
+ try:
284
+ rms = librosa.feature.rms(y=y_mono)[0]
285
+ # Use the 10th percentile of energy as a proxy for the noise floor
286
+ noise_floor_rms = np.percentile(rms, 10)
287
+ # Set the frame_threshold to be slightly above this noise floor
288
+ # The scaling factor here is empirical and can be tuned
289
+ params['frame_threshold'] = max(0.05, min(0.4, noise_floor_rms * 4))
290
+ print(f" - Noise floor RMS: {noise_floor_rms:.5f} -> frame_threshold: {params['frame_threshold']:.2f}")
291
+ except Exception as e:
292
+ print(f" - RMS analysis failed, using default frame_threshold. Error: {e}")
293
+
294
+ return params
295
+
296
+
297
  def format_params_for_metadata(params: AppParameters) -> str:
298
  """
299
  Formats the AppParameters object into a human-readable string
 
1474
  # Return the path to the newly created MIDI file
1475
  return out_mid_path
1476
 
1477
+ def TranscribeGeneralAudio(input_file, **kwargs):
1478
  """
1479
  Transcribes a general audio file into a MIDI file using basic-pitch.
1480
  This is suitable for various instruments and vocals.
 
1497
  model_output, midi_data, note_events = basic_pitch.inference.predict(
1498
  audio_path=input_file,
1499
  model_or_model_path=ICASSP_2022_MODEL_PATH,
1500
+ **kwargs
 
 
 
 
 
 
 
1501
  )
1502
 
1503
  # --- Save the MIDI file ---
 
2048
  # Load the audio stem to process it
2049
  audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False)
2050
 
2051
+ # --- Adaptive Parameter Logic ---
2052
+ final_bp_params = {
2053
+ "onset_threshold": params.onset_threshold,
2054
+ "frame_threshold": params.frame_threshold,
2055
+ "minimum_note_length": params.minimum_note_length,
2056
+ "minimum_frequency": params.minimum_frequency,
2057
+ "maximum_frequency": params.maximum_frequency,
2058
+ "infer_onsets": params.infer_onsets,
2059
+ "melodia_trick": params.melodia_trick,
2060
+ "multiple_pitch_bends": params.multiple_pitch_bends,
2061
+ }
2062
+
2063
+ # Check if the user has selected the auto-analysis option from the dropdown.
2064
+ if params.transcription_method == "General Purpose" and params.basic_pitch_preset_selector == "Auto-Analyze Audio":
2065
+ adaptive_params = analyze_audio_for_adaptive_params(audio_data, native_sample_rate)
2066
+ # Update the final_bp_params dictionary with the new adaptive values
2067
+ final_bp_params.update(adaptive_params)
2068
+ print(f" - Overriding manual settings with auto-analyzed parameters. final_bp_params: {final_bp_params}")
2069
+
2070
  if params.enable_stereo_processing and audio_data.ndim == 2 and audio_data.shape[0] == 2:
2071
  print("Stereo processing enabled for stem.")
2072
  left_channel_np = audio_data[0]
 
2086
 
2087
  print("Transcribing left and right channel...")
2088
  if params.transcription_method == "General Purpose":
2089
+ midi_path_left = TranscribeGeneralAudio(temp_left_path, **final_bp_params)
2090
+ midi_path_right = TranscribeGeneralAudio(temp_right_path, **final_bp_params)
2091
  else: # Piano-Specific
2092
  midi_path_left = TranscribePianoAudio(temp_left_path)
2093
  midi_path_right = TranscribePianoAudio(temp_right_path)
 
2112
  sf.write(temp_mono_path, normalized_mono, native_sample_rate)
2113
 
2114
  if params.transcription_method == "General Purpose":
2115
+ return TranscribeGeneralAudio(temp_mono_path, **final_bp_params)
2116
  else:
2117
  return TranscribePianoAudio(temp_mono_path)
2118
 
 
3286
  with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
3287
  # --- Preset dropdown for basic_pitch ---
3288
  basic_pitch_preset_selector = gr.Dropdown(
3289
+ choices=["Auto-Analyze Audio", "Custom"] + list(BASIC_PITCH_PRESETS.keys()),
3290
  value="Default (Balanced)",
3291
  label="Transcription Profile Preset",
3292
  info="Select a profile to auto-fill settings for different instrument types."
3293
  "For reference only; it is recommended to test and adjust for optimal results.")
3294
  # --- The existing basic_pitch components ---
3295
+ onset_threshold = gr.Slider(
3296
+ 0.0, 1.0, value=0.5, step=0.05,
3297
+ label="On-set Threshold",
3298
+ info="Sensitivity for detecting the start of a new note. Lower values will detect more notes (even faint ones), but may create false positives. Higher values are stricter and cleaner, but might miss subtle notes."
3299
+ )
3300
+ frame_threshold = gr.Slider(
3301
+ 0.0, 1.0, value=0.3, step=0.05,
3302
+ label="Frame Threshold",
3303
+ info="Sensitivity for determining if a note is 'on' or 'off'. Lower values will sustain notes longer, but can merge distinct notes. Higher values create shorter, more separated notes, but might cut off tails."
3304
+ )
3305
+ minimum_note_length = gr.Slider(
3306
+ 10, 500, value=128, step=1,
3307
+ label="Minimum Note Length (ms)",
3308
+ info="Filters out notes shorter than this duration. Increase this to remove fast, noisy artifacts or clicks. Decrease it if the transcription is missing very short, staccato notes."
3309
+ )
3310
+ minimum_frequency = gr.Slider(
3311
+ 0, 500, value=60, step=5,
3312
+ label="Minimum Frequency (Hz)",
3313
+ info="Ignores any detected pitches below this frequency. Increase this to filter out low-frequency noise like rumble or hum. Set it just below your target instrument's lowest note (e.g., ~80Hz for guitar)."
3314
+ )
3315
+ maximum_frequency = gr.Slider(
3316
+ 501, 10000, value=4000, step=10,
3317
+ label="Maximum Frequency (Hz)",
3318
+ info="Ignores any detected pitches above this frequency. Decrease this to filter out high-frequency noise like hiss or cymbals. Set it just above your target instrument's highest note (e.g., ~1200Hz for vocals)."
3319
+ )
3320
+ infer_onsets = gr.Checkbox(
3321
+ value=True,
3322
+ label="Infer Onsets (Boost Onsets)",
3323
+ info="When enabled, the model actively looks for and emphasizes the start of each note (the 'attack'). Recommended for percussive or clear, rhythmic music. Disable for very smooth, legato music like vocal pads."
3324
+ )
3325
+ melodia_trick = gr.Checkbox(
3326
+ value=True,
3327
+ label="Melodia Trick (Contour Optimization)",
3328
+ info="When enabled, uses a secondary melody-detection algorithm to refine the main pitch contour. Highly recommended for most melodic content. Disable if you are transcribing non-melodic noise or complex polyphony."
3329
+ )
3330
+ multiple_pitch_bends = gr.Checkbox(
3331
+ value=False,
3332
+ label="Allow Multiple Pitch Bends",
3333
+ info="When enabled, allows a single note to have multiple, continuous pitch bends within it. Essential for transcribing vocals, slides, or vibrato-heavy instruments. Disable for clean, discrete notes like a standard piano."
3334
+ )
3335
 
3336
  with gr.Column(scale=1):
3337
  # --- Rendering Settings ---