feat(transcription): Implement adaptive parameters for basic-pitch
Browse filesAdds a new "Auto-Analyze Audio" mode for the "General Purpose" transcription.
When this mode is enabled, the system analyzes the input audio's tempo, spectral content, and dynamics to automatically determine optimal parameters for `basic-pitch`.
app.py
CHANGED
@@ -91,7 +91,7 @@ SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)"
|
|
91 |
|
92 |
|
93 |
# =================================================================================================
|
94 |
-
# ===
|
95 |
# =================================================================================================
|
96 |
|
97 |
@dataclass
|
@@ -125,6 +125,7 @@ class AppParameters:
|
|
125 |
|
126 |
enable_stereo_processing: bool = False
|
127 |
transcription_method: str = "General Purpose"
|
|
|
128 |
|
129 |
# Basic Pitch Settings
|
130 |
onset_threshold: float = 0.5
|
@@ -206,6 +207,93 @@ class AppParameters:
|
|
206 |
# === Helper Functions ===
|
207 |
# =================================================================================================
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
def format_params_for_metadata(params: AppParameters) -> str:
|
210 |
"""
|
211 |
Formats the AppParameters object into a human-readable string
|
@@ -1386,7 +1474,7 @@ def TranscribePianoAudio(input_file):
|
|
1386 |
# Return the path to the newly created MIDI file
|
1387 |
return out_mid_path
|
1388 |
|
1389 |
-
def TranscribeGeneralAudio(input_file,
|
1390 |
"""
|
1391 |
Transcribes a general audio file into a MIDI file using basic-pitch.
|
1392 |
This is suitable for various instruments and vocals.
|
@@ -1409,14 +1497,7 @@ def TranscribeGeneralAudio(input_file, onset_threshold, frame_threshold, minimum
|
|
1409 |
model_output, midi_data, note_events = basic_pitch.inference.predict(
|
1410 |
audio_path=input_file,
|
1411 |
model_or_model_path=ICASSP_2022_MODEL_PATH,
|
1412 |
-
|
1413 |
-
frame_threshold=frame_threshold,
|
1414 |
-
minimum_note_length=minimum_note_length,
|
1415 |
-
minimum_frequency=minimum_frequency,
|
1416 |
-
maximum_frequency=maximum_frequency,
|
1417 |
-
infer_onsets=infer_onsets,
|
1418 |
-
melodia_trick=melodia_trick,
|
1419 |
-
multiple_pitch_bends=multiple_bends
|
1420 |
)
|
1421 |
|
1422 |
# --- Save the MIDI file ---
|
@@ -1967,6 +2048,25 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
|
|
1967 |
# Load the audio stem to process it
|
1968 |
audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False)
|
1969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1970 |
if params.enable_stereo_processing and audio_data.ndim == 2 and audio_data.shape[0] == 2:
|
1971 |
print("Stereo processing enabled for stem.")
|
1972 |
left_channel_np = audio_data[0]
|
@@ -1986,8 +2086,8 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
|
|
1986 |
|
1987 |
print("Transcribing left and right channel...")
|
1988 |
if params.transcription_method == "General Purpose":
|
1989 |
-
midi_path_left = TranscribeGeneralAudio(temp_left_path,
|
1990 |
-
midi_path_right = TranscribeGeneralAudio(temp_right_path,
|
1991 |
else: # Piano-Specific
|
1992 |
midi_path_left = TranscribePianoAudio(temp_left_path)
|
1993 |
midi_path_right = TranscribePianoAudio(temp_right_path)
|
@@ -2012,7 +2112,7 @@ def _transcribe_stem(audio_path: str, base_name: str, temp_dir: str, params: App
|
|
2012 |
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
2013 |
|
2014 |
if params.transcription_method == "General Purpose":
|
2015 |
-
return TranscribeGeneralAudio(temp_mono_path,
|
2016 |
else:
|
2017 |
return TranscribePianoAudio(temp_mono_path)
|
2018 |
|
@@ -3186,20 +3286,52 @@ if __name__ == "__main__":
|
|
3186 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
3187 |
# --- Preset dropdown for basic_pitch ---
|
3188 |
basic_pitch_preset_selector = gr.Dropdown(
|
3189 |
-
choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()),
|
3190 |
value="Default (Balanced)",
|
3191 |
label="Transcription Profile Preset",
|
3192 |
info="Select a profile to auto-fill settings for different instrument types."
|
3193 |
"For reference only; it is recommended to test and adjust for optimal results.")
|
3194 |
# --- The existing basic_pitch components ---
|
3195 |
-
onset_threshold = gr.Slider(
|
3196 |
-
|
3197 |
-
|
3198 |
-
|
3199 |
-
|
3200 |
-
|
3201 |
-
|
3202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3203 |
|
3204 |
with gr.Column(scale=1):
|
3205 |
# --- Rendering Settings ---
|
|
|
91 |
|
92 |
|
93 |
# =================================================================================================
|
94 |
+
# === Central Parameter Object ===
|
95 |
# =================================================================================================
|
96 |
|
97 |
@dataclass
|
|
|
125 |
|
126 |
enable_stereo_processing: bool = False
|
127 |
transcription_method: str = "General Purpose"
|
128 |
+
basic_pitch_preset_selector: str = "Default (Balanced)"
|
129 |
|
130 |
# Basic Pitch Settings
|
131 |
onset_threshold: float = 0.5
|
|
|
207 |
# === Helper Functions ===
|
208 |
# =================================================================================================
|
209 |
|
210 |
+
def analyze_audio_for_adaptive_params(audio_data: np.ndarray, sample_rate: int):
|
211 |
+
"""
|
212 |
+
Analyzes raw audio data to dynamically determine optimal parameters for basic-pitch.
|
213 |
+
|
214 |
+
Args:
|
215 |
+
audio_data: The audio signal as a NumPy array (can be stereo).
|
216 |
+
sample_rate: The sample rate of the audio.
|
217 |
+
|
218 |
+
Returns:
|
219 |
+
A dictionary of recommended parameters for basic_pitch.
|
220 |
+
"""
|
221 |
+
print(" - Running adaptive analysis on audio to determine optimal transcription parameters...")
|
222 |
+
|
223 |
+
# Ensure audio is mono for most feature extractions
|
224 |
+
if audio_data.ndim > 1:
|
225 |
+
y_mono = librosa.to_mono(audio_data)
|
226 |
+
else:
|
227 |
+
y_mono = audio_data
|
228 |
+
|
229 |
+
params = {}
|
230 |
+
|
231 |
+
# 1. Tempo detection with enhanced stability
|
232 |
+
try:
|
233 |
+
tempo_info = librosa.beat.tempo(y=y_mono, sr=sample_rate, aggregate=np.median)
|
234 |
+
|
235 |
+
# Ensure BPM is a scalar float
|
236 |
+
bpm = float(np.median(tempo_info))
|
237 |
+
|
238 |
+
if bpm <= 0 or np.isnan(bpm):
|
239 |
+
raise ValueError("Invalid BPM detected")
|
240 |
+
|
241 |
+
# A 64th note is a reasonable shortest note length for most music
|
242 |
+
# Duration of a beat (quarter note) in seconds = 60 / BPM
|
243 |
+
# Duration of a 64th note = (60 / BPM) / 16
|
244 |
+
min_len_s = (60.0 / bpm) / 16.0
|
245 |
+
# basic-pitch expects milliseconds
|
246 |
+
params['minimum_note_length'] = max(20, int(min_len_s * 1000))
|
247 |
+
print(f" - Detected BPM (median): {bpm:.1f} -> minimum_note_length: {params['minimum_note_length']}ms")
|
248 |
+
except Exception as e:
|
249 |
+
print(f" - BPM detection failed, using default minimum_note_length. Error: {e}")
|
250 |
+
|
251 |
+
# 2. Spectral analysis: centroid + rolloff for richer info
|
252 |
+
try:
|
253 |
+
spectral_centroid = librosa.feature.spectral_centroid(y=y_mono, sr=sample_rate)[0]
|
254 |
+
rolloff = librosa.feature.spectral_rolloff(y=y_mono, sr=sample_rate)[0]
|
255 |
+
avg_centroid = np.mean(spectral_centroid)
|
256 |
+
avg_rolloff = np.mean(rolloff)
|
257 |
+
print(f" - Spectral centroid: {avg_centroid:.1f} Hz, rolloff (85%): {avg_rolloff:.1f} Hz")
|
258 |
+
# Simple logic: if the 'center of mass' of the spectrum is low, it's bass-heavy.
|
259 |
+
# If it's high, it contains high-frequency content.
|
260 |
+
if avg_centroid < 500 and avg_rolloff < 1500:
|
261 |
+
params['minimum_frequency'] = 30
|
262 |
+
params['maximum_frequency'] = 1200
|
263 |
+
elif avg_centroid > 2000 or avg_rolloff > 5000: # Likely bright, high-frequency content (cymbals, flutes)
|
264 |
+
params['minimum_frequency'] = 100
|
265 |
+
params['maximum_frequency'] = 8000
|
266 |
+
else:
|
267 |
+
params['minimum_frequency'] = 50
|
268 |
+
params['maximum_frequency'] = 4000
|
269 |
+
except Exception as e:
|
270 |
+
print(f" - Spectral analysis failed, using default frequencies. Error: {e}")
|
271 |
+
|
272 |
+
# 3. Onset threshold based on percussiveness
|
273 |
+
try:
|
274 |
+
y_harmonic, y_percussive = librosa.effects.hpss(y_mono)
|
275 |
+
percussive_ratio = np.sum(y_percussive**2) / (np.sum(y_harmonic**2) + 1e-10)
|
276 |
+
# If the percussive energy is high, we need a higher onset threshold to be stricter
|
277 |
+
params['onset_threshold'] = 0.6 if percussive_ratio > 0.5 else 0.45
|
278 |
+
print(f" - Percussive ratio: {percussive_ratio:.2f} -> onset_threshold: {params['onset_threshold']}")
|
279 |
+
except Exception as e:
|
280 |
+
print(f" - Percussiveness analysis failed, using default onset_threshold. Error: {e}")
|
281 |
+
|
282 |
+
# 4. Frame threshold from RMS
|
283 |
+
try:
|
284 |
+
rms = librosa.feature.rms(y=y_mono)[0]
|
285 |
+
# Use the 10th percentile of energy as a proxy for the noise floor
|
286 |
+
noise_floor_rms = np.percentile(rms, 10)
|
287 |
+
# Set the frame_threshold to be slightly above this noise floor
|
288 |
+
# The scaling factor here is empirical and can be tuned
|
289 |
+
params['frame_threshold'] = max(0.05, min(0.4, noise_floor_rms * 4))
|
290 |
+
print(f" - Noise floor RMS: {noise_floor_rms:.5f} -> frame_threshold: {params['frame_threshold']:.2f}")
|
291 |
+
except Exception as e:
|
292 |
+
print(f" - RMS analysis failed, using default frame_threshold. Error: {e}")
|
293 |
+
|
294 |
+
return params
|
295 |
+
|
296 |
+
|
297 |
def format_params_for_metadata(params: AppParameters) -> str:
|
298 |
"""
|
299 |
Formats the AppParameters object into a human-readable string
|
|
|
1474 |
# Return the path to the newly created MIDI file
|
1475 |
return out_mid_path
|
1476 |
|
1477 |
+
def TranscribeGeneralAudio(input_file, **kwargs):
|
1478 |
"""
|
1479 |
Transcribes a general audio file into a MIDI file using basic-pitch.
|
1480 |
This is suitable for various instruments and vocals.
|
|
|
1497 |
model_output, midi_data, note_events = basic_pitch.inference.predict(
|
1498 |
audio_path=input_file,
|
1499 |
model_or_model_path=ICASSP_2022_MODEL_PATH,
|
1500 |
+
**kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1501 |
)
|
1502 |
|
1503 |
# --- Save the MIDI file ---
|
|
|
2048 |
# Load the audio stem to process it
|
2049 |
audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False)
|
2050 |
|
2051 |
+
# --- Adaptive Parameter Logic ---
|
2052 |
+
final_bp_params = {
|
2053 |
+
"onset_threshold": params.onset_threshold,
|
2054 |
+
"frame_threshold": params.frame_threshold,
|
2055 |
+
"minimum_note_length": params.minimum_note_length,
|
2056 |
+
"minimum_frequency": params.minimum_frequency,
|
2057 |
+
"maximum_frequency": params.maximum_frequency,
|
2058 |
+
"infer_onsets": params.infer_onsets,
|
2059 |
+
"melodia_trick": params.melodia_trick,
|
2060 |
+
"multiple_pitch_bends": params.multiple_pitch_bends,
|
2061 |
+
}
|
2062 |
+
|
2063 |
+
# Check if the user has selected the auto-analysis option from the dropdown.
|
2064 |
+
if params.transcription_method == "General Purpose" and params.basic_pitch_preset_selector == "Auto-Analyze Audio":
|
2065 |
+
adaptive_params = analyze_audio_for_adaptive_params(audio_data, native_sample_rate)
|
2066 |
+
# Update the final_bp_params dictionary with the new adaptive values
|
2067 |
+
final_bp_params.update(adaptive_params)
|
2068 |
+
print(f" - Overriding manual settings with auto-analyzed parameters. final_bp_params: {final_bp_params}")
|
2069 |
+
|
2070 |
if params.enable_stereo_processing and audio_data.ndim == 2 and audio_data.shape[0] == 2:
|
2071 |
print("Stereo processing enabled for stem.")
|
2072 |
left_channel_np = audio_data[0]
|
|
|
2086 |
|
2087 |
print("Transcribing left and right channel...")
|
2088 |
if params.transcription_method == "General Purpose":
|
2089 |
+
midi_path_left = TranscribeGeneralAudio(temp_left_path, **final_bp_params)
|
2090 |
+
midi_path_right = TranscribeGeneralAudio(temp_right_path, **final_bp_params)
|
2091 |
else: # Piano-Specific
|
2092 |
midi_path_left = TranscribePianoAudio(temp_left_path)
|
2093 |
midi_path_right = TranscribePianoAudio(temp_right_path)
|
|
|
2112 |
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
2113 |
|
2114 |
if params.transcription_method == "General Purpose":
|
2115 |
+
return TranscribeGeneralAudio(temp_mono_path, **final_bp_params)
|
2116 |
else:
|
2117 |
return TranscribePianoAudio(temp_mono_path)
|
2118 |
|
|
|
3286 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
3287 |
# --- Preset dropdown for basic_pitch ---
|
3288 |
basic_pitch_preset_selector = gr.Dropdown(
|
3289 |
+
choices=["Auto-Analyze Audio", "Custom"] + list(BASIC_PITCH_PRESETS.keys()),
|
3290 |
value="Default (Balanced)",
|
3291 |
label="Transcription Profile Preset",
|
3292 |
info="Select a profile to auto-fill settings for different instrument types."
|
3293 |
"For reference only; it is recommended to test and adjust for optimal results.")
|
3294 |
# --- The existing basic_pitch components ---
|
3295 |
+
onset_threshold = gr.Slider(
|
3296 |
+
0.0, 1.0, value=0.5, step=0.05,
|
3297 |
+
label="On-set Threshold",
|
3298 |
+
info="Sensitivity for detecting the start of a new note. Lower values will detect more notes (even faint ones), but may create false positives. Higher values are stricter and cleaner, but might miss subtle notes."
|
3299 |
+
)
|
3300 |
+
frame_threshold = gr.Slider(
|
3301 |
+
0.0, 1.0, value=0.3, step=0.05,
|
3302 |
+
label="Frame Threshold",
|
3303 |
+
info="Sensitivity for determining if a note is 'on' or 'off'. Lower values will sustain notes longer, but can merge distinct notes. Higher values create shorter, more separated notes, but might cut off tails."
|
3304 |
+
)
|
3305 |
+
minimum_note_length = gr.Slider(
|
3306 |
+
10, 500, value=128, step=1,
|
3307 |
+
label="Minimum Note Length (ms)",
|
3308 |
+
info="Filters out notes shorter than this duration. Increase this to remove fast, noisy artifacts or clicks. Decrease it if the transcription is missing very short, staccato notes."
|
3309 |
+
)
|
3310 |
+
minimum_frequency = gr.Slider(
|
3311 |
+
0, 500, value=60, step=5,
|
3312 |
+
label="Minimum Frequency (Hz)",
|
3313 |
+
info="Ignores any detected pitches below this frequency. Increase this to filter out low-frequency noise like rumble or hum. Set it just below your target instrument's lowest note (e.g., ~80Hz for guitar)."
|
3314 |
+
)
|
3315 |
+
maximum_frequency = gr.Slider(
|
3316 |
+
501, 10000, value=4000, step=10,
|
3317 |
+
label="Maximum Frequency (Hz)",
|
3318 |
+
info="Ignores any detected pitches above this frequency. Decrease this to filter out high-frequency noise like hiss or cymbals. Set it just above your target instrument's highest note (e.g., ~1200Hz for vocals)."
|
3319 |
+
)
|
3320 |
+
infer_onsets = gr.Checkbox(
|
3321 |
+
value=True,
|
3322 |
+
label="Infer Onsets (Boost Onsets)",
|
3323 |
+
info="When enabled, the model actively looks for and emphasizes the start of each note (the 'attack'). Recommended for percussive or clear, rhythmic music. Disable for very smooth, legato music like vocal pads."
|
3324 |
+
)
|
3325 |
+
melodia_trick = gr.Checkbox(
|
3326 |
+
value=True,
|
3327 |
+
label="Melodia Trick (Contour Optimization)",
|
3328 |
+
info="When enabled, uses a secondary melody-detection algorithm to refine the main pitch contour. Highly recommended for most melodic content. Disable if you are transcribing non-melodic noise or complex polyphony."
|
3329 |
+
)
|
3330 |
+
multiple_pitch_bends = gr.Checkbox(
|
3331 |
+
value=False,
|
3332 |
+
label="Allow Multiple Pitch Bends",
|
3333 |
+
info="When enabled, allows a single note to have multiple, continuous pitch bends within it. Essential for transcribing vocals, slides, or vibrato-heavy instruments. Disable for clean, discrete notes like a standard piano."
|
3334 |
+
)
|
3335 |
|
3336 |
with gr.Column(scale=1):
|
3337 |
# --- Rendering Settings ---
|