Add experimental vocal separation and selective transcription features
Browse files- Added **Separate Vocals** option (experimental): Splits input audio into vocals and music stems before processing.
- Added **Transcription Target** setting: Allows choosing which stem (vocals or music) to transcribe to MIDI.
- Added option to **Re-merge Other Part with Rendered Audio**: After rendering, merges the non-transcribed stem (e.g., original vocals) back with the new music.
- app.py +364 -160
- requirements.txt +6 -2
app.py
CHANGED
@@ -50,8 +50,13 @@ import soundfile as sf
|
|
50 |
import torch
|
51 |
import gradio as gr
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
|
54 |
|
|
|
55 |
from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate
|
56 |
|
57 |
# --- Import core transcription and MIDI processing libraries ---
|
@@ -1042,6 +1047,9 @@ def recommend_8bit_params(midi_data, default_preset):
|
|
1042 |
def process_and_render_file(input_file,
|
1043 |
# --- Pass the preset selector value ---
|
1044 |
s8bit_preset_selector,
|
|
|
|
|
|
|
1045 |
# --- Transcription params ---
|
1046 |
enable_stereo_processing,
|
1047 |
transcription_method,
|
@@ -1071,14 +1079,9 @@ def process_and_render_file(input_file,
|
|
1071 |
filename = os.path.basename(input_file_path)
|
1072 |
print(f"Processing new file: {filename}")
|
1073 |
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
except Exception as e:
|
1078 |
-
# If loading fails, it might be a MIDI file, which librosa cannot handle.
|
1079 |
-
# We will proceed, assuming it's a MIDI, and let pretty_midi handle it later.
|
1080 |
-
print(f"Could not load as audio: {e}. Assuming it is a MIDI file.")
|
1081 |
-
pass
|
1082 |
|
1083 |
# --- Step 1: Check file type and transcribe if necessary ---
|
1084 |
if filename.lower().endswith(('.mid', '.midi', '.kar')):
|
@@ -1086,42 +1089,95 @@ def process_and_render_file(input_file,
|
|
1086 |
midi_path_for_rendering = input_file_path
|
1087 |
else: #if filename.lower().endswith(('.wav', '.mp3'))
|
1088 |
print("Audio file detected. Starting transcription...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1089 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1090 |
base_name = os.path.splitext(filename)[0]
|
1091 |
-
temp_dir = "output/
|
1092 |
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1093 |
|
1094 |
# === STEREO PROCESSING LOGIC ===
|
1095 |
if enable_stereo_processing:
|
1096 |
-
if
|
1097 |
print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
|
1098 |
enable_stereo_processing = False # Disable stereo processing if audio is not stereo
|
1099 |
|
1100 |
if enable_stereo_processing:
|
1101 |
-
print("Stereo processing enabled. Splitting channels...")
|
1102 |
try:
|
1103 |
-
|
1104 |
-
|
1105 |
|
1106 |
-
normalized_left = normalize_loudness(
|
1107 |
-
normalized_right = normalize_loudness(
|
1108 |
|
1109 |
-
|
1110 |
-
|
1111 |
|
1112 |
-
sf.write(
|
1113 |
-
sf.write(
|
1114 |
|
1115 |
-
print(f"Saved left channel to: {
|
1116 |
-
print(f"Saved right channel to: {
|
1117 |
|
1118 |
print("Transcribing left and right channel...")
|
1119 |
if transcription_method == "General Purpose":
|
1120 |
-
midi_path_left = TranscribeGeneralAudio(
|
1121 |
-
midi_path_right = TranscribeGeneralAudio(
|
1122 |
-
else:
|
1123 |
-
midi_path_left = TranscribePianoAudio(
|
1124 |
-
midi_path_right = TranscribePianoAudio(
|
1125 |
|
1126 |
if midi_path_left and midi_path_right:
|
1127 |
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
|
@@ -1139,24 +1195,22 @@ def process_and_render_file(input_file,
|
|
1139 |
print(f"An error occurred during stereo processing: {e}")
|
1140 |
raise gr.Error(f"Stereo Processing Failed: {e}")
|
1141 |
else: # Standard mono transcription
|
1142 |
-
print("
|
1143 |
-
|
1144 |
-
|
1145 |
-
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
1149 |
-
normalized_mono = normalize_loudness(mono_signal, native_sample_rate)
|
1150 |
|
1151 |
-
|
1152 |
-
|
1153 |
-
|
1154 |
|
1155 |
try:
|
1156 |
if transcription_method == "General Purpose":
|
1157 |
-
midi_path_for_rendering = TranscribeGeneralAudio(
|
1158 |
else: # Piano-Specific
|
1159 |
-
midi_path_for_rendering = TranscribePianoAudio(
|
1160 |
except Exception as e:
|
1161 |
print(f"An error occurred during transcription: {e}")
|
1162 |
raise gr.Error(f"Transcription Failed: {e}")
|
@@ -1216,7 +1270,43 @@ def process_and_render_file(input_file,
|
|
1216 |
synth_params['fm_modulation_depth'],
|
1217 |
synth_params['fm_modulation_rate']
|
1218 |
)
|
1219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1220 |
print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
|
1221 |
print('*' * 70)
|
1222 |
|
@@ -1308,7 +1398,18 @@ if __name__ == "__main__":
|
|
1308 |
if not soundfonts_dict:
|
1309 |
print("\nWARNING: No SoundFonts were found or could be downloaded.")
|
1310 |
print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
|
1311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1312 |
# --- Define a constant for the fallback preset name ---
|
1313 |
# This prevents errors if the preset name is changed in the dictionary.
|
1314 |
FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
|
@@ -1318,43 +1419,7 @@ if __name__ == "__main__":
|
|
1318 |
# Comprehensive preset dictionary including new JRPG and Handheld classics
|
1319 |
# Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
|
1320 |
S8BIT_PRESETS = {
|
1321 |
-
# ---
|
1322 |
-
"Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
|
1323 |
-
# Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
|
1324 |
-
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
|
1325 |
-
'vibrato_rate': 4.5, 'vibrato_depth': 4,
|
1326 |
-
'smooth_notes_level': 0.9, # Formerly True -> 1.0; slightly reduced for a bit more attack.
|
1327 |
-
'continuous_vibrato_level': 0.8, # Formerly True -> 1.0; slightly weakened for more defined note transitions.
|
1328 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1329 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1330 |
-
},
|
1331 |
-
"Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
|
1332 |
-
# Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
|
1333 |
-
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
1334 |
-
'vibrato_rate': 5.0, 'vibrato_depth': 6,
|
1335 |
-
'smooth_notes_level': 0.8,
|
1336 |
-
'continuous_vibrato_level': 0.7,
|
1337 |
-
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
|
1338 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1339 |
-
},
|
1340 |
-
"Mega Man (Rockman / ロックマン)": {
|
1341 |
-
# Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
|
1342 |
-
'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
1343 |
-
'vibrato_rate': 6.0, 'vibrato_depth': 8,
|
1344 |
-
'smooth_notes_level': 0.9,
|
1345 |
-
'continuous_vibrato_level': 0.85,
|
1346 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
|
1347 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1348 |
-
},
|
1349 |
-
"Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
|
1350 |
-
# Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
|
1351 |
-
'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
1352 |
-
'vibrato_rate': 6.0, 'vibrato_depth': 4,
|
1353 |
-
'smooth_notes_level': 0.85,
|
1354 |
-
'continuous_vibrato_level': 0.3, # Formerly False (0.0); adds a hint of continuity for more liveliness.
|
1355 |
-
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1356 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1357 |
-
},
|
1358 |
"Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
|
1359 |
# Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
|
1360 |
'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
|
@@ -1364,41 +1429,13 @@ if __name__ == "__main__":
|
|
1364 |
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1365 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1366 |
},
|
1367 |
-
|
1368 |
-
|
1369 |
-
|
1370 |
-
'
|
1371 |
-
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
1372 |
-
'smooth_notes_level': 0.95,
|
1373 |
-
'continuous_vibrato_level': 0.9,
|
1374 |
-
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
1375 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1376 |
-
},
|
1377 |
-
"Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
|
1378 |
-
# Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
|
1379 |
-
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
1380 |
-
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
1381 |
-
'smooth_notes_level': 1.0,
|
1382 |
-
'continuous_vibrato_level': 0.95,
|
1383 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1384 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1385 |
-
},
|
1386 |
-
"Dragon Quest (ドラゴンクエスト)": {
|
1387 |
-
# Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
|
1388 |
-
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
|
1389 |
-
'vibrato_rate': 3.0, 'vibrato_depth': 4,
|
1390 |
-
'smooth_notes_level': 0.9,
|
1391 |
-
'continuous_vibrato_level': 0.9,
|
1392 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1393 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1394 |
-
},
|
1395 |
-
"ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
|
1396 |
-
# Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
|
1397 |
-
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
1398 |
-
'vibrato_rate': 3.5, 'vibrato_depth': 3,
|
1399 |
'smooth_notes_level': 0.9,
|
1400 |
'continuous_vibrato_level': 0.85,
|
1401 |
-
'bass_boost_level': 0.
|
1402 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1403 |
},
|
1404 |
"Zelda (The Legend of Zelda / ゼルダの伝説)": {
|
@@ -1410,23 +1447,22 @@ if __name__ == "__main__":
|
|
1410 |
'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1411 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1412 |
},
|
1413 |
-
|
1414 |
-
|
1415 |
-
|
1416 |
-
'
|
1417 |
-
'vibrato_rate': 5.5, 'vibrato_depth': 6,
|
1418 |
'smooth_notes_level': 0.85,
|
1419 |
-
'continuous_vibrato_level': 0.
|
1420 |
-
'bass_boost_level': 0.
|
1421 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1422 |
},
|
1423 |
-
"
|
1424 |
-
# Description: A
|
1425 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
1426 |
-
'vibrato_rate': 5.0, 'vibrato_depth':
|
1427 |
'smooth_notes_level': 0.9,
|
1428 |
-
'continuous_vibrato_level': 0.
|
1429 |
-
'bass_boost_level': 0.
|
1430 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1431 |
},
|
1432 |
"Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
|
@@ -1438,13 +1474,22 @@ if __name__ == "__main__":
|
|
1438 |
'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1439 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1440 |
},
|
1441 |
-
"
|
1442 |
-
# Description: A
|
1443 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
1444 |
-
'vibrato_rate': 5.0, 'vibrato_depth':
|
1445 |
'smooth_notes_level': 0.9,
|
1446 |
-
'continuous_vibrato_level': 0.
|
1447 |
-
'bass_boost_level': 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1448 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1449 |
},
|
1450 |
# --- Advanced System Impressions ---
|
@@ -1484,7 +1529,155 @@ if __name__ == "__main__":
|
|
1484 |
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1485 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1486 |
},
|
1487 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1488 |
"Sci-Fi Energy Field": {
|
1489 |
# Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
|
1490 |
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
@@ -1530,7 +1723,7 @@ if __name__ == "__main__":
|
|
1530 |
'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
|
1531 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1532 |
},
|
1533 |
-
# --- Utility ---
|
1534 |
"Generic Chiptune Loop": {
|
1535 |
# Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
|
1536 |
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
@@ -1540,35 +1733,14 @@ if __name__ == "__main__":
|
|
1540 |
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1541 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1542 |
},
|
1543 |
-
"Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
|
1544 |
-
# Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
|
1545 |
-
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
|
1546 |
-
'vibrato_rate': 7.0, 'vibrato_depth': 12,
|
1547 |
-
'smooth_notes_level': 0.1,
|
1548 |
-
'continuous_vibrato_level': 0.0,
|
1549 |
-
'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
|
1550 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1551 |
-
},
|
1552 |
-
"Modern JRPG Pad (Persona / ペルソナ)": {
|
1553 |
-
# Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
|
1554 |
-
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
1555 |
-
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
1556 |
-
'smooth_notes_level': 1.0,
|
1557 |
-
'continuous_vibrato_level': 0.95,
|
1558 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1559 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1560 |
-
},
|
1561 |
-
"Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
|
1562 |
-
# Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
|
1563 |
-
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
1564 |
-
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
1565 |
-
'smooth_notes_level': 0.95,
|
1566 |
-
'continuous_vibrato_level': 0.9,
|
1567 |
-
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
1568 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1569 |
-
}
|
1570 |
}
|
1571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1572 |
app = gr.Blocks(theme=gr.themes.Base())
|
1573 |
|
1574 |
with app:
|
@@ -1611,6 +1783,27 @@ if __name__ == "__main__":
|
|
1611 |
info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
|
1612 |
)
|
1613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1614 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
1615 |
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
|
1616 |
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
|
@@ -1727,7 +1920,7 @@ if __name__ == "__main__":
|
|
1727 |
s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
|
1728 |
s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
|
1729 |
s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
|
1730 |
-
s8bit_decay_time_s = gr.Slider(0.01, 0
|
1731 |
s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
|
1732 |
s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
|
1733 |
s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
|
@@ -1774,7 +1967,11 @@ if __name__ == "__main__":
|
|
1774 |
# all_inputs now includes the preset selector itself
|
1775 |
# Inputs for the main processing function
|
1776 |
all_inputs = [
|
1777 |
-
input_file, s8bit_preset_selector,
|
|
|
|
|
|
|
|
|
1778 |
transcription_method, onset_threshold, frame_threshold, minimum_note_length,
|
1779 |
minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
|
1780 |
render_type, soundfont_bank, render_sample_rate, render_with_sustains,
|
@@ -1810,6 +2007,13 @@ if __name__ == "__main__":
|
|
1810 |
inputs=all_inputs,
|
1811 |
outputs=all_outputs # Pass the combined list
|
1812 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1813 |
|
1814 |
# --- Listeners for dynamic UI updates ---
|
1815 |
transcription_method.change(
|
|
|
50 |
import torch
|
51 |
import gradio as gr
|
52 |
|
53 |
+
# --- Imports for Vocal Separation ---
|
54 |
+
import torchaudio
|
55 |
+
from demucs.apply import apply_model
|
56 |
+
from demucs.pretrained import get_model
|
57 |
+
from demucs.audio import convert_audio
|
58 |
|
59 |
+
from src.piano_transcription.utils import initialize_app
|
60 |
from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate
|
61 |
|
62 |
# --- Import core transcription and MIDI processing libraries ---
|
|
|
1047 |
def process_and_render_file(input_file,
|
1048 |
# --- Pass the preset selector value ---
|
1049 |
s8bit_preset_selector,
|
1050 |
+
separate_vocals,
|
1051 |
+
remerge_vocals,
|
1052 |
+
transcription_target,
|
1053 |
# --- Transcription params ---
|
1054 |
enable_stereo_processing,
|
1055 |
transcription_method,
|
|
|
1079 |
filename = os.path.basename(input_file_path)
|
1080 |
print(f"Processing new file: {filename}")
|
1081 |
|
1082 |
+
# This will store the other part if separation is performed
|
1083 |
+
other_part_tensor = None
|
1084 |
+
other_part_sr = None
|
|
|
|
|
|
|
|
|
|
|
1085 |
|
1086 |
# --- Step 1: Check file type and transcribe if necessary ---
|
1087 |
if filename.lower().endswith(('.mid', '.midi', '.kar')):
|
|
|
1089 |
midi_path_for_rendering = input_file_path
|
1090 |
else: #if filename.lower().endswith(('.wav', '.mp3'))
|
1091 |
print("Audio file detected. Starting transcription...")
|
1092 |
+
|
1093 |
+
try:
|
1094 |
+
# Use torchaudio to load directly into a tensor, as demucs needs it.
|
1095 |
+
# This is more efficient than loading with librosa then converting.
|
1096 |
+
audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
|
1097 |
+
except Exception as e:
|
1098 |
+
raise gr.Error(f"Failed to load audio file: {e}")
|
1099 |
|
1100 |
+
# --- Demucs Vocal Separation Logic, now decides which stem to process ---
|
1101 |
+
if separate_vocals:
|
1102 |
+
if demucs_model is None:
|
1103 |
+
raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
|
1104 |
+
|
1105 |
+
# Convert to a common format (stereo, float32) that demucs expects
|
1106 |
+
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
|
1107 |
+
|
1108 |
+
if torch.cuda.is_available():
|
1109 |
+
audio_tensor = audio_tensor.cuda()
|
1110 |
+
|
1111 |
+
print("Separating audio with Demucs... This may take some time.")
|
1112 |
+
all_stems = apply_model(demucs_model, audio_tensor[None], device='cuda' if torch.cuda.is_available() else 'cpu', progress=True)[0]
|
1113 |
+
|
1114 |
+
vocals_idx = demucs_model.sources.index('vocals')
|
1115 |
+
# Sum all stems that are NOT vocals to get the accompaniment
|
1116 |
+
accompaniment_indices = [i for i, source in enumerate(demucs_model.sources) if source != 'vocals']
|
1117 |
+
|
1118 |
+
vocals_tensor = all_stems[vocals_idx]
|
1119 |
+
accompaniment_tensor = all_stems[accompaniment_indices].sum(0)
|
1120 |
+
|
1121 |
+
# --- The new core branching logic ---
|
1122 |
+
if transcription_target == "Transcribe Vocals":
|
1123 |
+
print("Target: Transcribing VOCALS.")
|
1124 |
+
tensor_to_process = vocals_tensor
|
1125 |
+
other_part_tensor = accompaniment_tensor # Save accompaniment for re-merging
|
1126 |
+
else: # Default to "Transcribe Music (Accompaniment)"
|
1127 |
+
print("Target: Transcribing MUSIC (ACCOMPANIMENT).")
|
1128 |
+
tensor_to_process = accompaniment_tensor
|
1129 |
+
other_part_tensor = vocals_tensor # Save vocals for re-merging
|
1130 |
+
|
1131 |
+
other_part_sr = demucs_model.samplerate
|
1132 |
+
audio_tensor = tensor_to_process # The audio to be processed is now the chosen stem
|
1133 |
+
native_sample_rate = demucs_model.samplerate # Update sample rate to match demucs output
|
1134 |
+
print("Separation complete.")
|
1135 |
+
|
1136 |
+
# --- Prepare audio for transcription (saving to a temp file) ---
|
1137 |
+
# This part of the logic now works on whichever stem was selected above
|
1138 |
base_name = os.path.splitext(filename)[0]
|
1139 |
+
temp_dir = "output/temp_transcribe"
|
1140 |
os.makedirs(temp_dir, exist_ok=True)
|
1141 |
+
suffix = f"_{transcription_target.split(' ')[1].lower()}" if separate_vocals else "_original"
|
1142 |
+
audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}{suffix}.wav")
|
1143 |
+
|
1144 |
+
torchaudio.save(audio_to_transcribe_path, audio_tensor.cpu(), native_sample_rate)
|
1145 |
+
|
1146 |
+
# Convert tensor to numpy array (channels, samples) for librosa/pyloudnorm compatibility
|
1147 |
+
# We work with a CPU copy of the tensor.
|
1148 |
+
audio_data_np = audio_tensor.cpu().numpy()
|
1149 |
|
1150 |
# === STEREO PROCESSING LOGIC ===
|
1151 |
if enable_stereo_processing:
|
1152 |
+
if audio_data_np.ndim != 2 or audio_data_np.shape[0] != 2:
|
1153 |
print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
|
1154 |
enable_stereo_processing = False # Disable stereo processing if audio is not stereo
|
1155 |
|
1156 |
if enable_stereo_processing:
|
1157 |
+
print("Stereo processing enabled. Splitting, normalizing, and transcribing channels...")
|
1158 |
try:
|
1159 |
+
left_channel_np = audio_data_np[0]
|
1160 |
+
right_channel_np = audio_data_np[1]
|
1161 |
|
1162 |
+
normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
|
1163 |
+
normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
|
1164 |
|
1165 |
+
temp_left_path = os.path.join(temp_dir, f"{base_name}_left.wav")
|
1166 |
+
temp_right_path = os.path.join(temp_dir, f"{base_name}_right.wav")
|
1167 |
|
1168 |
+
sf.write(temp_left_path, normalized_left, native_sample_rate)
|
1169 |
+
sf.write(temp_right_path, normalized_right, native_sample_rate)
|
1170 |
|
1171 |
+
print(f"Saved left channel to: {temp_left_path}")
|
1172 |
+
print(f"Saved right channel to: {temp_right_path}")
|
1173 |
|
1174 |
print("Transcribing left and right channel...")
|
1175 |
if transcription_method == "General Purpose":
|
1176 |
+
midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
1177 |
+
midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
1178 |
+
else: # Piano-Specific
|
1179 |
+
midi_path_left = TranscribePianoAudio(temp_left_path)
|
1180 |
+
midi_path_right = TranscribePianoAudio(temp_right_path)
|
1181 |
|
1182 |
if midi_path_left and midi_path_right:
|
1183 |
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
|
|
|
1195 |
print(f"An error occurred during stereo processing: {e}")
|
1196 |
raise gr.Error(f"Stereo Processing Failed: {e}")
|
1197 |
else: # Standard mono transcription
|
1198 |
+
print("Mono processing. Normalizing and transcribing audio...")
|
1199 |
+
# If the audio is stereo but stereo processing is disabled, convert to mono.
|
1200 |
+
if audio_data_np.shape[0] == 2:
|
1201 |
+
mono_signal_np = np.mean(audio_data_np, axis=0)
|
1202 |
+
else:
|
1203 |
+
mono_signal_np = audio_data_np[0]
|
|
|
|
|
1204 |
|
1205 |
+
normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
|
1206 |
+
temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
|
1207 |
+
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
1208 |
|
1209 |
try:
|
1210 |
if transcription_method == "General Purpose":
|
1211 |
+
midi_path_for_rendering = TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
1212 |
else: # Piano-Specific
|
1213 |
+
midi_path_for_rendering = TranscribePianoAudio(temp_mono_path)
|
1214 |
except Exception as e:
|
1215 |
print(f"An error occurred during transcription: {e}")
|
1216 |
raise gr.Error(f"Transcription Failed: {e}")
|
|
|
1270 |
synth_params['fm_modulation_depth'],
|
1271 |
synth_params['fm_modulation_rate']
|
1272 |
)
|
1273 |
+
|
1274 |
+
# --- Vocal Re-merging Logic now uses the generic "other_part" ---
|
1275 |
+
if separate_vocals and remerge_vocals and other_part_tensor is not None:
|
1276 |
+
print(f"Re-merging the non-transcribed part with newly rendered music...")
|
1277 |
+
|
1278 |
+
rendered_srate, rendered_music_int16 = results[4]
|
1279 |
+
|
1280 |
+
rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
|
1281 |
+
rendered_music_tensor = torch.from_numpy(rendered_music_float).T
|
1282 |
+
|
1283 |
+
if rendered_srate != other_part_sr:
|
1284 |
+
resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr)
|
1285 |
+
rendered_music_tensor = resampler(rendered_music_tensor)
|
1286 |
+
|
1287 |
+
len_music = rendered_music_tensor.shape[1]
|
1288 |
+
len_other = other_part_tensor.shape[1]
|
1289 |
+
|
1290 |
+
if len_music > len_other:
|
1291 |
+
padding = len_music - len_other
|
1292 |
+
other_part_tensor = torch.nn.functional.pad(other_part_tensor, (0, padding))
|
1293 |
+
elif len_other > len_music:
|
1294 |
+
padding = len_other - len_music
|
1295 |
+
rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding))
|
1296 |
+
|
1297 |
+
merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu()
|
1298 |
+
|
1299 |
+
max_abs = torch.max(torch.abs(merged_audio_tensor))
|
1300 |
+
if max_abs > 1.0:
|
1301 |
+
merged_audio_tensor /= max_abs
|
1302 |
+
|
1303 |
+
merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16)
|
1304 |
+
|
1305 |
+
new_results = list(results)
|
1306 |
+
new_results[4] = (other_part_sr, merged_audio_int16)
|
1307 |
+
results = tuple(new_results)
|
1308 |
+
print("Re-merging complete.")
|
1309 |
+
|
1310 |
print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
|
1311 |
print('*' * 70)
|
1312 |
|
|
|
1398 |
if not soundfonts_dict:
|
1399 |
print("\nWARNING: No SoundFonts were found or could be downloaded.")
|
1400 |
print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
|
1401 |
+
|
1402 |
+
# --- Pre-load the Demucs model on startup for efficiency ---
|
1403 |
+
print("Loading Demucs model (htdemucs_ft), this may take a moment on first run...")
|
1404 |
+
try:
|
1405 |
+
demucs_model = get_model(name='htdemucs_ft')
|
1406 |
+
if torch.cuda.is_available():
|
1407 |
+
demucs_model = demucs_model.cuda()
|
1408 |
+
print("Demucs model loaded successfully.")
|
1409 |
+
except Exception as e:
|
1410 |
+
print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}")
|
1411 |
+
demucs_model = None
|
1412 |
+
|
1413 |
# --- Define a constant for the fallback preset name ---
|
1414 |
# This prevents errors if the preset name is changed in the dictionary.
|
1415 |
FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
|
|
|
1419 |
# Comprehensive preset dictionary including new JRPG and Handheld classics
|
1420 |
# Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
|
1421 |
S8BIT_PRESETS = {
|
1422 |
+
# --- Classic Chiptune ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1423 |
"Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
|
1424 |
# Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
|
1425 |
'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
|
|
|
1429 |
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1430 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1431 |
},
|
1432 |
+
"Mega Man (Rockman / ロックマン)": {
|
1433 |
+
# Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
|
1434 |
+
'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
1435 |
+
'vibrato_rate': 6.0, 'vibrato_depth': 8,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1436 |
'smooth_notes_level': 0.9,
|
1437 |
'continuous_vibrato_level': 0.85,
|
1438 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
|
1439 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1440 |
},
|
1441 |
"Zelda (The Legend of Zelda / ゼルダの伝説)": {
|
|
|
1447 |
'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1448 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1449 |
},
|
1450 |
+
"Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
|
1451 |
+
# Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
|
1452 |
+
'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
1453 |
+
'vibrato_rate': 6.0, 'vibrato_depth': 4,
|
|
|
1454 |
'smooth_notes_level': 0.85,
|
1455 |
+
'continuous_vibrato_level': 0.3, # Formerly False (0.0); adds a hint of continuity for more liveliness.
|
1456 |
+
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1457 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1458 |
},
|
1459 |
+
"Pokémon (Game Boy Classics / ポケットモンスター)": {
|
1460 |
+
# Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs.
|
1461 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
1462 |
+
'vibrato_rate': 5.0, 'vibrato_depth': 5,
|
1463 |
'smooth_notes_level': 0.9,
|
1464 |
+
'continuous_vibrato_level': 0.9,
|
1465 |
+
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1466 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1467 |
},
|
1468 |
"Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
|
|
|
1474 |
'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1475 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1476 |
},
|
1477 |
+
"Final Fantasy (Arpeggio / ファイナルファンタジー)": {
|
1478 |
+
# Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound.
|
1479 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
1480 |
+
'vibrato_rate': 5.0, 'vibrato_depth': 0,
|
1481 |
'smooth_notes_level': 0.9,
|
1482 |
+
'continuous_vibrato_level': 0.2,
|
1483 |
+
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1484 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1485 |
+
},
|
1486 |
+
"ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
|
1487 |
+
# Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
|
1488 |
+
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
1489 |
+
'vibrato_rate': 3.5, 'vibrato_depth': 3,
|
1490 |
+
'smooth_notes_level': 0.9,
|
1491 |
+
'continuous_vibrato_level': 0.85,
|
1492 |
+
'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1493 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1494 |
},
|
1495 |
# --- Advanced System Impressions ---
|
|
|
1529 |
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1530 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1531 |
},
|
1532 |
+
# --- Action & Rock Leads ---
|
1533 |
+
"Falcom Ys (Rock Lead / イース)": {
|
1534 |
+
# Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs.
|
1535 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
1536 |
+
'vibrato_rate': 5.5, 'vibrato_depth': 6,
|
1537 |
+
'smooth_notes_level': 0.85,
|
1538 |
+
'continuous_vibrato_level': 0.8,
|
1539 |
+
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15,
|
1540 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1541 |
+
},
|
1542 |
+
"Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
|
1543 |
+
# Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
|
1544 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
1545 |
+
'vibrato_rate': 5.0, 'vibrato_depth': 6,
|
1546 |
+
'smooth_notes_level': 0.8,
|
1547 |
+
'continuous_vibrato_level': 0.7,
|
1548 |
+
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
|
1549 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1550 |
+
},
|
1551 |
+
"Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
|
1552 |
+
# Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
|
1553 |
+
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
|
1554 |
+
'vibrato_rate': 4.5, 'vibrato_depth': 4,
|
1555 |
+
'smooth_notes_level': 0.9, # Formerly True -> 1.0; slightly reduced for a bit more attack.
|
1556 |
+
'continuous_vibrato_level': 0.8, # Formerly True -> 1.0; slightly weakened for more defined note transitions.
|
1557 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1558 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1559 |
+
},
|
1560 |
+
# --- Epic & Orchestral Pads ---
|
1561 |
+
"Dragon Quest (Orchestral Feel / ドラゴンクエスト)": {
|
1562 |
+
# Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
|
1563 |
+
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
|
1564 |
+
'vibrato_rate': 3.0, 'vibrato_depth': 4,
|
1565 |
+
'smooth_notes_level': 0.9,
|
1566 |
+
'continuous_vibrato_level': 0.9,
|
1567 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1568 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1569 |
+
},
|
1570 |
+
"Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
|
1571 |
+
# Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
|
1572 |
+
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
1573 |
+
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
1574 |
+
'smooth_notes_level': 1.0,
|
1575 |
+
'continuous_vibrato_level': 0.95,
|
1576 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1577 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1578 |
+
},
|
1579 |
+
"Modern JRPG Pad (Persona / ペルソナ)": {
|
1580 |
+
# Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
|
1581 |
+
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
1582 |
+
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
1583 |
+
'smooth_notes_level': 1.0,
|
1584 |
+
'continuous_vibrato_level': 0.95,
|
1585 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1586 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1587 |
+
},
|
1588 |
+
"Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
|
1589 |
+
# Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
|
1590 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
1591 |
+
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
1592 |
+
'smooth_notes_level': 0.95,
|
1593 |
+
'continuous_vibrato_level': 0.9,
|
1594 |
+
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
1595 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1596 |
+
},
|
1597 |
+
"Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": {
|
1598 |
+
# Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes.
|
1599 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
1600 |
+
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
1601 |
+
'smooth_notes_level': 0.95,
|
1602 |
+
'continuous_vibrato_level': 0.9,
|
1603 |
+
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
1604 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1605 |
+
},
|
1606 |
+
"Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
|
1607 |
+
# Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
|
1608 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
|
1609 |
+
'vibrato_rate': 7.0, 'vibrato_depth': 12,
|
1610 |
+
'smooth_notes_level': 0.1,
|
1611 |
+
'continuous_vibrato_level': 0.0,
|
1612 |
+
'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
|
1613 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1614 |
+
},
|
1615 |
+
# --- Vocal Synthesis ---
|
1616 |
+
"8-Bit Vocal Lead": {
|
1617 |
+
# Description: A soft, sustained triangle wave with gentle vibrato to mimic a singing voice.
|
1618 |
+
'waveform_type': 'Triangle',
|
1619 |
+
'pulse_width': 0.5,
|
1620 |
+
'envelope_type': 'Sustained (Full Decay)',
|
1621 |
+
'decay_time_s': 0.8,
|
1622 |
+
'vibrato_rate': 5.5,
|
1623 |
+
'vibrato_depth': 4, # Mapped from the suggested 0.15 range
|
1624 |
+
'bass_boost_level': 0.1,
|
1625 |
+
'smooth_notes_level': 0.85,
|
1626 |
+
'continuous_vibrato_level': 0.9,
|
1627 |
+
'noise_level': 0.02,
|
1628 |
+
'distortion_level': 0.0,
|
1629 |
+
'fm_modulation_depth': 0.05,
|
1630 |
+
'fm_modulation_rate': 20
|
1631 |
+
},
|
1632 |
+
"8-Bit Male Vocal": {
|
1633 |
+
# Description: A deeper, fuller triangle wave with more bass and slower vibrato for a masculine feel.
|
1634 |
+
'waveform_type': 'Triangle',
|
1635 |
+
'pulse_width': 0.5,
|
1636 |
+
'envelope_type': 'Sustained (Full Decay)',
|
1637 |
+
'decay_time_s': 1.0,
|
1638 |
+
'vibrato_rate': 5.0,
|
1639 |
+
'vibrato_depth': 3, # Mapped from the suggested 0.12 range
|
1640 |
+
'bass_boost_level': 0.3,
|
1641 |
+
'smooth_notes_level': 0.9,
|
1642 |
+
'continuous_vibrato_level': 0.85,
|
1643 |
+
'noise_level': 0.015,
|
1644 |
+
'distortion_level': 0.0,
|
1645 |
+
'fm_modulation_depth': 0.08,
|
1646 |
+
'fm_modulation_rate': 25
|
1647 |
+
},
|
1648 |
+
"8-Bit Female Vocal": {
|
1649 |
+
# Description: A brighter, lighter triangle wave with faster vibrato and less bass for a feminine feel.
|
1650 |
+
'waveform_type': 'Triangle',
|
1651 |
+
'pulse_width': 0.5,
|
1652 |
+
'envelope_type': 'Sustained (Full Decay)',
|
1653 |
+
'decay_time_s': 0.7,
|
1654 |
+
'vibrato_rate': 6.0,
|
1655 |
+
'vibrato_depth': 5, # Mapped from the suggested 0.18 range
|
1656 |
+
'bass_boost_level': 0.05,
|
1657 |
+
'smooth_notes_level': 0.85,
|
1658 |
+
'continuous_vibrato_level': 0.92,
|
1659 |
+
'noise_level': 0.025,
|
1660 |
+
'distortion_level': 0.0,
|
1661 |
+
'fm_modulation_depth': 0.04,
|
1662 |
+
'fm_modulation_rate': 30
|
1663 |
+
},
|
1664 |
+
"Lo-Fi Vocal": {
|
1665 |
+
# Description: A gritty, noisy square wave with a short decay to simulate a low-resolution vocal sample.
|
1666 |
+
'waveform_type': 'Square',
|
1667 |
+
'pulse_width': 0.48,
|
1668 |
+
'envelope_type': 'Plucky (AD Envelope)', # "Short" implies a plucky, not sustained, envelope
|
1669 |
+
'decay_time_s': 0.4,
|
1670 |
+
'vibrato_rate': 4.8,
|
1671 |
+
'vibrato_depth': 2, # Mapped from the suggested 0.10 range
|
1672 |
+
'bass_boost_level': 0.1,
|
1673 |
+
'smooth_notes_level': 0.65,
|
1674 |
+
'continuous_vibrato_level': 0.6,
|
1675 |
+
'noise_level': 0.05,
|
1676 |
+
'distortion_level': 0.05,
|
1677 |
+
'fm_modulation_depth': 0.02,
|
1678 |
+
'fm_modulation_rate': 20
|
1679 |
+
},
|
1680 |
+
# --- Sound FX & Experimental ---
|
1681 |
"Sci-Fi Energy Field": {
|
1682 |
# Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
|
1683 |
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
|
|
1723 |
'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
|
1724 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1725 |
},
|
1726 |
+
# --- Utility & Starting Points ---
|
1727 |
"Generic Chiptune Loop": {
|
1728 |
# Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
|
1729 |
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
|
|
1733 |
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
|
1734 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
1735 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1736 |
}
|
1737 |
|
1738 |
+
# --- Function to control visibility of BOTH new UI elements ---
|
1739 |
+
def update_vocal_ui_visibility(separate_vocals):
|
1740 |
+
"""Shows or hides the separation-related UI controls."""
|
1741 |
+
is_visible = gr.update(visible=separate_vocals)
|
1742 |
+
return is_visible, is_visible # Return two updates
|
1743 |
+
|
1744 |
app = gr.Blocks(theme=gr.themes.Base())
|
1745 |
|
1746 |
with app:
|
|
|
1783 |
info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
|
1784 |
)
|
1785 |
|
1786 |
+
# --- Vocal Separation Checkboxes ---
|
1787 |
+
with gr.Group():
|
1788 |
+
separate_vocals = gr.Checkbox(
|
1789 |
+
label="Separate Vocals",
|
1790 |
+
value=False,
|
1791 |
+
info="If checked, separates the audio into vocals and music stems before processing."
|
1792 |
+
)
|
1793 |
+
transcription_target = gr.Radio(
|
1794 |
+
["Transcribe Music (Accompaniment)", "Transcribe Vocals"],
|
1795 |
+
label="Transcription Target",
|
1796 |
+
value="Transcribe Music (Accompaniment)",
|
1797 |
+
info="Choose which part of the separated audio to transcribe to MIDI.",
|
1798 |
+
visible=False # Initially hidden
|
1799 |
+
)
|
1800 |
+
remerge_vocals = gr.Checkbox(
|
1801 |
+
label="Re-merge Other Part with Rendered Audio",
|
1802 |
+
value=False,
|
1803 |
+
info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
|
1804 |
+
visible=False # Initially hidden
|
1805 |
+
)
|
1806 |
+
|
1807 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
1808 |
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
|
1809 |
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
|
|
|
1920 |
s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
|
1921 |
s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
|
1922 |
s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
|
1923 |
+
s8bit_decay_time_s = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="Decay Time (s)") # Increased max to 0.6 for DQ style
|
1924 |
s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
|
1925 |
s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
|
1926 |
s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
|
|
|
1967 |
# all_inputs now includes the preset selector itself
|
1968 |
# Inputs for the main processing function
|
1969 |
all_inputs = [
|
1970 |
+
input_file, s8bit_preset_selector,
|
1971 |
+
separate_vocals,
|
1972 |
+
remerge_vocals,
|
1973 |
+
transcription_target,
|
1974 |
+
enable_stereo_processing,
|
1975 |
transcription_method, onset_threshold, frame_threshold, minimum_note_length,
|
1976 |
minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
|
1977 |
render_type, soundfont_bank, render_sample_rate, render_with_sustains,
|
|
|
2007 |
inputs=all_inputs,
|
2008 |
outputs=all_outputs # Pass the combined list
|
2009 |
)
|
2010 |
+
|
2011 |
+
# --- The change event now controls TWO components ---
|
2012 |
+
separate_vocals.change(
|
2013 |
+
fn=update_vocal_ui_visibility,
|
2014 |
+
inputs=separate_vocals,
|
2015 |
+
outputs=[transcription_target, remerge_vocals] # Update both components
|
2016 |
+
)
|
2017 |
|
2018 |
# --- Listeners for dynamic UI updates ---
|
2019 |
transcription_method.change(
|
requirements.txt
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
--extra-index-url https://download.pytorch.org/whl/cu128
|
2 |
|
3 |
torch
|
|
|
4 |
numpy
|
5 |
-
gradio
|
6 |
mido
|
7 |
librosa
|
8 |
torchlibrosa
|
@@ -18,9 +19,12 @@ psutil
|
|
18 |
pretty_midi
|
19 |
soundfile
|
20 |
pyloudnorm
|
|
|
21 |
piano_transcription_inference
|
22 |
|
23 |
basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
|
24 |
basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
|
25 |
|
26 |
-
git+https://github.com/avan06/pyfluidsynth
|
|
|
|
|
|
1 |
--extra-index-url https://download.pytorch.org/whl/cu128
|
2 |
|
3 |
torch
|
4 |
+
torchaudio
|
5 |
numpy
|
6 |
+
gradio >= 5.42.0
|
7 |
mido
|
8 |
librosa
|
9 |
torchlibrosa
|
|
|
19 |
pretty_midi
|
20 |
soundfile
|
21 |
pyloudnorm
|
22 |
+
|
23 |
piano_transcription_inference
|
24 |
|
25 |
basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
|
26 |
basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
|
27 |
|
28 |
+
git+https://github.com/avan06/pyfluidsynth
|
29 |
+
|
30 |
+
demucs
|