avans06 commited on
Commit
22dd15a
·
1 Parent(s): 58195d3

Add experimental vocal separation and selective transcription features

Browse files

- Added **Separate Vocals** option (experimental): Splits input audio into vocals and music stems before processing.
- Added **Transcription Target** setting: Allows choosing which stem (vocals or music) to transcribe to MIDI.
- Added option to **Re-merge Other Part with Rendered Audio**: After rendering, merges the non-transcribed stem (e.g., original vocals) back with the new music.

Files changed (2) hide show
  1. app.py +364 -160
  2. requirements.txt +6 -2
app.py CHANGED
@@ -50,8 +50,13 @@ import soundfile as sf
50
  import torch
51
  import gradio as gr
52
 
53
- from src.piano_transcription.utils import initialize_app
 
 
 
 
54
 
 
55
  from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate
56
 
57
  # --- Import core transcription and MIDI processing libraries ---
@@ -1042,6 +1047,9 @@ def recommend_8bit_params(midi_data, default_preset):
1042
  def process_and_render_file(input_file,
1043
  # --- Pass the preset selector value ---
1044
  s8bit_preset_selector,
 
 
 
1045
  # --- Transcription params ---
1046
  enable_stereo_processing,
1047
  transcription_method,
@@ -1071,14 +1079,9 @@ def process_and_render_file(input_file,
1071
  filename = os.path.basename(input_file_path)
1072
  print(f"Processing new file: {filename}")
1073
 
1074
- try:
1075
- # Mono=False is required to correctly detect stereo channels
1076
- audio_data, native_sample_rate = librosa.load(input_file_path, sr=None, mono=False)
1077
- except Exception as e:
1078
- # If loading fails, it might be a MIDI file, which librosa cannot handle.
1079
- # We will proceed, assuming it's a MIDI, and let pretty_midi handle it later.
1080
- print(f"Could not load as audio: {e}. Assuming it is a MIDI file.")
1081
- pass
1082
 
1083
  # --- Step 1: Check file type and transcribe if necessary ---
1084
  if filename.lower().endswith(('.mid', '.midi', '.kar')):
@@ -1086,42 +1089,95 @@ def process_and_render_file(input_file,
1086
  midi_path_for_rendering = input_file_path
1087
  else: #if filename.lower().endswith(('.wav', '.mp3'))
1088
  print("Audio file detected. Starting transcription...")
 
 
 
 
 
 
 
1089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
  base_name = os.path.splitext(filename)[0]
1091
- temp_dir = "output/temp_normalized"
1092
  os.makedirs(temp_dir, exist_ok=True)
 
 
 
 
 
 
 
 
1093
 
1094
  # === STEREO PROCESSING LOGIC ===
1095
  if enable_stereo_processing:
1096
- if 'audio_data' not in locals() or audio_data.ndim != 2 or audio_data.shape[0] != 2:
1097
  print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
1098
  enable_stereo_processing = False # Disable stereo processing if audio is not stereo
1099
 
1100
  if enable_stereo_processing:
1101
- print("Stereo processing enabled. Splitting channels...")
1102
  try:
1103
- left_channel = audio_data[0]
1104
- right_channel = audio_data[1]
1105
 
1106
- normalized_left = normalize_loudness(left_channel, native_sample_rate)
1107
- normalized_right = normalize_loudness(right_channel, native_sample_rate)
1108
 
1109
- temp_left_wav_path = os.path.join(temp_dir, f"{base_name}_left.wav")
1110
- temp_right_wav_path = os.path.join(temp_dir, f"{base_name}_right.wav")
1111
 
1112
- sf.write(temp_left_wav_path, normalized_left, native_sample_rate)
1113
- sf.write(temp_right_wav_path, normalized_right, native_sample_rate)
1114
 
1115
- print(f"Saved left channel to: {temp_left_wav_path}")
1116
- print(f"Saved right channel to: {temp_right_wav_path}")
1117
 
1118
  print("Transcribing left and right channel...")
1119
  if transcription_method == "General Purpose":
1120
- midi_path_left = TranscribeGeneralAudio(temp_left_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1121
- midi_path_right = TranscribeGeneralAudio(temp_right_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1122
- else:
1123
- midi_path_left = TranscribePianoAudio(temp_left_wav_path)
1124
- midi_path_right = TranscribePianoAudio(temp_right_wav_path)
1125
 
1126
  if midi_path_left and midi_path_right:
1127
  merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
@@ -1139,24 +1195,22 @@ def process_and_render_file(input_file,
1139
  print(f"An error occurred during stereo processing: {e}")
1140
  raise gr.Error(f"Stereo Processing Failed: {e}")
1141
  else: # Standard mono transcription
1142
- print("Stereo processing disabled. Using standard mono transcription.")
1143
- if 'audio_data' in locals():
1144
- if audio_data.ndim == 1:
1145
- mono_signal = audio_data
1146
- else:
1147
- mono_signal = np.mean(audio_data, axis=0)
1148
-
1149
- normalized_mono = normalize_loudness(mono_signal, native_sample_rate)
1150
 
1151
- temp_mono_wav_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
1152
- sf.write(temp_mono_wav_path, normalized_mono, native_sample_rate)
1153
- input_file_path = temp_mono_wav_path # Use the normalized mono file for transcription
1154
 
1155
  try:
1156
  if transcription_method == "General Purpose":
1157
- midi_path_for_rendering = TranscribeGeneralAudio(input_file_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1158
  else: # Piano-Specific
1159
- midi_path_for_rendering = TranscribePianoAudio(input_file_path)
1160
  except Exception as e:
1161
  print(f"An error occurred during transcription: {e}")
1162
  raise gr.Error(f"Transcription Failed: {e}")
@@ -1216,7 +1270,43 @@ def process_and_render_file(input_file,
1216
  synth_params['fm_modulation_depth'],
1217
  synth_params['fm_modulation_rate']
1218
  )
1219
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
  print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
1221
  print('*' * 70)
1222
 
@@ -1308,7 +1398,18 @@ if __name__ == "__main__":
1308
  if not soundfonts_dict:
1309
  print("\nWARNING: No SoundFonts were found or could be downloaded.")
1310
  print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
1311
-
 
 
 
 
 
 
 
 
 
 
 
1312
  # --- Define a constant for the fallback preset name ---
1313
  # This prevents errors if the preset name is changed in the dictionary.
1314
  FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
@@ -1318,43 +1419,7 @@ if __name__ == "__main__":
1318
  # Comprehensive preset dictionary including new JRPG and Handheld classics
1319
  # Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
1320
  S8BIT_PRESETS = {
1321
- # --- Rhythmic & Action ---
1322
- "Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
1323
- # Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
1324
- 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
1325
- 'vibrato_rate': 4.5, 'vibrato_depth': 4,
1326
- 'smooth_notes_level': 0.9, # Formerly True -> 1.0; slightly reduced for a bit more attack.
1327
- 'continuous_vibrato_level': 0.8, # Formerly True -> 1.0; slightly weakened for more defined note transitions.
1328
- 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1329
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1330
- },
1331
- "Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
1332
- # Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
1333
- 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
1334
- 'vibrato_rate': 5.0, 'vibrato_depth': 6,
1335
- 'smooth_notes_level': 0.8,
1336
- 'continuous_vibrato_level': 0.7,
1337
- 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
1338
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1339
- },
1340
- "Mega Man (Rockman / ロックマン)": {
1341
- # Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
1342
- 'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
1343
- 'vibrato_rate': 6.0, 'vibrato_depth': 8,
1344
- 'smooth_notes_level': 0.9,
1345
- 'continuous_vibrato_level': 0.85,
1346
- 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
1347
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1348
- },
1349
- "Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
1350
- # Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
1351
- 'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
1352
- 'vibrato_rate': 6.0, 'vibrato_depth': 4,
1353
- 'smooth_notes_level': 0.85,
1354
- 'continuous_vibrato_level': 0.3, # Formerly False (0.0); adds a hint of continuity for more liveliness.
1355
- 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
1356
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1357
- },
1358
  "Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
1359
  # Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
1360
  'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
@@ -1364,41 +1429,13 @@ if __name__ == "__main__":
1364
  'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
1365
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1366
  },
1367
- # --- Epic & Atmospheric ---
1368
- "Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": {
1369
- # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes.
1370
- 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
1371
- 'vibrato_rate': 3.5, 'vibrato_depth': 5,
1372
- 'smooth_notes_level': 0.95,
1373
- 'continuous_vibrato_level': 0.9,
1374
- 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
1375
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1376
- },
1377
- "Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
1378
- # Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
1379
- 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
1380
- 'vibrato_rate': 2.5, 'vibrato_depth': 4,
1381
- 'smooth_notes_level': 1.0,
1382
- 'continuous_vibrato_level': 0.95,
1383
- 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1384
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1385
- },
1386
- "Dragon Quest (ドラゴンクエスト)": {
1387
- # Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
1388
- 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
1389
- 'vibrato_rate': 3.0, 'vibrato_depth': 4,
1390
- 'smooth_notes_level': 0.9,
1391
- 'continuous_vibrato_level': 0.9,
1392
- 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1393
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1394
- },
1395
- "ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
1396
- # Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
1397
- 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
1398
- 'vibrato_rate': 3.5, 'vibrato_depth': 3,
1399
  'smooth_notes_level': 0.9,
1400
  'continuous_vibrato_level': 0.85,
1401
- 'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0,
1402
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1403
  },
1404
  "Zelda (The Legend of Zelda / ゼルダの伝説)": {
@@ -1410,23 +1447,22 @@ if __name__ == "__main__":
1410
  'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
1411
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1412
  },
1413
- # --- JRPG & System Classics ---
1414
- "Falcom Ys (Ys / イース)": {
1415
- # Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs.
1416
- 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
1417
- 'vibrato_rate': 5.5, 'vibrato_depth': 6,
1418
  'smooth_notes_level': 0.85,
1419
- 'continuous_vibrato_level': 0.8,
1420
- 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15,
1421
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1422
  },
1423
- "Final Fantasy (ファイナルファンタジー)": {
1424
- # Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound.
1425
  'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
1426
- 'vibrato_rate': 5.0, 'vibrato_depth': 0,
1427
  'smooth_notes_level': 0.9,
1428
- 'continuous_vibrato_level': 0.2,
1429
- 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
1430
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1431
  },
1432
  "Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
@@ -1438,13 +1474,22 @@ if __name__ == "__main__":
1438
  'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
1439
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1440
  },
1441
- "Pokémon (Game Boy Classics / ポケットモンスター)": {
1442
- # Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs.
1443
  'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
1444
- 'vibrato_rate': 5.0, 'vibrato_depth': 5,
1445
  'smooth_notes_level': 0.9,
1446
- 'continuous_vibrato_level': 0.9,
1447
- 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
 
 
 
 
 
 
 
 
 
1448
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1449
  },
1450
  # --- Advanced System Impressions ---
@@ -1484,7 +1529,155 @@ if __name__ == "__main__":
1484
  'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
1485
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1486
  },
1487
- # --- Experimental & Sound FX ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1488
  "Sci-Fi Energy Field": {
1489
  # Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
1490
  'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
@@ -1530,7 +1723,7 @@ if __name__ == "__main__":
1530
  'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
1531
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1532
  },
1533
- # --- Utility ---
1534
  "Generic Chiptune Loop": {
1535
  # Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
1536
  'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
@@ -1540,35 +1733,14 @@ if __name__ == "__main__":
1540
  'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
1541
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1542
  },
1543
- "Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
1544
- # Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
1545
- 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
1546
- 'vibrato_rate': 7.0, 'vibrato_depth': 12,
1547
- 'smooth_notes_level': 0.1,
1548
- 'continuous_vibrato_level': 0.0,
1549
- 'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
1550
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1551
- },
1552
- "Modern JRPG Pad (Persona / ペルソナ)": {
1553
- # Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
1554
- 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
1555
- 'vibrato_rate': 2.5, 'vibrato_depth': 4,
1556
- 'smooth_notes_level': 1.0,
1557
- 'continuous_vibrato_level': 0.95,
1558
- 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1559
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1560
- },
1561
- "Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
1562
- # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
1563
- 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
1564
- 'vibrato_rate': 3.5, 'vibrato_depth': 5,
1565
- 'smooth_notes_level': 0.95,
1566
- 'continuous_vibrato_level': 0.9,
1567
- 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
1568
- 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1569
- }
1570
  }
1571
 
 
 
 
 
 
 
1572
  app = gr.Blocks(theme=gr.themes.Base())
1573
 
1574
  with app:
@@ -1611,6 +1783,27 @@ if __name__ == "__main__":
1611
  info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
1612
  )
1613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1614
  with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
1615
  onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
1616
  frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
@@ -1727,7 +1920,7 @@ if __name__ == "__main__":
1727
  s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
1728
  s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
1729
  s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
1730
- s8bit_decay_time_s = gr.Slider(0.01, 0.6, value=0.1, step=0.01, label="Decay Time (s)") # Increased max to 0.6 for DQ style
1731
  s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
1732
  s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
1733
  s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
@@ -1774,7 +1967,11 @@ if __name__ == "__main__":
1774
  # all_inputs now includes the preset selector itself
1775
  # Inputs for the main processing function
1776
  all_inputs = [
1777
- input_file, s8bit_preset_selector, enable_stereo_processing,
 
 
 
 
1778
  transcription_method, onset_threshold, frame_threshold, minimum_note_length,
1779
  minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
1780
  render_type, soundfont_bank, render_sample_rate, render_with_sustains,
@@ -1810,6 +2007,13 @@ if __name__ == "__main__":
1810
  inputs=all_inputs,
1811
  outputs=all_outputs # Pass the combined list
1812
  )
 
 
 
 
 
 
 
1813
 
1814
  # --- Listeners for dynamic UI updates ---
1815
  transcription_method.change(
 
50
  import torch
51
  import gradio as gr
52
 
53
+ # --- Imports for Vocal Separation ---
54
+ import torchaudio
55
+ from demucs.apply import apply_model
56
+ from demucs.pretrained import get_model
57
+ from demucs.audio import convert_audio
58
 
59
+ from src.piano_transcription.utils import initialize_app
60
  from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate
61
 
62
  # --- Import core transcription and MIDI processing libraries ---
 
1047
  def process_and_render_file(input_file,
1048
  # --- Pass the preset selector value ---
1049
  s8bit_preset_selector,
1050
+ separate_vocals,
1051
+ remerge_vocals,
1052
+ transcription_target,
1053
  # --- Transcription params ---
1054
  enable_stereo_processing,
1055
  transcription_method,
 
1079
  filename = os.path.basename(input_file_path)
1080
  print(f"Processing new file: {filename}")
1081
 
1082
+ # This will store the other part if separation is performed
1083
+ other_part_tensor = None
1084
+ other_part_sr = None
 
 
 
 
 
1085
 
1086
  # --- Step 1: Check file type and transcribe if necessary ---
1087
  if filename.lower().endswith(('.mid', '.midi', '.kar')):
 
1089
  midi_path_for_rendering = input_file_path
1090
  else: #if filename.lower().endswith(('.wav', '.mp3'))
1091
  print("Audio file detected. Starting transcription...")
1092
+
1093
+ try:
1094
+ # Use torchaudio to load directly into a tensor, as demucs needs it.
1095
+ # This is more efficient than loading with librosa then converting.
1096
+ audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
1097
+ except Exception as e:
1098
+ raise gr.Error(f"Failed to load audio file: {e}")
1099
 
1100
+ # --- Demucs Vocal Separation Logic, now decides which stem to process ---
1101
+ if separate_vocals:
1102
+ if demucs_model is None:
1103
+ raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
1104
+
1105
+ # Convert to a common format (stereo, float32) that demucs expects
1106
+ audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
1107
+
1108
+ if torch.cuda.is_available():
1109
+ audio_tensor = audio_tensor.cuda()
1110
+
1111
+ print("Separating audio with Demucs... This may take some time.")
1112
+ all_stems = apply_model(demucs_model, audio_tensor[None], device='cuda' if torch.cuda.is_available() else 'cpu', progress=True)[0]
1113
+
1114
+ vocals_idx = demucs_model.sources.index('vocals')
1115
+ # Sum all stems that are NOT vocals to get the accompaniment
1116
+ accompaniment_indices = [i for i, source in enumerate(demucs_model.sources) if source != 'vocals']
1117
+
1118
+ vocals_tensor = all_stems[vocals_idx]
1119
+ accompaniment_tensor = all_stems[accompaniment_indices].sum(0)
1120
+
1121
+ # --- The new core branching logic ---
1122
+ if transcription_target == "Transcribe Vocals":
1123
+ print("Target: Transcribing VOCALS.")
1124
+ tensor_to_process = vocals_tensor
1125
+ other_part_tensor = accompaniment_tensor # Save accompaniment for re-merging
1126
+ else: # Default to "Transcribe Music (Accompaniment)"
1127
+ print("Target: Transcribing MUSIC (ACCOMPANIMENT).")
1128
+ tensor_to_process = accompaniment_tensor
1129
+ other_part_tensor = vocals_tensor # Save vocals for re-merging
1130
+
1131
+ other_part_sr = demucs_model.samplerate
1132
+ audio_tensor = tensor_to_process # The audio to be processed is now the chosen stem
1133
+ native_sample_rate = demucs_model.samplerate # Update sample rate to match demucs output
1134
+ print("Separation complete.")
1135
+
1136
+ # --- Prepare audio for transcription (saving to a temp file) ---
1137
+ # This part of the logic now works on whichever stem was selected above
1138
  base_name = os.path.splitext(filename)[0]
1139
+ temp_dir = "output/temp_transcribe"
1140
  os.makedirs(temp_dir, exist_ok=True)
1141
+ suffix = f"_{transcription_target.split(' ')[1].lower()}" if separate_vocals else "_original"
1142
+ audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}{suffix}.wav")
1143
+
1144
+ torchaudio.save(audio_to_transcribe_path, audio_tensor.cpu(), native_sample_rate)
1145
+
1146
+ # Convert tensor to numpy array (channels, samples) for librosa/pyloudnorm compatibility
1147
+ # We work with a CPU copy of the tensor.
1148
+ audio_data_np = audio_tensor.cpu().numpy()
1149
 
1150
  # === STEREO PROCESSING LOGIC ===
1151
  if enable_stereo_processing:
1152
+ if audio_data_np.ndim != 2 or audio_data_np.shape[0] != 2:
1153
  print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
1154
  enable_stereo_processing = False # Disable stereo processing if audio is not stereo
1155
 
1156
  if enable_stereo_processing:
1157
+ print("Stereo processing enabled. Splitting, normalizing, and transcribing channels...")
1158
  try:
1159
+ left_channel_np = audio_data_np[0]
1160
+ right_channel_np = audio_data_np[1]
1161
 
1162
+ normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
1163
+ normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
1164
 
1165
+ temp_left_path = os.path.join(temp_dir, f"{base_name}_left.wav")
1166
+ temp_right_path = os.path.join(temp_dir, f"{base_name}_right.wav")
1167
 
1168
+ sf.write(temp_left_path, normalized_left, native_sample_rate)
1169
+ sf.write(temp_right_path, normalized_right, native_sample_rate)
1170
 
1171
+ print(f"Saved left channel to: {temp_left_path}")
1172
+ print(f"Saved right channel to: {temp_right_path}")
1173
 
1174
  print("Transcribing left and right channel...")
1175
  if transcription_method == "General Purpose":
1176
+ midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1177
+ midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1178
+ else: # Piano-Specific
1179
+ midi_path_left = TranscribePianoAudio(temp_left_path)
1180
+ midi_path_right = TranscribePianoAudio(temp_right_path)
1181
 
1182
  if midi_path_left and midi_path_right:
1183
  merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
 
1195
  print(f"An error occurred during stereo processing: {e}")
1196
  raise gr.Error(f"Stereo Processing Failed: {e}")
1197
  else: # Standard mono transcription
1198
+ print("Mono processing. Normalizing and transcribing audio...")
1199
+ # If the audio is stereo but stereo processing is disabled, convert to mono.
1200
+ if audio_data_np.shape[0] == 2:
1201
+ mono_signal_np = np.mean(audio_data_np, axis=0)
1202
+ else:
1203
+ mono_signal_np = audio_data_np[0]
 
 
1204
 
1205
+ normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
1206
+ temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
1207
+ sf.write(temp_mono_path, normalized_mono, native_sample_rate)
1208
 
1209
  try:
1210
  if transcription_method == "General Purpose":
1211
+ midi_path_for_rendering = TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
1212
  else: # Piano-Specific
1213
+ midi_path_for_rendering = TranscribePianoAudio(temp_mono_path)
1214
  except Exception as e:
1215
  print(f"An error occurred during transcription: {e}")
1216
  raise gr.Error(f"Transcription Failed: {e}")
 
1270
  synth_params['fm_modulation_depth'],
1271
  synth_params['fm_modulation_rate']
1272
  )
1273
+
1274
+ # --- Vocal Re-merging Logic now uses the generic "other_part" ---
1275
+ if separate_vocals and remerge_vocals and other_part_tensor is not None:
1276
+ print(f"Re-merging the non-transcribed part with newly rendered music...")
1277
+
1278
+ rendered_srate, rendered_music_int16 = results[4]
1279
+
1280
+ rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
1281
+ rendered_music_tensor = torch.from_numpy(rendered_music_float).T
1282
+
1283
+ if rendered_srate != other_part_sr:
1284
+ resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr)
1285
+ rendered_music_tensor = resampler(rendered_music_tensor)
1286
+
1287
+ len_music = rendered_music_tensor.shape[1]
1288
+ len_other = other_part_tensor.shape[1]
1289
+
1290
+ if len_music > len_other:
1291
+ padding = len_music - len_other
1292
+ other_part_tensor = torch.nn.functional.pad(other_part_tensor, (0, padding))
1293
+ elif len_other > len_music:
1294
+ padding = len_other - len_music
1295
+ rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding))
1296
+
1297
+ merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu()
1298
+
1299
+ max_abs = torch.max(torch.abs(merged_audio_tensor))
1300
+ if max_abs > 1.0:
1301
+ merged_audio_tensor /= max_abs
1302
+
1303
+ merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16)
1304
+
1305
+ new_results = list(results)
1306
+ new_results[4] = (other_part_sr, merged_audio_int16)
1307
+ results = tuple(new_results)
1308
+ print("Re-merging complete.")
1309
+
1310
  print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
1311
  print('*' * 70)
1312
 
 
1398
  if not soundfonts_dict:
1399
  print("\nWARNING: No SoundFonts were found or could be downloaded.")
1400
  print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
1401
+
1402
+ # --- Pre-load the Demucs model on startup for efficiency ---
1403
+ print("Loading Demucs model (htdemucs_ft), this may take a moment on first run...")
1404
+ try:
1405
+ demucs_model = get_model(name='htdemucs_ft')
1406
+ if torch.cuda.is_available():
1407
+ demucs_model = demucs_model.cuda()
1408
+ print("Demucs model loaded successfully.")
1409
+ except Exception as e:
1410
+ print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}")
1411
+ demucs_model = None
1412
+
1413
  # --- Define a constant for the fallback preset name ---
1414
  # This prevents errors if the preset name is changed in the dictionary.
1415
  FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
 
1419
  # Comprehensive preset dictionary including new JRPG and Handheld classics
1420
  # Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
1421
  S8BIT_PRESETS = {
1422
+ # --- Classic Chiptune ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1423
  "Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
1424
  # Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
1425
  'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
 
1429
  'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
1430
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1431
  },
1432
+ "Mega Man (Rockman / ロックマン)": {
1433
+ # Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
1434
+ 'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
1435
+ 'vibrato_rate': 6.0, 'vibrato_depth': 8,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1436
  'smooth_notes_level': 0.9,
1437
  'continuous_vibrato_level': 0.85,
1438
+ 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
1439
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1440
  },
1441
  "Zelda (The Legend of Zelda / ゼルダの伝説)": {
 
1447
  'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
1448
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1449
  },
1450
+ "Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
1451
+ # Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
1452
+ 'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
1453
+ 'vibrato_rate': 6.0, 'vibrato_depth': 4,
 
1454
  'smooth_notes_level': 0.85,
1455
+ 'continuous_vibrato_level': 0.3, # Formerly False (0.0); adds a hint of continuity for more liveliness.
1456
+ 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
1457
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1458
  },
1459
+ "Pokémon (Game Boy Classics / ポケットモンスター)": {
1460
+ # Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs.
1461
  'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
1462
+ 'vibrato_rate': 5.0, 'vibrato_depth': 5,
1463
  'smooth_notes_level': 0.9,
1464
+ 'continuous_vibrato_level': 0.9,
1465
+ 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
1466
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1467
  },
1468
  "Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
 
1474
  'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
1475
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1476
  },
1477
+ "Final Fantasy (Arpeggio / ファイナルファンタジー)": {
1478
+ # Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound.
1479
  'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
1480
+ 'vibrato_rate': 5.0, 'vibrato_depth': 0,
1481
  'smooth_notes_level': 0.9,
1482
+ 'continuous_vibrato_level': 0.2,
1483
+ 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
1484
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1485
+ },
1486
+ "ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
1487
+ # Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
1488
+ 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
1489
+ 'vibrato_rate': 3.5, 'vibrato_depth': 3,
1490
+ 'smooth_notes_level': 0.9,
1491
+ 'continuous_vibrato_level': 0.85,
1492
+ 'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0,
1493
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1494
  },
1495
  # --- Advanced System Impressions ---
 
1529
  'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
1530
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1531
  },
1532
+ # --- Action & Rock Leads ---
1533
+ "Falcom Ys (Rock Lead / イース)": {
1534
+ # Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs.
1535
+ 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
1536
+ 'vibrato_rate': 5.5, 'vibrato_depth': 6,
1537
+ 'smooth_notes_level': 0.85,
1538
+ 'continuous_vibrato_level': 0.8,
1539
+ 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15,
1540
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1541
+ },
1542
+ "Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
1543
+ # Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
1544
+ 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
1545
+ 'vibrato_rate': 5.0, 'vibrato_depth': 6,
1546
+ 'smooth_notes_level': 0.8,
1547
+ 'continuous_vibrato_level': 0.7,
1548
+ 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
1549
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1550
+ },
1551
+ "Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
1552
+ # Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
1553
+ 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
1554
+ 'vibrato_rate': 4.5, 'vibrato_depth': 4,
1555
+ 'smooth_notes_level': 0.9, # Formerly True -> 1.0; slightly reduced for a bit more attack.
1556
+ 'continuous_vibrato_level': 0.8, # Formerly True -> 1.0; slightly weakened for more defined note transitions.
1557
+ 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1558
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1559
+ },
1560
+ # --- Epic & Orchestral Pads ---
1561
+ "Dragon Quest (Orchestral Feel / ドラゴンクエスト)": {
1562
+ # Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
1563
+ 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
1564
+ 'vibrato_rate': 3.0, 'vibrato_depth': 4,
1565
+ 'smooth_notes_level': 0.9,
1566
+ 'continuous_vibrato_level': 0.9,
1567
+ 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1568
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1569
+ },
1570
+ "Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
1571
+ # Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
1572
+ 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
1573
+ 'vibrato_rate': 2.5, 'vibrato_depth': 4,
1574
+ 'smooth_notes_level': 1.0,
1575
+ 'continuous_vibrato_level': 0.95,
1576
+ 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1577
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1578
+ },
1579
+ "Modern JRPG Pad (Persona / ペルソナ)": {
1580
+ # Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
1581
+ 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
1582
+ 'vibrato_rate': 2.5, 'vibrato_depth': 4,
1583
+ 'smooth_notes_level': 1.0,
1584
+ 'continuous_vibrato_level': 0.95,
1585
+ 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
1586
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1587
+ },
1588
+ "Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
1589
+ # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
1590
+ 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
1591
+ 'vibrato_rate': 3.5, 'vibrato_depth': 5,
1592
+ 'smooth_notes_level': 0.95,
1593
+ 'continuous_vibrato_level': 0.9,
1594
+ 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
1595
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1596
+ },
1597
+ "Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": {
1598
+ # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes.
1599
+ 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
1600
+ 'vibrato_rate': 3.5, 'vibrato_depth': 5,
1601
+ 'smooth_notes_level': 0.95,
1602
+ 'continuous_vibrato_level': 0.9,
1603
+ 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
1604
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1605
+ },
1606
+ "Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
1607
+ # Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
1608
+ 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
1609
+ 'vibrato_rate': 7.0, 'vibrato_depth': 12,
1610
+ 'smooth_notes_level': 0.1,
1611
+ 'continuous_vibrato_level': 0.0,
1612
+ 'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
1613
+ 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1614
+ },
1615
+ # --- Vocal Synthesis ---
1616
+ "8-Bit Vocal Lead": {
1617
+ # Description: A soft, sustained triangle wave with gentle vibrato to mimic a singing voice.
1618
+ 'waveform_type': 'Triangle',
1619
+ 'pulse_width': 0.5,
1620
+ 'envelope_type': 'Sustained (Full Decay)',
1621
+ 'decay_time_s': 0.8,
1622
+ 'vibrato_rate': 5.5,
1623
+ 'vibrato_depth': 4, # Mapped from the suggested 0.15 range
1624
+ 'bass_boost_level': 0.1,
1625
+ 'smooth_notes_level': 0.85,
1626
+ 'continuous_vibrato_level': 0.9,
1627
+ 'noise_level': 0.02,
1628
+ 'distortion_level': 0.0,
1629
+ 'fm_modulation_depth': 0.05,
1630
+ 'fm_modulation_rate': 20
1631
+ },
1632
+ "8-Bit Male Vocal": {
1633
+ # Description: A deeper, fuller triangle wave with more bass and slower vibrato for a masculine feel.
1634
+ 'waveform_type': 'Triangle',
1635
+ 'pulse_width': 0.5,
1636
+ 'envelope_type': 'Sustained (Full Decay)',
1637
+ 'decay_time_s': 1.0,
1638
+ 'vibrato_rate': 5.0,
1639
+ 'vibrato_depth': 3, # Mapped from the suggested 0.12 range
1640
+ 'bass_boost_level': 0.3,
1641
+ 'smooth_notes_level': 0.9,
1642
+ 'continuous_vibrato_level': 0.85,
1643
+ 'noise_level': 0.015,
1644
+ 'distortion_level': 0.0,
1645
+ 'fm_modulation_depth': 0.08,
1646
+ 'fm_modulation_rate': 25
1647
+ },
1648
+ "8-Bit Female Vocal": {
1649
+ # Description: A brighter, lighter triangle wave with faster vibrato and less bass for a feminine feel.
1650
+ 'waveform_type': 'Triangle',
1651
+ 'pulse_width': 0.5,
1652
+ 'envelope_type': 'Sustained (Full Decay)',
1653
+ 'decay_time_s': 0.7,
1654
+ 'vibrato_rate': 6.0,
1655
+ 'vibrato_depth': 5, # Mapped from the suggested 0.18 range
1656
+ 'bass_boost_level': 0.05,
1657
+ 'smooth_notes_level': 0.85,
1658
+ 'continuous_vibrato_level': 0.92,
1659
+ 'noise_level': 0.025,
1660
+ 'distortion_level': 0.0,
1661
+ 'fm_modulation_depth': 0.04,
1662
+ 'fm_modulation_rate': 30
1663
+ },
1664
+ "Lo-Fi Vocal": {
1665
+ # Description: A gritty, noisy square wave with a short decay to simulate a low-resolution vocal sample.
1666
+ 'waveform_type': 'Square',
1667
+ 'pulse_width': 0.48,
1668
+ 'envelope_type': 'Plucky (AD Envelope)', # "Short" implies a plucky, not sustained, envelope
1669
+ 'decay_time_s': 0.4,
1670
+ 'vibrato_rate': 4.8,
1671
+ 'vibrato_depth': 2, # Mapped from the suggested 0.10 range
1672
+ 'bass_boost_level': 0.1,
1673
+ 'smooth_notes_level': 0.65,
1674
+ 'continuous_vibrato_level': 0.6,
1675
+ 'noise_level': 0.05,
1676
+ 'distortion_level': 0.05,
1677
+ 'fm_modulation_depth': 0.02,
1678
+ 'fm_modulation_rate': 20
1679
+ },
1680
+ # --- Sound FX & Experimental ---
1681
  "Sci-Fi Energy Field": {
1682
  # Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
1683
  'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
 
1723
  'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
1724
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1725
  },
1726
+ # --- Utility & Starting Points ---
1727
  "Generic Chiptune Loop": {
1728
  # Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
1729
  'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
 
1733
  'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
1734
  'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
1735
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1736
  }
1737
 
1738
+ # --- Function to control visibility of BOTH new UI elements ---
1739
+ def update_vocal_ui_visibility(separate_vocals):
1740
+ """Shows or hides the separation-related UI controls."""
1741
+ is_visible = gr.update(visible=separate_vocals)
1742
+ return is_visible, is_visible # Return two updates
1743
+
1744
  app = gr.Blocks(theme=gr.themes.Base())
1745
 
1746
  with app:
 
1783
  info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
1784
  )
1785
 
1786
+ # --- Vocal Separation Checkboxes ---
1787
+ with gr.Group():
1788
+ separate_vocals = gr.Checkbox(
1789
+ label="Separate Vocals",
1790
+ value=False,
1791
+ info="If checked, separates the audio into vocals and music stems before processing."
1792
+ )
1793
+ transcription_target = gr.Radio(
1794
+ ["Transcribe Music (Accompaniment)", "Transcribe Vocals"],
1795
+ label="Transcription Target",
1796
+ value="Transcribe Music (Accompaniment)",
1797
+ info="Choose which part of the separated audio to transcribe to MIDI.",
1798
+ visible=False # Initially hidden
1799
+ )
1800
+ remerge_vocals = gr.Checkbox(
1801
+ label="Re-merge Other Part with Rendered Audio",
1802
+ value=False,
1803
+ info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
1804
+ visible=False # Initially hidden
1805
+ )
1806
+
1807
  with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
1808
  onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
1809
  frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
 
1920
  s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
1921
  s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
1922
  s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
1923
+ s8bit_decay_time_s = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="Decay Time (s)") # Increased max to 0.6 for DQ style
1924
  s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
1925
  s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
1926
  s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
 
1967
  # all_inputs now includes the preset selector itself
1968
  # Inputs for the main processing function
1969
  all_inputs = [
1970
+ input_file, s8bit_preset_selector,
1971
+ separate_vocals,
1972
+ remerge_vocals,
1973
+ transcription_target,
1974
+ enable_stereo_processing,
1975
  transcription_method, onset_threshold, frame_threshold, minimum_note_length,
1976
  minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
1977
  render_type, soundfont_bank, render_sample_rate, render_with_sustains,
 
2007
  inputs=all_inputs,
2008
  outputs=all_outputs # Pass the combined list
2009
  )
2010
+
2011
+ # --- The change event now controls TWO components ---
2012
+ separate_vocals.change(
2013
+ fn=update_vocal_ui_visibility,
2014
+ inputs=separate_vocals,
2015
+ outputs=[transcription_target, remerge_vocals] # Update both components
2016
+ )
2017
 
2018
  # --- Listeners for dynamic UI updates ---
2019
  transcription_method.change(
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
  --extra-index-url https://download.pytorch.org/whl/cu128
2
 
3
  torch
 
4
  numpy
5
- gradio
6
  mido
7
  librosa
8
  torchlibrosa
@@ -18,9 +19,12 @@ psutil
18
  pretty_midi
19
  soundfile
20
  pyloudnorm
 
21
  piano_transcription_inference
22
 
23
  basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
24
  basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
25
 
26
- git+https://github.com/avan06/pyfluidsynth
 
 
 
1
  --extra-index-url https://download.pytorch.org/whl/cu128
2
 
3
  torch
4
+ torchaudio
5
  numpy
6
+ gradio >= 5.42.0
7
  mido
8
  librosa
9
  torchlibrosa
 
19
  pretty_midi
20
  soundfile
21
  pyloudnorm
22
+
23
  piano_transcription_inference
24
 
25
  basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
26
  basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
27
 
28
+ git+https://github.com/avan06/pyfluidsynth
29
+
30
+ demucs