Audio-To-MIDI-And-Advanced-Renderer

Running

avans06 commited on Aug 12

Commit

2923df9

1 Parent(s): 22dd15a

feat: Add dual-stem MIDI transcription and `basic-pitch` profiles

**1. Dual-Stem MIDI Transcription:**
- When using vocal separation, a new option allows for transcribing **both** the vocal and accompaniment stems independently.
- The two resulting MIDI files are then automatically merged into a single, more complete MIDI file for rendering. This improves transcription quality on complex tracks.

**2. `basic-pitch` Profile Presets:**
- A "Transcription Profile Preset" dropdown has been added to the UI for the general-purpose transcription method.
- Includes a library of presets optimized for different instruments (Vocals, Piano, Drums) and genres (Rock, Jazz, Classical).
- Selecting a profile automatically configures all `basic-pitch` parameters for better results on specific audio types.

Files changed (2) hide show

app.py +411 -177
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# =================================================================
 #
 # Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
 #
@@ -39,6 +39,7 @@
 #
 # =================================================================
 import os
 import hashlib
 import time as reqtime
@@ -48,6 +49,7 @@ import pyloudnorm as pyln
 import soundfile as sf
 import torch
 import gradio as gr
 # --- Imports for Vocal Separation ---
@@ -185,7 +187,7 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s,
     total_duration = midi_data.get_end_time()
     # Initialize a stereo waveform buffer (2 channels: Left, Right)
     waveform = np.zeros((2, int(total_duration * fs) + fs))
     num_instruments = len(midi_data.instruments)
     # Phase tracking: main oscillator phase for each instrument
@@ -320,7 +322,7 @@ def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s,
 def analyze_midi_velocity(midi_path):
     midi = pretty_midi.PrettyMIDI(midi_path)
     all_velocities = []
     print(f"Analyzing velocity for MIDI: {midi_path}")
     for i, instrument in enumerate(midi.instruments):
         velocities = [note.velocity for note in instrument.notes]
@@ -348,13 +350,13 @@ def analyze_midi_velocity(midi_path):
 def scale_instrument_velocity(instrument, scale=0.8):
     for note in instrument.notes:
         note.velocity = max(1, min(127, int(note.velocity * scale)))
 def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
     """
     Normalizes the audio data to a target integrated loudness (LUFS).
     This provides more consistent perceived volume than peak normalization.
     Args:
         audio_data (np.ndarray): The audio signal.
         sample_rate (int): The sample rate of the audio.
@@ -400,7 +402,7 @@ def merge_midis(midi_path_left, midi_path_right, output_path):
     Merges two MIDI files into a single MIDI file. This robust version iterates
     through ALL instruments in both MIDI files, ensuring no data is lost if the
     source files are multi-instrumental.
     It applies hard-left panning (Pan=0) to every instrument from the left MIDI
     and hard-right panning (Pan=127) to every instrument from the right MIDI.
     """
@@ -479,7 +481,7 @@ def TranscribePianoAudio(input_file):
     print('=' * 70)
     print('STAGE 1: Starting Piano-Specific Transcription')
     print('=' * 70)
     # Generate a unique output filename for the MIDI
     fn = os.path.basename(input_file)
     fn1 = fn.split('.')[0]
@@ -529,7 +531,7 @@ def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len,
     print('=' * 70)
     print('STAGE 1: Starting General Purpose Transcription')
     print('=' * 70)
     fn = os.path.basename(input_file)
     fn1 = fn.split('.')[0]
     output_dir = os.path.join("output", "transcribed_general_")
@@ -867,7 +869,7 @@ def Render_MIDI(input_midi_path,
 def analyze_midi_features(midi_data):
     """
     Analyzes a PrettyMIDI object to extract musical features for parameter recommendation.
     Args:
         midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze.
@@ -1044,12 +1046,81 @@ def recommend_8bit_params(midi_data, default_preset):
 # === Main Application Logic ===
 # =================================================================================================
 def process_and_render_file(input_file,
                             # --- Pass the preset selector value ---
                             s8bit_preset_selector,
                             separate_vocals,
                             remerge_vocals,
                             transcription_target,
                             # --- Transcription params ---
                             enable_stereo_processing,
                             transcription_method,
@@ -1082,140 +1153,164 @@ def process_and_render_file(input_file,
     # This will store the other part if separation is performed
     other_part_tensor = None
     other_part_sr = None
     # --- Step 1: Check file type and transcribe if necessary ---
     if filename.lower().endswith(('.mid', '.midi', '.kar')):
-        print("MIDI file detected. Proceeding directly to rendering.")
         midi_path_for_rendering = input_file_path
-    else: #if filename.lower().endswith(('.wav', '.mp3'))
-        print("Audio file detected. Starting transcription...")
         try:
-            # Use torchaudio to load directly into a tensor, as demucs needs it.
-            # This is more efficient than loading with librosa then converting.
             audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
         except Exception as e:
-            raise gr.Error(f"Failed to load audio file: {e}")
         # --- Demucs Vocal Separation Logic, now decides which stem to process ---
-        if separate_vocals:
             if demucs_model is None:
                 raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
             # Convert to a common format (stereo, float32) that demucs expects
             audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
             if torch.cuda.is_available():
                 audio_tensor = audio_tensor.cuda()
             print("Separating audio with Demucs... This may take some time.")
-            all_stems = apply_model(demucs_model, audio_tensor[None], device='cuda' if torch.cuda.is_available() else 'cpu', progress=True)[0]
-            vocals_idx = demucs_model.sources.index('vocals')
-            # Sum all stems that are NOT vocals to get the accompaniment
-            accompaniment_indices = [i for i, source in enumerate(demucs_model.sources) if source != 'vocals']
-            vocals_tensor = all_stems[vocals_idx]
-            accompaniment_tensor = all_stems[accompaniment_indices].sum(0)
-            # --- The new core branching logic ---
-            if transcription_target == "Transcribe Vocals":
-                print("Target: Transcribing VOCALS.")
-                tensor_to_process = vocals_tensor
-                other_part_tensor = accompaniment_tensor # Save accompaniment for re-merging
-            else: # Default to "Transcribe Music (Accompaniment)"
-                print("Target: Transcribing MUSIC (ACCOMPANIMENT).")
-                tensor_to_process = accompaniment_tensor
-                other_part_tensor = vocals_tensor # Save vocals for re-merging
             other_part_sr = demucs_model.samplerate
-            audio_tensor = tensor_to_process # The audio to be processed is now the chosen stem
-            native_sample_rate = demucs_model.samplerate # Update sample rate to match demucs output
             print("Separation complete.")
-        # --- Prepare audio for transcription (saving to a temp file) ---
-        # This part of the logic now works on whichever stem was selected above
-        base_name = os.path.splitext(filename)[0]
-        temp_dir = "output/temp_transcribe"
-        os.makedirs(temp_dir, exist_ok=True)
-        suffix = f"_{transcription_target.split(' ')[1].lower()}" if separate_vocals else "_original"
-        audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}{suffix}.wav")
-        torchaudio.save(audio_to_transcribe_path, audio_tensor.cpu(), native_sample_rate)
-        # Convert tensor to numpy array (channels, samples) for librosa/pyloudnorm compatibility
-        # We work with a CPU copy of the tensor.
-        audio_data_np = audio_tensor.cpu().numpy()
-        # === STEREO PROCESSING LOGIC ===
-        if enable_stereo_processing:
-            if audio_data_np.ndim != 2 or audio_data_np.shape[0] != 2:
-                print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
-                enable_stereo_processing = False # Disable stereo processing if audio is not stereo
-        if enable_stereo_processing:
-            print("Stereo processing enabled. Splitting, normalizing, and transcribing channels...")
-            try:
-                left_channel_np = audio_data_np[0]
-                right_channel_np = audio_data_np[1]
-                normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
-                normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
-                temp_left_path = os.path.join(temp_dir, f"{base_name}_left.wav")
-                temp_right_path = os.path.join(temp_dir, f"{base_name}_right.wav")
-                sf.write(temp_left_path, normalized_left, native_sample_rate)
-                sf.write(temp_right_path, normalized_right, native_sample_rate)
-                print(f"Saved left channel to: {temp_left_path}")
-                print(f"Saved right channel to: {temp_right_path}")
-                print("Transcribing left and right channel...")
-                if transcription_method == "General Purpose":
-                    midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
-                    midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
-                else: # Piano-Specific
-                    midi_path_left = TranscribePianoAudio(temp_left_path)
-                    midi_path_right = TranscribePianoAudio(temp_right_path)
-                if midi_path_left and midi_path_right:
-                    merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
-                    midi_path_for_rendering = merge_midis(midi_path_left, midi_path_right, merged_midi_path)
-                elif midi_path_left:
-                    print("Warning: Right channel transcription failed. Using left channel only.")
-                    midi_path_for_rendering = midi_path_left
-                elif midi_path_right:
-                    print("Warning: Left channel transcription failed. Using right channel only.")
-                    midi_path_for_rendering = midi_path_right
                 else:
-                     raise gr.Error("Both left and right channel transcriptions failed.")
-            except Exception as e:
-                print(f"An error occurred during stereo processing: {e}")
-                raise gr.Error(f"Stereo Processing Failed: {e}")
-        else: # Standard mono transcription
-            print("Mono processing. Normalizing and transcribing audio...")
-            # If the audio is stereo but stereo processing is disabled, convert to mono.
-            if audio_data_np.shape[0] == 2:
-                mono_signal_np = np.mean(audio_data_np, axis=0)
-            else:
-                mono_signal_np = audio_data_np[0]
-            normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
-            temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
-            sf.write(temp_mono_path, normalized_mono, native_sample_rate)
-            try:
-                if transcription_method == "General Purpose":
-                    midi_path_for_rendering = TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
-                else: # Piano-Specific
-                    midi_path_for_rendering = TranscribePianoAudio(temp_mono_path)
-            except Exception as e:
-                print(f"An error occurred during transcription: {e}")
-                raise gr.Error(f"Transcription Failed: {e}")
-    # --- Step 2: Render the MIDI file with selected options ---
     # --- Auto-Recommendation Logic ---
     # Store the original parameters from the UI sliders into a dictionary.
@@ -1272,7 +1367,8 @@ def process_and_render_file(input_file,
                           )
     # --- Vocal Re-merging Logic now uses the generic "other_part" ---
-    if separate_vocals and remerge_vocals and other_part_tensor is not None:
         print(f"Re-merging the non-transcribed part with newly rendered music...")
         rendered_srate, rendered_music_int16 = results[4]
@@ -1330,7 +1426,7 @@ def process_and_render_file(input_file,
         # We send a gr.update() for each UI component.
         for _ in param_order:
             final_ui_updates.append(gr.update())
     # The final return is a combination of the result values and the UI update values.
     return list(results) + final_ui_updates
@@ -1338,52 +1434,6 @@ def process_and_render_file(input_file,
 # === Gradio UI Setup ===
 # =================================================================================================
-def update_ui_visibility(transcription_method, soundfont_choice):
-    """
-    Dynamically updates the visibility of UI components based on user selections.
-    """
-    is_general = (transcription_method == "General Purpose")
-    is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
-    return {
-        general_transcription_settings: gr.update(visible=is_general),
-        synth_8bit_settings: gr.update(visible=is_8bit),
-    }
-# --- Function to apply 8-bit synthesizer presets ---
-# --- This function must be defined before the UI components that use it ---
-def apply_8bit_preset(preset_name):
-    """
-    Takes the name of a preset and returns a dictionary of gr.update objects
-    to set the values of all 13 of the 8-bit synthesizer's UI components.
-    """
-    # --- Use a list of keys for consistent updates ---
-    param_keys = [
-        'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate',
-        'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level',
-        'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate'
-    ]
-    # If the user selects "Custom" or the preset is not found, do not change the values.
-    if preset_name == "Custom" or preset_name not in S8BIT_PRESETS:
-        # When switching to custom, don't change any values, just return empty updates.
-        return {comp: gr.update() for comp in s8bit_ui_components}
-    # Get the settings dictionary for the chosen preset.
-    settings = S8BIT_PRESETS[preset_name]
-    # Create a dictionary mapping UI components to their new values from the preset.
-    update_dict = {}
-    for i, key in enumerate(param_keys):
-        component = s8bit_ui_components[i]
-        value = settings.get(key)
-        if value is not None:
-            update_dict[component] = gr.update(value=value)
-        else:
-            update_dict[component] = gr.update()
-    return update_dict
 if __name__ == "__main__":
     # Initialize the app: download model (if needed) and apply patches
     # Set to False if you don't have 'requests' or 'tqdm' installed
@@ -1735,11 +1785,154 @@ if __name__ == "__main__":
         },
     }
-    # --- Function to control visibility of BOTH new UI elements ---
-    def update_vocal_ui_visibility(separate_vocals):
-        """Shows or hides the separation-related UI controls."""
         is_visible = gr.update(visible=separate_vocals)
-        return is_visible, is_visible # Return two updates
     app = gr.Blocks(theme=gr.themes.Base())
@@ -1803,8 +1996,25 @@ if __name__ == "__main__":
                         info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
                         visible=False # Initially hidden
                     )
                 with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
                     onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
                     frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
                     minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
@@ -1967,10 +2177,12 @@ if __name__ == "__main__":
         # all_inputs now includes the preset selector itself
         # Inputs for the main processing function
         all_inputs = [
-            input_file, s8bit_preset_selector,
-            separate_vocals,
             remerge_vocals,
             transcription_target,
             enable_stereo_processing,
             transcription_method, onset_threshold, frame_threshold, minimum_note_length,
             minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
@@ -1989,6 +2201,13 @@ if __name__ == "__main__":
             output_midi, output_audio, output_plot, output_song_description
         ]
         # The list of 8-bit UI components that can be updated
         # This MUST be defined after the components themselves are created in the UI.
         s8bit_ui_components = [
@@ -2007,12 +2226,20 @@ if __name__ == "__main__":
             inputs=all_inputs,
             outputs=all_outputs # Pass the combined list
         )
-        # --- The change event now controls TWO components ---
         separate_vocals.change(
-            fn=update_vocal_ui_visibility,
-            inputs=separate_vocals,
-            outputs=[transcription_target, remerge_vocals] # Update both components
         )
         # --- Listeners for dynamic UI updates ---
@@ -2026,6 +2253,13 @@ if __name__ == "__main__":
             inputs=[transcription_method, soundfont_bank],
             outputs=[general_transcription_settings, synth_8bit_settings]
         )
         # This listener now correctly handles only the named presets, ignoring "Auto-Recommend"
         # --- Event listener for the preset selector ---
@@ -2038,7 +2272,7 @@ if __name__ == "__main__":
             inputs=[s8bit_preset_selector],
             outputs=s8bit_ui_components # This now correctly targets the new sliders
         )
     # Launch the Gradio app
     app.queue().launch(inbrowser=True, debug=True)

+# =================================================================
 #
 # Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
 #
 #
 # =================================================================
+import io
 import os
 import hashlib
 import time as reqtime
 import soundfile as sf
 import torch
+import ffmpeg
 import gradio as gr
 # --- Imports for Vocal Separation ---
     total_duration = midi_data.get_end_time()
     # Initialize a stereo waveform buffer (2 channels: Left, Right)
     waveform = np.zeros((2, int(total_duration * fs) + fs))
     num_instruments = len(midi_data.instruments)
     # Phase tracking: main oscillator phase for each instrument
 def analyze_midi_velocity(midi_path):
     midi = pretty_midi.PrettyMIDI(midi_path)
     all_velocities = []
     print(f"Analyzing velocity for MIDI: {midi_path}")
     for i, instrument in enumerate(midi.instruments):
         velocities = [note.velocity for note in instrument.notes]
 def scale_instrument_velocity(instrument, scale=0.8):
     for note in instrument.notes:
         note.velocity = max(1, min(127, int(note.velocity * scale)))
 def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
     """
     Normalizes the audio data to a target integrated loudness (LUFS).
     This provides more consistent perceived volume than peak normalization.
     Args:
         audio_data (np.ndarray): The audio signal.
         sample_rate (int): The sample rate of the audio.
     Merges two MIDI files into a single MIDI file. This robust version iterates
     through ALL instruments in both MIDI files, ensuring no data is lost if the
     source files are multi-instrumental.
     It applies hard-left panning (Pan=0) to every instrument from the left MIDI
     and hard-right panning (Pan=127) to every instrument from the right MIDI.
     """
     print('=' * 70)
     print('STAGE 1: Starting Piano-Specific Transcription')
     print('=' * 70)
     # Generate a unique output filename for the MIDI
     fn = os.path.basename(input_file)
     fn1 = fn.split('.')[0]
     print('=' * 70)
     print('STAGE 1: Starting General Purpose Transcription')
     print('=' * 70)
     fn = os.path.basename(input_file)
     fn1 = fn.split('.')[0]
     output_dir = os.path.join("output", "transcribed_general_")
 def analyze_midi_features(midi_data):
     """
     Analyzes a PrettyMIDI object to extract musical features for parameter recommendation.
     Args:
         midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze.
 # === Main Application Logic ===
 # =================================================================================================
+# --- Helper function to encapsulate the transcription pipeline for a single audio file ---
+def _transcribe_stem(audio_path, base_name, temp_dir,
+                     # Pass all transcription-related parameters
+                     enable_stereo, transcription_method,
+                     onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
+                     infer_onsets_bool, melodia_trick_bool, multiple_bends_bool):
+    """
+    Takes a single audio file path and runs the full transcription pipeline on it.
+    This includes stereo/mono handling and normalization.
+    Returns the file path of the resulting transcribed MIDI.
+    """
+    print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---")
+    # Load the audio stem to process it
+    audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False)
+    if enable_stereo and audio_data.ndim == 2 and audio_data.shape[0] == 2:
+        print("Stereo processing enabled for stem.")
+        left_channel_np = audio_data[0]
+        right_channel_np = audio_data[1]
+        normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
+        normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
+        temp_left_path = os.path.join(temp_dir, f"{base_name}_left.flac")
+        temp_right_path = os.path.join(temp_dir, f"{base_name}_right.flac")
+        sf.write(temp_left_path, normalized_left, native_sample_rate)
+        sf.write(temp_right_path, normalized_right, native_sample_rate)
+        print(f"Saved left channel to: {temp_left_path}")
+        print(f"Saved right channel to: {temp_right_path}")
+        print("Transcribing left and right channel...")
+        if transcription_method == "General Purpose":
+            midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
+            midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
+        else: # Piano-Specific
+            midi_path_left = TranscribePianoAudio(temp_left_path)
+            midi_path_right = TranscribePianoAudio(temp_right_path)
+        if midi_path_left and midi_path_right:
+            merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
+            return merge_midis(midi_path_left, midi_path_right, merged_midi_path)
+        elif midi_path_left:
+            print("Warning: Right channel transcription failed. Using left channel only.")
+            return midi_path_left
+        elif midi_path_right:
+            print("Warning: Left channel transcription failed. Using right channel only.")
+            return midi_path_right
+        else:
+            print(f"Warning: Stereo transcription failed for stem {base_name}.")
+            return None
+    else:
+        print("Mono processing for stem.")
+        mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data
+        normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
+        temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.flac")
+        sf.write(temp_mono_path, normalized_mono, native_sample_rate)
+        if transcription_method == "General Purpose":
+            return TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
+        else:
+            return TranscribePianoAudio(temp_mono_path)
+# --- The main processing function is now significantly refactored ---
 def process_and_render_file(input_file,
                             # --- Pass the preset selector value ---
                             s8bit_preset_selector,
                             separate_vocals,
                             remerge_vocals,
                             transcription_target,
+                            # --- ADDED: New parameter from UI ---
+                            transcribe_both_stems,
                             # --- Transcription params ---
                             enable_stereo_processing,
                             transcription_method,
     # This will store the other part if separation is performed
     other_part_tensor = None
     other_part_sr = None
     # --- Step 1: Check file type and transcribe if necessary ---
     if filename.lower().endswith(('.mid', '.midi', '.kar')):
+        print("MIDI file detected. Cannot perform vocal separation. Proceeding directly to rendering.")
         midi_path_for_rendering = input_file_path
+    else:
+        print("Audio file detected. Starting pre-processing...")
+        # --- Robust audio loading with ffmpeg fallback ---
         try:
+            # Try loading directly with torchaudio (efficient for supported formats).
+            # This works for formats like WAV, MP3, FLAC, OGG, etc.
+            print("Attempting to load audio with torchaudio...")
             audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
+            print("Torchaudio loading successful.")
         except Exception as e:
+            print(f"Torchaudio failed: {e}. Attempting fallback with ffmpeg...")
+            try:
+                # Use ffmpeg to convert the audio to WAV in-memory, then load the bytes.
+                out, err = (
+                    ffmpeg
+                    .input(input_file_path)
+                    .output('pipe:', format='flac')
+                    .run(capture_stdout=True, capture_stderr=True)
+                )
+                # Load the WAV data from the in-memory buffer
+                audio_tensor, native_sample_rate = torchaudio.load(io.BytesIO(out))
+                print("FFmpeg fallback successful.")
+            except Exception as ffmpeg_err:
+                # If both direct loading and ffmpeg fallback fail, raise an error.
+                raise gr.Error(f"Failed to load audio file with both torchaudio and ffmpeg.\n"
+                               f"Torchaudio error: {e}\n"
+                               f"FFmpeg error: {ffmpeg_err.decode() if isinstance(ffmpeg_err, bytes) else ffmpeg_err}")
+        base_name = os.path.splitext(filename)[0]
+        temp_dir = "output/temp_transcribe"
+        os.makedirs(temp_dir, exist_ok=True)
         # --- Demucs Vocal Separation Logic, now decides which stem to process ---
+        if not separate_vocals:
+            # --- Standard Workflow: Transcribe the original full audio ---
+            print("Standard workflow: No vocal separation.")
+            audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}_original.flac")
+            torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate)
+            midi_path_for_rendering = _transcribe_stem(
+                audio_to_transcribe_path, f"{base_name}_original", temp_dir,
+                enable_stereo_processing, transcription_method,
+                onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
+                infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
+            )
+        else:
+            # --- Vocal Separation Workflow ---
             if demucs_model is None:
                 raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
             # Convert to a common format (stereo, float32) that demucs expects
             audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
             if torch.cuda.is_available():
                 audio_tensor = audio_tensor.cuda()
             print("Separating audio with Demucs... This may take some time.")
+            # --- Wrap the model call in a no_grad() context ---
+            with torch.no_grad():
+                all_stems = apply_model(
+                    demucs_model,
+                    audio_tensor[None], # The input shape is [batch, channels, samples]
+                    device='cuda' if torch.cuda.is_available() else 'cpu',
+                    progress=True,
+                )[0] # Remove the batch dimension from the output
+            # --- Clear CUDA cache immediately after use ---
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                print("CUDA cache cleared.")
+            # --- Robust stem handling to prevent CUDA errors ---
+            # Instead of complex GPU indexing, we create a dictionary of stems on the CPU.
+            # This is safer and more robust across different hardware.
+            sources = {}
+            for i, source_name in enumerate(demucs_model.sources):
+                sources[source_name] = all_stems[i]
+            vocals_tensor = sources['vocals']
+            # Sum the other stems to create the accompaniment.
+            # This loop is safer than a single complex indexing operation.
+            accompaniment_tensor = torch.zeros_like(vocals_tensor)
+            for source_name, stem_tensor in sources.items():
+                if source_name != 'vocals':
+                    accompaniment_tensor += stem_tensor
+            # --- Save both stems to temporary files ---
+            vocals_path = os.path.join(temp_dir, f"{base_name}_vocals.flac")
+            accompaniment_path = os.path.join(temp_dir, f"{base_name}_accompaniment.flac")
+            torchaudio.save(vocals_path, vocals_tensor.cpu(), demucs_model.samplerate)
+            torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate)
+            # --- Determine which stem is the primary target and which is the "other part" ---
+            primary_target_path = vocals_path if transcription_target == "Transcribe Vocals" else accompaniment_path
+            other_part_path = accompaniment_path if transcription_target == "Transcribe Vocals" else vocals_path
+            # Store the audio tensor of the "other part" for potential audio re-merging
+            other_part_tensor = accompaniment_tensor if transcription_target == "Transcribe Vocals" else vocals_tensor
             other_part_sr = demucs_model.samplerate
             print("Separation complete.")
+            # --- Main Branching Logic: Transcribe one or both stems ---
+            if not transcribe_both_stems:
+                print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}")
+                midi_path_for_rendering = _transcribe_stem(
+                    primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir,
+                    enable_stereo_processing, transcription_method,
+                    onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
+                    infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
+                )
+            else:
+                print("Transcribing BOTH stems and merging the MIDI results.")
+                # Transcribe the primary target
+                midi_path_primary = _transcribe_stem(
+                    primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir,
+                    enable_stereo_processing, transcription_method,
+                    onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
+                    infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
+                )
+                # Transcribe the other part
+                midi_path_other = _transcribe_stem(
+                    other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir,
+                    enable_stereo_processing, transcription_method,
+                    onset_thresh, frame_thresh, min_note_len, min_freq, max_freq,
+                    infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
+                )
+                # Merge the two resulting MIDI files
+                if midi_path_primary and midi_path_other:
+                    final_merged_midi_path = os.path.join(temp_dir, f"{base_name}_full_transcription.mid")
+                    print(f"Merging transcribed MIDI files into {os.path.basename(final_merged_midi_path)}")
+                    # A more robust MIDI merge is needed here
+                    primary_midi = pretty_midi.PrettyMIDI(midi_path_primary)
+                    other_midi = pretty_midi.PrettyMIDI(midi_path_other)
+                    # Add all instruments from the other midi to the primary one
+                    for instrument in other_midi.instruments:
+                        instrument.name = f"Other - {instrument.name}" # Rename to avoid confusion
+                        primary_midi.instruments.append(instrument)
+                    primary_midi.write(final_merged_midi_path)
+                    midi_path_for_rendering = final_merged_midi_path
+                elif midi_path_primary:
+                    print("Warning: Transcription of the 'other' part failed. Using primary transcription only.")
+                    midi_path_for_rendering = midi_path_primary
                 else:
+                    raise gr.Error("Transcription of the primary target failed. Aborting.")
+    # --- Step 2: Render the FINAL MIDI file with selected options ---
     # --- Auto-Recommendation Logic ---
     # Store the original parameters from the UI sliders into a dictionary.
                           )
     # --- Vocal Re-merging Logic now uses the generic "other_part" ---
+    # IMPORTANT: This only runs if we did NOT transcribe both stems.
+    if separate_vocals and remerge_vocals and not transcribe_both_stems and other_part_tensor is not None:
         print(f"Re-merging the non-transcribed part with newly rendered music...")
         rendered_srate, rendered_music_int16 = results[4]
         # We send a gr.update() for each UI component.
         for _ in param_order:
             final_ui_updates.append(gr.update())
     # The final return is a combination of the result values and the UI update values.
     return list(results) + final_ui_updates
 # === Gradio UI Setup ===
 # =================================================================================================
 if __name__ == "__main__":
     # Initialize the app: download model (if needed) and apply patches
     # Set to False if you don't have 'requests' or 'tqdm' installed
         },
     }
+    # --- Data structure for basic_pitch transcription presets ---
+    BASIC_PITCH_PRESETS = {
+        # --- General & All-Purpose ---
+        "Default (Balanced)": {
+            'description': "A good all-around starting point for most music types.",
+            'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 128,
+            'min_freq': 60, 'max_freq': 4000,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
+        },
+        "Anime / J-Pop": {
+            'description': "For tracks with clear melodies and pop/rock arrangements.",
+            'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 150,
+            'min_freq': 40, 'max_freq': 2500,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
+        },
+        # --- Specific Instruments ---
+        "Solo Vocals": {
+            'description': "Optimized for a single singing voice. Sensitive to nuances.",
+            'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100,
+            'min_freq': 80, 'max_freq': 1200,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
+        },
+        "Solo Piano": {
+            'description': "For solo piano with a wide dynamic and frequency range.",
+            'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 120,
+            'min_freq': 27, 'max_freq': 4200,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
+        },
+        "Acoustic Guitar": {
+            'description': "Balanced for picked or strummed acoustic guitar.",
+            'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 90,
+            'min_freq': 80, 'max_freq': 2500,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
+        },
+        "Bass Guitar": {
+            'description': "Isolates and transcribes only the low frequencies of a bassline.",
+            'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100,
+            'min_freq': 30, 'max_freq': 400,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False
+        },
+        "Percussion / Drums": {
+            'description': "For drums and rhythmic elements. Catches fast, sharp hits.",
+            'onset_thresh': 0.7, 'frame_thresh': 0.6, 'min_note_len': 30,
+            'min_freq': 40, 'max_freq': 10000,
+            'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False
+        },
+        # --- Complex Genres ---
+        "Rock / Metal": {
+            'description': "Higher thresholds for distorted guitars, bass, and drums in a dense mix.",
+            'onset_thresh': 0.6, 'frame_thresh': 0.4, 'min_note_len': 100,
+            'min_freq': 50, 'max_freq': 3000,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
+        },
+        "Jazz (Multi-instrument)": {
+            'description': "High thresholds to separate notes in complex, improvisational passages.",
+            'onset_thresh': 0.7, 'frame_thresh': 0.5, 'min_note_len': 150,
+            'min_freq': 55, 'max_freq': 2000,
+            'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': True
+        },
+        "Classical (Orchestral)": {
+            'description': "Longer note length to focus on sustained notes and filter out performance noise.",
+            'onset_thresh': 0.5, 'frame_thresh': 0.4, 'min_note_len': 200,
+            'min_freq': 32, 'max_freq': 4200,
+            'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True
+        },
+        "Electronic / Synth": {
+            'description': "Low thresholds and short note length for sharp, synthetic sounds.",
+            'onset_thresh': 0.3, 'frame_thresh': 0.2, 'min_note_len': 50,
+            'min_freq': 20, 'max_freq': 8000,
+            'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False
+        }
+    }
+    # --- UI visibility logic now controls three components ---
+    def update_vocal_ui_visibility(separate_vocals, remerge_audio):
+        """Shows or hides the separation-related UI controls based on selections."""
         is_visible = gr.update(visible=separate_vocals)
+        # The "Transcribe Both" checkbox is only visible if separation AND re-merging are active
+        transcribe_both_visible = gr.update(visible=(separate_vocals and remerge_audio))
+        return is_visible, is_visible, transcribe_both_visible
+    def update_ui_visibility(transcription_method, soundfont_choice):
+        """
+        Dynamically updates the visibility of UI components based on user selections.
+        """
+        is_general = (transcription_method == "General Purpose")
+        is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
+        return {
+            general_transcription_settings: gr.update(visible=is_general),
+            synth_8bit_settings: gr.update(visible=is_8bit),
+        }
+    # --- Controller function to apply basic_pitch presets to the UI ---
+    def apply_basic_pitch_preset(preset_name):
+        if preset_name not in BASIC_PITCH_PRESETS:
+            # If "Custom" is selected or name is invalid, don't change anything
+            return {comp: gr.update() for comp in basic_pitch_ui_components}
+        settings = BASIC_PITCH_PRESETS[preset_name]
+        # Return a dictionary that maps each UI component to its new value
+        return {
+            onset_threshold: gr.update(value=settings['onset_thresh']),
+            frame_threshold: gr.update(value=settings['frame_thresh']),
+            minimum_note_length: gr.update(value=settings['min_note_len']),
+            minimum_frequency: gr.update(value=settings['min_freq']),
+            maximum_frequency: gr.update(value=settings['max_freq']),
+            infer_onsets: gr.update(value=settings['infer_onsets_bool']),
+            melodia_trick: gr.update(value=settings['melodia_trick_bool']),
+            multiple_pitch_bends: gr.update(value=settings['multiple_bends_bool'])
+        }
+    # --- Function to apply 8-bit synthesizer presets ---
+    # --- This function must be defined before the UI components that use it ---
+    def apply_8bit_preset(preset_name):
+        """
+        Takes the name of a preset and returns a dictionary of gr.update objects
+        to set the values of all 13 of the 8-bit synthesizer's UI components.
+        """
+        # --- Use a list of keys for consistent updates ---
+        param_keys = [
+            'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate',
+            'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level',
+            'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate'
+        ]
+        # If the user selects "Custom" or the preset is not found, do not change the values.
+        if preset_name == "Custom" or preset_name not in S8BIT_PRESETS:
+            # When switching to custom, don't change any values, just return empty updates.
+            return {comp: gr.update() for comp in s8bit_ui_components}
+        # Get the settings dictionary for the chosen preset.
+        settings = S8BIT_PRESETS[preset_name]
+        # Create a dictionary mapping UI components to their new values from the preset.
+        update_dict = {}
+        for i, key in enumerate(param_keys):
+            component = s8bit_ui_components[i]
+            value = settings.get(key)
+            if value is not None:
+                update_dict[component] = gr.update(value=value)
+            else:
+                update_dict[component] = gr.update()
+        return update_dict
     app = gr.Blocks(theme=gr.themes.Base())
                         info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
                         visible=False # Initially hidden
                     )
+                    # --- New checkbox for transcribing both stems ---
+                    transcribe_both_stems = gr.Checkbox(
+                        label="Transcribe Both Parts & Merge MIDI",
+                        value=False,
+                        info="If checked, transcribes BOTH vocals and music, then merges them into one MIDI file for rendering. Disables audio re-merging.",
+                        visible=False # Initially hidden
+                    )
                 with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
+                    # --- Preset dropdown for basic_pitch ---
+                    basic_pitch_preset_selector = gr.Dropdown(
+                        choices=["Custom"] + list(BASIC_PITCH_PRESETS.keys()),
+                        value="Default (Balanced)",
+                        label="Transcription Profile Preset",
+                        info="Select a profile to auto-fill settings for different instrument types."
+                         "For reference only; it is recommended to test and adjust for optimal results."
+                    )
+                    # --- The existing basic_pitch components ---
                     onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
                     frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
                     minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
         # all_inputs now includes the preset selector itself
         # Inputs for the main processing function
         all_inputs = [
+            input_file,
+            s8bit_preset_selector,
+            separate_vocals,
             remerge_vocals,
             transcription_target,
+            transcribe_both_stems,
             enable_stereo_processing,
             transcription_method, onset_threshold, frame_threshold, minimum_note_length,
             minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
             output_midi, output_audio, output_plot, output_song_description
         ]
+        # The list of basic_pitch UI components that can be updated by its preset selector.
+        # This MUST be defined after the components themselves are created in the UI.
+        basic_pitch_ui_components = [
+            onset_threshold, frame_threshold, minimum_note_length, minimum_frequency,
+            maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends
+        ]
         # The list of 8-bit UI components that can be updated
         # This MUST be defined after the components themselves are created in the UI.
         s8bit_ui_components = [
             inputs=all_inputs,
             outputs=all_outputs # Pass the combined list
         )
+        # --- Visibility logic is now more complex ---
+        # A simple lambda function to handle multiple inputs
+        update_visibility_lambda = lambda sep, rem: update_vocal_ui_visibility(sep, rem)
         separate_vocals.change(
+            fn=update_visibility_lambda,
+            inputs=[separate_vocals, remerge_vocals],
+            outputs=[transcription_target, remerge_vocals, transcribe_both_stems]
+        )
+        remerge_vocals.change(
+            fn=update_visibility_lambda,
+            inputs=[separate_vocals, remerge_vocals],
+            outputs=[transcription_target, remerge_vocals, transcribe_both_stems]
         )
         # --- Listeners for dynamic UI updates ---
             inputs=[transcription_method, soundfont_bank],
             outputs=[general_transcription_settings, synth_8bit_settings]
         )
+        # --- Event listener for the new basic_pitch preset dropdown ---
+        basic_pitch_preset_selector.change(
+            fn=apply_basic_pitch_preset,
+            inputs=[basic_pitch_preset_selector],
+            outputs=basic_pitch_ui_components
+        )
         # This listener now correctly handles only the named presets, ignoring "Auto-Recommend"
         # --- Event listener for the preset selector ---
             inputs=[s8bit_preset_selector],
             outputs=s8bit_ui_components # This now correctly targets the new sliders
         )
     # Launch the Gradio app
     app.queue().launch(inbrowser=True, debug=True)

requirements.txt CHANGED Viewed

@@ -19,7 +19,7 @@ psutil
 pretty_midi
 soundfile
 pyloudnorm
 piano_transcription_inference
 basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'

 pretty_midi
 soundfile
 pyloudnorm
+ffmpeg-python
 piano_transcription_inference
 basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'