Audio-To-MIDI-And-Advanced-Renderer

Running

avans06 commited on 4 days ago

Commit

d050f96

1 Parent(s): 7a65465

feat(separation): Integrate BS-RoFormer & Mel-RoFormer models

This commit introduces support for two additional audio separation models,
BS-RoFormer and Mel-RoFormer, providing users with more specialized
options for vocal and instrumental separation.

Files changed (2) hide show

app.py +166 -44
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -61,6 +61,7 @@ import torchaudio
 from demucs.apply import apply_model
 from demucs.pretrained import get_model
 from demucs.audio import convert_audio
 from src.piano_transcription.utils import initialize_app
 from piano_transcription_inference  import PianoTranscription, utilities, sample_rate as transcription_sample_rate
@@ -106,6 +107,7 @@ class AppParameters:
     # Global Settings
     s8bit_preset_selector: str = "Custom"
     separate_vocals: bool = False
     # --- Advanced Separation and Merging Controls ---
     enable_advanced_separation: bool = False # Controls visibility of advanced options
@@ -1609,7 +1611,7 @@ def TranscribePianoAudio(input_file):
     # Use 'cuda' if a GPU is available and configured, otherwise 'cpu'
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f'Loading transcriptor model... device= {device}')
-    transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth")
     print('Transcriptor loaded.')
     print('-' * 70)
@@ -2377,7 +2379,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
         midi_path_for_rendering = input_file_path
     else:
-        temp_dir = "output/temp_transcribe" # Define temp_dir early for the fallback
         os.makedirs(temp_dir, exist_ok=True)
         # --- Audio Loading ---
@@ -2413,44 +2415,98 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
                 print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}")
                 return None # Return None to indicate failure
-        # --- Demucs Vocal Separation Logic ---
         # This block now handles multi-stem separation, transcription, and merging logic.
         separated_stems = {} # This will store the audio tensors for merging
-        if params.separate_vocals and demucs_model is not None:
-            # --- Vocal Separation Workflow ---
-            update_progress(0.2, "Separating audio with Demucs...")
-            # Convert to the format Demucs expects (e.g., 44.1kHz, stereo)
-            audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
-            # Move tensor to GPU if available for faster processing
-            if torch.cuda.is_available():
-                audio_tensor = audio_tensor.cuda()
-            print("Separating audio with Demucs... This may take some time.")
-            # --- Wrap the model call in a no_grad() context ---
-            with torch.no_grad():
-                all_stems = apply_model(
-                    demucs_model,
-                    audio_tensor[None], # The input shape is [batch, channels, samples]
-                    device='cuda' if torch.cuda.is_available() else 'cpu',
-                    progress=True
-                )[0] # Remove the batch dimension from the output
-            # --- Clear CUDA cache immediately after use ---
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                print("CUDA cache cleared.")
-            sources = {name: stem for name, stem in zip(demucs_model.sources, all_stems)}
-            # --- Store original stems for potential re-merging ---
-            for name, tensor in sources.items():
-                separated_stems[name] = (tensor.cpu(), demucs_model.samplerate)
-            # --- Prepare Stems for Transcription ---
             stems_to_transcribe = {}
             if params.enable_advanced_separation:
-                # User is in advanced mode, handle each stem individually
                 if params.transcribe_vocals:
                     stems_to_transcribe['vocals'] = sources['vocals']
                 if params.transcribe_drums:
@@ -2460,7 +2516,9 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
                 if params.transcribe_other_or_accompaniment:
                     stems_to_transcribe['other'] = sources['other']
             else:
-                # User is in simple mode, create a single 'accompaniment' stem
                 accompaniment_tensor = sources['drums'] + sources['bass'] + sources['other']
                 if params.transcribe_vocals:
                     stems_to_transcribe['vocals'] = sources['vocals']
@@ -2471,10 +2529,13 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
             transcribed_midi_paths = []
             if stems_to_transcribe:
                 stem_count = len(stems_to_transcribe)
                 for i, (name, tensor) in enumerate(stems_to_transcribe.items()):
                     update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
                     stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
-                    torchaudio.save(stem_path, tensor.cpu(), demucs_model.samplerate)
                     midi_path, used_bp_params = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
                     if midi_path:
                         transcribed_midi_paths.append((name, midi_path))
@@ -2554,7 +2615,7 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
     # --- Final Audio Merging Logic ---
     stems_to_merge = []
-    if params.separate_vocals:
         if params.merge_vocals_to_render and 'vocals' in separated_stems:
             stems_to_merge.append(separated_stems['vocals'])
@@ -2565,10 +2626,12 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
                 stems_to_merge.append(separated_stems['bass'])
             if params.merge_other_or_accompaniment and 'other' in separated_stems:
                 stems_to_merge.append(separated_stems['other'])
-        else: # Simple mode
-            if params.merge_other_or_accompaniment: # 'other' checkbox now controls the whole accompaniment
                 accompaniment_tensor = separated_stems['drums'][0] + separated_stems['bass'][0] + separated_stems['other'][0]
-                stems_to_merge.append((accompaniment_tensor, demucs_model.samplerate))
     if stems_to_merge:
         update_progress(0.9, "Re-merging audio stems...")
@@ -2584,6 +2647,10 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
                 resampler = torchaudio.transforms.Resample(stem_srate, final_srate)
                 stem_tensor = resampler(stem_tensor)
             # Pad and add to the final mix
             len_mix = final_mix_tensor.shape[1]
             len_stem = stem_tensor.shape[1]
@@ -2613,8 +2680,8 @@ def run_single_file_pipeline(input_file_path: str, timestamp: str, params: AppPa
     final_midi_path_from_render = results_tuple[3] # Get the path of the processed MIDI
     # --- Use timestamped names for final outputs ---
-    output_audio_dir = "output/final_audio"
-    output_midi_dir = "output/final_midi"
     os.makedirs(output_audio_dir, exist_ok=True)
     os.makedirs(output_midi_dir, exist_ok=True)
@@ -2835,7 +2902,7 @@ if __name__ == "__main__":
     initialize_app()
     # --- Prepare soundfonts and make the map globally accessible ---
-    global soundfonts_dict, demucs_model
     # On application start, download SoundFonts from Hugging Face Hub if they don't exist.
     soundfonts_dict = prepare_soundfonts()
     print(f"Found {len(soundfonts_dict)} local SoundFonts.")
@@ -2855,6 +2922,25 @@ if __name__ == "__main__":
         print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}")
         demucs_model = None
     # --- Dictionary containing descriptions for each render type ---
     RENDER_TYPE_DESCRIPTIONS = {
         "Render as-is": "**Mode: Pass-through.** Renders the MIDI file directly without any modifications. Advanced MIDI options will be ignored.",
@@ -3385,6 +3471,19 @@ if __name__ == "__main__":
                 merge_other_or_accompaniment: gr.update(label="Merge Accompaniment")
             }
     # --- Use the dataclass to define the master list of parameter keys ---
     # This is now the single source of truth for parameter order.
     ALL_PARAM_KEYS = [field.name for field in fields(AppParameters) if field.name not in ["input_file", "batch_input_files"]]
@@ -3475,14 +3574,21 @@ if __name__ == "__main__":
                     # --- Vocal Separation Group ---
                     with gr.Group():
-                        separate_vocals = gr.Checkbox(label="Enable Source Separation (Demucs)", value=False,
                                 info="If checked, separates the audio into its component stems (vocals, drums, etc.) before processing.")
                         # --- Container for all separation options, visible only when enabled ---
                         with gr.Group(visible=False) as separation_options_box:
                             gr.Markdown("#### 1. Stem Separation Options")
-                            enable_advanced_separation = gr.Checkbox(label="Enable Advanced Stem Control (for Accompaniment)", value=False,
-                                info="If checked, you can individually control drums, bass, and other. If unchecked, they are treated as a single 'Accompaniment' track.")
                             with gr.Row(visible=False) as advanced_separation_controls:
                                 separate_drums = gr.Checkbox(label="Drums", value=True)
@@ -4066,7 +4172,23 @@ if __name__ == "__main__":
             outputs=[separation_options_box]
         )
-        # When the advanced stem control checkbox is toggled, update all relevant UI parts
         enable_advanced_separation.change(
             fn=update_separation_mode_ui,
             inputs=enable_advanced_separation,

 from demucs.apply import apply_model
 from demucs.pretrained import get_model
 from demucs.audio import convert_audio
+from audio_separator.separator import Separator
 from src.piano_transcription.utils import initialize_app
 from piano_transcription_inference  import PianoTranscription, utilities, sample_rate as transcription_sample_rate
     # Global Settings
     s8bit_preset_selector: str = "Custom"
     separate_vocals: bool = False
+    separation_model: str = "Demucs (4-stem)"
     # --- Advanced Separation and Merging Controls ---
     enable_advanced_separation: bool = False # Controls visibility of advanced options
     # Use 'cuda' if a GPU is available and configured, otherwise 'cpu'
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f'Loading transcriptor model... device= {device}')
+    transcriptor = PianoTranscription(device=device, checkpoint_path=os.path.join("src", "models", "CRNN_note_F1=0.9677_pedal_F1=0.9186.pth"))
     print('Transcriptor loaded.')
     print('-' * 70)
         midi_path_for_rendering = input_file_path
     else:
+        temp_dir = os.path.join("output", "temp_transcribe") # Define temp_dir early for the fallback
         os.makedirs(temp_dir, exist_ok=True)
         # --- Audio Loading ---
                 print(f"ERROR: Could not load {filename}. Skipping. FFmpeg error: {stderr}")
                 return None # Return None to indicate failure
+        # --- Vocal Separation Logic ---
         # This block now handles multi-stem separation, transcription, and merging logic.
         separated_stems = {} # This will store the audio tensors for merging
+        sources = {} # This will hold the tensors for transcription processing
+        if params.separate_vocals:
+            model_name = params.separation_model
+            # --- Demucs Separation Workflow (4-stem) ---
+            if 'Demucs' in model_name and demucs_model is not None:
+                update_progress(0.2, "Separating audio with Demucs...")
+                # Convert to the format Demucs expects (e.g., 44.1kHz, stereo)
+                audio_tensor_demucs = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
+                # Move tensor to GPU if available for faster processing
+                if torch.cuda.is_available():
+                    audio_tensor_demucs = audio_tensor_demucs.cuda()
+                print("Separating audio with Demucs... This may take some time.")
+                # --- Wrap the model call in a no_grad() context ---
+                with torch.no_grad():
+                    all_stems = apply_model(
+                        demucs_model,
+                        audio_tensor_demucs[None], # The input shape is [batch, channels, samples]
+                        device='cuda' if torch.cuda.is_available() else 'cpu',
+                        progress=True
+                    )[0] # Remove the batch dimension from the output
+                # --- Clear CUDA cache immediately after use ---
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    print("CUDA cache cleared.")
+                # Populate sources for transcription and separated_stems for merging
+                sources = {name: stem for name, stem in zip(demucs_model.sources, all_stems)}
+                # --- Store original stems for potential re-merging ---
+                for name, tensor in sources.items():
+                    separated_stems[name] = (tensor.cpu(), demucs_model.samplerate)
+            # --- RoFormer Separation Workflow (2-stem) ---
+            elif ('BS-RoFormer' in model_name or 'Mel-RoFormer' in model_name):
+                if not separator_models:
+                    print("Warning: RoFormer models are not loaded. Skipping separation.")
+                    params.separate_vocals = False
+                else:
+                    roformer_key = 'BS-RoFormer' if 'BS-RoFormer' in model_name else 'Mel-RoFormer'
+                    update_progress(0.2, f"Separating audio with {roformer_key}...")
+                    temp_input_path = os.path.join(temp_dir, f"{timestamped_base_name}_roformer_in.flac")
+                    torchaudio.save(temp_input_path, audio_tensor.cpu(), native_sample_rate)
+                    try:
+                        separator = separator_models[roformer_key]
+                        output_paths = separator.separate(temp_input_path)
+                        vocals_path, accompaniment_path = None, None
+                        for path in output_paths:
+                            basename = os.path.basename(path).lower()
+                            path = os.path.join(temp_dir, path)
+                            if '(vocals)' in basename:
+                                vocals_path = path
+                            elif '(instrumental)' in basename:
+                                accompaniment_path = path
+                        if not vocals_path or not accompaniment_path:
+                            raise RuntimeError(f"Could not find expected vocal/instrumental stems in output: {output_paths}")
+                        print(f"Separation complete. Vocals: {os.path.basename(vocals_path)}, Accompaniment: {os.path.basename(accompaniment_path)}")
+                        vocals_tensor, stem_sr = torchaudio.load(vocals_path)
+                        accompaniment_tensor, stem_sr = torchaudio.load(accompaniment_path)
+                        # Populate 'sources' and 'separated_stems' to match Demucs structure
+                        # This ensures compatibility with downstream logic
+                        sources['vocals'] = vocals_tensor
+                        sources['other'] = accompaniment_tensor # The entire accompaniment
+                        sources['drums'] = torch.zeros_like(accompaniment_tensor) # Dummy tensor
+                        sources['bass'] = torch.zeros_like(accompaniment_tensor)  # Dummy tensor
+                        for name, tensor in sources.items():
+                            separated_stems[name] = (tensor.cpu(), stem_sr)
+                    except Exception as e:
+                        print(f"ERROR: {roformer_key} separation failed: {e}. Skipping separation.")
+                        params.separate_vocals = False
+        # --- Prepare Stems for Transcription ---
+        if params.separate_vocals and sources: # Check if separation was successful
             stems_to_transcribe = {}
+            # NOTE: When a 2-stem model is used, the UI should ensure 'enable_advanced_separation' is False.
             if params.enable_advanced_separation:
+                # User is in advanced mode (Demucs only)
                 if params.transcribe_vocals:
                     stems_to_transcribe['vocals'] = sources['vocals']
                 if params.transcribe_drums:
                 if params.transcribe_other_or_accompaniment:
                     stems_to_transcribe['other'] = sources['other']
             else:
+                # Simple mode (Demucs) or RoFormer mode
+                # This logic correctly combines drums, bass, and other. For RoFormer, drums/bass are zero,
+                # so this correctly results in just the accompaniment tensor.
                 accompaniment_tensor = sources['drums'] + sources['bass'] + sources['other']
                 if params.transcribe_vocals:
                     stems_to_transcribe['vocals'] = sources['vocals']
             transcribed_midi_paths = []
             if stems_to_transcribe:
                 stem_count = len(stems_to_transcribe)
+                # The samplerate of all stems from a single separation will be the same
+                stem_samplerate = separated_stems.get('vocals', (None, native_sample_rate))[1]
                 for i, (name, tensor) in enumerate(stems_to_transcribe.items()):
                     update_progress(0.3 + (0.3 * (i / stem_count)), f"Transcribing stem: {name}...")
                     stem_path = os.path.join(temp_dir, f"{timestamped_base_name}_{name}.flac")
+                    torchaudio.save(stem_path, tensor.cpu(), stem_samplerate)
                     midi_path, used_bp_params = _transcribe_stem(stem_path, f"{timestamped_base_name}_{name}", temp_dir, params)
                     if midi_path:
                         transcribed_midi_paths.append((name, midi_path))
     # --- Final Audio Merging Logic ---
     stems_to_merge = []
+    if params.separate_vocals and separated_stems:
         if params.merge_vocals_to_render and 'vocals' in separated_stems:
             stems_to_merge.append(separated_stems['vocals'])
                 stems_to_merge.append(separated_stems['bass'])
             if params.merge_other_or_accompaniment and 'other' in separated_stems:
                 stems_to_merge.append(separated_stems['other'])
+        else: # Simple mode or RoFormer
+            if params.merge_other_or_accompaniment:
+                # This correctly combines the accompaniment, which for RoFormer is just the 'other' stem.
                 accompaniment_tensor = separated_stems['drums'][0] + separated_stems['bass'][0] + separated_stems['other'][0]
+                accompaniment_sr = separated_stems['other'][1]
+                stems_to_merge.append((accompaniment_tensor, accompaniment_sr))
     if stems_to_merge:
         update_progress(0.9, "Re-merging audio stems...")
                 resampler = torchaudio.transforms.Resample(stem_srate, final_srate)
                 stem_tensor = resampler(stem_tensor)
+            # Ensure stem is stereo if mix is stereo
+            if final_mix_tensor.shape[0] == 2 and stem_tensor.shape[0] == 1:
+                stem_tensor = stem_tensor.repeat(2, 1)
             # Pad and add to the final mix
             len_mix = final_mix_tensor.shape[1]
             len_stem = stem_tensor.shape[1]
     final_midi_path_from_render = results_tuple[3] # Get the path of the processed MIDI
     # --- Use timestamped names for final outputs ---
+    output_audio_dir = os.path.join("output", "final_audio")
+    output_midi_dir = os.path.join("output", "final_midi")
     os.makedirs(output_audio_dir, exist_ok=True)
     os.makedirs(output_midi_dir, exist_ok=True)
     initialize_app()
     # --- Prepare soundfonts and make the map globally accessible ---
+    global soundfonts_dict, demucs_model, separator_models
     # On application start, download SoundFonts from Hugging Face Hub if they don't exist.
     soundfonts_dict = prepare_soundfonts()
     print(f"Found {len(soundfonts_dict)} local SoundFonts.")
         print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}")
         demucs_model = None
+    # --- Pre-load BS-RoFormer and Mel-RoFormer models ---
+    separator_models: dict[str, Separator] = {}
+    try:
+        temp_dir = os.path.join("output", "temp_transcribe")
+        print("Loading BS-RoFormer model...")
+        bs_roformer = Separator(output_dir=temp_dir, output_format='flac', model_file_dir=os.path.join("src", "models"))
+        bs_roformer.load_model("model_bs_roformer_ep_317_sdr_12.9755.ckpt")
+        separator_models['BS-RoFormer'] = bs_roformer
+        print("BS-RoFormer model loaded successfully.")
+        print("Loading Mel-RoFormer model...")
+        mel_roformer = Separator(output_dir=temp_dir, output_format='flac', model_file_dir=os.path.join("src", "models"))
+        mel_roformer.load_model("model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt")
+        separator_models['Mel-RoFormer'] = mel_roformer
+        print("Mel-RoFormer model loaded successfully.")
+    except Exception as e:
+        print(f"Warning: Could not load RoFormer models. They will not be available for separation. Error: {e}")
     # --- Dictionary containing descriptions for each render type ---
     RENDER_TYPE_DESCRIPTIONS = {
         "Render as-is": "**Mode: Pass-through.** Renders the MIDI file directly without any modifications. Advanced MIDI options will be ignored.",
                 merge_other_or_accompaniment: gr.update(label="Merge Accompaniment")
             }
+    # --- UI controller for handling model selection ---
+    def on_separation_model_change(model_choice):
+        """
+        Update the UI when the separation model changes.
+        If a 2-stem model (RoFormer) is selected, hide advanced (4-stem) controls.
+        """
+        is_demucs = 'Demucs' in model_choice
+        # For 2-stem models, we force simple mode (is_advanced=False)
+        updates = update_separation_mode_ui(is_advanced=False)
+        # Also hide the checkbox that allows switching to advanced mode
+        updates[enable_advanced_separation] = gr.update(visible=is_demucs, value=False)
+        return updates
     # --- Use the dataclass to define the master list of parameter keys ---
     # This is now the single source of truth for parameter order.
     ALL_PARAM_KEYS = [field.name for field in fields(AppParameters) if field.name not in ["input_file", "batch_input_files"]]
                     # --- Vocal Separation Group ---
                     with gr.Group():
+                        separate_vocals = gr.Checkbox(label="Enable Source Separation", value=False,
                                 info="If checked, separates the audio into its component stems (vocals, drums, etc.) before processing.")
                         # --- Container for all separation options, visible only when enabled ---
                         with gr.Group(visible=False) as separation_options_box:
+                            separation_model = gr.Radio(
+                                ["Demucs (4-stem)", "BS-RoFormer (Vocals/Instrumental)", "Mel-RoFormer (Vocals/Instrumental)"],
+                                label="Separation Model",
+                                value="Demucs (4-stem)",
+                                info="Select the separation model. Demucs provides 4 stems (vocals, drums, bass, other). RoFormer models are specialized for 2-stem (vocals/instrumental) separation.",
+                            )
                             gr.Markdown("#### 1. Stem Separation Options")
+                            enable_advanced_separation = gr.Checkbox(label="Enable Advanced Stem Control (Demucs Only)", value=False,
+                                info="If checked, you can individually control drums, bass, and other. If unchecked, they are treated as a single 'Accompaniment' track. This option is only available for the Demucs model.")
                             with gr.Row(visible=False) as advanced_separation_controls:
                                 separate_drums = gr.Checkbox(label="Drums", value=True)
             outputs=[separation_options_box]
         )
+        # When the model selection changes, trigger UI update
+        separation_model.change(
+            fn=on_separation_model_change,
+            inputs=separation_model,
+            outputs=[
+                enable_advanced_separation,
+                advanced_separation_controls,
+                transcribe_drums,
+                transcribe_bass,
+                transcribe_other_or_accompaniment,
+                merge_drums_to_render,
+                merge_bass_to_render,
+                merge_other_or_accompaniment
+            ]
+        )
+        # When the advanced stem control checkbox is toggled, update all related UI parts
         enable_advanced_separation.change(
             fn=update_separation_mode_ui,
             inputs=enable_advanced_separation,

requirements.txt CHANGED Viewed

@@ -28,4 +28,7 @@ basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'li
 git+https://github.com/avan06/pyfluidsynth
-demucs

 git+https://github.com/avan06/pyfluidsynth
+demucs
+audio-separator[gpu]; sys_platform != 'darwin'
+audio-separator[cpu]; sys_platform == 'darwin'