Spaces:

LAP-DEV
/

Demo

Running

App Files Files Community

LAP-DEV commited on Feb 13

Commit

bbcf404

verified ·

1 Parent(s): d4fb1d5

Upload 3 files

Browse files

Files changed (3) hide show

modules/uvr/music_separator.py +183 -0
modules/vad/__init__.py +0 -0
modules/vad/silero_vad.py +264 -0

modules/uvr/music_separator.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from typing import Optional, Union, List, Dict
+import numpy as np
+import torchaudio
+import soundfile as sf
+import os
+import torch
+import gc
+import gradio as gr
+from datetime import datetime
+from uvr.models import MDX, Demucs, VrNetwork, MDXC
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
+from modules.utils.files_manager import load_yaml, save_yaml, is_video
+from modules.diarize.audio_loader import load_audio
+class MusicSeparator:
+    def __init__(self,
+                 model_dir: Optional[str] = None,
+                 output_dir: Optional[str] = None):
+        self.model = None
+        self.device = self.get_device()
+        self.available_devices = ["cpu", "cuda"]
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
+        vocals_output_dir = os.path.join(self.output_dir, "vocals")
+        os.makedirs(instrumental_output_dir, exist_ok=True)
+        os.makedirs(vocals_output_dir, exist_ok=True)
+        self.audio_info = None
+        self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
+        self.default_model = self.available_models[0]
+        self.current_model_size = self.default_model
+        self.model_config = {
+            "segment": 256,
+            "split": True
+        }
+    def update_model(self,
+                     model_name: str = "UVR-MDX-NET-Inst_1",
+                     device: Optional[str] = None,
+                     segment_size: int = 256):
+        """
+        Update model with the given model name
+        Args:
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+        """
+        if device is None:
+            device = self.device
+        self.device = device
+        self.model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        self.model = MDX(name=model_name,
+                         other_metadata=self.model_config,
+                         device=self.device,
+                         logger=None,
+                         model_dir=self.model_dir)
+    def separate(self,
+                 audio: Union[str, np.ndarray],
+                 model_name: str,
+                 device: Optional[str] = None,
+                 segment_size: int = 256,
+                 save_file: bool = False,
+                 progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
+        """
+        Separate the background music from the audio.
+        Args:
+            audio (Union[str, np.ndarray]): Audio path or numpy array.
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+            save_file (bool): Whether to save the separated audio to output path or not.
+            progress (gr.Progress): Gradio progress indicator.
+        Returns:
+            A Tuple of
+            np.ndarray: Instrumental numpy arrays.
+            np.ndarray: Vocals numpy arrays.
+            file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
+        """
+        if isinstance(audio, str):
+            output_filename, ext = os.path.basename(audio), ".wav"
+            output_filename, orig_ext = os.path.splitext(output_filename)
+            if is_video(audio):
+                audio = load_audio(audio)
+                sample_rate = 16000
+            else:
+                self.audio_info = torchaudio.info(audio)
+                sample_rate = self.audio_info.sample_rate
+        else:
+            timestamp = datetime.now().strftime("%m%d%H%M%S")
+            output_filename, ext = f"UVR-{timestamp}", ".wav"
+            sample_rate = 16000
+        model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        if (self.model is None or
+                self.current_model_size != model_name or
+                self.model_config != model_config or
+                self.model.sample_rate != sample_rate or
+                self.device != device):
+            progress(0, desc="Initializing UVR Model...")
+            self.update_model(
+                model_name=model_name,
+                device=device,
+                segment_size=segment_size
+            )
+            self.model.sample_rate = sample_rate
+        progress(0, desc="Separating background music from the audio...")
+        result = self.model(audio)
+        instrumental, vocals = result["instrumental"].T, result["vocals"].T
+        file_paths = []
+        if save_file:
+            instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
+            vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
+            sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
+            sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
+            file_paths += [instrumental_output_path, vocals_output_path]
+        return instrumental, vocals, file_paths
+    def separate_files(self,
+                       files: List,
+                       model_name: str,
+                       device: Optional[str] = None,
+                       segment_size: int = 256,
+                       save_file: bool = True,
+                       progress: gr.Progress = gr.Progress()) -> List[str]:
+        """Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
+        to display into gr.Audio()"""
+        self.cache_parameters(model_size=model_name, segment_size=segment_size)
+        for file_path in files:
+            instrumental, vocals, file_paths = self.separate(
+                audio=file_path,
+                model_name=model_name,
+                device=device,
+                segment_size=segment_size,
+                save_file=save_file,
+                progress=progress
+            )
+        return file_paths
+    @staticmethod
+    def get_device():
+        """Get device for the model"""
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    def offload(self):
+        """Offload the model and free up the memory"""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+        self.audio_info = None
+    @staticmethod
+    def cache_parameters(model_size: str,
+                         segment_size: int):
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_uvr_params = cached_params["bgm_separation"]
+        uvr_params_to_cache = {
+            "model_size": model_size,
+            "segment_size": segment_size
+        }
+        cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
+        cached_params["bgm_separation"] = cached_uvr_params
+        save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)

modules/vad/__init__.py ADDED Viewed

File without changes

modules/vad/silero_vad.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
+from faster_whisper.vad import VadOptions, get_vad_model
+import numpy as np
+from typing import BinaryIO, Union, List, Optional, Tuple
+import warnings
+import faster_whisper
+from faster_whisper.transcribe import SpeechTimestampsMap, Segment
+import gradio as gr
+class SileroVAD:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.window_size_samples = 512
+        self.model = None
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            vad_parameters: VadOptions,
+            progress: gr.Progress = gr.Progress()
+            ) -> Tuple[np.ndarray, List[dict]]:
+        """
+        Run VAD
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        vad_parameters:
+            Options for VAD processing.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        np.ndarray
+            Pre-processed audio with VAD
+        List[dict]
+            Chunks of speeches to be used to restore the timestamps later
+        """
+        sampling_rate = self.sampling_rate
+        if not isinstance(audio, np.ndarray):
+            audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
+        duration = audio.shape[0] / sampling_rate
+        duration_after_vad = duration
+        if vad_parameters is None:
+            vad_parameters = VadOptions()
+        elif isinstance(vad_parameters, dict):
+            vad_parameters = VadOptions(**vad_parameters)
+        speech_chunks = self.get_speech_timestamps(
+            audio=audio,
+            vad_options=vad_parameters,
+            progress=progress
+        )
+        audio = self.collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate
+        return audio, speech_chunks
+    def get_speech_timestamps(
+        self,
+        audio: np.ndarray,
+        vad_options: Optional[VadOptions] = None,
+        progress: gr.Progress = gr.Progress(),
+        **kwargs,
+    ) -> List[dict]:
+        """This method is used for splitting long audios into speech chunks using silero VAD.
+        Args:
+          audio: One dimensional float array.
+          vad_options: Options for VAD processing.
+          kwargs: VAD options passed as keyword arguments for backward compatibility.
+          progress: Gradio progress to indicate progress.
+        Returns:
+          List of dicts containing begin and end samples of each speech chunk.
+        """
+        if self.model is None:
+            self.update_model()
+        if vad_options is None:
+            vad_options = VadOptions(**kwargs)
+        threshold = vad_options.threshold
+        min_speech_duration_ms = vad_options.min_speech_duration_ms
+        max_speech_duration_s = vad_options.max_speech_duration_s
+        min_silence_duration_ms = vad_options.min_silence_duration_ms
+        window_size_samples = self.window_size_samples
+        speech_pad_ms = vad_options.speech_pad_ms
+        sampling_rate = 16000
+        min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
+        speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        max_speech_samples = (
+                sampling_rate * max_speech_duration_s
+                - window_size_samples
+                - 2 * speech_pad_samples
+        )
+        min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
+        audio_length_samples = len(audio)
+        state, context = self.model.get_initial_states(batch_size=1)
+        speech_probs = []
+        for current_start_sample in range(0, audio_length_samples, window_size_samples):
+            progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
+            chunk = audio[current_start_sample: current_start_sample + window_size_samples]
+            if len(chunk) < window_size_samples:
+                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
+            speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
+            speech_probs.append(speech_prob)
+        triggered = False
+        speeches = []
+        current_speech = {}
+        neg_threshold = threshold - 0.15
+        # to save potential segment end (and tolerate some silence)
+        temp_end = 0
+        # to save potential segment limits in case of maximum segment size reached
+        prev_end = next_start = 0
+        for i, speech_prob in enumerate(speech_probs):
+            if (speech_prob >= threshold) and temp_end:
+                temp_end = 0
+                if next_start < prev_end:
+                    next_start = window_size_samples * i
+            if (speech_prob >= threshold) and not triggered:
+                triggered = True
+                current_speech["start"] = window_size_samples * i
+                continue
+            if (
+                    triggered
+                    and (window_size_samples * i) - current_speech["start"] > max_speech_samples
+            ):
+                if prev_end:
+                    current_speech["end"] = prev_end
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    # previously reached silence (< neg_thres) and is still not speech (< thres)
+                    if next_start < prev_end:
+                        triggered = False
+                    else:
+                        current_speech["start"] = next_start
+                    prev_end = next_start = temp_end = 0
+                else:
+                    current_speech["end"] = window_size_samples * i
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+            if (speech_prob < neg_threshold) and triggered:
+                if not temp_end:
+                    temp_end = window_size_samples * i
+                # condition to avoid cutting in very short silence
+                if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
+                    prev_end = temp_end
+                if (window_size_samples * i) - temp_end < min_silence_samples:
+                    continue
+                else:
+                    current_speech["end"] = temp_end
+                    if (
+                            current_speech["end"] - current_speech["start"]
+                    ) > min_speech_samples:
+                        speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+        if (
+                current_speech
+                and (audio_length_samples - current_speech["start"]) > min_speech_samples
+        ):
+            current_speech["end"] = audio_length_samples
+            speeches.append(current_speech)
+        for i, speech in enumerate(speeches):
+            if i == 0:
+                speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
+            if i != len(speeches) - 1:
+                silence_duration = speeches[i + 1]["start"] - speech["end"]
+                if silence_duration < 2 * speech_pad_samples:
+                    speech["end"] += int(silence_duration // 2)
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - silence_duration // 2)
+                    )
+                else:
+                    speech["end"] = int(
+                        min(audio_length_samples, speech["end"] + speech_pad_samples)
+                    )
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - speech_pad_samples)
+                    )
+            else:
+                speech["end"] = int(
+                    min(audio_length_samples, speech["end"] + speech_pad_samples)
+                )
+        return speeches
+    def update_model(self):
+        self.model = get_vad_model()
+    @staticmethod
+    def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
+        """Collects and concatenates audio chunks."""
+        if not chunks:
+            return np.array([], dtype=np.float32)
+        return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
+    @staticmethod
+    def format_timestamp(
+        seconds: float,
+        always_include_hours: bool = False,
+        decimal_marker: str = ".",
+    ) -> str:
+        assert seconds >= 0, "non-negative timestamp expected"
+        milliseconds = round(seconds * 1000.0)
+        hours = milliseconds // 3_600_000
+        milliseconds -= hours * 3_600_000
+        minutes = milliseconds // 60_000
+        milliseconds -= minutes * 60_000
+        seconds = milliseconds // 1_000
+        milliseconds -= seconds * 1_000
+        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+        return (
+            f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+        )
+    def restore_speech_timestamps(
+        self,
+        segments: List[dict],
+        speech_chunks: List[dict],
+        sampling_rate: Optional[int] = None,
+    ) -> List[dict]:
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+        for segment in segments:
+            segment["start"] = ts_map.get_original_time(segment["start"])
+            segment["end"] = ts_map.get_original_time(segment["end"])
+        return segments