LAP-DEV commited on
Commit
bbcf404
·
verified ·
1 Parent(s): d4fb1d5

Upload 3 files

Browse files
modules/uvr/music_separator.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union, List, Dict
2
+ import numpy as np
3
+ import torchaudio
4
+ import soundfile as sf
5
+ import os
6
+ import torch
7
+ import gc
8
+ import gradio as gr
9
+ from datetime import datetime
10
+
11
+ from uvr.models import MDX, Demucs, VrNetwork, MDXC
12
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
13
+ from modules.utils.files_manager import load_yaml, save_yaml, is_video
14
+ from modules.diarize.audio_loader import load_audio
15
+
16
+ class MusicSeparator:
17
+ def __init__(self,
18
+ model_dir: Optional[str] = None,
19
+ output_dir: Optional[str] = None):
20
+ self.model = None
21
+ self.device = self.get_device()
22
+ self.available_devices = ["cpu", "cuda"]
23
+ self.model_dir = model_dir
24
+ self.output_dir = output_dir
25
+ instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
26
+ vocals_output_dir = os.path.join(self.output_dir, "vocals")
27
+ os.makedirs(instrumental_output_dir, exist_ok=True)
28
+ os.makedirs(vocals_output_dir, exist_ok=True)
29
+ self.audio_info = None
30
+ self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
31
+ self.default_model = self.available_models[0]
32
+ self.current_model_size = self.default_model
33
+ self.model_config = {
34
+ "segment": 256,
35
+ "split": True
36
+ }
37
+
38
+ def update_model(self,
39
+ model_name: str = "UVR-MDX-NET-Inst_1",
40
+ device: Optional[str] = None,
41
+ segment_size: int = 256):
42
+ """
43
+ Update model with the given model name
44
+
45
+ Args:
46
+ model_name (str): Model name.
47
+ device (str): Device to use for the model.
48
+ segment_size (int): Segment size for the prediction.
49
+ """
50
+ if device is None:
51
+ device = self.device
52
+
53
+ self.device = device
54
+ self.model_config = {
55
+ "segment": segment_size,
56
+ "split": True
57
+ }
58
+ self.model = MDX(name=model_name,
59
+ other_metadata=self.model_config,
60
+ device=self.device,
61
+ logger=None,
62
+ model_dir=self.model_dir)
63
+
64
+ def separate(self,
65
+ audio: Union[str, np.ndarray],
66
+ model_name: str,
67
+ device: Optional[str] = None,
68
+ segment_size: int = 256,
69
+ save_file: bool = False,
70
+ progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
71
+ """
72
+ Separate the background music from the audio.
73
+
74
+ Args:
75
+ audio (Union[str, np.ndarray]): Audio path or numpy array.
76
+ model_name (str): Model name.
77
+ device (str): Device to use for the model.
78
+ segment_size (int): Segment size for the prediction.
79
+ save_file (bool): Whether to save the separated audio to output path or not.
80
+ progress (gr.Progress): Gradio progress indicator.
81
+
82
+ Returns:
83
+ A Tuple of
84
+ np.ndarray: Instrumental numpy arrays.
85
+ np.ndarray: Vocals numpy arrays.
86
+ file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
87
+ """
88
+ if isinstance(audio, str):
89
+ output_filename, ext = os.path.basename(audio), ".wav"
90
+ output_filename, orig_ext = os.path.splitext(output_filename)
91
+
92
+ if is_video(audio):
93
+ audio = load_audio(audio)
94
+ sample_rate = 16000
95
+ else:
96
+ self.audio_info = torchaudio.info(audio)
97
+ sample_rate = self.audio_info.sample_rate
98
+ else:
99
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
100
+ output_filename, ext = f"UVR-{timestamp}", ".wav"
101
+ sample_rate = 16000
102
+
103
+ model_config = {
104
+ "segment": segment_size,
105
+ "split": True
106
+ }
107
+
108
+ if (self.model is None or
109
+ self.current_model_size != model_name or
110
+ self.model_config != model_config or
111
+ self.model.sample_rate != sample_rate or
112
+ self.device != device):
113
+ progress(0, desc="Initializing UVR Model...")
114
+ self.update_model(
115
+ model_name=model_name,
116
+ device=device,
117
+ segment_size=segment_size
118
+ )
119
+ self.model.sample_rate = sample_rate
120
+
121
+ progress(0, desc="Separating background music from the audio...")
122
+ result = self.model(audio)
123
+ instrumental, vocals = result["instrumental"].T, result["vocals"].T
124
+
125
+ file_paths = []
126
+ if save_file:
127
+ instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
128
+ vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
129
+ sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
130
+ sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
131
+ file_paths += [instrumental_output_path, vocals_output_path]
132
+
133
+ return instrumental, vocals, file_paths
134
+
135
+ def separate_files(self,
136
+ files: List,
137
+ model_name: str,
138
+ device: Optional[str] = None,
139
+ segment_size: int = 256,
140
+ save_file: bool = True,
141
+ progress: gr.Progress = gr.Progress()) -> List[str]:
142
+ """Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
143
+ to display into gr.Audio()"""
144
+ self.cache_parameters(model_size=model_name, segment_size=segment_size)
145
+
146
+ for file_path in files:
147
+ instrumental, vocals, file_paths = self.separate(
148
+ audio=file_path,
149
+ model_name=model_name,
150
+ device=device,
151
+ segment_size=segment_size,
152
+ save_file=save_file,
153
+ progress=progress
154
+ )
155
+ return file_paths
156
+
157
+ @staticmethod
158
+ def get_device():
159
+ """Get device for the model"""
160
+ return "cuda" if torch.cuda.is_available() else "cpu"
161
+
162
+ def offload(self):
163
+ """Offload the model and free up the memory"""
164
+ if self.model is not None:
165
+ del self.model
166
+ self.model = None
167
+ if self.device == "cuda":
168
+ torch.cuda.empty_cache()
169
+ gc.collect()
170
+ self.audio_info = None
171
+
172
+ @staticmethod
173
+ def cache_parameters(model_size: str,
174
+ segment_size: int):
175
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
176
+ cached_uvr_params = cached_params["bgm_separation"]
177
+ uvr_params_to_cache = {
178
+ "model_size": model_size,
179
+ "segment_size": segment_size
180
+ }
181
+ cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
182
+ cached_params["bgm_separation"] = cached_uvr_params
183
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/vad/__init__.py ADDED
File without changes
modules/vad/silero_vad.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
2
+
3
+ from faster_whisper.vad import VadOptions, get_vad_model
4
+ import numpy as np
5
+ from typing import BinaryIO, Union, List, Optional, Tuple
6
+ import warnings
7
+ import faster_whisper
8
+ from faster_whisper.transcribe import SpeechTimestampsMap, Segment
9
+ import gradio as gr
10
+
11
+
12
+ class SileroVAD:
13
+ def __init__(self):
14
+ self.sampling_rate = 16000
15
+ self.window_size_samples = 512
16
+ self.model = None
17
+
18
+ def run(self,
19
+ audio: Union[str, BinaryIO, np.ndarray],
20
+ vad_parameters: VadOptions,
21
+ progress: gr.Progress = gr.Progress()
22
+ ) -> Tuple[np.ndarray, List[dict]]:
23
+ """
24
+ Run VAD
25
+
26
+ Parameters
27
+ ----------
28
+ audio: Union[str, BinaryIO, np.ndarray]
29
+ Audio path or file binary or Audio numpy array
30
+ vad_parameters:
31
+ Options for VAD processing.
32
+ progress: gr.Progress
33
+ Indicator to show progress directly in gradio.
34
+
35
+ Returns
36
+ ----------
37
+ np.ndarray
38
+ Pre-processed audio with VAD
39
+ List[dict]
40
+ Chunks of speeches to be used to restore the timestamps later
41
+ """
42
+
43
+ sampling_rate = self.sampling_rate
44
+
45
+ if not isinstance(audio, np.ndarray):
46
+ audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
47
+
48
+ duration = audio.shape[0] / sampling_rate
49
+ duration_after_vad = duration
50
+
51
+ if vad_parameters is None:
52
+ vad_parameters = VadOptions()
53
+ elif isinstance(vad_parameters, dict):
54
+ vad_parameters = VadOptions(**vad_parameters)
55
+ speech_chunks = self.get_speech_timestamps(
56
+ audio=audio,
57
+ vad_options=vad_parameters,
58
+ progress=progress
59
+ )
60
+ audio = self.collect_chunks(audio, speech_chunks)
61
+ duration_after_vad = audio.shape[0] / sampling_rate
62
+
63
+ return audio, speech_chunks
64
+
65
+ def get_speech_timestamps(
66
+ self,
67
+ audio: np.ndarray,
68
+ vad_options: Optional[VadOptions] = None,
69
+ progress: gr.Progress = gr.Progress(),
70
+ **kwargs,
71
+ ) -> List[dict]:
72
+ """This method is used for splitting long audios into speech chunks using silero VAD.
73
+
74
+ Args:
75
+ audio: One dimensional float array.
76
+ vad_options: Options for VAD processing.
77
+ kwargs: VAD options passed as keyword arguments for backward compatibility.
78
+ progress: Gradio progress to indicate progress.
79
+
80
+ Returns:
81
+ List of dicts containing begin and end samples of each speech chunk.
82
+ """
83
+
84
+ if self.model is None:
85
+ self.update_model()
86
+
87
+ if vad_options is None:
88
+ vad_options = VadOptions(**kwargs)
89
+
90
+ threshold = vad_options.threshold
91
+ min_speech_duration_ms = vad_options.min_speech_duration_ms
92
+ max_speech_duration_s = vad_options.max_speech_duration_s
93
+ min_silence_duration_ms = vad_options.min_silence_duration_ms
94
+ window_size_samples = self.window_size_samples
95
+ speech_pad_ms = vad_options.speech_pad_ms
96
+ sampling_rate = 16000
97
+ min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
98
+ speech_pad_samples = sampling_rate * speech_pad_ms / 1000
99
+ max_speech_samples = (
100
+ sampling_rate * max_speech_duration_s
101
+ - window_size_samples
102
+ - 2 * speech_pad_samples
103
+ )
104
+ min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
105
+ min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
106
+
107
+ audio_length_samples = len(audio)
108
+
109
+ state, context = self.model.get_initial_states(batch_size=1)
110
+
111
+ speech_probs = []
112
+ for current_start_sample in range(0, audio_length_samples, window_size_samples):
113
+ progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
114
+
115
+ chunk = audio[current_start_sample: current_start_sample + window_size_samples]
116
+ if len(chunk) < window_size_samples:
117
+ chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
118
+ speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
119
+ speech_probs.append(speech_prob)
120
+
121
+ triggered = False
122
+ speeches = []
123
+ current_speech = {}
124
+ neg_threshold = threshold - 0.15
125
+
126
+ # to save potential segment end (and tolerate some silence)
127
+ temp_end = 0
128
+ # to save potential segment limits in case of maximum segment size reached
129
+ prev_end = next_start = 0
130
+
131
+ for i, speech_prob in enumerate(speech_probs):
132
+ if (speech_prob >= threshold) and temp_end:
133
+ temp_end = 0
134
+ if next_start < prev_end:
135
+ next_start = window_size_samples * i
136
+
137
+ if (speech_prob >= threshold) and not triggered:
138
+ triggered = True
139
+ current_speech["start"] = window_size_samples * i
140
+ continue
141
+
142
+ if (
143
+ triggered
144
+ and (window_size_samples * i) - current_speech["start"] > max_speech_samples
145
+ ):
146
+ if prev_end:
147
+ current_speech["end"] = prev_end
148
+ speeches.append(current_speech)
149
+ current_speech = {}
150
+ # previously reached silence (< neg_thres) and is still not speech (< thres)
151
+ if next_start < prev_end:
152
+ triggered = False
153
+ else:
154
+ current_speech["start"] = next_start
155
+ prev_end = next_start = temp_end = 0
156
+ else:
157
+ current_speech["end"] = window_size_samples * i
158
+ speeches.append(current_speech)
159
+ current_speech = {}
160
+ prev_end = next_start = temp_end = 0
161
+ triggered = False
162
+ continue
163
+
164
+ if (speech_prob < neg_threshold) and triggered:
165
+ if not temp_end:
166
+ temp_end = window_size_samples * i
167
+ # condition to avoid cutting in very short silence
168
+ if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
169
+ prev_end = temp_end
170
+ if (window_size_samples * i) - temp_end < min_silence_samples:
171
+ continue
172
+ else:
173
+ current_speech["end"] = temp_end
174
+ if (
175
+ current_speech["end"] - current_speech["start"]
176
+ ) > min_speech_samples:
177
+ speeches.append(current_speech)
178
+ current_speech = {}
179
+ prev_end = next_start = temp_end = 0
180
+ triggered = False
181
+ continue
182
+
183
+ if (
184
+ current_speech
185
+ and (audio_length_samples - current_speech["start"]) > min_speech_samples
186
+ ):
187
+ current_speech["end"] = audio_length_samples
188
+ speeches.append(current_speech)
189
+
190
+ for i, speech in enumerate(speeches):
191
+ if i == 0:
192
+ speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
193
+ if i != len(speeches) - 1:
194
+ silence_duration = speeches[i + 1]["start"] - speech["end"]
195
+ if silence_duration < 2 * speech_pad_samples:
196
+ speech["end"] += int(silence_duration // 2)
197
+ speeches[i + 1]["start"] = int(
198
+ max(0, speeches[i + 1]["start"] - silence_duration // 2)
199
+ )
200
+ else:
201
+ speech["end"] = int(
202
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
203
+ )
204
+ speeches[i + 1]["start"] = int(
205
+ max(0, speeches[i + 1]["start"] - speech_pad_samples)
206
+ )
207
+ else:
208
+ speech["end"] = int(
209
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
210
+ )
211
+
212
+ return speeches
213
+
214
+ def update_model(self):
215
+ self.model = get_vad_model()
216
+
217
+ @staticmethod
218
+ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
219
+ """Collects and concatenates audio chunks."""
220
+ if not chunks:
221
+ return np.array([], dtype=np.float32)
222
+
223
+ return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
224
+
225
+ @staticmethod
226
+ def format_timestamp(
227
+ seconds: float,
228
+ always_include_hours: bool = False,
229
+ decimal_marker: str = ".",
230
+ ) -> str:
231
+ assert seconds >= 0, "non-negative timestamp expected"
232
+ milliseconds = round(seconds * 1000.0)
233
+
234
+ hours = milliseconds // 3_600_000
235
+ milliseconds -= hours * 3_600_000
236
+
237
+ minutes = milliseconds // 60_000
238
+ milliseconds -= minutes * 60_000
239
+
240
+ seconds = milliseconds // 1_000
241
+ milliseconds -= seconds * 1_000
242
+
243
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
244
+ return (
245
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
246
+ )
247
+
248
+ def restore_speech_timestamps(
249
+ self,
250
+ segments: List[dict],
251
+ speech_chunks: List[dict],
252
+ sampling_rate: Optional[int] = None,
253
+ ) -> List[dict]:
254
+ if sampling_rate is None:
255
+ sampling_rate = self.sampling_rate
256
+
257
+ ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
258
+
259
+ for segment in segments:
260
+ segment["start"] = ts_map.get_original_time(segment["start"])
261
+ segment["end"] = ts_map.get_original_time(segment["end"])
262
+
263
+ return segments
264
+