VOICEVN

Build error

App Files Files Community

VOICEVN / main /library /uvr5_separator /common_separator.py

AnhP

Upload 65 files

98bb602 verified 8 months ago

raw

history blame

11.7 kB

	import os
	import gc
	import sys
	import torch
	import librosa

	import numpy as np
	import soundfile as sf

	from logging import Logger
	from pydub import AudioSegment

	now_dir = os.getcwd()
	sys.path.append(now_dir)

	from . import spec_utils
	from main.configs.config import Config

	translations = Config().translations

	class CommonSeparator:
	ALL_STEMS = "All Stems"
	VOCAL_STEM = "Vocals"
	INST_STEM = "Instrumental"
	OTHER_STEM = "Other"
	BASS_STEM = "Bass"
	DRUM_STEM = "Drums"
	GUITAR_STEM = "Guitar"
	PIANO_STEM = "Piano"
	SYNTH_STEM = "Synthesizer"
	STRINGS_STEM = "Strings"
	WOODWINDS_STEM = "Woodwinds"
	BRASS_STEM = "Brass"
	WIND_INST_STEM = "Wind Inst"
	NO_OTHER_STEM = "No Other"
	NO_BASS_STEM = "No Bass"
	NO_DRUM_STEM = "No Drums"
	NO_GUITAR_STEM = "No Guitar"
	NO_PIANO_STEM = "No Piano"
	NO_SYNTH_STEM = "No Synthesizer"
	NO_STRINGS_STEM = "No Strings"
	NO_WOODWINDS_STEM = "No Woodwinds"
	NO_WIND_INST_STEM = "No Wind Inst"
	NO_BRASS_STEM = "No Brass"
	PRIMARY_STEM = "Primary Stem"
	SECONDARY_STEM = "Secondary Stem"
	LEAD_VOCAL_STEM = "lead_only"
	BV_VOCAL_STEM = "backing_only"
	LEAD_VOCAL_STEM_I = "with_lead_vocals"
	BV_VOCAL_STEM_I = "with_backing_vocals"
	LEAD_VOCAL_STEM_LABEL = "Lead Vocals"
	BV_VOCAL_STEM_LABEL = "Backing Vocals"
	NO_STEM = "No "

	STEM_PAIR_MAPPER = {VOCAL_STEM: INST_STEM, INST_STEM: VOCAL_STEM, LEAD_VOCAL_STEM: BV_VOCAL_STEM, BV_VOCAL_STEM: LEAD_VOCAL_STEM, PRIMARY_STEM: SECONDARY_STEM}

	NON_ACCOM_STEMS = (VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM, SYNTH_STEM, STRINGS_STEM, WOODWINDS_STEM, BRASS_STEM, WIND_INST_STEM)


	def __init__(self, config):
	self.logger: Logger = config.get("logger")
	self.log_level: int = config.get("log_level")
	self.torch_device = config.get("torch_device")
	self.torch_device_cpu = config.get("torch_device_cpu")
	self.torch_device_mps = config.get("torch_device_mps")
	self.onnx_execution_provider = config.get("onnx_execution_provider")
	self.model_name = config.get("model_name")
	self.model_path = config.get("model_path")
	self.model_data = config.get("model_data")
	self.output_dir = config.get("output_dir")
	self.output_format = config.get("output_format")
	self.output_bitrate = config.get("output_bitrate")
	self.normalization_threshold = config.get("normalization_threshold")
	self.enable_denoise = config.get("enable_denoise")
	self.output_single_stem = config.get("output_single_stem")
	self.invert_using_spec = config.get("invert_using_spec")
	self.sample_rate = config.get("sample_rate")

	self.primary_stem_name = None
	self.secondary_stem_name = None

	if "training" in self.model_data and "instruments" in self.model_data["training"]:
	instruments = self.model_data["training"]["instruments"]

	if instruments:
	self.primary_stem_name = instruments[0]
	self.secondary_stem_name = instruments[1] if len(instruments) > 1 else self.secondary_stem(self.primary_stem_name)

	if self.primary_stem_name is None:
	self.primary_stem_name = self.model_data.get("primary_stem", "Vocals")
	self.secondary_stem_name = self.secondary_stem(self.primary_stem_name)

	self.is_karaoke = self.model_data.get("is_karaoke", False)
	self.is_bv_model = self.model_data.get("is_bv_model", False)
	self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0)

	self.logger.debug(translations["info"].format(model_name=self.model_name, model_path=self.model_path))
	self.logger.debug(translations["info_2"].format(output_dir=self.output_dir, output_format=self.output_format))
	self.logger.debug(translations["info_3"].format(normalization_threshold=self.normalization_threshold))
	self.logger.debug(translations["info_4"].format(enable_denoise=self.enable_denoise, output_single_stem=self.output_single_stem))
	self.logger.debug(translations["info_5"].format(invert_using_spec=self.invert_using_spec, sample_rate=self.sample_rate))
	self.logger.debug(translations["info_6"].format(primary_stem_name=self.primary_stem_name, secondary_stem_name=self.secondary_stem_name))
	self.logger.debug(translations["info_7"].format(is_karaoke=self.is_karaoke, is_bv_model=self.is_bv_model, bv_model_rebalance=self.bv_model_rebalance))

	self.audio_file_path = None
	self.audio_file_base = None
	self.primary_source = None
	self.secondary_source = None
	self.primary_stem_output_path = None
	self.secondary_stem_output_path = None
	self.cached_sources_map = {}

	def secondary_stem(self, primary_stem: str):
	primary_stem = primary_stem if primary_stem else self.NO_STEM

	return self.STEM_PAIR_MAPPER[primary_stem] if primary_stem in self.STEM_PAIR_MAPPER else primary_stem.replace(self.NO_STEM, "") if self.NO_STEM in primary_stem else f"{self.NO_STEM}{primary_stem}"

	def separate(self, audio_file_path):
	pass

	def final_process(self, stem_path, source, stem_name):
	self.logger.debug(translations["success_process"].format(stem_name=stem_name))
	self.write_audio(stem_path, source)

	return {stem_name: source}

	def cached_sources_clear(self):
	self.cached_sources_map = {}

	def cached_source_callback(self, model_architecture, model_name=None):
	model, sources = None, None
	mapper = self.cached_sources_map[model_architecture]

	for key, value in mapper.items():
	if model_name in key:
	model = key
	sources = value

	return model, sources

	def cached_model_source_holder(self, model_architecture, sources, model_name=None):
	self.cached_sources_map[model_architecture] = {self.cached_sources_map.get(model_architecture, {}), {model_name: sources}}

	def prepare_mix(self, mix):
	audio_path = mix

	if not isinstance(mix, np.ndarray):
	self.logger.debug(f"{translations['load_audio']}: {mix}")
	mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate)
	self.logger.debug(translations["load_audio_success"].format(sr=sr, shape=mix.shape))
	else:
	self.logger.debug(translations["convert_mix"])
	mix = mix.T
	self.logger.debug(translations["convert_shape"].format(shape=mix.shape))

	if isinstance(audio_path, str):
	if not np.any(mix):
	error_msg = translations["audio_not_valid"].format(audio_path=audio_path)
	self.logger.error(error_msg)
	raise ValueError(error_msg)
	else: self.logger.debug(translations["audio_valid"])

	if mix.ndim == 1:
	self.logger.debug(translations["mix_single"])
	mix = np.asfortranarray([mix, mix])
	self.logger.debug(translations["convert_mix_audio"])

	self.logger.debug(translations["mix_success_2"])
	return mix

	def write_audio(self, stem_path: str, stem_source):
	duration_seconds = librosa.get_duration(filename=self.audio_file_path)
	duration_hours = duration_seconds / 3600
	self.logger.info(translations["duration"].format(duration_hours=f"{duration_hours:.2f}", duration_seconds=f"{duration_seconds:.2f}"))

	if duration_hours >= 1:
	self.logger.warning(translations["write"].format(name="soundfile"))
	self.write_audio_soundfile(stem_path, stem_source)
	else:
	self.logger.info(translations["write"].format(name="pydub"))
	self.write_audio_pydub(stem_path, stem_source)

	def write_audio_pydub(self, stem_path: str, stem_source):
	self.logger.debug(f"{translations['write_audio'].format(name='write_audio_pydub')} {stem_path}")

	stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold)

	if np.max(np.abs(stem_source)) < 1e-6:
	self.logger.warning(translations["original_not_valid"])
	return

	if self.output_dir:
	os.makedirs(self.output_dir, exist_ok=True)
	stem_path = os.path.join(self.output_dir, stem_path)

	self.logger.debug(f"{translations['shape_audio']}: {stem_source.shape}")
	self.logger.debug(f"{translations['convert_data']}: {stem_source.dtype}")

	if stem_source.dtype != np.int16:
	stem_source = (stem_source * 32767).astype(np.int16)
	self.logger.debug(translations["original_source_to_int16"])

	stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16)
	stem_source_interleaved[0::2] = stem_source[:, 0]
	stem_source_interleaved[1::2] = stem_source[:, 1]

	self.logger.debug(f"{translations['shape_audio_2']}: {stem_source_interleaved.shape}")

	try:
	audio_segment = AudioSegment(stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2)
	self.logger.debug(translations["create_audiosegment"])
	except (IOError, ValueError) as e:
	self.logger.error(f"{translations['create_audiosegment_error']}: {e}")
	return

	file_format = stem_path.lower().split(".")[-1]

	if file_format == "m4a": file_format = "mp4"
	elif file_format == "mka": file_format = "matroska"

	bitrate = "320k" if file_format == "mp3" and self.output_bitrate is None else self.output_bitrate

	try:
	audio_segment.export(stem_path, format=file_format, bitrate=bitrate)
	self.logger.debug(f"{translations['export_success']} {stem_path}")
	except (IOError, ValueError) as e:
	self.logger.error(f"{translations['export_error']}: {e}")

	def write_audio_soundfile(self, stem_path: str, stem_source):
	self.logger.debug(f"{translations['write_audio'].format(name='write_audio_soundfile')}: {stem_path}")

	if stem_source.shape[1] == 2:
	if stem_source.flags["F_CONTIGUOUS"]: stem_source = np.ascontiguousarray(stem_source)
	else:
	stereo_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16)
	stereo_interleaved[0::2] = stem_source[:, 0]

	stereo_interleaved[1::2] = stem_source[:, 1]
	stem_source = stereo_interleaved

	self.logger.debug(f"{translations['shape_audio_2']}: {stem_source.shape}")

	try:
	sf.write(stem_path, stem_source, self.sample_rate)
	self.logger.debug(f"{translations['export_success']} {stem_path}")
	except Exception as e:
	self.logger.error(f"{translations['export_error']}: {e}")

	def clear_gpu_cache(self):
	self.logger.debug(translations["clean"])
	gc.collect()

	if self.torch_device == torch.device("mps"):
	self.logger.debug(translations["clean_cache"].format(name="MPS"))
	torch.mps.empty_cache()

	if self.torch_device == torch.device("cuda"):
	self.logger.debug(translations["clean_cache"].format(name="CUDA"))
	torch.cuda.empty_cache()

	def clear_file_specific_paths(self):
	self.logger.info(translations["del_path"])
	self.audio_file_path = None
	self.audio_file_base = None

	self.primary_source = None
	self.secondary_source = None

	self.primary_stem_output_path = None
	self.secondary_stem_output_path = None