VOICEVN

Build error

App Files Files Community

VOICEVN / main /inference /convert.py

AnhP

Upload 65 files

98bb602 verified 8 months ago

raw

history blame

47.8 kB

	import gc
	import re
	import os
	import sys
	import time
	import torch
	import faiss
	import shutil
	import codecs
	import pyworld
	import librosa
	import logging
	import argparse
	import warnings
	import traceback
	import torchcrepe
	import subprocess
	import parselmouth
	import logging.handlers

	import numpy as np
	import soundfile as sf
	import noisereduce as nr
	import torch.nn.functional as F
	import torch.multiprocessing as mp

	from tqdm import tqdm
	from scipy import signal
	from torch import Tensor
	from scipy.io import wavfile
	from audio_upscaler import upscale
	from distutils.util import strtobool
	from fairseq import checkpoint_utils
	from pydub import AudioSegment, silence


	now_dir = os.getcwd()
	sys.path.append(now_dir)

	from main.configs.config import Config
	from main.library.predictors.FCPE import FCPE
	from main.library.predictors.RMVPE import RMVPE
	from main.library.algorithm.synthesizers import Synthesizer


	warnings.filterwarnings("ignore", category=FutureWarning)
	warnings.filterwarnings("ignore", category=UserWarning)

	logging.getLogger("wget").setLevel(logging.ERROR)
	logging.getLogger("torch").setLevel(logging.ERROR)
	logging.getLogger("faiss").setLevel(logging.ERROR)
	logging.getLogger("httpx").setLevel(logging.ERROR)
	logging.getLogger("fairseq").setLevel(logging.ERROR)
	logging.getLogger("httpcore").setLevel(logging.ERROR)
	logging.getLogger("faiss.loader").setLevel(logging.ERROR)


	FILTER_ORDER = 5
	CUTOFF_FREQUENCY = 48
	SAMPLE_RATE = 16000

	bh, ah = signal.butter(N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE)
	input_audio_path2wav = {}

	log_file = os.path.join("assets", "logs", "convert.log")

	logger = logging.getLogger(__name__)
	logger.propagate = False

	translations = Config().translations


	if logger.hasHandlers(): logger.handlers.clear()
	else:
	console_handler = logging.StreamHandler()
	console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d \| %(levelname)s \| %(module)s \| %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

	console_handler.setFormatter(console_formatter)
	console_handler.setLevel(logging.INFO)

	file_handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=510241024, backupCount=3, encoding='utf-8')
	file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d \| %(levelname)s \| %(module)s \| %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

	file_handler.setFormatter(file_formatter)
	file_handler.setLevel(logging.DEBUG)

	logger.addHandler(console_handler)
	logger.addHandler(file_handler)
	logger.setLevel(logging.DEBUG)


	def parse_arguments() -> tuple:
	parser = argparse.ArgumentParser()
	parser.add_argument("--pitch", type=int, default=0)
	parser.add_argument("--filter_radius", type=int, default=3)
	parser.add_argument("--index_rate", type=float, default=0.5)
	parser.add_argument("--volume_envelope", type=float, default=1)
	parser.add_argument("--protect", type=float, default=0.33)
	parser.add_argument("--hop_length", type=int, default=64)
	parser.add_argument( "--f0_method", type=str, default="rmvpe")
	parser.add_argument("--input_path", type=str, required=True)
	parser.add_argument("--output_path", type=str, default="./audios/output.wav")
	parser.add_argument("--pth_path", type=str, required=True)
	parser.add_argument("--index_path", type=str, required=True)
	parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False)
	parser.add_argument("--f0_autotune_strength", type=float, default=1)
	parser.add_argument("--clean_audio", type=lambda x: bool(strtobool(x)), default=False)
	parser.add_argument("--clean_strength", type=float, default=0.7)
	parser.add_argument("--export_format", type=str, default="wav")
	parser.add_argument("--embedder_model", type=str, default="contentvec_base")
	parser.add_argument("--upscale_audio", type=lambda x: bool(strtobool(x)), default=False)
	parser.add_argument("--resample_sr", type=int, default=0)
	parser.add_argument("--batch_process", type=lambda x: bool(strtobool(x)), default=False)
	parser.add_argument("--batch_size", type=int, default=2)
	parser.add_argument("--split_audio", type=lambda x: bool(strtobool(x)), default=False)

	args = parser.parse_args()
	return args


	def main():
	args = parse_arguments()
	pitch = args.pitch
	filter_radius = args.filter_radius
	index_rate = args.index_rate
	volume_envelope = args.volume_envelope
	protect = args.protect
	hop_length = args.hop_length
	f0_method = args.f0_method
	input_path = args.input_path
	output_path = args.output_path
	pth_path = args.pth_path
	index_path = args.index_path
	f0_autotune = args.f0_autotune
	f0_autotune_strength = args.f0_autotune_strength
	clean_audio = args.clean_audio
	clean_strength = args.clean_strength
	export_format = args.export_format
	embedder_model = args.embedder_model
	upscale_audio = args.upscale_audio
	resample_sr = args.resample_sr
	batch_process = args.batch_process
	batch_size = args.batch_size
	split_audio = args.split_audio

	logger.debug(f"{translations['pitch']}: {pitch}")
	logger.debug(f"{translations['filter_radius']}: {filter_radius}")
	logger.debug(f"{translations['index_strength']} {index_rate}")
	logger.debug(f"{translations['volume_envelope']}: {volume_envelope}")
	logger.debug(f"{translations['protect']}: {protect}")
	if f0_method == "crepe" or f0_method == "crepe-tiny": logger.debug(f"Hop length: {hop_length}")
	logger.debug(f"{translations['f0_method']}: {f0_method}")
	logger.debug(f"f0_method: {input_path}")
	logger.debug(f"{translations['audio_path']}: {input_path}")
	logger.debug(f"{translations['output_path']}: {output_path.replace('.wav', f'.{export_format}')}")
	logger.debug(f"{translations['model_path']}: {pth_path}")
	logger.debug(f"{translations['indexpath']}: {index_path}")
	logger.debug(f"{translations['autotune']}: {f0_autotune}")
	logger.debug(f"{translations['clear_audio']}: {clean_audio}")
	if clean_audio: logger.debug(f"{translations['clean_strength']}: {clean_strength}")
	logger.debug(f"{translations['export_format']}: {export_format}")
	logger.debug(f"{translations['hubert_model']}: {embedder_model}")
	logger.debug(f"{translations['upscale_audio']}: {upscale_audio}")
	if resample_sr != 0: logger.debug(f"{translations['sample_rate']}: {resample_sr}")
	if split_audio: logger.debug(f"{translations['batch_process']}: {batch_process}")
	if batch_process and split_audio: logger.debug(f"{translations['batch_size']}: {batch_size}")
	logger.debug(f"{translations['split_audio']}: {split_audio}")
	if f0_autotune: logger.debug(f"{translations['autotune_rate_info']}: {f0_autotune_strength}")


	check_rmvpe_fcpe(f0_method)
	check_hubert(embedder_model)

	run_convert_script(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, input_path=input_path, output_path=output_path, pth_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, upscale_audio=upscale_audio, resample_sr=resample_sr, batch_process=batch_process, batch_size=batch_size, split_audio=split_audio)


	def check_rmvpe_fcpe(method):
	def download_rmvpe():
	if not os.path.exists(os.path.join("assets", "model", "predictors", "rmvpe.pt")): subprocess.run(["wget", "-q", "--show-progress", "--no-check-certificate", codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Pbyno_EIP_Cebwrpg_2/erfbyir/znva/", "rot13") + "rmvpe.pt", "-P", os.path.join("assets", "model", "predictors")], check=True)

	def download_fcpe():
	if not os.path.exists(os.path.join("assets", "model", "predictors", "fcpe.pt")): subprocess.run(["wget", "-q", "--show-progress", "--no-check-certificate", codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Pbyno_EIP_Cebwrpg_2/erfbyir/znva/", "rot13") + "fcpe.pt", "-P", os.path.join("assets", "model", "predictors")], check=True)

	if method == "rmvpe": download_rmvpe()
	elif method == "fcpe": download_fcpe()
	elif "hybrid" in method:
	methods_str = re.search("hybrid\[(.+)\]", method)
	if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")]

	for method in methods:
	if method == "rmvpe": download_rmvpe()
	elif method == "fcpe": download_fcpe()


	def check_hubert(hubert):
	if hubert == "contentvec_base" or hubert == "hubert_base" or hubert == "japanese_hubert_base" or hubert == "korean_hubert_base" or hubert == "chinese_hubert_base":
	model_path = os.path.join(now_dir, "assets", "model", "embedders", hubert + '.pt')

	if not os.path.exists(model_path): subprocess.run(["wget", "-q", "--show-progress", "--no-check-certificate", codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Pbyno_EIP_Cebwrpg_2/erfbyir/znva/", "rot13") + f"{hubert}.pt", "-P", os.path.join("assets", "model", "embedders")], check=True)


	def load_audio_infer(file, sample_rate):
	try:
	file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
	if not os.path.isfile(file): raise FileNotFoundError(translations["not_found"].format(name=file))

	audio, sr = sf.read(file)

	if len(audio.shape) > 1: audio = librosa.to_mono(audio.T)
	if sr != sample_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
	except Exception as e:
	raise RuntimeError(f"{translations['errors_loading_audio']}: {e}")

	return audio.flatten()


	def process_audio(file_path, output_path):
	try:
	song = AudioSegment.from_file(file_path)
	nonsilent_parts = silence.detect_nonsilent(song, min_silence_len=750, silence_thresh=-70)

	cut_files = []
	time_stamps = []

	min_chunk_duration = 30

	for i, (start_i, end_i) in enumerate(nonsilent_parts):
	chunk = song[start_i:end_i]

	if len(chunk) >= min_chunk_duration:
	chunk_file_path = os.path.join(output_path, f"chunk{i}.wav")

	if os.path.exists(chunk_file_path): os.remove(chunk_file_path)
	chunk.export(chunk_file_path, format="wav")

	cut_files.append(chunk_file_path)
	time_stamps.append((start_i, end_i))
	else: logger.debug(translations["skip_file"].format(i=i, chunk=len(chunk)))

	logger.info(f"{translations['split_total']}: {len(cut_files)}")
	return cut_files, time_stamps
	except Exception as e:
	raise RuntimeError(f"{translations['process_audio_error']}: {e}")


	def merge_audio(files_list, time_stamps, original_file_path, output_path, format):
	try:
	def extract_number(filename):
	match = re.search(r'_(\d+)', filename)

	return int(match.group(1)) if match else 0

	files_list = sorted(files_list, key=extract_number)
	total_duration = len(AudioSegment.from_file(original_file_path))

	combined = AudioSegment.empty()
	current_position = 0

	for file, (start_i, end_i) in zip(files_list, time_stamps):
	if start_i > current_position:
	silence_duration = start_i - current_position
	combined += AudioSegment.silent(duration=silence_duration)

	combined += AudioSegment.from_file(file)
	current_position = end_i

	if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position)

	combined.export(output_path, format=format)
	return output_path
	except Exception as e:
	raise RuntimeError(f"{translations['merge_error']}: {e}")


	def run_batch_convert(params):
	cvt = VoiceConverter()

	path = params["path"]
	audio_temp = params["audio_temp"]
	export_format = params["export_format"]
	cut_files = params["cut_files"]
	pitch = params["pitch"]
	filter_radius = params["filter_radius"]
	index_rate = params["index_rate"]
	volume_envelope = params["volume_envelope"]
	protect = params["protect"]
	hop_length = params["hop_length"]
	f0_method = params["f0_method"]
	pth_path = params["pth_path"]
	index_path = params["index_path"]
	f0_autotune = params["f0_autotune"]
	f0_autotune_strength = params["f0_autotune_strength"]
	clean_audio = params["clean_audio"]
	clean_strength = params["clean_strength"]
	upscale_audio = params["upscale_audio"]
	embedder_model = params["embedder_model"]
	resample_sr = params["resample_sr"]


	segment_output_path = os.path.join(audio_temp, f"output_{cut_files.index(path)}.{export_format}")
	if os.path.exists(segment_output_path): os.remove(segment_output_path)

	cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=path, audio_output_path=segment_output_path, model_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, upscale_audio=upscale_audio, embedder_model=embedder_model, resample_sr=resample_sr)
	os.remove(path)


	if os.path.exists(segment_output_path): return segment_output_path
	else:
	logger.warning(f"{translations['not_found_convert_file']}: {segment_output_path}")
	sys.exit(1)


	def run_convert_script(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, upscale_audio, embedder_model, resample_sr, batch_process, batch_size, split_audio):
	cvt = VoiceConverter()
	start_time = time.time()


	if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith(".pth"):
	logger.warning(translations["provide_file"].format(filename=translations["model"]))
	sys.exit(1)

	if not index_path or not os.path.exists(index_path) or os.path.isdir(index_path) or not index_path.endswith(".index"):
	logger.warning(translations["provide_file"].format(filename=translations["index"]))
	sys.exit(1)


	output_dir = os.path.dirname(output_path)
	output_dir = output_path if not output_dir else output_dir

	if output_dir is None: output_dir = "audios"

	if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)

	audio_temp = os.path.join("audios_temp")
	if not os.path.exists(audio_temp) and split_audio: os.makedirs(audio_temp, exist_ok=True)

	processed_segments = []

	if os.path.isdir(input_path):
	try:
	logger.info(translations["convert_batch"])

	audio_files = [f for f in os.listdir(input_path) if f.endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
	if not audio_files:
	logger.warning(translations["not_found_audio"])
	sys.exit(1)

	logger.info(translations["found_audio"].format(audio_files=len(audio_files)))

	for audio in audio_files:
	audio_path = os.path.join(input_path, audio)
	output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}")

	if split_audio:
	try:
	cut_files, time_stamps = process_audio(audio_path, audio_temp)
	num_threads = min(batch_size, len(cut_files))

	params_list = [
	{
	"path": path,
	"audio_temp": audio_temp,
	"export_format": export_format,
	"cut_files": cut_files,
	"pitch": pitch,
	"filter_radius": filter_radius,
	"index_rate": index_rate,
	"volume_envelope": volume_envelope,
	"protect": protect,
	"hop_length": hop_length,
	"f0_method": f0_method,
	"pth_path": pth_path,
	"index_path": index_path,
	"f0_autotune": f0_autotune,
	"f0_autotune_strength": f0_autotune_strength,
	"clean_audio": clean_audio,
	"clean_strength": clean_strength,
	"upscale_audio": upscale_audio,
	"embedder_model": embedder_model,
	"resample_sr": resample_sr
	}
	for path in cut_files
	]

	if batch_process:
	with mp.Pool(processes=num_threads) as pool:
	with tqdm(total=len(params_list), desc=translations["convert_audio"]) as pbar:
	for results in pool.imap_unordered(run_batch_convert, params_list):
	processed_segments.append(results)
	pbar.update(1)
	else:
	for params in tqdm(params_list, desc=translations["convert_audio"]):
	run_batch_convert(params)

	merge_audio(processed_segments, time_stamps, audio_path, output_audio, export_format)
	except Exception as e:
	logger.error(translations["error_convert_batch"].format(e=e))
	finally:
	if os.path.exists(audio_temp): shutil.rmtree(audio_temp, ignore_errors=True)
	else:
	try:
	logger.info(f"{translations['convert_audio']} '{audio_path}'...")

	if os.path.exists(output_audio): os.remove(output_audio)

	with tqdm(total=1, desc=translations["convert_audio"]) as pbar:
	cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=audio_path, audio_output_path=output_audio, model_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, upscale_audio=upscale_audio, embedder_model=embedder_model, resample_sr=resample_sr)
	pbar.update(1)
	except Exception as e:
	logger.error(translations["error_convert"].format(e=e))

	elapsed_time = time.time() - start_time
	logger.info(translations["convert_batch_success"].format(elapsed_time=f"{elapsed_time:.2f}", output_path=output_path.replace('.wav', f'.{export_format}')))
	except Exception as e:
	logger.error(translations["error_convert_batch_2"].format(e=e))
	else:
	logger.info(f"{translations['convert_audio']} '{input_path}'...")

	if not os.path.exists(input_path):
	logger.warning(translations["not_found_audio"])
	sys.exit(1)

	if os.path.isdir(output_path): output_path = os.path.join(output_path, f"output.{export_format}")
	if os.path.exists(output_path): os.remove(output_path)

	if split_audio:
	try:
	cut_files, time_stamps = process_audio(input_path, audio_temp)
	num_threads = min(batch_size, len(cut_files))

	params_list = [
	{
	"path": path,
	"audio_temp": audio_temp,
	"export_format": export_format,
	"cut_files": cut_files,
	"pitch": pitch,
	"filter_radius": filter_radius,
	"index_rate": index_rate,
	"volume_envelope": volume_envelope,
	"protect": protect,
	"hop_length": hop_length,
	"f0_method": f0_method,
	"pth_path": pth_path,
	"index_path": index_path,
	"f0_autotune": f0_autotune,
	"f0_autotune_strength": f0_autotune_strength,
	"clean_audio": clean_audio,
	"clean_strength": clean_strength,
	"upscale_audio": upscale_audio,
	"embedder_model": embedder_model,
	"resample_sr": resample_sr
	}
	for path in cut_files
	]

	if batch_process:
	with mp.Pool(processes=num_threads) as pool:
	with tqdm(total=len(params_list), desc=translations["convert_audio"]) as pbar:
	for results in pool.imap_unordered(run_batch_convert, params_list):
	processed_segments.append(results)
	pbar.update(1)
	else:
	for params in tqdm(params_list, desc=translations["convert_audio"]):
	run_batch_convert(params)

	merge_audio(processed_segments, time_stamps, input_path, output_path.replace(".wav", f".{export_format}"), export_format)
	except Exception as e:
	logger.error(translations["error_convert_batch"].format(e=e))
	finally:
	if os.path.exists(audio_temp): shutil.rmtree(audio_temp, ignore_errors=True)
	else:
	try:
	with tqdm(total=1, desc=translations["convert_audio"]) as pbar:
	cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=input_path, audio_output_path=output_path, model_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, upscale_audio=upscale_audio, embedder_model=embedder_model, resample_sr=resample_sr)
	pbar.update(1)
	except Exception as e:
	logger.error(translations["error_convert"].format(e=e))

	elapsed_time = time.time() - start_time
	logger.info(translations["convert_audio_success"].format(input_path=input_path, elapsed_time=f"{elapsed_time:.2f}", output_path=output_path.replace('.wav', f'.{export_format}')))


	def change_rms(source_audio: np.ndarray, source_rate: int, target_audio: np.ndarray, target_rate: int, rate: float) -> np.ndarray:
	rms1 = librosa.feature.rms(
	y=source_audio,
	frame_length=source_rate // 2 * 2,
	hop_length=source_rate // 2,
	)

	rms2 = librosa.feature.rms(
	y=target_audio,
	frame_length=target_rate // 2 * 2,
	hop_length=target_rate // 2,
	)

	rms1 = F.interpolate(
	torch.from_numpy(rms1).float().unsqueeze(0),
	size=target_audio.shape[0],
	mode="linear",
	).squeeze()

	rms2 = F.interpolate(
	torch.from_numpy(rms2).float().unsqueeze(0),
	size=target_audio.shape[0],
	mode="linear",
	).squeeze()

	rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)


	adjusted_audio = (target_audio * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy())
	return adjusted_audio


	class Autotune:
	def __init__(self, ref_freqs):
	self.ref_freqs = ref_freqs
	self.note_dict = self.ref_freqs


	def autotune_f0(self, f0, f0_autotune_strength):
	autotuned_f0 = np.zeros_like(f0)


	for i, freq in enumerate(f0):
	closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
	autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength

	return autotuned_f0


	class VC:
	def __init__(self, tgt_sr, config):
	self.x_pad = config.x_pad
	self.x_query = config.x_query
	self.x_center = config.x_center
	self.x_max = config.x_max
	self.is_half = config.is_half
	self.sample_rate = 16000
	self.window = 160
	self.t_pad = self.sample_rate * self.x_pad
	self.t_pad_tgt = tgt_sr * self.x_pad
	self.t_pad2 = self.t_pad * 2
	self.t_query = self.sample_rate * self.x_query
	self.t_center = self.sample_rate * self.x_center
	self.t_max = self.sample_rate * self.x_max
	self.time_step = self.window / self.sample_rate * 1000
	self.f0_min = 50
	self.f0_max = 1100
	self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
	self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
	self.device = config.device
	self.ref_freqs = [
	49.00,
	51.91,
	55.00,
	58.27,
	61.74,
	65.41,
	69.30,
	73.42,
	77.78,
	82.41,
	87.31,
	92.50,
	98.00,
	103.83,
	110.00,
	116.54,
	123.47,
	130.81,
	138.59,
	146.83,
	155.56,
	164.81,
	174.61,
	185.00,
	196.00,
	207.65,
	220.00,
	233.08,
	246.94,
	261.63,
	277.18,
	293.66,
	311.13,
	329.63,
	349.23,
	369.99,
	392.00,
	415.30,
	440.00,
	466.16,
	493.88,
	523.25,
	554.37,
	587.33,
	622.25,
	659.25,
	698.46,
	739.99,
	783.99,
	830.61,
	880.00,
	932.33,
	987.77,
	1046.50
	]
	self.autotune = Autotune(self.ref_freqs)
	self.note_dict = self.autotune.note_dict


	def get_f0_crepe(self, x, f0_min, f0_max, p_len, hop_length, model="full"):
	x = x.astype(np.float32)
	x /= np.quantile(np.abs(x), 0.999)

	audio = torch.from_numpy(x).to(self.device, copy=True)
	audio = torch.unsqueeze(audio, dim=0)


	if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach()

	audio = audio.detach()
	pitch: Tensor = torchcrepe.predict(audio, self.sample_rate, hop_length, f0_min, f0_max, model, batch_size=hop_length * 2, device=self.device, pad=True)

	p_len = p_len or x.shape[0] // hop_length
	source = np.array(pitch.squeeze(0).cpu().float().numpy())
	source[source < 0.001] = np.nan

	target = np.interp(
	np.arange(0, len(source) * p_len, len(source)) / p_len,
	np.arange(0, len(source)),
	source,
	)

	f0 = np.nan_to_num(target)
	return f0


	def get_f0_hybrid(self, methods_str, x, f0_min, f0_max, p_len, hop_length, filter_radius):
	methods_str = re.search("hybrid\[(.+)\]", methods_str)
	if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")]

	f0_computation_stack = []
	logger.debug(translations["hybrid_methods"].format(methods=methods))

	x = x.astype(np.float32)
	x /= np.quantile(np.abs(x), 0.999)


	for method in methods:
	f0 = None


	if method == "pm":
	f0 = (parselmouth.Sound(x, self.sample_rate).to_pitch_ac(time_step=self.window / self.sample_rate * 1000 / 1000, voicing_threshold=0.6, pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array["frequency"])
	pad_size = (p_len - len(f0) + 1) // 2

	if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
	elif method == 'dio':
	f0, t = pyworld.dio(x.astype(np.double), fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10)
	f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sample_rate)

	f0 = signal.medfilt(f0, 3)
	elif method == "crepe-tiny":
	f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny")
	elif method == "crepe":
	f0 = self.get_f0_crepe(x, f0_min, f0_max, p_len, int(hop_length))
	elif method == "fcpe":
	self.model_fcpe = FCPE(os.path.join("assets", "model", "predictors", "fcpe.pt"), hop_length=int(hop_length), f0_min=int(f0_min), f0_max=int(f0_max), dtype=torch.float32, device=self.device, sample_rate=self.sample_rate, threshold=0.03)
	f0 = self.model_fcpe.compute_f0(x, p_len=p_len)

	del self.model_fcpe
	gc.collect()
	elif method == "rmvpe":
	f0 = RMVPE(os.path.join("assets", "model", "predictors", "rmvpe.pt"), is_half=self.is_half, device=self.device).infer_from_audio(x, thred=0.03)
	f0 = f0[1:]
	elif method == "harvest":
	f0, t = pyworld.harvest(x.astype(np.double), fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10)
	f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sample_rate)

	if filter_radius > 2: f0 = signal.medfilt(f0, 3)
	else: raise ValueError(translations["method_not_valid"])

	f0_computation_stack.append(f0)

	resampled_stack = []

	for f0 in f0_computation_stack:
	resampled_f0 = np.interp(np.linspace(0, len(f0), p_len), np.arange(len(f0)), f0)
	resampled_stack.append(resampled_f0)

	f0_median_hybrid = resampled_stack[0] if len(resampled_stack) == 1 else np.nanmedian(np.vstack(resampled_stack), axis=0)
	return f0_median_hybrid


	def get_f0(self, input_audio_path, x, p_len, pitch, f0_method, filter_radius, hop_length, f0_autotune, f0_autotune_strength):
	global input_audio_path2wav


	if f0_method == "pm":
	f0 = (parselmouth.Sound(x, self.sample_rate).to_pitch_ac(time_step=self.window / self.sample_rate * 1000 / 1000, voicing_threshold=0.6, pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array["frequency"])
	pad_size = (p_len - len(f0) + 1) // 2

	if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
	elif f0_method == "dio":
	f0, t = pyworld.dio(x.astype(np.double), fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10)
	f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sample_rate)

	f0 = signal.medfilt(f0, 3)
	elif f0_method == "crepe-tiny":
	f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny")
	elif f0_method == "crepe":
	f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
	elif f0_method == "fcpe":
	self.model_fcpe = FCPE(os.path.join("assets", "model", "predictors", "fcpe.pt"), hop_length=int(hop_length), f0_min=int(self.f0_min), f0_max=int(self.f0_max), dtype=torch.float32, device=self.device, sample_rate=self.sample_rate, threshold=0.03)
	f0 = self.model_fcpe.compute_f0(x, p_len=p_len)

	del self.model_fcpe
	gc.collect()
	elif f0_method == "rmvpe":
	f0 = RMVPE(os.path.join("assets", "model", "predictors", "rmvpe.pt"), is_half=self.is_half, device=self.device).infer_from_audio(x, thred=0.03)
	elif f0_method == "harvest":
	f0, t = pyworld.harvest(x.astype(np.double), fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10)
	f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sample_rate)

	if filter_radius > 2: f0 = signal.medfilt(f0, 3)
	elif "hybrid" in f0_method:
	input_audio_path2wav[input_audio_path] = x.astype(np.double)
	f0 = self.get_f0_hybrid(f0_method, x, self.f0_min, self.f0_max, p_len, hop_length, filter_radius)
	else: raise ValueError(translations["method_not_valid"])

	if f0_autotune: f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)

	f0 *= pow(2, pitch / 12)

	f0bak = f0.copy()

	f0_mel = 1127 * np.log(1 + f0 / 700)
	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1
	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > 255] = 255

	f0_coarse = np.rint(f0_mel).astype(np.int32)
	return f0_coarse, f0bak


	def voice_conversion(self, model, net_g, sid, audio0, pitch, pitchf, index, big_npy, index_rate, version, protect):
	pitch_guidance = pitch != None and pitchf != None

	feats = (torch.from_numpy(audio0).half() if self.is_half else torch.from_numpy(audio0).float())

	if feats.dim() == 2: feats = feats.mean(-1)
	assert feats.dim() == 1, feats.dim()

	feats = feats.view(1, -1)

	padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)

	inputs = {
	"source": feats.to(self.device),
	"padding_mask": padding_mask,
	"output_layer": 9 if version == "v1" else 12,
	}

	with torch.no_grad():
	logits = model.extract_features(**inputs)
	feats = model.final_proj(logits[0]) if version == "v1" else logits[0]

	if protect < 0.5 and pitch_guidance: feats0 = feats.clone()

	if (not isinstance(index, type(None)) and not isinstance(big_npy, type(None)) and index_rate != 0):
	npy = feats[0].cpu().numpy()

	if self.is_half: npy = npy.astype("float32")

	score, ix = index.search(npy, k=8)

	weight = np.square(1 / score)
	weight /= weight.sum(axis=1, keepdims=True)

	npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)

	if self.is_half: npy = npy.astype("float16")

	feats = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats)

	feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)

	if protect < 0.5 and pitch_guidance: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)

	p_len = audio0.shape[0] // self.window

	if feats.shape[1] < p_len:
	p_len = feats.shape[1]

	if pitch_guidance:
	pitch = pitch[:, :p_len]
	pitchf = pitchf[:, :p_len]

	if protect < 0.5 and pitch_guidance:
	pitchff = pitchf.clone()
	pitchff[pitchf > 0] = 1
	pitchff[pitchf < 1] = protect
	pitchff = pitchff.unsqueeze(-1)

	feats = feats * pitchff + feats0 * (1 - pitchff)
	feats = feats.to(feats0.dtype)

	p_len = torch.tensor([p_len], device=self.device).long()

	with torch.no_grad():
	audio1 = ((net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]).data.cpu().float().numpy()) if pitch_guidance else ((net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy())

	del feats, p_len, padding_mask

	if torch.cuda.is_available(): torch.cuda.empty_cache()
	return audio1


	def pipeline(self, model, net_g, sid, audio, input_audio_path, pitch, f0_method, file_index, index_rate, pitch_guidance, filter_radius, tgt_sr, resample_sr, volume_envelope, version, protect, hop_length, f0_autotune, f0_autotune_strength):
	if file_index != "" and os.path.exists(file_index) and index_rate != 0:
	try:
	index = faiss.read_index(file_index)
	big_npy = index.reconstruct_n(0, index.ntotal)
	except Exception as e:
	logger.error(translations["read_faiss_index_error"].format(e=e))
	index = big_npy = None
	else: index = big_npy = None

	audio = signal.filtfilt(bh, ah, audio)
	audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
	opt_ts = []

	if audio_pad.shape[0] > self.t_max:
	audio_sum = np.zeros_like(audio)

	for i in range(self.window):
	audio_sum += audio_pad[i : i - self.window]

	for t in range(self.t_center, audio.shape[0], self.t_center):
	opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min())[0][0])

	s = 0
	audio_opt = []
	t = None

	audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
	p_len = audio_pad.shape[0] // self.window

	sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()

	if pitch_guidance:
	pitch, pitchf = self.get_f0(input_audio_path, audio_pad, p_len, pitch, f0_method, filter_radius, hop_length, f0_autotune, f0_autotune_strength)
	pitch = pitch[:p_len]
	pitchf = pitchf[:p_len]

	if self.device == "mps": pitchf = pitchf.astype(np.float32)

	pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
	pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()

	for t in opt_ts:
	t = t // self.window * self.window

	if pitch_guidance: audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], pitch[:, s // self.window : (t + self.t_pad2) // self.window], pitchf[:, s // self.window : (t + self.t_pad2) // self.window], index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])
	else: audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], None, None, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])

	s = t

	if pitch_guidance: audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window :] if t is not None else pitch, pitchf[:, t // self.window :] if t is not None else pitchf, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])
	else: audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[t:], None, None, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt])

	audio_opt = np.concatenate(audio_opt)

	if volume_envelope != 1: audio_opt = change_rms(audio, self.sample_rate, audio_opt, tgt_sr, volume_envelope)
	if resample_sr >= self.sample_rate and tgt_sr != resample_sr: audio_opt = librosa.resample(audio_opt, orig_sr=tgt_sr, target_sr=resample_sr)

	audio_max = np.abs(audio_opt).max() / 0.99
	max_int16 = 32768

	if audio_max > 1: max_int16 /= audio_max

	audio_opt = (audio_opt * max_int16).astype(np.int16)

	if pitch_guidance: del pitch, pitchf
	del sid

	if torch.cuda.is_available(): torch.cuda.empty_cache()
	return audio_opt


	class VoiceConverter:
	def __init__(self):
	self.config = Config()
	self.hubert_model = (None)

	self.tgt_sr = None
	self.net_g = None

	self.vc = None
	self.cpt = None

	self.version = None
	self.n_spk = None

	self.use_f0 = None
	self.loaded_model = None


	def load_hubert(self, embedder_model):
	try:
	models, _, _ = checkpoint_utils.load_model_ensemble_and_task([os.path.join(now_dir, "assets", "model", "embedders", embedder_model + '.pt')], suffix="")
	except Exception as e:
	raise ImportError(translations["read_model_error"].format(e=e))

	self.hubert_model = models[0].to(self.config.device)
	self.hubert_model = (self.hubert_model.half() if self.config.is_half else self.hubert_model.float())
	self.hubert_model.eval()


	@staticmethod
	def remove_audio_noise(input_audio_path, reduction_strength=0.7):
	try:
	rate, data = wavfile.read(input_audio_path)
	reduced_noise = nr.reduce_noise(y=data, sr=rate, prop_decrease=reduction_strength)

	return reduced_noise
	except Exception as e:
	logger.error(translations["denoise_error"].format(e=e))
	return None


	@staticmethod
	def convert_audio_format(input_path, output_path, output_format):
	try:
	if output_format != "wav":
	logger.debug(translations["change_format"].format(output_format=output_format))
	audio, sample_rate = sf.read(input_path)


	common_sample_rates = [
	8000,
	11025,
	12000,
	16000,
	22050,
	24000,
	32000,
	44100,
	48000
	]

	target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
	audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sr)

	sf.write(output_path, audio, target_sr, format=output_format)

	return output_path
	except Exception as e:
	raise RuntimeError(translations["change_format_error"].format(e=e))


	def convert_audio(self, audio_input_path, audio_output_path, model_path, index_path, embedder_model, pitch, f0_method, index_rate, volume_envelope, protect, hop_length, f0_autotune, f0_autotune_strength, filter_radius, clean_audio, clean_strength, export_format, upscale_audio, resample_sr = 0, sid = 0):
	self.get_vc(model_path, sid)

	try:
	if upscale_audio: upscale(audio_input_path, audio_input_path)

	audio = load_audio_infer(audio_input_path, 16000)

	audio_max = np.abs(audio).max() / 0.95


	if audio_max > 1: audio /= audio_max

	if not self.hubert_model:
	if not os.path.exists(os.path.join(now_dir, "assets", "model", "embedders", embedder_model + '.pt')): raise FileNotFoundError(f"Không tìm thấy mô hình: {embedder_model}")

	self.load_hubert(embedder_model)

	if self.tgt_sr != resample_sr >= 16000: self.tgt_sr = resample_sr

	file_index = (index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added"))

	audio_opt = self.vc.pipeline(model=self.hubert_model, net_g=self.net_g, sid=sid, audio=audio, input_audio_path=audio_input_path, pitch=pitch, f0_method=f0_method, file_index=file_index, index_rate=index_rate, pitch_guidance=self.use_f0, filter_radius=filter_radius, tgt_sr=self.tgt_sr, resample_sr=resample_sr, volume_envelope=volume_envelope, version=self.version, protect=protect, hop_length=hop_length, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength)

	if audio_output_path: sf.write(audio_output_path, audio_opt, self.tgt_sr, format="wav")

	if clean_audio:
	cleaned_audio = self.remove_audio_noise(audio_output_path, clean_strength)
	if cleaned_audio is not None: sf.write(audio_output_path, cleaned_audio, self.tgt_sr, format="wav")

	output_path_format = audio_output_path.replace(".wav", f".{export_format}")
	audio_output_path = self.convert_audio_format(audio_output_path, output_path_format, export_format)
	except Exception as e:
	logger.error(translations["error_convert"].format(e=e))
	logger.error(traceback.format_exc())


	def get_vc(self, weight_root, sid):
	if sid == "" or sid == []:
	self.cleanup_model()
	if torch.cuda.is_available(): torch.cuda.empty_cache()

	if not self.loaded_model or self.loaded_model != weight_root:
	self.load_model(weight_root)

	if self.cpt is not None:
	self.setup_network()
	self.setup_vc_instance()

	self.loaded_model = weight_root


	def cleanup_model(self):
	if self.hubert_model is not None:
	del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr

	self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None

	if torch.cuda.is_available(): torch.cuda.empty_cache()

	del self.net_g, self.cpt

	if torch.cuda.is_available(): torch.cuda.empty_cache()
	self.cpt = None


	def load_model(self, weight_root):
	self.cpt = (torch.load(weight_root, map_location="cpu") if os.path.isfile(weight_root) else None)


	def setup_network(self):
	if self.cpt is not None:
	self.tgt_sr = self.cpt["config"][-1]
	self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
	self.use_f0 = self.cpt.get("f0", 1)

	self.version = self.cpt.get("version", "v1")
	self.text_enc_hidden_dim = 768 if self.version == "v2" else 256

	self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=self.text_enc_hidden_dim, is_half=self.config.is_half)

	del self.net_g.enc_q

	self.net_g.load_state_dict(self.cpt["weight"], strict=False)
	self.net_g.eval().to(self.config.device)
	self.net_g = (self.net_g.half() if self.config.is_half else self.net_g.float())


	def setup_vc_instance(self):
	if self.cpt is not None:
	self.vc = VC(self.tgt_sr, self.config)
	self.n_spk = self.cpt["config"][-3]

	if __name__ == "__main__":
	mp.set_start_method("spawn", force=True)
	main()