import os import sys from dotenv import load_dotenv import requests import wave import zipfile now_dir = os.getcwd() sys.path.append(now_dir) load_dotenv() from import VC from infer.modules.uvr5.modules import UVRHANDLER from infer.lib.train.process_ckpt import ( change_info, extract_small_model, merge, show_info, ) from i18n.i18n import I18nAuto from configs.config import Config from sklearn.cluster import MiniBatchKMeans import torch import numpy as np import gradio as gr import faiss import fairseq import librosa import librosa.display import pathlib import json from pydub import AudioSegment from time import sleep from subprocess import Popen from random import shuffle import warnings import traceback import threading import shutil import logging import matplotlib.pyplot as plt import soundfile as sf from dotenv import load_dotenv from tools import pretrain_helper import edge_tts, asyncio from import tts_order_voice language_dict = tts_order_voice ilariavoices = list(language_dict.keys()) now_dir = os.getcwd() sys.path.append(now_dir) load_dotenv() logging.getLogger("numba").setLevel(logging.WARNING) logger = logging.getLogger(__name__) tmp = os.path.join(now_dir, "TEMP") shutil.rmtree(tmp, ignore_errors=True) shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % now_dir, ignore_errors=True) os.makedirs(tmp, exist_ok=True) os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True) os.makedirs(os.path.join(now_dir, "models/pth"), exist_ok=True) os.environ["TEMP"] = tmp warnings.filterwarnings("ignore") torch.manual_seed(114514) config = Config() vc = VC(config) weight_root = os.getenv("weight_root") weight_uvr5_root = os.getenv("weight_uvr5_root") index_root = os.getenv("index_root") names = [] for name in os.listdir(weight_root): if name.endswith(".pth"): names.append(name) index_paths = [] for root, dirs, files in os.walk(index_root, topdown=False): for name in files: if name.endswith(".index") and "trained" not in name: index_paths.append("%s/%s" % (root, name)) uvr5_names = [ '5_HP-Karaoke-UVR.pth', 'Kim_Vocal_2.onnx', 'MDX23C-8KFFT-InstVoc_HQ_2.ckpt', 'UVR-DeEcho-DeReverb.pth', 'UVR-Denoise', ] if config.dml: def forward_dml(ctx, x, scale): ctx.scale = scale res = x.clone().detach() return res fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml i18n = I18nAuto() ngpu = torch.cuda.device_count() gpu_infos = [] mem = [] if_gpu_ok = False if torch.cuda.is_available() or ngpu != 0: for i in range(ngpu): gpu_name = torch.cuda.get_device_name(i) if any( value in gpu_name.upper() for value in [ "10", "16", "20", "30", "40", "A2", "A3", "A4", "P4", "A50", "500", "A60", "70", "80", "90", "M4", "T4", "TITAN", ] ): if_gpu_ok = True gpu_infos.append("%s\t%s" % (i, gpu_name)) mem.append( int( torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4 ) ) if if_gpu_ok and len(gpu_infos) > 0: gpu_info = "\n".join(gpu_infos) default_batch_size = ((min(mem) // 2 + 1) // 2) * 2 else: gpu_info = i18n("Your GPU doesn't work for training") default_batch_size = 1 gpus = "-".join([i[0] for i in gpu_infos]) class ToolButton(gr.Button, gr.components.FormComponent): def __init__(self, **kwargs): super().__init__(variant="tool", **kwargs) def get_block_name(self): return "button" weight_root = os.getenv("weight_root") index_root = os.getenv("index_root") audio_root = "audios" sup_audioext = {'wav', 'mp3', 'flac', 'ogg', 'opus', 'm4a', 'mp4', 'aac', 'alac', 'wma', 'aiff', 'webm', 'ac3'} names = [os.path.join(root, file) for root, _, files in os.walk(weight_root) for file in files if file.endswith((".pth", ".onnx"))] indexes_list = [os.path.join(root, name) for root, _, files in os.walk(index_root, topdown=False) for name in files if name.endswith(".index") and "trained" not in name] audio_paths = [os.path.join(root, name) for root, _, files in os.walk(audio_root, topdown=False) for name in files if name.endswith(tuple(sup_audioext))] def get_pretrained_files(directory, keyword, filter_str): file_paths = {} for filename in os.listdir(directory): if filename.endswith(".pth") and keyword in filename and filter_str in filename: file_paths[filename] = os.path.join(directory, filename) return file_paths pretrained_directory = "assets/pretrained_v2" pretrained_path = {filename: os.path.join(pretrained_directory, filename) for filename in os.listdir(pretrained_directory)} pretrained_G_files = get_pretrained_files(pretrained_directory, "G", "f0") pretrained_D_files = get_pretrained_files(pretrained_directory, "D", "f0") def get_pretrained_models(path_str, f0_str, sr2): sr_mapping = pretrain_helper.get_pretrained_models(f0_str) pretrained_G_filename = sr_mapping.get(sr2, "") pretrained_D_filename = pretrained_G_filename.replace("G", "D") if not pretrained_G_filename or not pretrained_D_filename: logging.warning(f"Pretrained models not found for sample rate {sr2}, will not use pretrained models") return os.path.join(pretrained_directory, pretrained_G_filename), os.path.join(pretrained_directory, pretrained_D_filename) names = [] for name in os.listdir(weight_root): if name.endswith(".pth"): names.append(name) index_paths = [] for root, dirs, files in os.walk(index_root, topdown=False): for name in files: if name.endswith(".index") and "trained" not in name: index_paths.append("%s/%s" % (root, name)) def generate_spectrogram_and_get_info(audio_file): y, sr = librosa.load(audio_file, sr=None) S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=256) log_S = librosa.amplitude_to_db(S, ref=np.max, top_db=256) plt.figure(figsize=(12, 5.5)) librosa.display.specshow(log_S, sr=sr, x_axis='time') plt.colorbar(format='%+2.0f dB', pad=0.01) plt.tight_layout(pad=0.5) plt.savefig('spectrogram.png', dpi=500) audio_info = bit_depth = {'PCM_16': 16, 'FLOAT': 32}.get(audio_info.subtype, 0) minutes, seconds = divmod(audio_info.duration, 60) seconds, milliseconds = divmod(seconds, 1) milliseconds *= 1000 speed_in_kbps = audio_info.samplerate * bit_depth / 1000 filename_without_extension, _ = os.path.splitext(os.path.basename(audio_file)) info_table = f""" | Information | Value | | :---: | :---: | | File Name | {filename_without_extension} | | Duration | {int(minutes)} minutes - {int(seconds)} seconds - {int(milliseconds)} milliseconds | | Bitrate | {speed_in_kbps} kbp/s | | Audio Channels | {audio_info.channels} | | Samples per second | {audio_info.samplerate} Hz | | Bit per second | {audio_info.samplerate * audio_info.channels * bit_depth} bit/s | """ return info_table, "spectrogram.png" def change_choices(): names = [] for name in os.listdir(weight_root): if name.endswith(".pth"): names.append(name) index_paths = [] for root, dirs, files in os.walk(index_root, topdown=False): for name in files: if name.endswith(".index") and "trained" not in name: index_paths.append("%s/%s" % (root, name)) audios = [os.path.join(audio_root, file) for file in os.listdir(os.path.join(now_dir, "audios"))] return {"choices": sorted(names), "__type__": "update"}, {"choices": sorted(index_paths),"__type__": "update"},{ "choices": sorted(audios), "__type__": "update" } # Define the tts_and_convert function def tts_and_convert(ttsvoice, text, spk_item, vc_transform, f0_file, f0method, file_index1, file_index2, index_rate, filter_radius, resample_sr, rms_mix_rate, protect): # Perform TTS (we only need 1 function) vo=language_dict[ttsvoice], vo).save("./TEMP/temp_ilariatts.mp3")) aud_path = './TEMP/temp_ilariatts.mp3' # Update output Textbox vc_output1.update("Text converted successfully!") #Calls vc similar to any other inference. #This is why we needed all the other shit in our call, otherwise we couldn't infer. return vc.vc_single(spk_item , None,aud_path, vc_transform, f0_file, f0method, file_index1, file_index2, index_rate, filter_radius, resample_sr, rms_mix_rate, protect) def import_files(file): if file is not None: file_name = if file_name.endswith('.zip'): with zipfile.ZipFile(, 'r') as zip_ref: # Create a temporary directory to extract files temp_dir = './TEMP' zip_ref.extractall(temp_dir) # Move .pth and .index files to their respective directories for root, dirs, files in os.walk(temp_dir): for file in files: if file.endswith('.pth'): destination = './models/pth/' + file if not os.path.exists(destination): shutil.move(os.path.join(root, file), destination) else: print(f"File {destination} already exists. Skipping.") elif file.endswith('.index'): destination = './models/index/' + file if not os.path.exists(destination): shutil.move(os.path.join(root, file), destination) else: print(f"File {destination} already exists. Skipping.") # Remove the temporary directory shutil.rmtree(temp_dir) return "Zip file has been successfully extracted." elif file_name.endswith('.pth'): destination = './models/pth/' + os.path.basename( if not os.path.exists(destination): os.rename(, destination) else: print(f"File {destination} already exists. Skipping.") return "PTH file has been successfully imported." elif file_name.endswith('.index'): destination = './models/index/' + os.path.basename( if not os.path.exists(destination): os.rename(, destination) else: print(f"File {destination} already exists. Skipping.") return "Index file has been successfully imported." else: return "Unsupported file type." else: return "No file has been uploaded." def import_button_click(file): return import_files(file) def calculate_remaining_time(epochs, seconds_per_epoch): total_seconds = epochs * seconds_per_epoch hours = total_seconds // 3600 minutes = (total_seconds % 3600) // 60 seconds = total_seconds % 60 if hours == 0: return f"{int(minutes)} minutes" elif hours == 1: return f"{int(hours)} hour and {int(minutes)} minutes" else: return f"{int(hours)} hours and {int(minutes)} minutes" def get_audio_duration(audio_file_path): audio_info = duration_minutes = audio_info.duration / 60 return duration_minutes def clean(): return {"value": "", "__type__": "update"} sr_dict = { "32k": 32000, "40k": 40000, "48k": 48000, "OV2-32k": 32000, "OV2-40k": 40000, "RIN-40k": 40000, "Snowie-40k": 40000, "Snowie-48k": 48000, "SnowieV3.1-40k": 40000, "SnowieV3.1-32k": 32000, "SnowieV3.1-48k": 48000, "SnowieV3.1-RinE3-40K": 40000, "Italia-32k": 32000, } def durations(sample_rate, model_options, qualities, duration): if duration <= 350: return qualities['short'] else: if sample_rate == 32000: return model_options['32k'] elif sample_rate == 40000: return model_options['40k'] elif sample_rate == 48000: return model_options['48k'] else: return qualities['other'] def get_training_info(audio_file): if audio_file is None: return 'Please provide an audio file!' duration = get_audio_duration(audio_file) sample_rate =, 'rb').getframerate() training_info = { (0, 2): (150, 'OV2'), (2, 3): (200, 'OV2'), (3, 5): (250, 'OV2'), (5, 10): (300, 'Normal'), (10, 25): (500, 'Normal'), (25, 45): (700, 'Normal'), (45, 60): (1000, 'Normal') } for (min_duration, max_duration), (epochs, pretrain) in training_info.items(): if min_duration <= duration < max_duration: break else: return 'Duration is not within the specified range!' return f'You should use the **{pretrain}** pretrain with **{epochs}** epochs at **{sample_rate/1000}khz** sample rate.' def if_done(done, p): while 1: if p.poll() is None: sleep(0.5) else: break done[0] = True def on_button_click(audio_file_path): return get_training_info(audio_file_path) def download_from_url(url, model): if url == '': return "URL cannot be left empty." if model == '': return "You need to name your model. For example: Ilaria" url = url.strip() zip_dirs = ["zips", "unzips"] for directory in zip_dirs: if os.path.exists(directory): shutil.rmtree(directory) os.makedirs("zips", exist_ok=True) os.makedirs("unzips", exist_ok=True) zipfile = model + '.zip' zipfile_path = './zips/' + zipfile try: if "" in url:["gdown", url, "--fuzzy", "-O", zipfile_path]) elif "" in url: m = Mega() m.download_url(url, './zips') else: response = requests.get(url) response.raise_for_status() # Raise an exception for HTTP errors with open(zipfile_path, 'wb') as file: file.write(response.content) shutil.unpack_archive(zipfile_path, "./unzips", 'zip') for root, dirs, files in os.walk('./unzips'): for file in files: file_path = os.path.join(root, file) if file.endswith(".index"): os.makedirs(f'./models/index', exist_ok=True) shutil.copy2(file_path, f'./models/index/{model}.index') elif "G_" not in file and "D_" not in file and file.endswith(".pth"): os.makedirs(f'./models/pth', exist_ok=True) shutil.copy(file_path, f'./models/pth/{model}.pth') shutil.rmtree("zips") shutil.rmtree("unzips") return "Model downloaded, you can go back to the inference page!" except subprocess.CalledProcessError as e: return f"ERROR - Download failed (gdown): {str(e)}" except requests.exceptions.RequestException as e: return f"ERROR - Download failed (requests): {str(e)}" except Exception as e: return f"ERROR - The test failed: {str(e)}" def transfer_files(filething, dataset_dir='dataset/'): file_names = [ for f in filething] for f in file_names: filename = os.path.basename(f) destination = os.path.join(dataset_dir, filename) shutil.copyfile(f, destination) return "Transferred files to dataset directory!" def if_done_multi(done, ps): while 1: flag = 1 for p in ps: if p.poll() is None: flag = 0 sleep(0.5) break if flag == 1: break done[0] = True def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): sr = sr_dict[sr] os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w") f.close() per = 3.0 if config.is_half else 3.7 cmd = '"%s" infer/modules/train/ "%s" %s %s "%s/logs/%s" %s %.1f' % ( config.python_cmd, trainset_dir, sr, n_p, now_dir, exp_dir, config.noparallel, per, ) p = Popen(cmd, shell=True) done = [False] threading.Thread( target=if_done, args=( done, p, ), ).start() while 1: with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f: yield sleep(1) if done[0]: break with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f: log = yield log def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvpe): gpus = gpus.split("-") os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w") f.close() if if_f0: if f0method != "rmvpe_gpu": cmd = ( '"%s" infer/modules/train/extract/ "%s/logs/%s" %s %s' % ( config.python_cmd, now_dir, exp_dir, n_p, f0method, ) ) p = Popen( cmd, shell=True, cwd=now_dir ) done = [False] threading.Thread( target=if_done, args=( done, p, ), ).start() else: if gpus_rmvpe != "-": gpus_rmvpe = gpus_rmvpe.split("-") leng = len(gpus_rmvpe) ps = [] for idx, n_g in enumerate(gpus_rmvpe): cmd = ( '"%s" infer/modules/train/extract/ %s %s %s "%s/logs/%s" %s ' % ( config.python_cmd, leng, idx, n_g, now_dir, exp_dir, config.is_half, ) ) p = Popen( cmd, shell=True, cwd=now_dir ) ps.append(p) done = [False] threading.Thread( target=if_done_multi, # args=( done, ps, ), ).start() else: cmd = ( config.python_cmd + ' infer/modules/train/extract/ "%s/logs/%s" ' % ( now_dir, exp_dir, ) ) p = Popen( cmd, shell=True, cwd=now_dir ) p.wait() done = [True] while 1: with open( "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r" ) as f: yield sleep(1) if done[0]: break with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: log = yield log leng = len(gpus) ps = [] for idx, n_g in enumerate(gpus): cmd = ( '"%s" infer/modules/train/ %s %s %s %s "%s/logs/%s" %s' % ( config.python_cmd, config.device, leng, idx, n_g, now_dir, exp_dir, version19, ) ) p = Popen( cmd, shell=True, cwd=now_dir ) ps.append(p) done = [False] threading.Thread( target=if_done_multi, args=( done, ps, ), ).start() while 1: with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: yield sleep(1) if done[0]: break with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: log = yield log def change_sr2(sr2, if_f0_3, version19): path_str = "" if version19 == "v1" else "_v2" f0_str = "f0" if if_f0_3 else "" return get_pretrained_models(path_str, f0_str, sr2) def change_version19(sr2, if_f0_3, version19): path_str = "" if version19 == "v1" else "_v2" if sr2 == "32k" and version19 == "v1": sr2 = "40k" to_return_sr2 = ( {"choices": ["32k","40k", "48k"], "__type__": "update", "value": sr2} if version19 == "v1" else {"choices": ["32k", "40k", "48k", "OV2-32k", "OV2-40k", "RIN-40k","Snowie-40k","Snowie-48k","Italia-32k"], "__type__": "update", "value": sr2} ) f0_str = "f0" if if_f0_3 else "" return ( *get_pretrained_models(path_str, f0_str, sr2), to_return_sr2, ) def change_f0(if_f0_3, sr2, version19): path_str = "" if version19 == "v1" else "_v2" return ( {"visible": if_f0_3, "__type__": "update"}, {"visible": if_f0_3, "__type__": "update"}, *get_pretrained_models(path_str, "f0" if if_f0_3 is True else "", sr2), ) def click_train( exp_dir1, sr2, if_f0_3, spk_id5, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17, if_save_every_weights18, version19, ): global f0_dir, f0nsf_dir exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) os.makedirs(exp_dir, exist_ok=True) gt_wavs_dir = "%s/0_gt_wavs" % exp_dir feature_dir = ( "%s/3_feature256" % exp_dir if version19 == "v1" else "%s/3_feature768" % exp_dir ) if if_f0_3: f0_dir = "%s/2a_f0" % exp_dir f0nsf_dir = "%s/2b-f0nsf" % exp_dir names = ( set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set([name.split(".")[0] for name in os.listdir(feature_dir)]) & set([name.split(".")[0] for name in os.listdir(f0_dir)]) & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) ) else: names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( [name.split(".")[0] for name in os.listdir(feature_dir)] ) opt = [] for name in names: if if_f0_3: opt.append( "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" % ( gt_wavs_dir.replace("\\", "\\\\"), name, feature_dir.replace("\\", "\\\\"), name, f0_dir.replace("\\", "\\\\"), name, f0nsf_dir.replace("\\", "\\\\"), name, spk_id5, ) ) else: opt.append( "%s/%s.wav|%s/%s.npy|%s" % ( gt_wavs_dir.replace("\\", "\\\\"), name, feature_dir.replace("\\", "\\\\"), name, spk_id5, ) ) fea_dim = 256 if version19 == "v1" else 768 if if_f0_3: for _ in range(2): opt.append( "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy" "|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5) ) else: for _ in range(2): opt.append( "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" % (now_dir, sr2, now_dir, fea_dim, spk_id5) ) shuffle(opt) with open("%s/filelist.txt" % exp_dir, "w") as f: f.write("\n".join(opt)) logger.debug("Write filelist done")"Use gpus: %s", str(gpus16)) if pretrained_G14 == "":"No pretrained Generator") if pretrained_D15 == "":"No pretrained Discriminator") if version19 == "v1" or sr2 == "40k": config_path = "v1/%s.json" % sr2 else: config_path = "v2/%s.json" % sr2 config_save_path = os.path.join(exp_dir, "config.json") if not pathlib.Path(config_save_path).exists(): with open(config_save_path, "w", encoding="utf-8") as f: json.dump( config.json_config[config_path], f, ensure_ascii=False, indent=4, sort_keys=True, ) f.write("\n") if gpus16: cmd = ( '"%s" infer/modules/train/ -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s ' "-sw %s -v %s" % ( config.python_cmd, exp_dir1, sr2, 1 if if_f0_3 else 0, batch_size12, gpus16, total_epoch11, save_epoch10, "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", 1 if if_save_latest13 == i18n("是") else 0, 1 if if_cache_gpu17 == i18n("是") else 0, 1 if if_save_every_weights18 == i18n("是") else 0, version19, ) ) else: cmd = ( '"%s" infer/modules/train/ -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw ' "%s -v %s" % ( config.python_cmd, exp_dir1, sr2, 1 if if_f0_3 else 0, batch_size12, total_epoch11, save_epoch10, "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", 1 if if_save_latest13 == i18n("是") else 0, 1 if if_cache_gpu17 == i18n("是") else 0, 1 if if_save_every_weights18 == i18n("是") else 0, version19, ) ) p = Popen(cmd, shell=True, cwd=now_dir) p.wait() return "You can view console or train.log" def train_index(exp_dir1, version19): exp_dir = "logs/%s" % exp_dir1 os.makedirs(exp_dir, exist_ok=True) feature_dir = ( "%s/3_feature256" % exp_dir if version19 == "v1" else "%s/3_feature768" % exp_dir ) if not os.path.exists(feature_dir): return "Please perform Feature Extraction First!" listdir_res = list(os.listdir(feature_dir)) if len(listdir_res) == 0: return "Please perform Feature Extraction First!" infos = [] npys = [] for name in sorted(listdir_res): phone = np.load("%s/%s" % (feature_dir, name)) npys.append(phone) big_npy = np.concatenate(npys, 0) big_npy_idx = np.arange(big_npy.shape[0]) np.random.shuffle(big_npy_idx) big_npy = big_npy[big_npy_idx] if big_npy.shape[0] > 2e5: infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]) yield "\n".join(infos) try: big_npy = ( MiniBatchKMeans( n_clusters=10000, verbose=True, batch_size=256 * config.n_cpu, compute_labels=False, init="random", ) .fit(big_npy) .cluster_centers_ ) except: info = traceback.format_exc() infos.append(info) yield "\n".join(infos)"%s/total_fea.npy" % exp_dir, big_npy) n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) infos.append("%s,%s" % (big_npy.shape, n_ivf)) yield "\n".join(infos) index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf) infos.append("training") yield "\n".join(infos) index_ivf = faiss.extract_index_ivf(index) # index_ivf.nprobe = 1 index.train(big_npy) faiss.write_index( index, "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), ) infos.append("adding") yield "\n".join(infos) batch_size_add = 8192 for i in range(0, big_npy.shape[0], batch_size_add): index.add(big_npy[i: i + batch_size_add]) faiss.write_index( index, "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), ) infos.append( "Success,added_IVF%s_Flat_nprobe_%s_%s_%s.index" % (n_ivf, index_ivf.nprobe, exp_dir1, version19) ) yield "\n".join(infos) F0GPUVisible = config.dml is False def change_f0_method(f0method8): if f0method8 == "rmvpe_gpu": visible = F0GPUVisible else: visible = False return {"visible": visible, "__type__": "update"} vc_output1 = gr.Textbox(label=i18n("Console")) vc_output2 = gr.Audio(label=i18n("Audio output")) with gr.Blocks(title="Simple Ilaria RVC 💖") as app: gr.Markdown("

Simple Ilaria RVC 💖

") gr.Markdown(value=i18n("Made with 💖 by Ilaria | Support her on [Ko-Fi](")) gr.Markdown(i18n("For voice models and support join [AI Hub](")) with gr.Tabs(): with gr.TabItem(i18n("Inference")): with gr.Row(): sid0= gr.Dropdown(label=i18n("Voice"), choices=sorted(names)) sid1= sid0 refresh_button = gr.Button(i18n("Refresh"), variant="primary") clean_button = gr.Button(i18n("Unload Voice from VRAM"), variant="primary") with gr.Row(): spk_item = gr.Slider( minimum=0, maximum=2333, step=1, label=i18n("Speaker ID (Auto-Detected)"), value=0, visible=True, interactive=False, ) vc_transform0 = gr.Slider( label=i18n( "Pitch: -24 is lower (2 octaves) and 24 is higher (2 octaves)"), minimum=-24, maximum=24, default=0, step=1, ) fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean" ) with gr.Row(): but0 = gr.Button(i18n("Convert"), variant="primary") with gr.TabItem(i18n("Inference")): with gr.Group(): with gr.Row(): with gr.Column(): input_audio1 = gr.Audio( label=i18n("Upload Audio file"), type="filepath", ) record_button = gr.Audio(source="microphone", label="Use your microphone", type="filepath") input_audio0 = gr.Dropdown( label=i18n("Select a file from the audio folder"), choices=sorted(audio_paths), value='', interactive=True, ) record_button.change( fn=lambda x: x, inputs=[record_button], outputs=[input_audio0], ) file_index1 = gr.Textbox( label=i18n("Path of index"), placeholder=".\models\index", interactive=True, visible=False, ) file_index2 = gr.Textbox( label=i18n("Auto-detect index path"), choices=sorted(index_paths), interactive=True, visible=False, ) with gr.Column(): with gr.Accordion('Advanced Settings', open=False, visible=False): with gr.Column(): f0method0 = gr.Radio( label=i18n("Pitch Extraction, rmvpe is best"), choices=["harvest", "crepe", "rmvpe"] if config.dml is False else ["harvest", "rmvpe"], value="rmvpe", interactive=True, ) with gr.Row(): resample_sr0 = gr.Slider( minimum=0, maximum=48000, label=i18n("Resampling, 0=none"), value=0, step=1, interactive=True, ) with gr.Row(): rms_mix_rate0 = gr.Slider( minimum=0, maximum=1, label=i18n("0=Input source volume, 1=Normalized Output"), value=0.25, interactive=True, ) protect0 = gr.Slider( minimum=0, maximum=0.5, label=i18n( "Protect clear consonants and breathing sounds, preventing electro-acoustic tearing and other artifacts, 0.5 does not open"), value=0.33, step=0.01, interactive=True, ) filter_radius0 = gr.Slider( minimum=0, maximum=7, label=i18n(">=3 apply median filter to the harvested pitch results"), value=3, step=1, interactive=True, ) with gr.Row(): index_rate1 = gr.Slider( minimum=0, maximum=1, label=i18n("Index Ratio"), value=0.40, interactive=True, ) f0_file = gr.File( label=i18n("F0 curve file [optional]"), visible=False, ) fn=change_choices, inputs=[], outputs=[sid0, file_index2, input_audio1], api_name="infer_refresh", ) file_index1 = gr.Textbox( label=i18n("Path of index"), placeholder="%userprofile%\\Desktop\\models\\model_example.index", interactive=True, ) file_index2 = gr.Dropdown( label=i18n("Auto-detect index path"), choices=sorted(index_paths), interactive=True, ) with gr.Accordion('IlariaTTS', open=True): with gr.Column(): ilariaid=gr.Dropdown(label="Voice:", choices=ilariavoices, interactive=True, value="English-Jenny (Female)") with gr.Row(): ilariatext = gr.Textbox(label="Input your Text", interactive=True, value="This is a test.") ilariatts_button = gr.Button(value="Speak and Convert"), [ilariaid, ilariatext, spk_item, vc_transform0, f0_file, f0method0, file_index1, file_index2, index_rate1, filter_radius0, resample_sr0, rms_mix_rate0, protect0] , [vc_output1, vc_output2]) #Otherwise everything break, to be optimized with gr.Accordion('Advanced Settings', open=False, visible=True): with gr.Column(): f0method0 = gr.Radio( label=i18n("Pitch Extraction, rmvpe is best"), choices=["harvest", "crepe", "rmvpe"] if config.dml is False else ["harvest", "rmvpe"], value="rmvpe", interactive=True, ) resample_sr0 = gr.Slider( minimum=0, maximum=48000, label=i18n("Resampling, 0=none"), value=0, step=1, interactive=True, ) rms_mix_rate0 = gr.Slider( minimum=0, maximum=1, label=i18n("0=Input source volume, 1=Normalized Output"), value=0.25, interactive=True, ) protect0 = gr.Slider( minimum=0, maximum=0.5, label=i18n( "Protect clear consonants and breathing sounds, preventing electro-acoustic tearing and other artifacts, 0.5 does not open"), value=0.33, step=0.01, interactive=True, ) filter_radius0 = gr.Slider( minimum=0, maximum=7, label=i18n(">=3 apply median filter to the harvested pitch results"), value=3, step=1, interactive=True, ) index_rate1 = gr.Slider( minimum=0, maximum=1, label=i18n("Index Ratio"), value=0.40, interactive=True, ) f0_file = gr.File( label=i18n("F0 curve file [optional]"), visible=False, ) fn=change_choices, inputs=[], outputs=[sid0, file_index2], api_name="infer_refresh", ) file_index1 = gr.Textbox( label=i18n("Path of index"), placeholder="%userprofile%\\Desktop\\models\\model_example.index", interactive=True, ) file_index2 = gr.Dropdown( label=i18n("Auto-detect index path"), choices=sorted(index_paths), interactive=True, ) with gr.Group(): with gr.Column(): vc_output1.render() with gr.Column(): vc_output2.render() vc.vc_single, [ spk_item, input_audio0, input_audio1, vc_transform0, f0_file, f0method0, file_index1, file_index2, # file_big_npy1, index_rate1, filter_radius0, resample_sr0, rms_mix_rate0, protect0, ], [vc_output1, vc_output2], api_name="infer_convert", ) with gr.TabItem("Download Voice Models"): gr.Markdown(i18n("# For models found in [AI Hub](")) with gr.Row(): url = gr.Textbox(label="Huggingface Link:") model = gr.Textbox(label="Name of the model (without spaces):") download_button = gr.Button("Download") with gr.Row(): status_bar = gr.Textbox(label="Download Status"), inputs=[url, model], outputs=[status_bar]) with gr.TabItem("Import Models"): gr.Markdown(i18n("For models found on [Weights](")) file_upload = gr.File(label="Upload a .zip file containing a .pth and .index file") import_button = gr.Button("Import") import_status = gr.Textbox(label="Import Status"), inputs=file_upload, outputs=import_status) with gr.TabItem(i18n("Batch Inference")): gr.Markdown( value=i18n("Batch Conversion") ) with gr.Row(): with gr.Column(): vc_transform1 = gr.Number( label=i18n("Pitch: 0 from man to man (or woman to woman); 12 from man to woman and -12 from woman to man."), value=0 ) opt_input = gr.Textbox(label=i18n("Output"), value="InferOutput") file_index3 = gr.Textbox( label=i18n("Path to index"), value="", interactive=True, ) file_index4 = gr.Dropdown( label=i18n("Auto-detect index path"), choices=sorted(index_paths), interactive=True, ) f0method1 = gr.Radio( label=i18n("Pitch Extraction, rmvpe is best"), choices=["harvest", "crepe", "rmvpe"] if config.dml is False else ["harvest", "rmvpe"], value="rmvpe", interactive=True, ) format1 = gr.Radio( label=i18n("Export Format"), choices=["flac", "wav", "mp3", "m4a"], value="flac", interactive=True, ) fn=lambda: change_choices()[1], inputs=[], outputs=file_index4, api_name="infer_refresh_batch", ) with gr.Column(): resample_sr1 = gr.Slider( minimum=0, maximum=48000, label=i18n("Resampling, 0=none"), value=0, step=1, interactive=True, ) rms_mix_rate1 = gr.Slider( minimum=0, maximum=1, label=i18n("0=Input source volume, 1=Normalized Output"), value=0.25, interactive=True, ) protect1 = gr.Slider( minimum=0, maximum=0.5, label=i18n( "Protect clear consonants and breathing sounds, preventing electro-acoustic tearing and other artifacts, 0.5 does not open"), value=0.33, step=0.01, interactive=True, ) filter_radius1 = gr.Slider( minimum=0, maximum=7, label=i18n(">=3 apply median filter to the harvested pitch results"), value=3, step=1, interactive=True, ) index_rate2 = gr.Slider( minimum=0, maximum=1, label=i18n("Index Ratio"), value=0.40, interactive=True, ) with gr.Row(): dir_input = gr.Textbox( label=i18n("Enter the path to the audio folder to be processed"), placeholder="%userprofile%\\Desktop\\covers", ) inputs = gr.File( file_count="multiple", label=i18n("Audio files can also be imported in batch") ) with gr.Row(): but1 = gr.Button(i18n("Convert"), variant="primary") vc_output3 = gr.Textbox(label=i18n("Console")) vc.vc_multi, [ spk_item, dir_input, opt_input, inputs, vc_transform1, f0method1, file_index3, file_index4, # file_big_npy2, index_rate2, filter_radius1, resample_sr1, rms_mix_rate1, protect1, format1, ], [vc_output3], api_name="infer_convert_batch", ) with gr.TabItem(i18n("Train")): gr.Markdown(value=i18n("")) with gr.Row(): exp_dir1 = gr.Textbox(label=i18n("Model Name"), value="test-model") sr2 = gr.Dropdown( label=i18n("Sample Rate & Pretrain"), choices=["32k", "40k", "48k", "OV2-32k", "OV2-40k", "RIN-40k", "Snowie-40k", "Snowie-48k", "SnowieV3.1-40k","SnowieV3.1-32k","SnowieV3.1-48k","SnowieV3.1-RinE3-40K","Italia-32k"], value="32k", interactive=True, ) version19 = gr.Radio( label=i18n("Version 2 only here"), choices=["v2"], value="v2", interactive=False, visible=False, ) np7 = gr.Slider( minimum=0, maximum=config.n_cpu, step=1, label=i18n("CPU Threads"), value=int(np.ceil(config.n_cpu / 2.5)), interactive=True, ) with gr.Group(): gr.Markdown(value=i18n("")) with gr.Row(): trainset_dir4 = gr.Textbox( label=i18n("Path to Dataset"), value="dataset" ) with gr.Row(): with gr.Accordion('Upload Dataset (alternative)', open=False, visible=True): file_thin = gr.Files(label='Dataset') # transfers files to the dataset dir, lol # much coding -ila show = gr.Textbox(label='Status') transfer_button = gr.Button('Upload Dataset to the folder', variant="primary") fn=transfer_files, inputs=[file_thin], outputs=show, ) with gr.Group(): gr.Markdown(value=i18n("")) with gr.Row(): save_epoch10 = gr.Slider( minimum=1, maximum=250, step=1, label=i18n("Save frequency"), value=50, interactive=True, ) total_epoch11 = gr.Slider( minimum=2, maximum=10000, step=1, label=i18n("Total Epochs"), value=300, interactive=True, ) batch_size12 = gr.Slider( minimum=1, maximum=16, step=1, label=i18n("Batch Size"), value=default_batch_size, interactive=True, ) if_save_every_weights18 = gr.Radio( label=i18n("Create model with save frequency"), choices=[i18n("是"), i18n("否")], value=i18n("是"), interactive=True, ) with gr.Accordion('Advanced Settings', open=False, visible=True): with gr.Row(): with gr.Group(): spk_id5 = gr.Slider( minimum=0, maximum=4, step=1, label=i18n("Speaker ID"), value=0, interactive=True, ) if_f0_3 = gr.Radio( label=i18n("Pitch Guidance"), choices=[True, False], value=True, interactive=True, ) gpus6 = gr.Textbox( label=i18n("GPU ID (Leave 0 if you have only one GPU, use 0-1 for multiple GPus)"), value=gpus, interactive=True, visible=F0GPUVisible, ) gpu_info9 = gr.Textbox( label=i18n("GPU Model"), value=gpu_info, visible=F0GPUVisible, ) gpus16 = gr.Textbox( label=i18n("Enter cards to be used (Leave 0 if you have only one GPU, use 0-1 for multiple GPus)"), value=gpus if gpus != "" else "0", interactive=True, ) with gr.Group(): if_save_latest13 = gr.Radio( label=i18n("Save last ckpt as final Model"), choices=[i18n("是"), i18n("否")], value=i18n("是"), interactive=True, ) if_cache_gpu17 = gr.Radio( label=i18n("Cache data to GPU (Only for datasets under 8 minutes)"), choices=[i18n("是"), i18n("否")], value=i18n("否"), interactive=True, ) f0method8 = gr.Radio( label=i18n("Feature Extraction Method"), choices=["rmvpe", "rmvpe_gpu"], value="rmvpe_gpu", interactive=True, ) gpus_rmvpe = gr.Textbox( label=i18n( "rmvpe_gpu will use your GPU instead of the CPU for the feature extraction" ), value="%s-%s" % (gpus, gpus), interactive=True, visible=F0GPUVisible, ) f0method8.change( fn=change_f0_method, inputs=[f0method8], outputs=[gpus_rmvpe], ) with gr.Row(): pretrained_G14 = gr.Textbox( label="Pretrained G", choices=list(pretrained_G_files.values()), value=pretrained_G_files.get('f0G32.pth', ''), visible=False, interactive=True, ) pretrained_D15 = gr.Textbox( label="Pretrained D", choices=list(pretrained_D_files.values()), value=pretrained_D_files.get('f0D32.pth', ''), visible=False, interactive=True, ) sr2.change( change_sr2, [sr2, if_f0_3, version19], [pretrained_G14, pretrained_D15], ) version19.change( change_version19, [sr2, if_f0_3, version19], [pretrained_G14, pretrained_D15, sr2], ) if_f0_3.change( change_f0, [if_f0_3, sr2, version19], [f0method8, gpus_rmvpe, pretrained_G14, pretrained_D15], ) with gr.Group(): with gr.Row(): but1 = gr.Button(i18n("1. Process Data"), variant="primary") but2 = gr.Button(i18n("2. Feature Extraction"), variant="primary") but4 = gr.Button(i18n("3. Train Index"), variant="primary") but3 = gr.Button(i18n("4. Train Model"), variant="primary") with gr.Row(): info = gr.Textbox(label=i18n("Output"), value="", max_lines=5, lines=5) preprocess_dataset, [trainset_dir4, exp_dir1, sr2, np7], [info], api_name="train_preprocess", ) extract_f0_feature, [ gpus6, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe, ], [info], api_name="train_extract_f0_feature", ), [exp_dir1, version19], info) click_train, [ exp_dir1, sr2, if_f0_3, spk_id5, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17, if_save_every_weights18, version19, ], info, api_name="train_start", ), [exp_dir1, version19], info) with gr.TabItem(i18n("UVR5")): with gr.Group(): gr.Markdown( value=i18n( """ - **Kim Vocal 2**: Effortlessly separates vocals and instrumentals, a perfect tool for music enthusiasts. - **Karaoke 5 HP**: Expertly isolates two overlapping voices, making it a valuable asset for duet performances. - **DeEcho DeReverb**: Skillfully eliminates reverb from vocal tracks, enhancing the clarity of your sound. - **MDX23C InstVoc**: Excellent at removing sound effects or other annoying noises, ensuring a smooth listening experience. - **DeNoise**: Exceptional at detecting and removing nearly imperceptible noises that can compromise the quality of a cover or a model. """ ) ) with gr.Group(): uvr_handler = UVRHANDLER() audios = gr.File() with gr.Row(): output_dir = gr.Textbox('opt/', label='Output Directory') with gr.Row(): model_name = gr.Dropdown(choices=uvr5_names, label='Models') model_status = gr.Textbox(placeholder='Waiting...', interactive=False, label='Model Information') with gr.Row(): LOADMODELBUTTON = gr.Button('Load Model', variant="primary") fn=uvr_handler.loadmodel, inputs=[model_name, output_dir], outputs=[model_status] ) CLEARMODELBUTTON = gr.Button('Unload Model', variant="primary") fn=uvr_handler.deloadmodel, outputs=[model_status] ) with gr.Group(): with gr.Column(): with gr.Row(): inst = gr.Audio(show_download_button=True, interactive=False, label='Instrumental') vocal = gr.Audio(show_download_button=True, interactive=False, label='Vocals') UVRBUTTON = gr.Button('Extract', variant="primary") fn=uvr_handler.uvr, inputs=[audios], outputs=[inst, vocal] ) with gr.TabItem(i18n("Extra")): with gr.Accordion('Model Info', open=False): with gr.Column(): sid1 = gr.Dropdown(label=i18n("Voice Model"), choices=sorted(names)) refresh_button = gr.Button(i18n("Refresh"), variant="primary") fn=change_choices, inputs=[], outputs=[sid1, file_index2], api_name="infer_refresh", ) modelload_out = gr.Textbox(label="Model Metadata", interactive=False, lines=4) get_model_info_button = gr.Button(i18n("Get Model Info")) fn=vc.get_vc, inputs=[sid1, protect0, protect1], outputs=[spk_item, protect0, protect1, file_index2, file_index4, modelload_out] ) with gr.Accordion('Audio Analyser', open=False): with gr.Column(): audio_input = gr.Audio(type="filepath") get_info_button = gr.Button( value=i18n("Get information about the audio"), variant="primary" ) with gr.Column(): with gr.Row(): with gr.Column(): gr.Markdown( value=i18n("Information about the audio file"), visible=True, ) output_markdown = gr.Markdown( value=i18n("Waiting for information..."), visible=True ) image_output = gr.Image(type="filepath", interactive=False) fn=generate_spectrogram_and_get_info, inputs=[audio_input], outputs=[output_markdown, image_output], ) with gr.Accordion('Training Helper', open=False): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Upload your audio file") gr.Text("Please note that these results are approximate and intended to provide a general idea for beginners.", label='Notice:') training_info_output = gr.Markdown(label="Training Information:") get_info_button = gr.Button("Get Training Info") fn=on_button_click, inputs=[audio_input], outputs=[training_info_output] ) with gr.Accordion('Training Time Calculator', open=False): with gr.Column(): epochs_input = gr.Number(label="Number of Epochs") seconds_input = gr.Number(label="Seconds per Epoch") calculate_button = gr.Button("Calculate Time Remaining") remaining_time_output = gr.Textbox(label="Remaining Time", interactive=False) fn=calculate_remaining_time, inputs=[epochs_input, seconds_input], outputs=[remaining_time_output] ) with gr.Accordion(i18n("Model Fusion"), open=False): with gr.Group(): gr.Markdown(value=i18n("Strongly suggested to use only very clean models.")) with gr.Row(): ckpt_a = gr.Textbox( label=i18n("Path of the first .pth"), value="", interactive=True ) ckpt_b = gr.Textbox( label=i18n("Path of the second .pth"), value="", interactive=True ) alpha_a = gr.Slider( minimum=0, maximum=1, label=i18n("Weight of the first model over the second"), value=0.5, interactive=True, ) with gr.Group(): with gr.Row(): sr_ = gr.Radio( label=i18n("Sample rate of both models"), choices=["32k","40k", "48k"], value="32k", interactive=True, ) if_f0_ = gr.Radio( label=i18n("Pitch Guidance"), choices=[i18n("是"), i18n("否")], value=i18n("是"), interactive=True, ) info__ = gr.Textbox( label=i18n("Add informations to the model"), value="", max_lines=8, interactive=True, visible=False ) name_to_save0 = gr.Textbox( label=i18n("Final Model name"), value="", max_lines=1, interactive=True, ) version_2 = gr.Radio( label=i18n("Versions of the models"), choices=["v1", "v2"], value="v2", interactive=True, ) with gr.Group(): with gr.Row(): but6 = gr.Button(i18n("Fuse the two models"), variant="primary") info4 = gr.Textbox(label=i18n("Output"), value="", max_lines=8) merge, [ ckpt_a, ckpt_b, alpha_a, sr_, if_f0_, info__, name_to_save0, version_2, ], info4, api_name="ckpt_merge", ) with gr.Accordion('Credits', open=False): gr.Markdown(''' ## All the amazing people who worked on this! ### Developers - **Ilaria**: Founder, Lead Developer - **Yui**: Training feature - **GDR-**: Inference feature - **Poopmaster**: Model downloader, Model importer - **kitlemonfoot**: Ilaria TTS implementation - **eddycrack864**: UVR5 implementation - **Mikus**: Ilaria Updater & Downloader - **Diablo**: Pretrain Automation, UI features, Various fixes ### Beta Tester - **Charlotte**: Beta Tester, Advisor - **mrm0dz**: Beta Tester, Advisor - **RME**: Beta Tester - **Delik**: Beta Tester - **inductivegrub**: Beta Tester - **l3af**: Beta Tester, Helper ### Pretrains Makers - **simplcup**: Ov2Super - **mustar22**: RIN_E3 & Snowie ### Colab Port - **Angetyde** - **l3af** - **Poopmaster** - **Hina** ### HuggingFace Port - **Nick088** ### Other - **RVC Project**: Original Developers - **yumereborn**: Ilaria RVC image ### **In loving memory of JLabDX** 🕊️ ''') sid0.change( fn=vc.get_vc, inputs=[sid0, protect0, protect1], outputs=[spk_item, protect0, protect1, file_index2, file_index4, modelload_out], api_name="infer_change_voice", ) with gr.TabItem(i18n("")): gr.Markdown(''' ![ilaria]( ''') if config.iscolab: app.queue(concurrency_count=511, max_size=1022).launch(share=True) else: app.queue(concurrency_count=511, max_size=1022).launch( server_name="", inbrowser=not config.noautoopen, server_port=config.listen_port, quiet=True, )