diff --git a/main/app/app.py b/main/app/app.py deleted file mode 100644 index 938159fedbaeadbc05d583a6b604e28611730a93..0000000000000000000000000000000000000000 --- a/main/app/app.py +++ /dev/null @@ -1,2900 +0,0 @@ -import os -import re -import ssl -import sys -import json -import onnx -import torch -import codecs -import shutil -import yt_dlp -import logging -import platform -import requests -import warnings -import threading -import gradio.strings -import logging.handlers - -import gradio as gr -import pandas as pd - -from time import sleep -from subprocess import Popen -from bs4 import BeautifulSoup -from datetime import datetime -from multiprocessing import cpu_count - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.utils import pydub_convert, pydub_load -from main.tools import gdown, meganz, mediafire, pixeldrain, huggingface, edge_tts, google_tts - -ssl._create_default_https_context = ssl._create_unverified_context -logger = logging.getLogger(__name__) -logger.propagate = False - -if logger.hasHandlers(): logger.handlers.clear() -else: - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "app.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - -warnings.filterwarnings("ignore") -for l in ["httpx", "gradio", "uvicorn", "httpcore", "urllib3"]: - logging.getLogger(l).setLevel(logging.ERROR) - -config = Config() -python = sys.executable - -translations = config.translations -configs_json = os.path.join("main", "configs", "config.json") -configs = json.load(open(configs_json, "r")) - -models, model_options = {}, {} -method_f0 = ["pm", "diow", "dio", "mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "crepe-tiny", "crepe-small", "crepe-medium", "crepe-large", "crepe-full", "fcpe", "fcpe-legacy", "rmvpe", "rmvpe-legacy", "harvestw", "harvest", "yin", "pyin", "swipe"] -embedders_model = ["contentvec_base", "hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "custom"] - -paths_for_files = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk("audios") for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")]) -model_name, index_path, delete_index = sorted(list(model for model in os.listdir(os.path.join("assets", "weights")) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".index")]), sorted([os.path.join("assets", "logs", f) for f in os.listdir(os.path.join("assets", "logs")) if "mute" not in f and os.path.isdir(os.path.join("assets", "logs", f))]) -pretrainedD, pretrainedG, Allpretrained = ([model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "D" in model], [model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "G" in model], [os.path.join("assets", "models", path, model) for path in ["pretrained_v1", "pretrained_v2", "pretrained_custom"] for model in os.listdir(os.path.join("assets", "models", path)) if model.endswith(".pth") and ("D" in model or "G" in model)]) - -separate_model = sorted([os.path.join("assets", "models", "uvr5", models) for models in os.listdir(os.path.join("assets", "models", "uvr5")) if models.endswith((".th", ".yaml", ".onnx"))]) -presets_file = sorted(list(f for f in os.listdir(os.path.join("assets", "presets")) if f.endswith(".json"))) -f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(os.path.join("assets", "f0")) for f in files if f.endswith(".txt")]) - -language, theme, edgetts, google_tts_voice, mdx_model, uvr_model = configs.get("language", "vi-VN"), configs.get("theme", "NoCrypt/miku"), configs.get("edge_tts", ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"]), configs.get("google_tts_voice", ["vi", "en"]), configs.get("mdx_model", "MDXNET_Main"), (configs.get("demucs_model", "HD_MMI") + configs.get("mdx_model", "MDXNET_Main")) - -miku_image = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/zvxh.cat", "rot13") -csv_path = os.path.join("assets", "spreadsheet.csv") - -logger.info(config.device) - -app_mode = "--app" in sys.argv - -if "--allow_all_disk" in sys.argv: - import win32api - - allow_disk = win32api.GetLogicalDriveStrings().split('\x00')[:-1] -else: allow_disk = [] - -if language == "vi-VN": gradio.strings.en = {"RUNNING_LOCALLY": "* Chạy trên liên kết nội bộ: {}://{}:{}", "RUNNING_LOCALLY_SSR": "* Chạy trên liên kết nội bộ: {}://{}:{}, với SSR ⚡ (thử nghiệm, để tắt hãy dùng `ssr=False` trong `launch()`)", "SHARE_LINK_DISPLAY": "* Chạy trên liên kết công khai: {}", "COULD_NOT_GET_SHARE_LINK": "\nKhông thể tạo liên kết công khai. Vui lòng kiểm tra kết nối mạng của bạn hoặc trang trạng thái của chúng tôi: https://status.gradio.app.", "COULD_NOT_GET_SHARE_LINK_MISSING_FILE": "\nKhông thể tạo liên kết công khai. Thiếu tập tin: {}. \n\nVui lòng kiểm tra kết nối internet của bạn. Điều này có thể xảy ra nếu phần mềm chống vi-rút của bạn chặn việc tải xuống tệp này. Bạn có thể cài đặt thủ công bằng cách làm theo các bước sau: \n\n1. Tải xuống tệp này: {}\n2. Đổi tên tệp đã tải xuống thành: {}\n3. Di chuyển tệp đến vị trí này: {}", "COLAB_NO_LOCAL": "Không thể hiển thị giao diện nội bộ trên google colab, liên kết công khai đã được tạo.", "PUBLIC_SHARE_TRUE": "\nĐể tạo một liên kết công khai, hãy đặt `share=True` trong `launch()`.", "MODEL_PUBLICLY_AVAILABLE_URL": "Mô hình được cung cấp công khai tại: {} (có thể mất tới một phút để sử dụng được liên kết)", "GENERATING_PUBLIC_LINK": "Đang tạo liên kết công khai (có thể mất vài giây...):", "BETA_INVITE": "\nCảm ơn bạn đã là người dùng Gradio! Nếu bạn có thắc mắc hoặc phản hồi, vui lòng tham gia máy chủ Discord của chúng tôi và trò chuyện với chúng tôi: https://discord.gg/feTf9x3ZSB", "COLAB_DEBUG_TRUE": "Đã phát hiện thấy sổ tay Colab. Ô này sẽ chạy vô thời hạn để bạn có thể xem lỗi và nhật ký. " "Để tắt, hãy đặt debug=False trong launch().", "COLAB_DEBUG_FALSE": "Đã phát hiện thấy sổ tay Colab. Để hiển thị lỗi trong sổ ghi chép colab, hãy đặt debug=True trong launch()", "COLAB_WARNING": "Lưu ý: việc mở Chrome Inspector có thể làm hỏng bản demo trong sổ tay Colab.", "SHARE_LINK_MESSAGE": "\nLiên kết công khai sẽ hết hạn sau 72 giờ. Để nâng cấp GPU và lưu trữ vĩnh viễn miễn phí, hãy chạy `gradio deploy` từ terminal trong thư mục làm việc để triển khai lên huggingface (https://huggingface.co/spaces)", "INLINE_DISPLAY_BELOW": "Đang tải giao diện bên dưới...", "COULD_NOT_GET_SHARE_LINK_CHECKSUM": "\nKhông thể tạo liên kết công khai. Tổng kiểm tra không khớp cho tập tin: {}."} -if not os.path.exists(os.path.join("assets", "miku.png")): huggingface.HF_download_file(miku_image, os.path.join("assets", "miku.png")) - -if os.path.exists(csv_path): cached_data = pd.read_csv(csv_path) -else: - cached_data = pd.read_csv(codecs.decode("uggcf://qbpf.tbbtyr.pbz/fcernqfurrgf/q/1gNHnDeRULtEfz1Yieaw14USUQjWJy0Oq9k0DrCrjApb/rkcbeg?sbezng=pfi&tvq=1977693859", "rot13")) - cached_data.to_csv(csv_path, index=False) - -for _, row in cached_data.iterrows(): - filename = row['Filename'] - url = None - - for value in row.values: - if isinstance(value, str) and "huggingface" in value: - url = value - break - - if url: models[filename] = url - -def gr_info(message): - gr.Info(message, duration=2) - logger.info(message) - -def gr_warning(message): - gr.Warning(message, duration=2) - logger.warning(message) - -def gr_error(message): - gr.Error(message=message, duration=6) - logger.error(message) - -def get_gpu_info(): - ngpu = torch.cuda.device_count() - gpu_infos = [f"{i}: {torch.cuda.get_device_name(i)} ({int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)} GB)" for i in range(ngpu) if torch.cuda.is_available() or ngpu != 0] - - return "\n".join(gpu_infos) if len(gpu_infos) > 0 else translations["no_support_gpu"] - -def change_f0_choices(): - f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(os.path.join("assets", "f0")) for f in files if f.endswith(".txt")]) - return {"value": f0_file[0] if len(f0_file) >= 1 else "", "choices": f0_file, "__type__": "update"} - -def change_audios_choices(): - audios = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk("audios") for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")]) - return {"value": audios[0] if len(audios) >= 1 else "", "choices": audios, "__type__": "update"} - -def change_separate_choices(): - return [{"choices": sorted([os.path.join("assets", "models", "uvr5", models) for models in os.listdir(os.path.join("assets", "models", "uvr5")) if model.endswith((".th", ".yaml", ".onnx"))]), "__type__": "update"}] - -def change_models_choices(): - model, index = sorted(list(model for model in os.listdir(os.path.join("assets", "weights")) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".index")]) - return [{"value": model[0] if len(model) >= 1 else "", "choices": model, "__type__": "update"}, {"value": index[0] if len(index) >= 1 else "", "choices": index, "__type__": "update"}] - -def change_allpretrained_choices(): - return [{"choices": sorted([os.path.join("assets", "models", path, model) for path in ["pretrained_v1", "pretrained_v2", "pretrained_custom"] for model in os.listdir(os.path.join("assets", "models", path)) if model.endswith(".pth") and ("D" in model or "G" in model)]), "__type__": "update"}] - -def change_pretrained_choices(): - return [{"choices": sorted([model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "D" in model]), "__type__": "update"}, {"choices": sorted([model for model in os.listdir(os.path.join("assets", "models", "pretrained_custom")) if model.endswith(".pth") and "G" in model]), "__type__": "update"}] - -def change_choices_del(): - return [{"choices": sorted(list(model for model in os.listdir(os.path.join("assets", "weights")) if model.endswith(".pth") and not model.startswith("G_") and not model.startswith("D_"))), "__type__": "update"}, {"choices": sorted([os.path.join("assets", "logs", f) for f in os.listdir(os.path.join("assets", "logs")) if "mute" not in f and os.path.isdir(os.path.join("assets", "logs", f))]), "__type__": "update"}] - -def change_preset_choices(): - return {"value": "", "choices": sorted(list(f for f in os.listdir(os.path.join("assets", "presets")) if f.endswith(".json"))), "__type__": "update"} - -def change_tts_voice_choices(google): - return {"choices": google_tts_voice if google else edgetts, "value": google_tts_voice[0] if google else edgetts[0], "__type__": "update"} - -def change_backing_choices(backing, merge): - if backing or merge: return {"value": False, "interactive": False, "__type__": "update"} - elif not backing or not merge: return {"interactive": True, "__type__": "update"} - else: gr_warning(translations["option_not_valid"]) - -def change_download_choices(select): - selects = [False]*10 - - if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True - elif select == translations["download_from_csv"]: selects[3] = selects[4] = True - elif select == translations["search_models"]: selects[5] = selects[6] = True - elif select == translations["upload"]: selects[9] = True - else: gr_warning(translations["option_not_valid"]) - - return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))] - -def change_download_pretrained_choices(select): - selects = [False]*8 - - if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True - elif select == translations["list_model"]: selects[3] = selects[4] = selects[5] = True - elif select == translations["upload"]: selects[6] = selects[7] = True - else: gr_warning(translations["option_not_valid"]) - - return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))] - -def get_index(model): - model = os.path.basename(model).split("_")[0] - return {"value": next((f for f in [os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if model.split(".")[0] in f), ""), "__type__": "update"} if model else None - -def index_strength_show(index): - return {"visible": index and os.path.exists(index), "value": 0.5, "__type__": "update"} - -def hoplength_show(method, hybrid_method=None): - show_hop_length_method = ["mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "fcpe", "fcpe-legacy", "yin", "pyin"] - - if method in show_hop_length_method: visible = True - elif method == "hybrid": - methods_str = re.search("hybrid\[(.+)\]", hybrid_method) - if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")] - - for i in methods: - visible = i in show_hop_length_method - if visible: break - else: visible = False - - return {"visible": visible, "__type__": "update"} - -def visible(value): - return {"visible": value, "__type__": "update"} - -def valueFalse_interactive(inp): - return {"value": False, "interactive": inp, "__type__": "update"} - -def valueEmpty_visible1(inp1): - return {"value": "", "visible": inp1, "__type__": "update"} - -def process_input(file_path): - with open(file_path, "r", encoding="utf-8") as file: - file_contents = file.read() - - gr_info(translations["upload_success"].format(name=translations["text"])) - return file_contents - -def fetch_pretrained_data(): - response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/wfba/phfgbz_cergenvarq.wfba", "rot13")) - response.raise_for_status() - return response.json() - -def update_sample_rate_dropdown(model): - data = fetch_pretrained_data() - if model != translations["success"]: return {"choices": list(data[model].keys()), "value": list(data[model].keys())[0], "__type__": "update"} - -def if_done(done, p): - while 1: - if p.poll() is None: sleep(0.5) - else: break - - done[0] = True - -def restart_app(): - global app - - gr_info(translations["15s"]) - os.system("cls" if platform.system() == "Windows" else "clear") - - app.close() - os.system(f"{python} {os.path.join('main', 'app', 'app.py')} {sys.argv}") - -def change_language(lang): - with open(configs_json, "r") as f: - configs = json.load(f) - - configs["language"] = lang - with open(configs_json, "w") as f: - json.dump(configs, f, indent=4) - - restart_app() - -def change_theme(theme): - with open(configs_json, "r") as f: - configs = json.load(f) - - configs["theme"] = theme - with open(configs_json, "w") as f: - json.dump(configs, f, indent=4) - - restart_app() - -def zip_file(name, pth, index): - pth_path = os.path.join("assets", "weights", pth) - if not pth or not os.path.exists(pth_path) or not pth.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"])) - - zip_file_path = os.path.join("assets", "logs", pth.replace(".pth", ""), name + ".zip") - gr_info(translations["start"].format(start=translations["zip"])) - - import zipfile - with zipfile.ZipFile(zip_file_path, 'w') as zipf: - zipf.write(pth_path, os.path.basename(pth_path)) - if index: zipf.write(index, os.path.basename(index)) - - gr_info(translations["success"]) - return {"visible": True, "value": zip_file_path, "__type__": "update"} - -def fetch_models_data(search): - all_table_data = [] - page = 1 - - while 1: - try: - response = requests.post(url=codecs.decode("uggcf://ibvpr-zbqryf.pbz/srgpu_qngn.cuc", "rot13"), data={"page": page, "search": search}) - - if response.status_code == 200: - table_data = response.json().get("table", "") - if not table_data.strip(): break - all_table_data.append(table_data) - page += 1 - else: - logger.debug(f"{translations['code_error']} {response.status_code}") - break - except json.JSONDecodeError: - logger.debug(translations["json_error"]) - break - except requests.RequestException as e: - logger.debug(translations["requests_error"].format(e=e)) - break - return all_table_data - -def search_models(name): - gr_info(translations["start"].format(start=translations["search"])) - tables = fetch_models_data(name) - - if len(tables) == 0: - gr_info(translations["not_found"].format(name=name)) - return [None]*2 - else: - model_options.clear() - - for table in tables: - for row in BeautifulSoup(table, "html.parser").select("tr"): - name_tag, url_tag = row.find("a", {"class": "fs-5"}), row.find("a", {"class": "btn btn-sm fw-bold btn-light ms-0 p-1 ps-2 pe-2"}) - if name_tag and url_tag: model_options[name_tag.text.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", "").replace(" ", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip()] = url_tag["href"].replace("https://easyaivoice.com/run?url=", "") - - gr_info(translations["found"].format(results=len(model_options))) - return [{"value": "", "choices": model_options, "interactive": True, "visible": True, "__type__": "update"}, {"value": translations["downloads"], "visible": True, "__type__": "update"}] - -def move_files_from_directory(src_dir, dest_weights, dest_logs, model_name): - for root, _, files in os.walk(src_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".index"): - model_log_dir = os.path.join(dest_logs, model_name) - os.makedirs(model_log_dir, exist_ok=True) - - filepath = os.path.join(model_log_dir, file.replace(' ', '_').replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip()) - if os.path.exists(filepath): os.remove(filepath) - - shutil.move(file_path, filepath) - elif file.endswith(".pth") and not file.startswith("D_") and not file.startswith("G_"): - pth_path = os.path.join(dest_weights, model_name + ".pth") - if os.path.exists(pth_path): os.remove(pth_path) - - shutil.move(file_path, pth_path) - elif file.endswith(".onnx") and not file.startswith("D_") and not file.startswith("G_"): - pth_path = os.path.join(dest_weights, model_name + ".onnx") - if os.path.exists(pth_path): os.remove(pth_path) - - shutil.move(file_path, pth_path) - -def download_url(url): - if not url: return gr_warning(translations["provide_url"]) - if not os.path.exists("audios"): os.makedirs("audios", exist_ok=True) - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore") - ydl_opts = {"format": "bestaudio/best", "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "wav", "preferredquality": "192"}], "quiet": True, "no_warnings": True, "noplaylist": True, "verbose": False} - - gr_info(translations["start"].format(start=translations["download_music"])) - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - audio_output = os.path.join("audios", re.sub(r'\s+', '-', re.sub(r'[^\w\s\u4e00-\u9fff\uac00-\ud7af\u0400-\u04FF\u1100-\u11FF]', '', ydl.extract_info(url, download=False).get('title', 'video')).strip())) - if os.path.exists(audio_output): shutil.rmtree(audio_output, ignore_errors=True) - - ydl_opts['outtmpl'] = audio_output - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - audio_output = audio_output + ".wav" - if os.path.exists(audio_output): os.remove(audio_output) - - ydl.download([url]) - - gr_info(translations["success"]) - return [audio_output, audio_output, translations["success"]] - -def download_model(url=None, model=None): - if not url: return gr_warning(translations["provide_url"]) - if not model: return gr_warning(translations["provide_name_is_save"]) - - model = model.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", "").replace(" ", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "").strip() - url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip() - - download_dir = os.path.join("download_model") - weights_dir = os.path.join("assets", "weights") - logs_dir = os.path.join("assets", "logs") - - if not os.path.exists(download_dir): os.makedirs(download_dir, exist_ok=True) - if not os.path.exists(weights_dir): os.makedirs(weights_dir, exist_ok=True) - if not os.path.exists(logs_dir): os.makedirs(logs_dir, exist_ok=True) - - try: - gr_info(translations["start"].format(start=translations["download"])) - - if url.endswith(".pth"): huggingface.HF_download_file(url, os.path.join(weights_dir, f"{model}.pth")) - elif url.endswith(".onnx"): huggingface.HF_download_file(url, os.path.join(weights_dir, f"{model}.onnx")) - elif url.endswith(".index"): - model_log_dir = os.path.join(logs_dir, model) - os.makedirs(model_log_dir, exist_ok=True) - - huggingface.HF_download_file(url, os.path.join(model_log_dir, f"{model}.index")) - elif url.endswith(".zip"): - output_path = huggingface.HF_download_file(url, os.path.join(download_dir, model + ".zip")) - shutil.unpack_archive(output_path, download_dir) - - move_files_from_directory(download_dir, weights_dir, logs_dir, model) - else: - if "drive.google.com" in url or "drive.usercontent.google.com" in url: - file_id = None - - if "/file/d/" in url: file_id = url.split("/d/")[1].split("/")[0] - elif "open?id=" in url: file_id = url.split("open?id=")[1].split("/")[0] - elif "/download?id=" in url: file_id = url.split("/download?id=")[1].split("&")[0] - - if file_id: - file = gdown.gdown_download(id=file_id, output=download_dir) - if file.endswith(".zip"): shutil.unpack_archive(file, download_dir) - - move_files_from_directory(download_dir, weights_dir, logs_dir, model) - elif "mega.nz" in url: - meganz.mega_download_url(url, download_dir) - - file_download = next((f for f in os.listdir(download_dir)), None) - if file_download.endswith(".zip"): shutil.unpack_archive(os.path.join(download_dir, file_download), download_dir) - - move_files_from_directory(download_dir, weights_dir, logs_dir, model) - elif "mediafire.com" in url: - file = mediafire.Mediafire_Download(url, download_dir) - if file.endswith(".zip"): shutil.unpack_archive(file, download_dir) - - move_files_from_directory(download_dir, weights_dir, logs_dir, model) - elif "pixeldrain.com" in url: - file = pixeldrain.pixeldrain(url, download_dir) - if file.endswith(".zip"): shutil.unpack_archive(file, download_dir) - - move_files_from_directory(download_dir, weights_dir, logs_dir, model) - else: - gr_warning(translations["not_support_url"]) - return translations["not_support_url"] - - gr_info(translations["success"]) - return translations["success"] - except Exception as e: - gr_error(message=translations["error_occurred"].format(e=e)) - logger.debug(e) - return translations["error_occurred"].format(e=e) - finally: - shutil.rmtree(download_dir, ignore_errors=True) - -def save_drop_model(dropbox): - weight_folder = os.path.join("assets", "weights") - logs_folder = os.path.join("assets", "logs") - save_model_temp = os.path.join("save_model_temp") - - if not os.path.exists(weight_folder): os.makedirs(weight_folder, exist_ok=True) - if not os.path.exists(logs_folder): os.makedirs(logs_folder, exist_ok=True) - if not os.path.exists(save_model_temp): os.makedirs(save_model_temp, exist_ok=True) - - shutil.move(dropbox, save_model_temp) - - try: - file_name = os.path.basename(dropbox) - - if file_name.endswith(".pth") and file_name.endswith(".onnx") and file_name.endswith(".index"): gr_warning(translations["not_model"]) - else: - if file_name.endswith(".zip"): - shutil.unpack_archive(os.path.join(save_model_temp, file_name), save_model_temp) - move_files_from_directory(save_model_temp, weight_folder, logs_folder, file_name.replace(".zip", "")) - elif file_name.endswith((".pth", ".onnx")): - output_file = os.path.join(weight_folder, file_name) - if os.path.exists(output_file): os.remove(output_file) - - shutil.move(os.path.join(save_model_temp, file_name), output_file) - elif file_name.endswith(".index"): - def extract_name_model(filename): - match = re.search(r"([A-Za-z]+)(?=_v|\.|$)", filename) - return match.group(1) if match else None - - model_logs = os.path.join(logs_folder, extract_name_model(file_name)) - if not os.path.exists(model_logs): os.makedirs(model_logs, exist_ok=True) - shutil.move(os.path.join(save_model_temp, file_name), model_logs) - else: - gr_warning(translations["unable_analyze_model"]) - return None - - gr_info(translations["upload_success"].format(name=translations["model"])) - return None - except Exception as e: - gr_error(message=translations["error_occurred"].format(e=e)) - logger.debug(e) - return None - finally: - shutil.rmtree(save_model_temp, ignore_errors=True) - -def download_pretrained_model(choices, model, sample_rate): - pretraineds_custom_path = os.path.join("assets", "models", "pretrained_custom") - if choices == translations["list_model"]: - paths = fetch_pretrained_data()[model][sample_rate] - - if not os.path.exists(pretraineds_custom_path): os.makedirs(pretraineds_custom_path, exist_ok=True) - url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_phfgbz/", "rot13") + paths - - gr_info(translations["download_pretrain"]) - file = huggingface.HF_download_file(url.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), os.path.join(pretraineds_custom_path, paths)) - - if file.endswith(".zip"): - shutil.unpack_archive(file, pretraineds_custom_path) - os.remove(file) - - gr_info(translations["success"]) - return translations["success"] - elif choices == translations["download_url"]: - if not model: return gr_warning(translations["provide_pretrain"].format(dg="D")) - if not sample_rate: return gr_warning(translations["provide_pretrain"].format(dg="G")) - - gr_info(translations["download_pretrain"]) - - huggingface.HF_download_file(model.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), pretraineds_custom_path) - huggingface.HF_download_file(sample_rate.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), pretraineds_custom_path) - - gr_info(translations["success"]) - return translations["success"] - -def hubert_download(hubert): - if not hubert: - gr_warning(translations["provide_hubert"]) - return translations["provide_hubert"] - - huggingface.HF_download_file(hubert.replace("/blob/", "/resolve/").replace("?download=true", "").strip(), os.path.join("assets", "models", "embedders")) - - gr_info(translations["success"]) - return translations["success"] - -def fushion_model_pth(name, pth_1, pth_2, ratio): - if not name.endswith(".pth"): name = name + ".pth" - - if not pth_1 or not os.path.exists(pth_1) or not pth_1.endswith(".pth"): - gr_warning(translations["provide_file"].format(filename=translations["model"] + " 1")) - return [translations["provide_file"].format(filename=translations["model"] + " 1"), None] - - if not pth_2 or not os.path.exists(pth_2) or not pth_2.endswith(".pth"): - gr_warning(translations["provide_file"].format(filename=translations["model"] + " 2")) - return [translations["provide_file"].format(filename=translations["model"] + " 2"), None] - - from collections import OrderedDict - - def extract(ckpt): - a = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - - for key in a.keys(): - if "enc_q" in key: continue - - opt["weight"][key] = a[key] - - return opt - - try: - ckpt1 = torch.load(pth_1, map_location="cpu") - ckpt2 = torch.load(pth_2, map_location="cpu") - - if ckpt1["sr"] != ckpt2["sr"]: - gr_warning(translations["sr_not_same"]) - return [translations["sr_not_same"], None] - - cfg = ckpt1["config"] - cfg_f0 = ckpt1["f0"] - cfg_version = ckpt1["version"] - cfg_sr = ckpt1["sr"] - - vocoder = ckpt1.get("vocoder", "Default") - - ckpt1 = extract(ckpt1) if "model" in ckpt1 else ckpt1["weight"] - ckpt2 = extract(ckpt2) if "model" in ckpt2 else ckpt2["weight"] - - if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): - gr_warning(translations["architectures_not_same"]) - return [translations["architectures_not_same"], None] - - gr_info(translations["start"].format(start=translations["fushion_model"])) - - opt = OrderedDict() - opt["weight"] = {} - - for key in ckpt1.keys(): - if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: - min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) - opt["weight"][key] = (ratio * (ckpt1[key][:min_shape0].float()) + (1 - ratio) * (ckpt2[key][:min_shape0].float())).half() - else: opt["weight"][key] = (ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())).half() - - opt["config"] = cfg - opt["sr"] = cfg_sr - opt["f0"] = cfg_f0 - opt["version"] = cfg_version - opt["infos"] = translations["model_fushion_info"].format(name=name, pth_1=pth_1, pth_2=pth_2, ratio=ratio) - opt["vocoder"] = vocoder - - output_model = os.path.join("assets", "weights") - if not os.path.exists(output_model): os.makedirs(output_model, exist_ok=True) - - torch.save(opt, os.path.join(output_model, name)) - - gr_info(translations["success"]) - return [translations["success"], os.path.join(output_model, name)] - except Exception as e: - gr_error(message=translations["error_occurred"].format(e=e)) - logger.debug(e) - return [e, None] - -def extract_metadata(model): - return {prop.key: prop.value for prop in model.metadata_props} - -def fushion_model_onnx(name, onnx_path1, onnx_path2, ratio=0.5): - if not name.endswith(".onnx"): name = name + ".onnx" - - if not onnx_path1 or not os.path.exists(onnx_path1) or not onnx_path1.endswith(".onnx"): - gr_warning(translations["provide_file"].format(filename=translations["model"] + " 1")) - return [translations["provide_file"].format(filename=translations["model"] + " 1"), None] - - if not onnx_path2 or not os.path.exists(onnx_path2) or not onnx_path2.endswith(".onnx"): - gr_warning(translations["provide_file"].format(filename=translations["model"] + " 2")) - return [translations["provide_file"].format(filename=translations["model"] + " 2"), None] - - try: - model1 = onnx.load(onnx_path1) - model2 = onnx.load(onnx_path2) - - metadata1 = extract_metadata(model1) - metadata2 = extract_metadata(model2) - - if metadata1.get("sr") != metadata2.get("sr"): - gr_warning(translations["sr_not_same"]) - return [translations["sr_not_same"], None] - - gr_info(translations["start"].format(start=translations["fushion_model"])) - - for init1, init2 in zip(model1.graph.initializer, model2.graph.initializer): - tensor1 = onnx.numpy_helper.to_array(init1) - tensor2 = onnx.numpy_helper.to_array(init2) - - if tensor1.shape != tensor2.shape: - gr_warning(translations["architectures_not_same"]) - return [translations["architectures_not_same"], None] - - fused_tensor = ratio * tensor1 + (1 - ratio) * tensor2 - init1.CopyFrom(onnx.numpy_helper.from_array(fused_tensor, name=init1.name)) - - new_metadata = metadata1.copy() - new_metadata["fusion_ratio"] = str(ratio) - new_metadata["creation_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - del model1.metadata_props[:] - - for key, value in new_metadata.items(): - entry = model1.metadata_props.add() - entry.key = key - entry.value = value - - output_model = os.path.join("assets", "weights") - if not os.path.exists(output_model): os.makedirs(output_model, exist_ok=True) - - onnx.save(model1, os.path.join(output_model, name)) - - gr_info(translations["success"]) - return [translations["success"], os.path.join(output_model, name)] - except Exception as e: - gr_error(message=translations["error_occurred"].format(e=e)) - logger.debug(e) - return [e, None] - -def fushion_model(name, path_1, path_2, ratio): - if not name: - gr_warning(translations["provide_name_is_save"]) - return [translations["provide_name_is_save"], None] - - if path_1.endswith(".onnx") and path_2.endswith(".onnx"): return fushion_model_onnx(name.replace(".pth", ".onnx"), path_1, path_2, ratio) - elif path_1.endswith(".pth") and path_2.endswith(".pth"): return fushion_model_pth(name.replace(".onnx", ".pth"), path_1, path_2, ratio) - else: - gr_warning(translations["format_not_valid"]) - return [None, None] - -def onnx_export(model_path): - from main.library.algorithm.onnx_export import onnx_exporter - - if not model_path.endswith(".pth"): model_path + ".pth" - if not model_path or not os.path.exists(model_path) or not model_path.endswith(".pth"): - gr_warning(translations["provide_file"].format(filename=translations["model"])) - return [None, translations["provide_file"].format(filename=translations["model"])] - - try: - gr_info(translations["start_onnx_export"]) - output = onnx_exporter(model_path, model_path.replace(".pth", ".onnx")) - - gr_info(translations["success"]) - return [output, translations["success"]] - except Exception as e: - return [None, e] - -def model_info(path): - if not path or not os.path.exists(path) or os.path.isdir(path) or not path.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"])) - - def prettify_date(date_str): - if date_str == translations["not_found_create_time"]: return None - - try: - return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f").strftime("%Y-%m-%d %H:%M:%S") - except ValueError as e: - logger.debug(e) - return translations["format_not_valid"] - - if path.endswith(".pth"): model_data = torch.load(path, map_location=torch.device("cpu")) - else: - model = onnx.load(path) - model_data = None - - for prop in model.metadata_props: - if prop.key == "model_info": - model_data = json.loads(prop.value) - break - - gr_info(translations["read_info"]) - - epochs = model_data.get("epoch", None) - if epochs is None: - epochs = model_data.get("info", None) - try: - epoch = epochs.replace("epoch", "").replace("e", "").isdigit() - if epoch and epochs is None: epochs = translations["not_found"].format(name=translations["epoch"]) - except: - pass - - steps = model_data.get("step", translations["not_found"].format(name=translations["step"])) - sr = model_data.get("sr", translations["not_found"].format(name=translations["sr"])) - f0 = model_data.get("f0", translations["not_found"].format(name=translations["f0"])) - version = model_data.get("version", translations["not_found"].format(name=translations["version"])) - creation_date = model_data.get("creation_date", translations["not_found_create_time"]) - model_hash = model_data.get("model_hash", translations["not_found"].format(name="model_hash")) - pitch_guidance = translations["trained_f0"] if f0 else translations["not_f0"] - creation_date_str = prettify_date(creation_date) if creation_date else translations["not_found_create_time"] - model_name = model_data.get("model_name", translations["unregistered"]) - model_author = model_data.get("author", translations["not_author"]) - vocoder = model_data.get("vocoder", "Default") - - gr_info(translations["success"]) - return translations["model_info"].format(model_name=model_name, model_author=model_author, epochs=epochs, steps=steps, version=version, sr=sr, pitch_guidance=pitch_guidance, model_hash=model_hash, creation_date_str=creation_date_str, vocoder=vocoder) - -def audio_effects(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input): - if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path): - gr_warning(translations["input_not_valid"]) - return None - - if not output_path: - gr_warning(translations["output_not_valid"]) - return None - - if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_effects.{export_format}") - output_dir = os.path.dirname(output_path) or output_path - - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - if os.path.exists(output_path): os.remove(output_path) - - gr_info(translations["start"].format(start=translations["apply_effect"])) - os.system(f'{python} main/inference/audio_effects.py --input_path "{input_path}" --output_path "{output_path}" --resample {resample} --resample_sr {resample_sr} --chorus_depth {chorus_depth} --chorus_rate {chorus_rate} --chorus_mix {chorus_mix} --chorus_delay {chorus_delay} --chorus_feedback {chorus_feedback} --drive_db {distortion_drive} --reverb_room_size {reverb_room_size} --reverb_damping {reverb_damping} --reverb_wet_level {reverb_wet_level} --reverb_dry_level {reverb_dry_level} --reverb_width {reverb_width} --reverb_freeze_mode {reverb_freeze_mode} --pitch_shift {pitch_shift} --delay_seconds {delay_seconds} --delay_feedback {delay_feedback} --delay_mix {delay_mix} --compressor_threshold {compressor_threshold} --compressor_ratio {compressor_ratio} --compressor_attack_ms {compressor_attack_ms} --compressor_release_ms {compressor_release_ms} --limiter_threshold {limiter_threshold} --limiter_release {limiter_release} --gain_db {gain_db} --bitcrush_bit_depth {bitcrush_bit_depth} --clipping_threshold {clipping_threshold} --phaser_rate_hz {phaser_rate_hz} --phaser_depth {phaser_depth} --phaser_centre_frequency_hz {phaser_centre_frequency_hz} --phaser_feedback {phaser_feedback} --phaser_mix {phaser_mix} --bass_boost_db {bass_boost_db} --bass_boost_frequency {bass_boost_frequency} --treble_boost_db {treble_boost_db} --treble_boost_frequency {treble_boost_frequency} --fade_in_duration {fade_in_duration} --fade_out_duration {fade_out_duration} --export_format {export_format} --chorus {chorus} --distortion {distortion} --reverb {reverb} --pitchshift {pitch_shift != 0} --delay {delay} --compressor {compressor} --limiter {limiter} --gain {gain} --bitcrush {bitcrush} --clipping {clipping} --phaser {phaser} --treble_bass_boost {treble_bass_boost} --fade_in_out {fade_in_out} --audio_combination {audio_combination} --audio_combination_input "{audio_combination_input}"') - - gr_info(translations["success"]) - return output_path - -async def TTS(prompt, voice, speed, output, pitch, google): - if not prompt: - gr_warning(translations["enter_the_text"]) - return None - - if not voice: - gr_warning(translations["choose_voice"]) - return None - - if not output: - gr_warning(translations["output_not_valid"]) - return None - - if os.path.isdir(output): output = os.path.join(output, f"tts.wav") - gr_info(translations["convert"].format(name=translations["text"])) - - output_dir = os.path.dirname(output) or output - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - - if not google: await edge_tts.Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output) - else: google_tts.google_tts(text=prompt, lang=voice, speed=speed, pitch=pitch, output_file=output) - - gr_info(translations["success"]) - return output - -def separator_music(input, output_audio, format, shifts, segments_size, overlap, clean_audio, clean_strength, denoise, separator_model, kara_model, backing, reverb, backing_reverb, hop_length, batch_size, sample_rate): - output = os.path.dirname(output_audio) or output_audio - - if not input or not os.path.exists(input) or os.path.isdir(input): - gr_warning(translations["input_not_valid"]) - return [None]*4 - - if not os.path.exists(output): - gr_warning(translations["output_not_valid"]) - return [None]*4 - - if not os.path.exists(output): os.makedirs(output) - gr_info(translations["start"].format(start=translations["separator_music"])) - - os.system(f'{python} main/inference/separator_music.py --input_path "{input}" --output_path "{output}" --format {format} --shifts {shifts} --segments_size {segments_size} --overlap {overlap} --mdx_hop_length {hop_length} --mdx_batch_size {batch_size} --clean_audio {clean_audio} --clean_strength {clean_strength} --kara_model {kara_model} --backing {backing} --mdx_denoise {denoise} --reverb {reverb} --backing_reverb {backing_reverb} --model_name "{separator_model}" --sample_rate {sample_rate}') - gr_info(translations["success"]) - - return [os.path.join(output, f"Original_Vocals_No_Reverb.{format}") if reverb else os.path.join(output, f"Original_Vocals.{format}"), os.path.join(output, f"Instruments.{format}"), (os.path.join(output, f"Main_Vocals_No_Reverb.{format}") if reverb else os.path.join(output, f"Main_Vocals.{format}") if backing else None), (os.path.join(output, f"Backing_Vocals_No_Reverb.{format}") if backing_reverb else os.path.join(output, f"Backing_Vocals.{format}") if backing else None)] if os.path.isfile(input) else [None]*4 - -def convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_onnx, formant_shifting, formant_qfrency, formant_timbre, f0_file): - os.system(f'{python} main/inference/convert.py --pitch {pitch} --filter_radius {filter_radius} --index_rate {index_rate} --volume_envelope {volume_envelope} --protect {protect} --hop_length {hop_length} --f0_method {f0_method} --input_path "{input_path}" --output_path "{output_path}" --pth_path "{pth_path}" --index_path "{index_path}" --f0_autotune {f0_autotune} --clean_audio {clean_audio} --clean_strength {clean_strength} --export_format {export_format} --embedder_model {embedder_model} --resample_sr {resample_sr} --split_audio {split_audio} --f0_autotune_strength {f0_autotune_strength} --checkpointing {checkpointing} --f0_onnx {onnx_f0_mode} --embedders_onnx {embedders_onnx} --formant_shifting {formant_shifting} --formant_qfrency {formant_qfrency} --formant_timbre {formant_timbre} --f0_file "{f0_file}"') - -def convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, input_audio_name, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_onnx): - model_path = os.path.join("assets", "weights", model) - - return_none = [None]*6 - return_none[5] = {"visible": True, "__type__": "update"} - - if not use_audio: - if merge_instrument or not_merge_backing or convert_backing or use_original: - gr_warning(translations["turn_on_use_audio"]) - return return_none - - if use_original: - if convert_backing: - gr_warning(translations["turn_off_convert_backup"]) - return return_none - elif not_merge_backing: - gr_warning(translations["turn_off_merge_backup"]) - return return_none - - if not model or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")): - gr_warning(translations["provide_file"].format(filename=translations["model"])) - return return_none - - f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders) - - if use_audio: - output_audio = os.path.join("audios", input_audio_name) - - def get_audio_file(label): - matching_files = [f for f in os.listdir(output_audio) if label in f] - - if not matching_files: return translations["notfound"] - return os.path.join(output_audio, matching_files[0]) - - output_path = os.path.join(output_audio, f"Convert_Vocals.{format}") - output_backing = os.path.join(output_audio, f"Convert_Backing.{format}") - output_merge_backup = os.path.join(output_audio, f"Vocals+Backing.{format}") - output_merge_instrument = os.path.join(output_audio, f"Vocals+Instruments.{format}") - - if os.path.exists(output_audio): os.makedirs(output_audio, exist_ok=True) - if os.path.exists(output_path): os.remove(output_path) - - if use_original: - original_vocal = get_audio_file('Original_Vocals_No_Reverb.') - - if original_vocal == translations["notfound"]: original_vocal = get_audio_file('Original_Vocals.') - - if original_vocal == translations["notfound"]: - gr_warning(translations["not_found_original_vocal"]) - return return_none - - input_path = original_vocal - else: - main_vocal = get_audio_file('Main_Vocals_No_Reverb.') - backing_vocal = get_audio_file('Backing_Vocals_No_Reverb.') - - if main_vocal == translations["notfound"]: main_vocal = get_audio_file('Main_Vocals.') - if not not_merge_backing and backing_vocal == translations["notfound"]: backing_vocal = get_audio_file('Backing_Vocals.') - - if main_vocal == translations["notfound"]: - gr_warning(translations["not_found_main_vocal"]) - return return_none - - if not not_merge_backing and backing_vocal == translations["notfound"]: - gr_warning(translations["not_found_backing_vocal"]) - return return_none - - input_path = main_vocal - backing_path = backing_vocal - - gr_info(translations["convert_vocal"]) - - convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input_path, output_path, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_onnx, formant_shifting, formant_qfrency, formant_timbre, f0_file) - - gr_info(translations["convert_success"]) - - if convert_backing: - if os.path.exists(output_backing): os.remove(output_backing) - - gr_info(translations["convert_backup"]) - - convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, backing_path, output_backing, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_onnx, formant_shifting, formant_qfrency, formant_timbre, f0_file) - - gr_info(translations["convert_backup_success"]) - - try: - if not not_merge_backing and not use_original: - backing_source = output_backing if convert_backing else backing_vocal - - if os.path.exists(output_merge_backup): os.remove(output_merge_backup) - - gr_info(translations["merge_backup"]) - - pydub_convert(pydub_load(output_path)).overlay(pydub_convert(pydub_load(backing_source))).export(output_merge_backup, format=format) - - gr_info(translations["merge_success"]) - - if merge_instrument: - vocals = output_merge_backup if not not_merge_backing and not use_original else output_path - - if os.path.exists(output_merge_instrument): os.remove(output_merge_instrument) - - gr_info(translations["merge_instruments_process"]) - - instruments = get_audio_file('Instruments.') - - if instruments == translations["notfound"]: - gr_warning(translations["not_found_instruments"]) - output_merge_instrument = None - else: pydub_convert(pydub_load(instruments)).overlay(pydub_convert(pydub_load(vocals))).export(output_merge_instrument, format=format) - - gr_info(translations["merge_success"]) - except: - return return_none - - return [(None if use_original else output_path), output_backing, (None if not_merge_backing and use_original else output_merge_backup), (output_path if use_original else None), (output_merge_instrument if merge_instrument else None), {"visible": True, "__type__": "update"}] - else: - if not input or not os.path.exists(input): - gr_warning(translations["input_not_valid"]) - return return_none - - if not output: - gr_warning(translations["output_not_valid"]) - return return_none - - if os.path.isdir(input): - gr_info(translations["is_folder"]) - - if not [f for f in os.listdir(input) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]: - gr_warning(translations["not_found_in_folder"]) - return return_none - - gr_info(translations["batch_convert"]) - - output_dir = os.path.dirname(output) or output - convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input, output_dir, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_onnx, formant_shifting, formant_qfrency, formant_timbre, f0_file) - - gr_info(translations["batch_convert_success"]) - - return return_none - else: - output_dir = os.path.dirname(output) or output - - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - if os.path.exists(output): os.remove(output) - - gr_info(translations["convert_vocal"]) - - convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_onnx, formant_shifting, formant_qfrency, formant_timbre, f0_file) - - gr_info(translations["convert_success"]) - - return_none[0] = output - return return_none - -def convert_selection(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_onnx): - if use_audio: - gr_info(translations["search_separate"]) - - choice = [f for f in os.listdir("audios") if os.path.isdir(os.path.join("audios", f))] - - gr_info(translations["found_choice"].format(choice=len(choice))) - - if len(choice) == 0: - gr_warning(translations["separator==0"]) - - return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, None, None, None, None, None, {"visible": True, "__type__": "update"}] - elif len(choice) == 1: - convert_output = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, None, None, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, choice[0], checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_onnx) - - return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, convert_output[0], convert_output[1], convert_output[2], convert_output[3], convert_output[4], {"visible": True, "__type__": "update"}] - else: return [{"choices": choice, "value": "", "interactive": True, "visible": True, "__type__": "update"}, None, None, None, None, None, {"visible": False, "__type__": "update"}] - else: - main_convert = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, None, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_onnx) - - return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, main_convert[0], None, None, None, None, {"visible": True, "__type__": "update"}] - -def convert_tts(clean, autotune, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_onnx): - model_path = os.path.join("assets", "weights", model) - - if not model_path or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")): - gr_warning(translations["provide_file"].format(filename=translations["model"])) - return None - - if not input or not os.path.exists(input): - gr_warning(translations["input_not_valid"]) - return None - - if os.path.isdir(input): - input_audio = [f for f in os.listdir(input) if "tts" in f and f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))] - - if not input_audio: - gr_warning(translations["not_found_in_folder"]) - return None - - input = os.path.join(input, input_audio[0]) - - if not output: - gr_warning(translations["output_not_valid"]) - return None - - if os.path.isdir(output): output = os.path.join(output, f"tts.{format}") - - output_dir = os.path.dirname(output) - if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) - - if os.path.exists(output): os.remove(output) - - f0method = method if method != "hybrid" else hybrid_method - embedder_model = embedders if embedders != "custom" else custom_embedders - - gr_info(translations["convert_vocal"]) - - convert(pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_onnx, formant_shifting, formant_qfrency, formant_timbre, f0_file) - - gr_info(translations["convert_success"]) - return output - -def log_read(log_file, done): - f = open(log_file, "w", encoding="utf-8") - f.close() - - while 1: - with open(log_file, "r", encoding="utf-8") as f: - yield "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "") - - sleep(1) - if done[0]: break - - with open(log_file, "r", encoding="utf-8") as f: - log = "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "") - - yield log - -def create_dataset(input_audio, output_dataset, clean_dataset, clean_strength, separator_reverb, kim_vocals_version, overlap, segments_size, denoise_mdx, skip, skip_start, skip_end, hop_length, batch_size, sample_rate): - version = 1 if kim_vocals_version == "Version-1" else 2 - - gr_info(translations["start"].format(start=translations["create"])) - - p = Popen(f'{python} main/inference/create_dataset.py --input_audio "{input_audio}" --output_dataset "{output_dataset}" --clean_dataset {clean_dataset} --clean_strength {clean_strength} --separator_reverb {separator_reverb} --kim_vocal_version {version} --overlap {overlap} --segments_size {segments_size} --mdx_hop_length {hop_length} --mdx_batch_size {batch_size} --denoise_mdx {denoise_mdx} --skip {skip} --skip_start_audios "{skip_start}" --skip_end_audios "{skip_end}" --sample_rate {sample_rate}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - - for log in log_read(os.path.join("assets", "logs", "create_dataset.log"), done): - yield log - -def preprocess(model_name, sample_rate, cpu_core, cut_preprocess, process_effects, path, clean_dataset, clean_strength): - dataset = os.path.join(path) - sr = int(float(sample_rate.rstrip("k")) * 1000) - - if not model_name: return gr_warning(translations["provide_name"]) - if not any(f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))): return gr_warning(translations["not_found_data"]) - - model_dir = os.path.join("assets", "logs", model_name) - if os.path.exists(model_dir): shutil.rmtree(model_dir, ignore_errors=True) - - p = Popen(f'{python} main/inference/preprocess.py --model_name "{model_name}" --dataset_path "{dataset}" --sample_rate {sr} --cpu_cores {cpu_core} --cut_preprocess {cut_preprocess} --process_effects {process_effects} --clean_dataset {clean_dataset} --clean_strength {clean_strength}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - os.makedirs(model_dir, exist_ok=True) - - for log in log_read(os.path.join(model_dir, "preprocess.log"), done): - yield log - -def extract(model_name, version, method, pitch_guidance, hop_length, cpu_cores, gpu, sample_rate, embedders, custom_embedders, onnx_f0_mode): - embedder_model = embedders if embedders != "custom" else custom_embedders - sr = int(float(sample_rate.rstrip("k")) * 1000) - - if not model_name: return gr_warning(translations["provide_name"]) - - model_dir = os.path.join("assets", "logs", model_name) - if not any(os.path.isfile(os.path.join(model_dir, "sliced_audios", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios"))) or not any(os.path.isfile(os.path.join(model_dir, "sliced_audios_16k", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios_16k"))): return gr_warning(translations["not_found_data_preprocess"]) - - p = Popen(f'{python} main/inference/extract.py --model_name "{model_name}" --rvc_version {version} --f0_method {method} --pitch_guidance {pitch_guidance} --hop_length {hop_length} --cpu_cores {cpu_cores} --gpu {gpu} --sample_rate {sr} --embedder_model {embedder_model} --f0_onnx {onnx_f0_mode}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - os.makedirs(model_dir, exist_ok=True) - - for log in log_read(os.path.join(model_dir, "extract.log"), done): - yield log - -def create_index(model_name, rvc_version, index_algorithm): - if not model_name: return gr_warning(translations["provide_name"]) - model_dir = os.path.join("assets", "logs", model_name) - - if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"]) - - p = Popen(f'{python} main/inference/create_index.py --model_name "{model_name}" --rvc_version {rvc_version} --index_algorithm {index_algorithm}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - os.makedirs(model_dir, exist_ok=True) - - for log in log_read(os.path.join(model_dir, "create_index.log"), done): - yield log - -def training(model_name, rvc_version, save_every_epoch, save_only_latest, save_every_weights, total_epoch, sample_rate, batch_size, gpu, pitch_guidance, not_pretrain, custom_pretrained, pretrain_g, pretrain_d, detector, threshold, clean_up, cache, model_author, vocoder, checkpointing): - sr = int(float(sample_rate.rstrip("k")) * 1000) - if not model_name: return gr_warning(translations["provide_name"]) - - model_dir = os.path.join("assets", "logs", model_name) - if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"]) - - if not not_pretrain: - if not custom_pretrained: - pretrained_selector = {True: {32000: ("f0G32k.pth", "f0D32k.pth"), 40000: ("f0G40k.pth", "f0D40k.pth"), 44100: ("f0G44k.pth", "f0D44k.pth"), 48000: ("f0G48k.pth", "f0D48k.pth")}, False: {32000: ("G32k.pth", "D32k.pth"), 40000: ("G40k.pth", "D40k.pth"), 44100: ("G44k.pth", "D44k.pth"), 48000: ("G48k.pth", "D48k.pth")}} - - pg, pd = pretrained_selector[pitch_guidance][sr] - else: - if not pretrain_g: return gr_warning(translations["provide_pretrained"].format(dg="G")) - if not pretrain_d: return gr_warning(translations["provide_pretrained"].format(dg="D")) - - pg, pd = pretrain_g, pretrain_d - - pretrained_G, pretrained_D = (os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder if vocoder != 'Default' else ''}{pg}"), os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder if vocoder != 'Default' else ''}{pd}")) if not custom_pretrained else (os.path.join("assets", "models", f"pretrained_custom", pg), os.path.join("assets", "models", f"pretrained_custom", pd)) - download_version = codecs.decode(f"uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_i{'2' if rvc_version == 'v2' else '1'}/", "rot13") - - if not custom_pretrained: - try: - if not os.path.exists(pretrained_G): - gr_info(translations["download_pretrained"].format(dg="G", rvc_version=rvc_version)) - huggingface.HF_download_file(f"{download_version}{pg}", os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder if vocoder != 'Default' else ''}{pg}")) - - if not os.path.exists(pretrained_D): - gr_info(translations["download_pretrained"].format(dg="D", rvc_version=rvc_version)) - huggingface.HF_download_file(f"{download_version}{pd}", os.path.join("assets", "models", f"pretrained_{rvc_version}", f"{vocoder if vocoder != 'Default' else ''}{pd}")) - except: - gr_warning(translations["not_use_pretrain_error_download"]) - pretrained_G, pretrained_D = None, None - else: - if not os.path.exists(pretrained_G): return gr_warning(translations["not_found_pretrain"].format(dg="G")) - if not os.path.exists(pretrained_D): return gr_warning(translations["not_found_pretrain"].format(dg="D")) - else: gr_warning(translations["not_use_pretrain"]) - - gr_info(translations["start"].format(start=translations["training"])) - - p = Popen(f'{python} main/inference/train.py --model_name "{model_name}" --rvc_version {rvc_version} --save_every_epoch {save_every_epoch} --save_only_latest {save_only_latest} --save_every_weights {save_every_weights} --total_epoch {total_epoch} --sample_rate {sr} --batch_size {batch_size} --gpu {gpu} --pitch_guidance {pitch_guidance} --overtraining_detector {detector} --overtraining_threshold {threshold} --cleanup {clean_up} --cache_data_in_gpu {cache} --g_pretrained_path "{pretrained_G}" --d_pretrained_path "{pretrained_D}" --model_author "{model_author}" --vocoder "{vocoder}" --checkpointing {checkpointing}', shell=True) - done = [False] - - threading.Thread(target=if_done, args=(done, p)).start() - if not os.path.exists(model_dir): os.makedirs(model_dir, exist_ok=True) - - for log in log_read(os.path.join(model_dir, "train.log"), done): - if len(log.split("\n")) > 100: log = log[-100:] - yield log - -def stop_pid(pid_file, model_name=None): - try: - pid_file_path = os.path.join("assets", f"{pid_file}.txt") if model_name is None else os.path.join("assets", "logs", model_name, f"{pid_file}.txt") - - if not os.path.exists(pid_file_path): return gr_warning(translations["not_found_pid"]) - else: - with open(pid_file_path, "r") as pid_file: - pids = [int(pid) for pid in pid_file.readlines()] - - for pid in pids: - os.kill(pid, 9) - - gr_info(translations["end_pid"]) - if os.path.exists(pid_file_path): os.remove(pid_file_path) - except: - pass - -def stop_train(model_name): - try: - pid_file_path = os.path.join("assets", "logs", model_name, "config.json") - - if not os.path.exists(pid_file_path): return gr_warning(translations["not_found_pid"]) - else: - with open(pid_file_path, "r") as pid_file: - pid_data = json.load(pid_file) - pids = pid_data.get("process_pids", []) - - with open(pid_file_path, "w") as pid_file: - pid_data.pop("process_pids", None) - - json.dump(pid_data, pid_file, indent=4) - - for pid in pids: - os.kill(pid, 9) - - gr_info(translations["end_pid"]) - except: - pass - -def delete_audios(files): - if not os.path.exists(files) or os.path.isdir(files): return gr_warning(translations["input_not_valid"]) - else: - gr_info(translations["clean_audios"]) - os.remove(files) - - for item in os.listdir("audios"): - item_path = os.path.join("audios", item) - - if os.path.isdir(item_path) and len([f for f in os.listdir(item_path)]) < 1: shutil.rmtree(item_path, ignore_errors=True) - - gr_info(translations["clean_audios_success"]) - return change_audios_choices() - -def delete_separated(files): - if not os.path.exists(files) or os.path.isdir(files): return gr_warning(translations["input_not_valid"]) - else: - gr_info(translations["clean_separate"]) - os.remove(files) - - gr_info(translations["clean_separate_success"]) - return change_separate_choices() - -def delete_model(model, index): - files = os.path.join("assets", "weights", model) - - if model: - if not os.path.exists(files) or not model.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"])) - else: - gr_info(translations["clean_model"]) - os.remove(files) - gr_info(translations["clean_model_success"]) - - if index: - if not os.path.exists(index): return gr_warning(translations["provide_file"].format(filename=translations["index"])) - else: - gr_info(translations["clean_index"]) - shutil.rmtree(index, ignore_errors=True) - gr_info(translations["clean_index_success"]) - - return change_choices_del() - -def delete_pretrained(pretrain): - if not os.path.exists(pretrain) or os.path.isdir(pretrain): return gr_warning(translations["input_not_valid"]) - else: - gr_info(translations["clean_pretrain"]) - os.remove(pretrain) - gr_info(translations["clean_pretrain_success"]) - - return change_allpretrained_choices() - -def delete_presets(json_file): - files = os.path.join("assets", "presets", json_file) - - if not os.path.exists(files) or not json_file.endswith(".json"): return gr_warning(translations["provide_file_settings"]) - else: - gr_info(translations["clean_presets_2"]) - os.remove(files) - gr_info(translations["clean_presets_success"]) - - return change_preset_choices() - -def delete_all_audios(): - dir = "audios" - - if len(os.listdir(dir)) < 1: return gr_warning(translations["not_found_in_folder"]) - else: - gr_info(translations["clean_all_audios"]) - - shutil.rmtree(dir, ignore_errors=True) - os.makedirs(dir, exist_ok=True) - - gr_info(translations["clean_all_audios_success"]) - return {"choices": [], "value": "", "__type__": "update"} - -def delete_all_separated(): - dir = os.path.join("assets", "models", "uvr5") - - if len(os.listdir(dir)) < 1: return gr_warning(translations["not_found_separate_model"]) - else: - gr_info(translations["clean_all_separate_model"]) - - shutil.rmtree(dir, ignore_errors=True) - os.makedirs(dir, exist_ok=True) - - gr_info(translations["clean_all_separate_model_success"]) - return {"choices": [], "value": "", "__type__": "update"} - -def delete_all_model(): - model = os.listdir(os.path.join("assets", "weights")) - index = list(f for f in os.listdir(os.path.join("assets", "logs")) if os.path.isdir(os.path.join("assets", "logs", f)) and f != "mute") - - if len(model) < 1: return gr_warning(translations["not_found"].format(name=translations["model"])) - if len(index) < 1: return gr_warning(translations["not_found"].format(name=translations["index"])) - - gr_info(translations["start_clean_model"]) - - for f in model: - file = os.path.join("assets", "weights", f) - if os.path.exists(file) and f.endswith((".pth", ".onnx")): os.remove(file) - - for f in index: - file = os.path.join("assets", "logs", f) - if os.path.exists(file): shutil.rmtree(file, ignore_errors=True) - - gr_info(translations["clean_all_models_success"]) - return [{"choices": [], "value": "", "__type__": "update"}]*2 - -def delete_all_pretrained(): - Allpretrained = [os.path.join("assets", "models", path, model) for path in ["pretrained_v1", "pretrained_v2", "pretrained_custom"] for model in os.listdir(os.path.join("assets", "models", path)) if model.endswith(".pth") and ("D" in model or "G" in model)] - - if len(Allpretrained) < 1: return gr_warning(translations["not_found_pretrained"]) - else: - gr_info(translations["clean_all_pretrained"]) - for f in Allpretrained: - if os.path.exists(f): os.remove(f) - - gr_info(translations["clean_all_pretrained_success"]) - return {"choices": [], "value": "", "__type__": "update"} - -def delete_all_presets(): - dir = os.path.join("assets", "presets") - - if len(os.listdir(dir)) < 1: return gr_warning(translations["not_found_presets"]) - else: - gr_info(translations["clean_all_presets"]) - - shutil.rmtree(dir, ignore_errors=True) - os.makedirs(dir, exist_ok=True) - - gr_info(translations["clean_all_presets_success"]) - return {"choices": [], "value": "", "__type__": "update"} - -def delete_all_log(): - log_path = [os.path.join(root, f) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for f in files if f.endswith(".log")] - - if len(log_path) < 1: return gr_warning(translations["not_found_log"]) - else: - gr_info(translations["clean_all_log"]) - - for f in log_path: - if os.path.exists(f): os.remove(f) - - open(os.path.join("assets", "logs", "app.log"), "w", encoding="utf-8") - gr_info(translations["clean_all_log_success"]) - -def delete_all_predictors(): - dir = os.path.join("assets", "models", "predictors") - - if len(os.listdir(dir)) < 1: return gr_warning(translations["not_found_predictors"]) - else: - gr_info(translations["clean_all_predictors"]) - - shutil.rmtree(dir, ignore_errors=True) - os.makedirs(dir, exist_ok=True) - - gr_info(translations["clean_all_predictors_success"]) - return {"choices": [], "value": "", "__type__": "update"} - -def delete_all_embedders(): - dir = os.path.join("assets", "models", "embedders") - - if len(os.listdir(dir)) < 1: return gr_warning(translations["not_found_embedders"]) - else: - gr_info(translations["clean_all_embedders"]) - - shutil.rmtree(dir, ignore_errors=True) - os.makedirs(dir, exist_ok=True) - - gr_info(translations["clean_all_embedders_success"]) - return {"choices": [], "value": "", "__type__": "update"} - -def delete_dataset(name): - if not name or not os.path.exists(name) or not os.path.isdir(name): return gr_warning(translations["provide_folder"]) - else: - if len(os.listdir(name)) < 1: gr_warning(translations["empty_folder"]) - else: - gr_info(translations["clean_dataset"]) - - shutil.rmtree(name, ignore_errors=True) - os.makedirs(name, exist_ok=True) - - gr_info(translations["clean_dataset_success"]) - -def clean_f0_files(): - path = os.path.join("assets", "f0") - - if len(os.listdir(path)) < 1: gr_warning(translations["empty_folder"]) - else: - gr_info(translations["start_clean_f0"]) - - shutil.rmtree(path, ignore_errors=True) - os.makedirs(path, exist_ok=True) - - gr_info(translations["clean_f0_done"]) - -def load_presets(presets, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, formant_shifting, formant_qfrency, formant_timbre): - if not presets: return gr_warning(translations["provide_file_settings"]) - - with open(os.path.join("assets", "presets", presets)) as f: - file = json.load(f) - - gr_info(translations["load_presets"].format(presets=presets)) - return file.get("cleaner", cleaner), file.get("autotune", autotune), file.get("pitch", pitch), file.get("clean_strength", clean_strength), file.get("index_strength", index_strength), file.get("resample_sr", resample_sr), file.get("filter_radius", filter_radius), file.get("volume_envelope", volume_envelope), file.get("protect", protect), file.get("split_audio", split_audio), file.get("f0_autotune_strength", f0_autotune_strength), file.get("formant_shifting", formant_shifting), file.get("formant_qfrency", formant_qfrency), file.get("formant_timbre", formant_timbre) - -def save_presets(name, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, volume_envelope, protect, split_audio, f0_autotune_strength, cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, volume_envelope_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox, formant_shifting, formant_qfrency, formant_timbre): - if not name: return gr_warning(translations["provide_filename_settings"]) - if not any([cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, volume_envelope_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox]): return gr_warning(translations["choose1"]) - - settings = {} - - for checkbox, data in [(cleaner_chbox, {"cleaner": cleaner, "clean_strength": clean_strength}), (autotune_chbox, {"autotune": autotune, "f0_autotune_strength": f0_autotune_strength}), (pitch_chbox, {"pitch": pitch}), (index_strength_chbox, {"index_strength": index_strength}), (resample_sr_chbox, {"resample_sr": resample_sr}), (filter_radius_chbox, {"filter_radius": filter_radius}), (volume_envelope_chbox, {"volume_envelope": volume_envelope}), (protect_chbox, {"protect": protect}), (split_audio_chbox, {"split_audio": split_audio}), (formant_shifting_chbox, {"formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre})]: - if checkbox: settings.update(data) - - with open(os.path.join("assets", "presets", name + ".json"), "w") as f: - json.dump(settings, f, indent=4) - - gr_info(translations["export_settings"]) - return change_preset_choices() - -def report_bug(error_info, provide): - report_path = os.path.join("assets", "logs", "report_bugs.log") - if os.path.exists(report_path): os.remove(report_path) - - report_url = codecs.decode(requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/jroubbx.gkg", "rot13")).text, "rot13") - if not error_info: error_info = "Không Có" - - gr_info(translations["thank"]) - - if provide: - try: - for log in [os.path.join(root, name) for root, _, files in os.walk(os.path.join("assets", "logs"), topdown=False) for name in files if name.endswith(".log")]: - with open(log, "r", encoding="utf-8") as r: - with open(report_path, "a", encoding="utf-8") as w: - w.write(str(r.read())) - w.write("\n") - except Exception as e: - gr_error(translations["error_read_log"]) - logger.debug(e) - - try: - with open(report_path, "r", encoding="utf-8") as f: - content = f.read() - - requests.post(report_url, json={"embeds": [{"title": "Báo Cáo Lỗi", "description": f"Mô tả lỗi: {error_info}", "color": 15158332, "author": {"name": "Vietnamese_RVC", "icon_url": miku_image, "url": codecs.decode("uggcf://tvguho.pbz/CunzUhlauNau16/Ivrganzrfr-EIP/gerr/znva","rot13")}, "thumbnail": {"url": codecs.decode("uggcf://p.grabe.pbz/7dADJbv-36fNNNNq/grabe.tvs", "rot13")}, "fields": [{"name": "Số Lượng Gỡ Lỗi", "value": content.count("DEBUG")}, {"name": "Số Lượng Thông Tin", "value": content.count("INFO")}, {"name": "Số Lượng Cảnh Báo", "value": content.count("WARNING")}, {"name": "Số Lượng Lỗi", "value": content.count("ERROR")}], "footer": {"text": f"Tên Máy: {platform.uname().node} - Hệ Điều Hành: {platform.system()}-{platform.version()}\nThời Gian Báo Cáo Lỗi: {datetime.now()}."}}]}) - - with open(report_path, "rb") as f: - requests.post(report_url, files={"file": f}) - except Exception as e: - gr_error(translations["error_send"]) - logger.debug(e) - finally: - if os.path.exists(report_path): os.remove(report_path) - else: requests.post(report_url, json={"embeds": [{"title": "Báo Cáo Lỗi", "description": error_info}]}) - -def f0_extract(audio, f0_method, f0_onnx): - if not audio or not os.path.exists(audio) or os.path.isdir(audio): - gr_warning(translations["input_not_valid"]) - return [None]*2 - - import librosa - - from matplotlib import pyplot as plt - from main.inference.extract import FeatureInput - - filename, _ = os.path.splitext(os.path.basename(audio)) - - f0_path = os.path.join("assets", "f0", filename) - image_path = os.path.join(f0_path, "f0.png") - txt_path = os.path.join(f0_path, "f0.txt") - - gr_info(translations["start_extract"]) - - if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True) - - y, sr = librosa.load(audio, sr=None) - f0 = FeatureInput(sample_rate=sr, device=config.device).compute_f0(y.flatten(), f0_method, 160, f0_onnx) - - plt.figure(figsize=(10, 4)) - plt.plot(f0) - plt.title(f0_method) - plt.xlabel(translations["time_frames"]) - plt.ylabel(translations["Frequency"]) - plt.savefig(image_path) - plt.close() - - with open(txt_path, "w") as f: - for i, f0_value in enumerate(f0): - f.write(f"{i * sr / 160},{f0_value}\n") - - gr_info(translations["extract_done"]) - - return [txt_path, image_path] - - - -with gr.Blocks(title="📱 Vietnamese-RVC GUI BY ANH", theme=theme) as app: - gr.HTML(translations["display_title"]) - with gr.Tabs(): - with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)): - gr.Markdown(f"## {translations['separator_tab']}") - with gr.Row(): - gr.Markdown(translations["4_part"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - cleaner = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True, min_width=140) - backing = gr.Checkbox(label=translations["separator_backing"], value=False, interactive=True, min_width=140) - reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True, min_width=140) - backing_reverb = gr.Checkbox(label=translations["dereveb_backing"], value=False, interactive=False, min_width=140) - denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False, min_width=140) - with gr.Row(): - separator_model = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True) - separator_backing_model = gr.Dropdown(label=translations["separator_backing_model"], value="Version-1", choices=["Version-1", "Version-2"], interactive=True, visible=backing.value) - with gr.Row(): - with gr.Column(): - separator_button = gr.Button(translations["separator_tab"], variant="primary") - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True) - segment_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True) - with gr.Row(): - mdx_batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=backing.value or reverb.value or separator_model.value in mdx_model) - with gr.Column(): - with gr.Group(): - with gr.Row(): - overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True) - with gr.Row(): - mdx_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=8192, value=1024, step=1, interactive=True, visible=backing.value or reverb.value or separator_model.value in mdx_model) - with gr.Row(): - with gr.Column(): - input = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"]) - with gr.Accordion(translations["use_url"], open=False): - url = gr.Textbox(label=translations["url_audio"], value="", placeholder="https://www.youtube.com/...", scale=6) - download_button = gr.Button(translations["downloads"]) - with gr.Column(): - with gr.Row(): - clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner.value) - sample_rate1 = gr.Slider(minimum=0, maximum=96000, step=1, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True) - with gr.Accordion(translations["input_output"], open=False): - format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True) - input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True) - refesh_separator = gr.Button(translations["refesh"]) - output_separator = gr.Textbox(label=translations["output_folder"], value="audios", placeholder="audios", info=translations["output_folder_info"], interactive=True) - audio_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Row(): - gr.Markdown(translations["output_separator"]) - with gr.Row(): - instruments_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["instruments"]) - original_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["original_vocal"]) - main_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["main_vocal"], visible=backing.value) - backing_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["backing_vocal"], visible=backing.value) - with gr.Row(): - separator_model.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), visible(c not in mdx_model)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, shifts]) - backing.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), visible(a), visible(a), visible(a), valueFalse_interactive(a and b)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, separator_backing_model, main_vocals, backing_vocals, backing_reverb]) - reverb.change(fn=lambda a, b, c: [visible(a or b or c in mdx_model), visible(a or b or c in mdx_model), valueFalse_interactive(a or b or c in mdx_model), valueFalse_interactive(a and b)], inputs=[backing, reverb, separator_model], outputs=[mdx_batch_size, mdx_hop_length, denoise, backing_reverb]) - with gr.Row(): - input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[audio_input]) - cleaner.change(fn=visible, inputs=[cleaner], outputs=[clean_strength]) - with gr.Row(): - input.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[input], outputs=[input_audio]) - refesh_separator.click(fn=change_audios_choices, inputs=[], outputs=[input_audio]) - with gr.Row(): - download_button.click( - fn=download_url, - inputs=[url], - outputs=[input_audio, audio_input, url], - api_name='download_url' - ) - separator_button.click( - fn=separator_music, - inputs=[ - input_audio, - output_separator, - format, - shifts, - segment_size, - overlap, - cleaner, - clean_strength, - denoise, - separator_model, - separator_backing_model, - backing, - reverb, - backing_reverb, - mdx_hop_length, - mdx_batch_size, - sample_rate1 - ], - outputs=[original_vocals, instruments_audio, main_vocals, backing_vocals], - api_name='separator_music' - ) - - with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)): - gr.Markdown(f"## {translations['convert_audio']}") - with gr.Row(): - gr.Markdown(translations["convert_info"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - cleaner0 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - use_audio = gr.Checkbox(label=translations["use_audio"], value=False, interactive=True) - checkpointing = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True) - with gr.Row(): - use_original = gr.Checkbox(label=translations["convert_original"], value=False, interactive=True, visible=use_audio.value) - convert_backing = gr.Checkbox(label=translations["convert_backing"], value=False, interactive=True, visible=use_audio.value) - not_merge_backing = gr.Checkbox(label=translations["not_merge_backing"], value=False, interactive=True, visible=use_audio.value) - merge_instrument = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True, visible=use_audio.value) - with gr.Row(): - pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - clean_strength0 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner0.value) - with gr.Row(): - with gr.Column(): - audio_select = gr.Dropdown(label=translations["select_separate"], choices=[], value="", interactive=True, allow_custom_value=True, visible=False) - convert_button_2 = gr.Button(translations["convert_audio"], visible=False) - with gr.Row(): - with gr.Column(): - convert_button = gr.Button(translations["convert_audio"], variant="primary") - with gr.Row(): - with gr.Column(): - input0 = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"]) - play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Column(): - with gr.Accordion(translations["model_accordion"], open=True): - with gr.Row(): - model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refesh = gr.Button(translations["refesh"]) - with gr.Row(): - index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "") - with gr.Accordion(translations["input_output"], open=False): - with gr.Column(): - export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True) - input_audio0 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) - output_audio = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True) - with gr.Column(): - refesh0 = gr.Button(translations["refesh"]) - with gr.Accordion(translations["setting"], open=False): - with gr.Accordion(translations["f0_method"], open=False): - with gr.Group(): - onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True) - hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method.value == "hybrid") - hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False) - with gr.Accordion(translations["f0_file"], open=False): - upload_f0_file = gr.File(label=translations["upload_f0"], file_types=[".txt"]) - f0_file_dropdown = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True) - refesh_f0_file = gr.Button(translations["refesh"]) - with gr.Accordion(translations["hubert_model"], open=False): - onnx_embed_mode = gr.Checkbox(label=translations["embed_onnx"], info=translations["embed_onnx_info"], value=False, interactive=True) - embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="contentvec_base", interactive=True) - custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom") - with gr.Accordion(translations["use_presets"], open=False): - with gr.Row(): - presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Row(): - load_click = gr.Button(translations["load_file"], variant="primary") - refesh_click = gr.Button(translations["refesh"]) - with gr.Accordion(translations["export_file"], open=False): - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True) - autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True) - pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True) - index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True) - resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True) - filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True) - volume_envelope_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True) - protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True) - split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True) - formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True) - with gr.Row(): - with gr.Column(): - name_to_save_file = gr.Textbox(label=translations["filename_to_save"]) - save_file_button = gr.Button(translations["export_file"]) - with gr.Row(): - upload_presets = gr.File(label=translations["upload_presets"], file_types=[".json"]) - with gr.Column(): - with gr.Row(): - split_audio = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True) - formant_shifting = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True) - f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value) - resample_sr = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True) - filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - volume_envelope = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True) - protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.33, step=0.01, interactive=True) - with gr.Row(): - formant_qfrency = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - formant_timbre = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - with gr.Row(): - gr.Markdown(translations["output_convert"]) - with gr.Row(): - main_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["main_convert"]) - backing_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_backing"], visible=convert_backing.value) - main_backing = gr.Audio(show_download_button=True, interactive=False, label=translations["main_or_backing"], visible=convert_backing.value) - with gr.Row(): - original_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_original"], visible=use_original.value) - vocal_instrument = gr.Audio(show_download_button=True, interactive=False, label=translations["voice_or_instruments"], visible=merge_instrument.value) - with gr.Row(): - upload_f0_file.upload(fn=lambda inp: shutil.move(inp.name, os.path.join("assets", "f0")), inputs=[upload_f0_file], outputs=[f0_file_dropdown]) - refesh_f0_file.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown]) - with gr.Row(): - load_click.click( - fn=load_presets, - inputs=[ - presets_name, - cleaner0, - autotune, - pitch, - clean_strength0, - index_strength, - resample_sr, - filter_radius, - volume_envelope, - protect, - split_audio, - f0_autotune_strength, - formant_qfrency, - formant_timbre - ], - outputs=[ - cleaner0, - autotune, - pitch, - clean_strength0, - index_strength, - resample_sr, - filter_radius, - volume_envelope, - protect, - split_audio, - f0_autotune_strength, - formant_shifting, - formant_qfrency, - formant_timbre - ] - ) - refesh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name]) - save_file_button.click( - fn=save_presets, - inputs=[ - name_to_save_file, - cleaner0, - autotune, - pitch, - clean_strength0, - index_strength, - resample_sr, - filter_radius, - volume_envelope, - protect, - split_audio, - f0_autotune_strength, - cleaner_chbox, - autotune_chbox, - pitch_chbox, - index_strength_chbox, - resample_sr_chbox, - filter_radius_chbox, - volume_envelope_chbox, - protect_chbox, - split_audio_chbox, - formant_shifting_chbox, - formant_shifting, - formant_qfrency, - formant_timbre - ], - outputs=[presets_name] - ) - with gr.Row(): - upload_presets.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("assets", "presets")), inputs=[upload_presets], outputs=[presets_name]) - autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength]) - use_audio.change(fn=lambda a: [visible(a), visible(a), visible(a), visible(a), visible(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), visible(not a), visible(not a), visible(not a), visible(not a)], inputs=[use_audio], outputs=[main_backing, use_original, convert_backing, not_merge_backing, merge_instrument, use_original, convert_backing, not_merge_backing, merge_instrument, input_audio0, output_audio, input0, play_audio]) - with gr.Row(): - convert_backing.change(fn=lambda a,b: [change_backing_choices(a, b), visible(a)], inputs=[convert_backing, not_merge_backing], outputs=[use_original, backing_convert]) - use_original.change(fn=lambda audio, original: [visible(original), visible(not original), visible(audio and not original), valueFalse_interactive(not original), valueFalse_interactive(not original)], inputs=[use_audio, use_original], outputs=[original_convert, main_convert, main_backing, convert_backing, not_merge_backing]) - cleaner0.change(fn=visible, inputs=[cleaner0], outputs=[clean_strength0]) - with gr.Row(): - merge_instrument.change(fn=visible, inputs=[merge_instrument], outputs=[vocal_instrument]) - not_merge_backing.change(fn=lambda audio, merge, cvb: [visible(audio and not merge), change_backing_choices(cvb, merge)], inputs=[use_audio, not_merge_backing, convert_backing], outputs=[main_backing, use_original]) - method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method, hybrid_method], outputs=[hybrid_method, hop_length]) - with gr.Row(): - hybrid_method.change(fn=hoplength_show, inputs=[method, hybrid_method], outputs=[hop_length]) - refesh.click(fn=change_models_choices, inputs=[], outputs=[model_pth, model_index]) - model_pth.change(fn=get_index, inputs=[model_pth], outputs=[model_index]) - with gr.Row(): - input0.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[input0], outputs=[input_audio0]) - input_audio0.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio0], outputs=[play_audio]) - formant_shifting.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting], outputs=[formant_qfrency, formant_timbre]) - with gr.Row(): - embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[custom_embedders]) - refesh0.click(fn=change_audios_choices, inputs=[], outputs=[input_audio0]) - model_index.change(fn=index_strength_show, inputs=[model_index], outputs=[index_strength]) - with gr.Row(): - audio_select.change(fn=lambda: visible(True), inputs=[], outputs=[convert_button_2]) - convert_button.click(fn=lambda: visible(False), inputs=[], outputs=[convert_button]) - convert_button_2.click(fn=lambda: [visible(False), visible(False)], inputs=[], outputs=[audio_select, convert_button_2]) - with gr.Row(): - convert_button.click( - fn=convert_selection, - inputs=[ - cleaner0, - autotune, - use_audio, - use_original, - convert_backing, - not_merge_backing, - merge_instrument, - pitch, - clean_strength0, - model_pth, - model_index, - index_strength, - input_audio0, - output_audio, - export_format, - method, - hybrid_method, - hop_length, - embedders, - custom_embedders, - resample_sr, - filter_radius, - volume_envelope, - protect, - split_audio, - f0_autotune_strength, - checkpointing, - onnx_f0_mode, - formant_shifting, - formant_qfrency, - formant_timbre, - f0_file_dropdown, - onnx_embed_mode - ], - outputs=[audio_select, main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button], - api_name="convert_selection" - ) - convert_button_2.click( - fn=convert_audio, - inputs=[ - cleaner0, - autotune, - use_audio, - use_original, - convert_backing, - not_merge_backing, - merge_instrument, - pitch, - clean_strength0, - model_pth, - model_index, - index_strength, - input_audio0, - output_audio, - export_format, - method, - hybrid_method, - hop_length, - embedders, - custom_embedders, - resample_sr, - filter_radius, - volume_envelope, - protect, - split_audio, - f0_autotune_strength, - audio_select, - checkpointing, - onnx_f0_mode, - formant_shifting, - formant_qfrency, - formant_timbre, - f0_file_dropdown, - onnx_embed_mode - ], - outputs=[main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button], - api_name="convert_audio" - ) - - with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)): - gr.Markdown(translations["convert_text_markdown"]) - with gr.Row(): - gr.Markdown(translations["convert_text_markdown_2"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - use_txt = gr.Checkbox(label=translations["input_txt"], value=False, interactive=True) - google_tts_check_box = gr.Checkbox(label=translations["googletts"], value=False, interactive=True) - prompt = gr.Textbox(label=translations["text_to_speech"], value="", placeholder="Hello Words", lines=3) - with gr.Column(): - speed = gr.Slider(label=translations["voice_speed"], info=translations["voice_speed_info"], minimum=-100, maximum=100, value=0, step=1) - pitch0 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True) - with gr.Row(): - tts_button = gr.Button(translations["tts_1"], variant="primary", scale=2) - convert_button0 = gr.Button(translations["tts_2"], variant="secondary", scale=2) - with gr.Row(): - with gr.Column(): - txt_input = gr.File(label=translations["drop_text"], file_types=[".txt"], visible=use_txt.value) - tts_voice = gr.Dropdown(label=translations["voice"], choices=edgetts, interactive=True, value="vi-VN-NamMinhNeural") - tts_pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info_2"], label=translations["pitch"], value=0, interactive=True) - with gr.Column(): - with gr.Accordion(translations["model_accordion"], open=True): - with gr.Row(): - model_pth0 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - model_index0 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refesh1 = gr.Button(translations["refesh"]) - with gr.Row(): - index_strength0 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index0.value != "") - with gr.Accordion(translations["output_path"], open=False): - export_format0 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True) - output_audio0 = gr.Textbox(label=translations["output_tts"], value="audios/tts.wav", placeholder="audios/tts.wav", info=translations["tts_output"], interactive=True) - output_audio1 = gr.Textbox(label=translations["output_tts_convert"], value="audios/tts-convert.wav", placeholder="audios/tts-convert.wav", info=translations["tts_output"], interactive=True) - with gr.Accordion(translations["setting"], open=False): - with gr.Accordion(translations["f0_method"], open=False): - with gr.Group(): - onnx_f0_mode1 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - method0 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True) - hybrid_method0 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method0.value == "hybrid") - hop_length0 = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False) - with gr.Accordion(translations["f0_file"], open=False): - upload_f0_file0 = gr.File(label=translations["upload_f0"], file_types=[".txt"]) - f0_file_dropdown0 = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True) - refesh_f0_file0 = gr.Button(translations["refesh"]) - with gr.Accordion(translations["hubert_model"], open=False): - onnx_embed_mode1 = gr.Checkbox(label=translations["embed_onnx"], info=translations["embed_onnx_info"], value=False, interactive=True) - embedders0 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="contentvec_base", interactive=True) - custom_embedders0 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders0.value == "custom") - with gr.Group(): - with gr.Row(): - formant_shifting1 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True) - split_audio0 = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True) - cleaner1 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - autotune3 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True) - checkpointing0 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True) - with gr.Column(): - f0_autotune_strength0 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune3.value) - clean_strength1 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner1.value) - resample_sr0 = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True) - filter_radius0 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True) - volume_envelope0 = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True) - protect0 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.33, step=0.01, interactive=True) - with gr.Row(): - formant_qfrency1 = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - formant_timbre1 = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False) - with gr.Row(): - gr.Markdown(translations["output_tts_markdown"]) - with gr.Row(): - tts_voice_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["output_text_to_speech"]) - tts_voice_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"]) - with gr.Row(): - upload_f0_file0.upload(fn=lambda inp: shutil.move(inp.name, os.path.join("assets", "f0")), inputs=[upload_f0_file0], outputs=[f0_file_dropdown0]) - refesh_f0_file0.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown0]) - with gr.Row(): - autotune3.change(fn=visible, inputs=[autotune3], outputs=[f0_autotune_strength0]) - model_pth0.change(fn=get_index, inputs=[model_pth0], outputs=[model_index0]) - with gr.Row(): - cleaner1.change(fn=visible, inputs=[cleaner1], outputs=[clean_strength1]) - method0.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method0, hybrid_method0], outputs=[hybrid_method0, hop_length0]) - hybrid_method0.change(fn=hoplength_show, inputs=[method0, hybrid_method0], outputs=[hop_length0]) - with gr.Row(): - refesh1.click(fn=change_models_choices, inputs=[], outputs=[model_pth0, model_index0]) - embedders0.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders0], outputs=[custom_embedders0]) - formant_shifting1.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting1], outputs=[formant_qfrency1, formant_timbre1]) - with gr.Row(): - model_index0.change(fn=index_strength_show, inputs=[model_index0], outputs=[index_strength0]) - txt_input.upload(fn=process_input, inputs=[txt_input], outputs=[prompt]) - use_txt.change(fn=visible, inputs=[use_txt], outputs=[txt_input]) - with gr.Row(): - google_tts_check_box.change(fn=change_tts_voice_choices, inputs=[google_tts_check_box], outputs=[tts_voice]) - tts_button.click( - fn=TTS, - inputs=[ - prompt, - tts_voice, - speed, - output_audio0, - tts_pitch, - google_tts_check_box - ], - outputs=[tts_voice_audio], - api_name="text-to-speech" - ) - convert_button0.click( - fn=convert_tts, - inputs=[ - cleaner1, - autotune3, - pitch0, - clean_strength1, - model_pth0, - model_index0, - index_strength0, - output_audio0, - output_audio1, - export_format0, - method0, - hybrid_method0, - hop_length0, - embedders0, - custom_embedders0, - resample_sr0, - filter_radius0, - volume_envelope0, - protect0, - split_audio0, - f0_autotune_strength0, - checkpointing0, - onnx_f0_mode1, - formant_shifting1, - formant_qfrency1, - formant_timbre1, - f0_file_dropdown0, - onnx_embed_mode1 - ], - outputs=[tts_voice_convert], - api_name="convert_tts" - ) - - with gr.TabItem(translations["audio_effects"], visible=configs.get("effects_tab", True)): - gr.Markdown(translations["apply_audio_effects"]) - with gr.Row(): - gr.Markdown(translations["audio_effects_edit"]) - with gr.Row(): - with gr.Column(): - with gr.Row(): - reverb_check_box = gr.Checkbox(label=translations["reverb"], value=False, interactive=True) - chorus_check_box = gr.Checkbox(label=translations["chorus"], value=False, interactive=True) - delay_check_box = gr.Checkbox(label=translations["delay"], value=False, interactive=True) - phaser_check_box = gr.Checkbox(label=translations["phaser"], value=False, interactive=True) - compressor_check_box = gr.Checkbox(label=translations["compressor"], value=False, interactive=True) - more_options = gr.Checkbox(label=translations["more_option"], value=False, interactive=True) - with gr.Row(): - with gr.Accordion(translations["input_output"], open=False): - with gr.Row(): - upload_audio = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"]) - with gr.Row(): - audio_in_path = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True) - audio_out_path = gr.Textbox(label=translations["output_audio"], value="audios/audio_effects.wav", placeholder="audios/audio_effects.wav", info=translations["provide_output"], interactive=True) - with gr.Row(): - with gr.Column(): - audio_combination = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True) - audio_combination_input = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True, visible=audio_combination.value) - with gr.Row(): - audio_effects_refesh = gr.Button(translations["refesh"]) - with gr.Row(): - audio_output_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True) - with gr.Row(): - apply_effects_button = gr.Button(translations["apply"], variant="primary", scale=2) - with gr.Row(): - with gr.Column(): - with gr.Row(): - with gr.Accordion(translations["reverb"], open=False, visible=reverb_check_box.value) as reverb_accordion: - reverb_freeze_mode = gr.Checkbox(label=translations["reverb_freeze"], info=translations["reverb_freeze_info"], value=False, interactive=True) - reverb_room_size = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["room_size"], info=translations["room_size_info"], interactive=True) - reverb_damping = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["damping"], info=translations["damping_info"], interactive=True) - reverb_wet_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.3, label=translations["wet_level"], info=translations["wet_level_info"], interactive=True) - reverb_dry_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label=translations["dry_level"], info=translations["dry_level_info"], interactive=True) - reverb_width = gr.Slider(minimum=0, maximum=1, step=0.01, value=1, label=translations["width"], info=translations["width_info"], interactive=True) - with gr.Row(): - with gr.Accordion(translations["chorus"], open=False, visible=chorus_check_box.value) as chorus_accordion: - chorus_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_depth"], info=translations["chorus_depth_info"], interactive=True) - chorus_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.5, label=translations["chorus_rate_hz"], info=translations["chorus_rate_hz_info"], interactive=True) - chorus_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_mix"], info=translations["chorus_mix_info"], interactive=True) - chorus_centre_delay_ms = gr.Slider(minimum=0, maximum=50, step=1, value=10, label=translations["chorus_centre_delay_ms"], info=translations["chorus_centre_delay_ms_info"], interactive=True) - chorus_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["chorus_feedback"], info=translations["chorus_feedback_info"], interactive=True) - with gr.Row(): - with gr.Accordion(translations["delay"], open=False, visible=delay_check_box.value) as delay_accordion: - delay_second = gr.Slider(minimum=0, maximum=5, step=0.01, value=0.5, label=translations["delay_seconds"], info=translations["delay_seconds_info"], interactive=True) - delay_feedback = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_feedback"], info=translations["delay_feedback_info"], interactive=True) - delay_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_mix"], info=translations["delay_mix_info"], interactive=True) - with gr.Column(): - with gr.Row(): - with gr.Accordion(translations["more_option"], open=False, visible=more_options.value) as more_accordion: - with gr.Row(): - fade = gr.Checkbox(label=translations["fade"], value=False, interactive=True) - bass_or_treble = gr.Checkbox(label=translations["bass_or_treble"], value=False, interactive=True) - limiter = gr.Checkbox(label=translations["limiter"], value=False, interactive=True) - resample_checkbox = gr.Checkbox(label=translations["resample"], value=False, interactive=True) - with gr.Row(): - distortion_checkbox = gr.Checkbox(label=translations["distortion"], value=False, interactive=True) - gain_checkbox = gr.Checkbox(label=translations["gain"], value=False, interactive=True) - bitcrush_checkbox = gr.Checkbox(label=translations["bitcrush"], value=False, interactive=True) - clipping_checkbox = gr.Checkbox(label=translations["clipping"], value=False, interactive=True) - with gr.Accordion(translations["fade"], open=True, visible=fade.value) as fade_accordion: - with gr.Row(): - fade_in = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_in"], info=translations["fade_in_info"], interactive=True) - fade_out = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_out"], info=translations["fade_out_info"], interactive=True) - with gr.Accordion(translations["bass_or_treble"], open=True, visible=bass_or_treble.value) as bass_treble_accordion: - with gr.Row(): - bass_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["bass_boost"], info=translations["bass_boost_info"], interactive=True) - bass_frequency = gr.Slider(minimum=20, maximum=200, step=10, value=100, label=translations["bass_frequency"], info=translations["bass_frequency_info"], interactive=True) - with gr.Row(): - treble_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["treble_boost"], info=translations["treble_boost_info"], interactive=True) - treble_frequency = gr.Slider(minimum=1000, maximum=10000, step=500, value=3000, label=translations["treble_frequency"], info=translations["treble_frequency_info"], interactive=True) - with gr.Accordion(translations["limiter"], open=True, visible=limiter.value) as limiter_accordion: - with gr.Row(): - limiter_threashold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["limiter_threashold_db"], info=translations["limiter_threashold_db_info"], interactive=True) - limiter_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["limiter_release_ms"], info=translations["limiter_release_ms_info"], interactive=True) - with gr.Column(): - pitch_shift_semitones = gr.Slider(minimum=-20, maximum=20, step=1, value=0, label=translations["pitch"], info=translations["pitch_info"], interactive=True) - audio_effect_resample_sr = gr.Slider(minimum=0, maximum=96000, step=1, value=0, label=translations["resample"], info=translations["resample_info"], interactive=True, visible=resample_checkbox.value) - distortion_drive_db = gr.Slider(minimum=0, maximum=50, step=1, value=20, label=translations["distortion"], info=translations["distortion_info"], interactive=True, visible=distortion_checkbox.value) - gain_db = gr.Slider(minimum=-60, maximum=60, step=1, value=0, label=translations["gain"], info=translations["gain_info"], interactive=True, visible=gain_checkbox.value) - clipping_threashold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["clipping_threashold_db"], info=translations["clipping_threashold_db_info"], interactive=True, visible=clipping_checkbox.value) - bitcrush_bit_depth = gr.Slider(minimum=1, maximum=24, step=1, value=16, label=translations["bitcrush_bit_depth"], info=translations["bitcrush_bit_depth_info"], interactive=True, visible=bitcrush_checkbox.value) - with gr.Row(): - with gr.Accordion(translations["phaser"], open=False, visible=phaser_check_box.value) as phaser_accordion: - phaser_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_depth"], info=translations["phaser_depth_info"], interactive=True) - phaser_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1, label=translations["phaser_rate_hz"], info=translations["phaser_rate_hz_info"], interactive=True) - phaser_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_mix"], info=translations["phaser_mix_info"], interactive=True) - phaser_centre_frequency_hz = gr.Slider(minimum=50, maximum=5000, step=10, value=1000, label=translations["phaser_centre_frequency_hz"], info=translations["phaser_centre_frequency_hz_info"], interactive=True) - phaser_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["phaser_feedback"], info=translations["phaser_feedback_info"], interactive=True) - with gr.Row(): - with gr.Accordion(translations["compressor"], open=False, visible=compressor_check_box.value) as compressor_accordion: - compressor_threashold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-20, label=translations["compressor_threashold_db"], info=translations["compressor_threashold_db_info"], interactive=True) - compressor_ratio = gr.Slider(minimum=1, maximum=20, step=0.1, value=1, label=translations["compressor_ratio"], info=translations["compressor_ratio_info"], interactive=True) - compressor_attack_ms = gr.Slider(minimum=0.1, maximum=100, step=0.1, value=10, label=translations["compressor_attack_ms"], info=translations["compressor_attack_ms_info"], interactive=True) - compressor_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["compressor_release_ms"], info=translations["compressor_release_ms_info"], interactive=True) - with gr.Row(): - gr.Markdown(translations["output_audio"]) - with gr.Row(): - audio_play_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - audio_play_output = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"]) - with gr.Row(): - reverb_check_box.change(fn=visible, inputs=[reverb_check_box], outputs=[reverb_accordion]) - chorus_check_box.change(fn=visible, inputs=[chorus_check_box], outputs=[chorus_accordion]) - delay_check_box.change(fn=visible, inputs=[delay_check_box], outputs=[delay_accordion]) - with gr.Row(): - compressor_check_box.change(fn=visible, inputs=[compressor_check_box], outputs=[compressor_accordion]) - phaser_check_box.change(fn=visible, inputs=[phaser_check_box], outputs=[phaser_accordion]) - more_options.change(fn=visible, inputs=[more_options], outputs=[more_accordion]) - with gr.Row(): - fade.change(fn=visible, inputs=[fade], outputs=[fade_accordion]) - bass_or_treble.change(fn=visible, inputs=[bass_or_treble], outputs=[bass_treble_accordion]) - limiter.change(fn=visible, inputs=[limiter], outputs=[limiter_accordion]) - resample_checkbox.change(fn=visible, inputs=[resample_checkbox], outputs=[audio_effect_resample_sr]) - with gr.Row(): - distortion_checkbox.change(fn=visible, inputs=[distortion_checkbox], outputs=[distortion_drive_db]) - gain_checkbox.change(fn=visible, inputs=[gain_checkbox], outputs=[gain_db]) - clipping_checkbox.change(fn=visible, inputs=[clipping_checkbox], outputs=[clipping_threashold_db]) - bitcrush_checkbox.change(fn=visible, inputs=[bitcrush_checkbox], outputs=[bitcrush_bit_depth]) - with gr.Row(): - upload_audio.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[upload_audio], outputs=[audio_in_path]) - audio_in_path.change(fn=lambda audio: audio if audio else None, inputs=[audio_in_path], outputs=[audio_play_input]) - audio_effects_refesh.click(fn=lambda: [change_audios_choices()]*2, inputs=[], outputs=[audio_in_path, audio_combination_input]) - with gr.Row(): - more_options.change(fn=lambda: [False]*8, inputs=[], outputs=[fade, bass_or_treble, limiter, resample_checkbox, distortion_checkbox, gain_checkbox, clipping_checkbox, bitcrush_checkbox]) - audio_combination.change(fn=visible, inputs=[audio_combination], outputs=[audio_combination_input]) - with gr.Row(): - apply_effects_button.click( - fn=audio_effects, - inputs=[ - audio_in_path, - audio_out_path, - resample_checkbox, - audio_effect_resample_sr, - chorus_depth, - chorus_rate_hz, - chorus_mix, - chorus_centre_delay_ms, - chorus_feedback, - distortion_drive_db, - reverb_room_size, - reverb_damping, - reverb_wet_level, - reverb_dry_level, - reverb_width, - reverb_freeze_mode, - pitch_shift_semitones, - delay_second, - delay_feedback, - delay_mix, - compressor_threashold_db, - compressor_ratio, - compressor_attack_ms, - compressor_release_ms, - limiter_threashold_db, - limiter_release_ms, - gain_db, - bitcrush_bit_depth, - clipping_threashold_db, - phaser_rate_hz, - phaser_depth, - phaser_centre_frequency_hz, - phaser_feedback, - phaser_mix, - bass_boost, - bass_frequency, - treble_boost, - treble_frequency, - fade_in, - fade_out, - audio_output_format, - chorus_check_box, - distortion_checkbox, - reverb_check_box, - delay_check_box, - compressor_check_box, - limiter, - gain_checkbox, - bitcrush_checkbox, - clipping_checkbox, - phaser_check_box, - bass_or_treble, - fade, - audio_combination, - audio_combination_input - ], - outputs=[audio_play_output], - api_name="audio_effects" - ) - - with gr.TabItem(translations["createdataset"], visible=configs.get("create_dataset_tab", True)): - gr.Markdown(translations["create_dataset_markdown"]) - with gr.Row(): - gr.Markdown(translations["create_dataset_markdown_2"]) - with gr.Row(): - dataset_url = gr.Textbox(label=translations["url_audio"], info=translations["create_dataset_url"], value="", placeholder="https://www.youtube.com/...", interactive=True) - output_dataset = gr.Textbox(label=translations["output_data"], info=translations["output_data_info"], value="dataset", placeholder="dataset", interactive=True) - with gr.Row(): - with gr.Column(): - with gr.Group(): - with gr.Row(): - separator_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True) - denoise_mdx = gr.Checkbox(label=translations["denoise"], value=False, interactive=True) - with gr.Row(): - kim_vocal_version = gr.Radio(label=translations["model_ver"], info=translations["model_ver_info"], choices=["Version-1", "Version-2"], value="Version-2", interactive=True) - kim_vocal_overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True) - with gr.Row(): - kim_vocal_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=8192, value=1024, step=1, interactive=True) - kim_vocal_batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True) - with gr.Row(): - kim_vocal_segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True) - with gr.Row(): - sample_rate0 = gr.Slider(minimum=0, maximum=96000, step=1, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True) - with gr.Column(): - create_button = gr.Button(translations["createdataset"], variant="primary", scale=2, min_width=4000) - with gr.Group(): - with gr.Row(): - clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True) - skip = gr.Checkbox(label=translations["skip"], value=False, interactive=True) - with gr.Row(): - dataset_clean_strength = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.5, label=translations["clean_strength"], info=translations["clean_strength_info"], interactive=True, visible=clean_audio.value) - with gr.Row(): - skip_start = gr.Textbox(label=translations["skip_start"], info=translations["skip_start_info"], value="", placeholder="0,...", interactive=True, visible=skip.value) - skip_end = gr.Textbox(label=translations["skip_end"], info=translations["skip_end_info"], value="", placeholder="0,...", interactive=True, visible=skip.value) - create_dataset_info = gr.Textbox(label=translations["create_dataset_info"], value="", interactive=False) - with gr.Row(): - clean_audio.change(fn=visible, inputs=[clean_audio], outputs=[dataset_clean_strength]) - skip.change(fn=lambda a: [valueEmpty_visible1(a)]*2, inputs=[skip], outputs=[skip_start, skip_end]) - with gr.Row(): - create_button.click( - fn=create_dataset, - inputs=[ - dataset_url, - output_dataset, - clean_audio, - dataset_clean_strength, - separator_reverb, - kim_vocal_version, - kim_vocal_overlap, - kim_vocal_segments_size, - denoise_mdx, - skip, - skip_start, - skip_end, - kim_vocal_hop_length, - kim_vocal_batch_size, - sample_rate0 - ], - outputs=[create_dataset_info], - api_name="create_dataset" - ) - - with gr.TabItem(translations["training_model"], visible=configs.get("training_tab", True)): - gr.Markdown(f"## {translations['training_model']}") - with gr.Row(): - gr.Markdown(translations["training_markdown"]) - with gr.Row(): - with gr.Column(): - with gr.Row(): - with gr.Column(): - training_name = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True) - training_sr = gr.Radio(label=translations["sample_rate"], info=translations["sample_rate_info"], choices=["32k", "40k", "44.1k", "48k"], value="48k", interactive=True) - training_ver = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True) - with gr.Row(): - clean_dataset = gr.Checkbox(label=translations["clear_dataset"], value=False, interactive=True) - preprocess_cut = gr.Checkbox(label=translations["split_audio"], value=True, interactive=True) - process_effects = gr.Checkbox(label=translations["preprocess_effect"], value=False, interactive=True) - checkpointing1 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True) - training_f0 = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True) - upload = gr.Checkbox(label=translations["upload_dataset"], value=False, interactive=True) - with gr.Row(): - clean_dataset_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.7, step=0.1, interactive=True, visible=clean_dataset.value) - with gr.Column(): - preprocess_button = gr.Button(translations["preprocess_button"], scale=2) - upload_dataset = gr.Files(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"], visible=upload.value) - preprocess_info = gr.Textbox(label=translations["preprocess_info"], value="", interactive=False) - with gr.Column(): - with gr.Row(): - with gr.Column(): - with gr.Accordion(label=translations["f0_method"], open=False): - with gr.Group(): - onnx_f0_mode2 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - extract_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True) - extract_hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False) - with gr.Accordion(label=translations["hubert_model"], open=False): - with gr.Group(): - onnx_embed_mode2 = gr.Checkbox(label=translations["embed_onnx"], info=translations["embed_onnx_info"], value=False, interactive=True) - extract_embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="contentvec_base", interactive=True) - with gr.Row(): - extract_embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=extract_embedders.value == "custom") - with gr.Column(): - extract_button = gr.Button(translations["extract_button"], scale=2) - extract_info = gr.Textbox(label=translations["extract_info"], value="", interactive=False) - with gr.Column(): - with gr.Row(): - with gr.Column(): - total_epochs = gr.Slider(label=translations["total_epoch"], info=translations["total_epoch_info"], minimum=1, maximum=10000, value=300, step=1, interactive=True) - save_epochs = gr.Slider(label=translations["save_epoch"], info=translations["save_epoch_info"], minimum=1, maximum=10000, value=50, step=1, interactive=True) - with gr.Column(): - index_button = gr.Button(f"3. {translations['create_index']}", variant="primary", scale=2) - training_button = gr.Button(f"4. {translations['training_model']}", variant="primary", scale=2) - with gr.Row(): - with gr.Accordion(label=translations["setting"], open=False): - with gr.Row(): - index_algorithm = gr.Radio(label=translations["index_algorithm"], info=translations["index_algorithm_info"], choices=["Auto", "Faiss", "KMeans"], value="Auto", interactive=True) - with gr.Row(): - custom_dataset = gr.Checkbox(label=translations["custom_dataset"], info=translations["custom_dataset_info"], value=False, interactive=True) - overtraining_detector = gr.Checkbox(label=translations["overtraining_detector"], info=translations["overtraining_detector_info"], value=False, interactive=True) - clean_up = gr.Checkbox(label=translations["cleanup_training"], info=translations["cleanup_training_info"], value=False, interactive=True) - cache_in_gpu = gr.Checkbox(label=translations["cache_in_gpu"], info=translations["cache_in_gpu_info"], value=False, interactive=True) - with gr.Column(): - dataset_path = gr.Textbox(label=translations["dataset_folder"], value="dataset", interactive=True, visible=custom_dataset.value) - with gr.Column(): - threshold = gr.Slider(minimum=1, maximum=100, value=50, step=1, label=translations["threshold"], interactive=True, visible=overtraining_detector.value) - with gr.Accordion(translations["setting_cpu_gpu"], open=False): - with gr.Column(): - gpu_number = gr.Textbox(label=translations["gpu_number"], value=str("-".join(map(str, range(torch.cuda.device_count()))) if torch.cuda.is_available() else "-"), info=translations["gpu_number_info"], interactive=True) - gpu_info = gr.Textbox(label=translations["gpu_info"], value=get_gpu_info(), info=translations["gpu_info_2"], interactive=False) - cpu_core = gr.Slider(label=translations["cpu_core"], info=translations["cpu_core_info"], minimum=0, maximum=cpu_count(), value=cpu_count(), step=1, interactive=True) - train_batch_size = gr.Slider(label=translations["batch_size"], info=translations["batch_size_info"], minimum=1, maximum=64, value=8, step=1, interactive=True) - with gr.Row(): - save_only_latest = gr.Checkbox(label=translations["save_only_latest"], info=translations["save_only_latest_info"], value=True, interactive=True) - save_every_weights = gr.Checkbox(label=translations["save_every_weights"], info=translations["save_every_weights_info"], value=True, interactive=True) - not_use_pretrain = gr.Checkbox(label=translations["not_use_pretrain_2"], info=translations["not_use_pretrain_info"], value=False, interactive=True) - custom_pretrain = gr.Checkbox(label=translations["custom_pretrain"], info=translations["custom_pretrain_info"], value=False, interactive=True) - with gr.Row(): - vocoders = gr.Radio(label=translations["vocoder"], info=translations["vocoder_info"], choices=["Default", "MRF HiFi-GAN", "RefineGAN"], value="Default", interactive=True) - with gr.Row(): - model_author = gr.Textbox(label=translations["training_author"], info=translations["training_author_info"], value="", placeholder=translations["training_author"], interactive=True) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["custom_pretrain_info"], open=False, visible=custom_pretrain.value and not not_use_pretrain.value) as pretrain_setting: - pretrained_D = gr.Dropdown(label=translations["pretrain_file"].format(dg="D"), choices=pretrainedD, value=pretrainedD[0] if len(pretrainedD) > 0 else '', interactive=True, allow_custom_value=True) - pretrained_G = gr.Dropdown(label=translations["pretrain_file"].format(dg="G"), choices=pretrainedG, value=pretrainedG[0] if len(pretrainedG) > 0 else '', interactive=True, allow_custom_value=True) - refesh_pretrain = gr.Button(translations["refesh"], scale=2) - with gr.Row(): - training_info = gr.Textbox(label=translations["train_info"], value="", interactive=False) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["export_model"], open=False): - with gr.Row(): - model_file= gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True) - index_file = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True) - with gr.Row(): - refesh_file = gr.Button(f"1. {translations['refesh']}", scale=2) - zip_model = gr.Button(translations["zip_model"], variant="primary", scale=2) - with gr.Row(): - zip_output = gr.File(label=translations["output_zip"], file_types=[".zip"], interactive=False, visible=False) - with gr.Row(): - refesh_file.click(fn=change_models_choices, inputs=[], outputs=[model_file, index_file]) - zip_model.click(fn=zip_file, inputs=[training_name, model_file, index_file], outputs=[zip_output]) - dataset_path.change(fn=lambda folder: os.makedirs(folder, exist_ok=True), inputs=[dataset_path], outputs=[]) - with gr.Row(): - upload.change(fn=visible, inputs=[upload], outputs=[upload_dataset]) - overtraining_detector.change(fn=visible, inputs=[overtraining_detector], outputs=[threshold]) - clean_dataset.change(fn=visible, inputs=[clean_dataset], outputs=[clean_dataset_strength]) - with gr.Row(): - custom_dataset.change(fn=lambda custom_dataset: [visible(custom_dataset), "dataset"],inputs=[custom_dataset], outputs=[dataset_path, dataset_path]) - upload_dataset.upload( - fn=lambda files, folder: [shutil.move(f.name, os.path.join(folder, os.path.split(f.name)[1])) for f in files] if folder != "" else gr_warning(translations["dataset_folder1"]), - inputs=[upload_dataset, dataset_path], - outputs=[], - api_name="upload_dataset" - ) - with gr.Row(): - not_use_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting]) - custom_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting]) - refesh_pretrain.click(fn=change_pretrained_choices, inputs=[], outputs=[pretrained_D, pretrained_G]) - with gr.Row(): - preprocess_button.click( - fn=preprocess, - inputs=[ - training_name, - training_sr, - cpu_core, - preprocess_cut, - process_effects, - dataset_path, - clean_dataset, - clean_dataset_strength - ], - outputs=[preprocess_info], - api_name="preprocess" - ) - with gr.Row(): - extract_method.change(fn=hoplength_show, inputs=[extract_method], outputs=[extract_hop_length]) - extract_embedders.change(fn=lambda extract_embedders: visible(extract_embedders == "custom"), inputs=[extract_embedders], outputs=[extract_embedders_custom]) - with gr.Row(): - extract_button.click( - fn=extract, - inputs=[ - training_name, - training_ver, - extract_method, - training_f0, - extract_hop_length, - cpu_core, - gpu_number, - training_sr, - extract_embedders, - extract_embedders_custom, - onnx_f0_mode2, - onnx_embed_mode2 - ], - outputs=[extract_info], - api_name="extract" - ) - with gr.Row(): - index_button.click( - fn=create_index, - inputs=[ - training_name, - training_ver, - index_algorithm - ], - outputs=[training_info], - api_name="create_index" - ) - with gr.Row(): - training_button.click( - fn=training, - inputs=[ - training_name, - training_ver, - save_epochs, - save_only_latest, - save_every_weights, - total_epochs, - training_sr, - train_batch_size, - gpu_number, - training_f0, - not_use_pretrain, - custom_pretrain, - pretrained_G, - pretrained_D, - overtraining_detector, - threshold, - clean_up, - cache_in_gpu, - model_author, - vocoders, - checkpointing1 - ], - outputs=[training_info], - api_name="training_model" - ) - - with gr.TabItem(translations["fushion"], visible=configs.get("fushion_tab", True)): - gr.Markdown(translations["fushion_markdown"]) - with gr.Row(): - gr.Markdown(translations["fushion_markdown_2"]) - with gr.Row(): - name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True) - with gr.Row(): - fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4) - with gr.Column(): - with gr.Row(): - model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"]) - model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"]) - with gr.Row(): - model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth") - model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth") - with gr.Row(): - ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True) - with gr.Row(): - output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False) - with gr.Row(): - model_a.upload(fn=lambda model: shutil.move(model.name, os.path.join("assets", "weights")), inputs=[model_a], outputs=[model_path_a]) - model_b.upload(fn=lambda model: shutil.move(model.name, os.path.join("assets", "weights")), inputs=[model_b], outputs=[model_path_b]) - with gr.Row(): - fushion_button.click( - fn=fushion_model, - inputs=[ - name_to_save, - model_path_a, - model_path_b, - ratio - ], - outputs=[name_to_save, output_model], - api_name="fushion_model" - ) - fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model]) - - with gr.TabItem(translations["read_model"], visible=configs.get("read_tab", True)): - gr.Markdown(translations["read_model_markdown"]) - with gr.Row(): - gr.Markdown(translations["read_model_markdown_2"]) - with gr.Row(): - model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"]) - with gr.Row(): - read_button = gr.Button(translations["readmodel"], variant="primary", scale=2) - with gr.Column(): - model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True) - output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6) - with gr.Row(): - model.upload(fn=lambda model: shutil.move(model.name, os.path.join("assets", "weights")), inputs=[model], outputs=[model_path]) - read_button.click( - fn=model_info, - inputs=[model_path], - outputs=[output_info], - api_name="read_model" - ) - - with gr.TabItem(translations["convert_model"], visible=configs.get("onnx_tab", True)): - gr.Markdown(translations["pytorch2onnx"]) - with gr.Row(): - gr.Markdown(translations["pytorch2onnx_markdown"]) - with gr.Row(): - model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"]) - with gr.Row(): - convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2) - with gr.Row(): - model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True) - with gr.Row(): - output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False) - with gr.Row(): - model_pth_upload.upload(fn=lambda model_pth_upload: shutil.move(model_pth_upload.name, os.path.join("assets", "weights")), inputs=[model_pth_upload], outputs=[model_pth_path]) - convert_onnx.click( - fn=onnx_export, - inputs=[model_pth_path], - outputs=[output_model2, output_info], - api_name="model_onnx_export" - ) - convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2]) - - with gr.TabItem(translations["downloads"], visible=configs.get("downloads_tab", True)): - gr.Markdown(translations["download_markdown"]) - with gr.Row(): - gr.Markdown(translations["download_markdown_2"]) - with gr.Row(): - with gr.Accordion(translations["model_download"], open=True): - with gr.Row(): - downloadmodel = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["download_from_csv"], translations["search_models"], translations["upload"]], interactive=True, value=translations["download_url"]) - with gr.Row(): - gr.Markdown("___") - with gr.Column(): - with gr.Row(): - url_input = gr.Textbox(label=translations["model_url"], value="", placeholder="https://...", scale=6) - download_model_name = gr.Textbox(label=translations["modelname"], value="", placeholder=translations["modelname"], scale=2) - url_download = gr.Button(value=translations["downloads"], scale=2) - with gr.Column(): - model_browser = gr.Dropdown(choices=models.keys(), label=translations["model_warehouse"], scale=8, allow_custom_value=True, visible=False) - download_from_browser = gr.Button(value=translations["get_model"], scale=2, variant="primary", visible=False) - with gr.Column(): - search_name = gr.Textbox(label=translations["name_to_search"], placeholder=translations["modelname"], interactive=True, scale=8, visible=False) - search = gr.Button(translations["search_2"], scale=2, visible=False) - search_dropdown = gr.Dropdown(label=translations["select_download_model"], value="", choices=[], allow_custom_value=True, interactive=False, visible=False) - download = gr.Button(translations["downloads"], variant="primary", visible=False) - with gr.Column(): - model_upload = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx", ".index", ".zip"], visible=False) - with gr.Row(): - with gr.Accordion(translations["download_pretrained_2"], open=False): - with gr.Row(): - pretrain_download_choices = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["list_model"], translations["upload"]], value=translations["download_url"], interactive=True) - with gr.Row(): - gr.Markdown("___") - with gr.Column(): - with gr.Row(): - pretrainD = gr.Textbox(label=translations["pretrained_url"].format(dg="D"), value="", info=translations["only_huggingface"], placeholder="https://...", interactive=True, scale=4) - pretrainG = gr.Textbox(label=translations["pretrained_url"].format(dg="G"), value="", info=translations["only_huggingface"], placeholder="https://...", interactive=True, scale=4) - download_pretrain_button = gr.Button(translations["downloads"], scale=2) - with gr.Column(): - with gr.Row(): - pretrain_choices = gr.Dropdown(label=translations["select_pretrain"], info=translations["select_pretrain_info"], choices=list(fetch_pretrained_data().keys()), value="Titan_Medium", allow_custom_value=True, interactive=True, scale=6, visible=False) - sample_rate_pretrain = gr.Dropdown(label=translations["pretrain_sr"], info=translations["pretrain_sr"], choices=["48k", "40k", "44.1k", "32k"], value="48k", interactive=True, visible=False) - download_pretrain_choices_button = gr.Button(translations["downloads"], scale=2, variant="primary", visible=False) - with gr.Row(): - pretrain_upload_g = gr.File(label=translations["drop_pretrain"].format(dg="G"), file_types=[".pth"], visible=False) - pretrain_upload_d = gr.File(label=translations["drop_pretrain"].format(dg="D"), file_types=[".pth"], visible=False) - with gr.Row(): - with gr.Accordion(translations["hubert_download"], open=False): - with gr.Column(): - hubert_url = gr.Textbox(label=translations["hubert_url"], value="", info=translations["only_huggingface"], placeholder="https://...", interactive=True, scale=8) - hubert_button = gr.Button(translations["downloads"], scale=2, variant="primary") - with gr.Row(): - hubert_input = gr.File(label=translations["drop_hubert"], file_types=[".pt"]) - with gr.Row(): - url_download.click( - fn=download_model, - inputs=[ - url_input, - download_model_name - ], - outputs=[url_input], - api_name="download_model" - ) - download_from_browser.click( - fn=lambda model: download_model(models[model], model), - inputs=[model_browser], - outputs=[model_browser], - api_name="download_browser" - ) - with gr.Row(): - downloadmodel.change(fn=change_download_choices, inputs=[downloadmodel], outputs=[url_input, download_model_name, url_download, model_browser, download_from_browser, search_name, search, search_dropdown, download, model_upload]) - search.click(fn=search_models, inputs=[search_name], outputs=[search_dropdown, download]) - model_upload.upload(fn=save_drop_model, inputs=[model_upload], outputs=[model_upload]) - download.click( - fn=lambda model: download_model(model_options[model], model), - inputs=[search_dropdown], - outputs=[search_dropdown], - api_name="search_models" - ) - with gr.Row(): - pretrain_download_choices.change(fn=change_download_pretrained_choices, inputs=[pretrain_download_choices], outputs=[pretrainD, pretrainG, download_pretrain_button, pretrain_choices, sample_rate_pretrain, download_pretrain_choices_button, pretrain_upload_d, pretrain_upload_g]) - pretrain_choices.change(fn=update_sample_rate_dropdown, inputs=[pretrain_choices], outputs=[sample_rate_pretrain]) - with gr.Row(): - download_pretrain_button.click( - fn=download_pretrained_model, - inputs=[ - pretrain_download_choices, - pretrainD, - pretrainG - ], - outputs=[pretrainD], - api_name="download_pretrain_link" - ) - download_pretrain_choices_button.click( - fn=download_pretrained_model, - inputs=[ - pretrain_download_choices, - pretrain_choices, - sample_rate_pretrain - ], - outputs=[pretrain_choices], - api_name="download_pretrain_choices" - ) - pretrain_upload_g.upload( - fn=lambda pretrain_upload_g: shutil.move(pretrain_upload_g.name, os.path.join("assets", "models", "pretrained_custom")), - inputs=[pretrain_upload_g], - outputs=[], - api_name="upload_pretrain_g" - ) - pretrain_upload_d.upload( - fn=lambda pretrain_upload_d: shutil.move(pretrain_upload_d.name, os.path.join("assets", "models", "pretrained_custom")), - inputs=[pretrain_upload_d], - outputs=[], - api_name="upload_pretrain_d" - ) - with gr.Row(): - hubert_button.click( - fn=hubert_download, - inputs=[hubert_url], - outputs=[hubert_url], - api_name="hubert_download" - ) - hubert_input.upload( - fn=lambda hubert: shutil.move(hubert.name, os.path.join("assets", "models", "embedders")), - inputs=[hubert_input], - outputs=[], - api_name="upload_embedder" - ) - - with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)): - gr.Markdown(translations["f0_extractor_markdown"]) - with gr.Row(): - gr.Markdown(translations["f0_extractor_markdown_2"]) - with gr.Row(): - extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary") - with gr.Row(): - with gr.Column(): - upload_audio_file = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"]) - audioplay = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"]) - with gr.Column(): - with gr.Accordion(translations["f0_method"], open=False): - with gr.Group(): - onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True) - f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True) - with gr.Accordion(translations["input_output"], open=True): - input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True) - refesh_audio_button = gr.Button(translations["refesh"]) - with gr.Row(): - gr.Markdown("___") - with gr.Row(): - file_output = gr.File(label="", file_types=[".txt"], interactive=False) - image_output = gr.Image(label="", interactive=False, show_download_button=True) - with gr.Row(): - upload_audio_file.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[upload_audio_file], outputs=[input_audio_path]) - input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay]) - refesh_audio_button.click(fn=change_audios_choices, inputs=[], outputs=[input_audio_path]) - with gr.Row(): - extractor_button.click( - fn=f0_extract, - inputs=[ - input_audio_path, - f0_method_extract, - onnx_f0_mode3 - ], - outputs=[file_output, image_output], - api_name="f0_extract" - ) - - with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)): - gr.Markdown(translations["settings_markdown"]) - with gr.Row(): - gr.Markdown(translations["settings_markdown_2"]) - with gr.Row(): - toggle_button = gr.Button(translations["change_light_dark"], variant=["secondary"], scale=2) - with gr.Row(): - with gr.Column(): - language_dropdown = gr.Dropdown(label=translations["lang"], interactive=True, info=translations["lang_restart"], choices=configs.get("support_language", "vi-VN"), value=language) - change_lang = gr.Button(translations["change_lang"], variant="primary", scale=2) - with gr.Column(): - theme_dropdown = gr.Dropdown(label=translations["theme"], interactive=True, info=translations["theme_restart"], choices=configs.get("themes", theme), value=theme, allow_custom_value=True) - changetheme = gr.Button(translations["theme_button"], variant="primary", scale=2) - with gr.Row(): - with gr.Column(): - with gr.Accordion(translations["stop"], open=False): - separate_stop = gr.Button(translations["stop_separate"]) - convert_stop = gr.Button(translations["stop_convert"]) - create_dataset_stop = gr.Button(translations["stop_create_dataset"]) - with gr.Accordion(translations["stop_training"], open=False): - model_name_stop = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True) - preprocess_stop = gr.Button(translations["stop_preprocess"]) - extract_stop = gr.Button(translations["stop_extract"]) - train_stop = gr.Button(translations["stop_training"]) - with gr.Column(): - with gr.Accordion(translations["cleaner"], open=False): - with gr.Accordion(translations["clean_audio"], open=False): - with gr.Row(): - audio_file_select = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True) - with gr.Column(): - refesh_audio_select = gr.Button(translations["refesh"]) - with gr.Row(): - delete_all_audio = gr.Button(translations["clean_all"]) - delete_audio = gr.Button(translations["clean_file"], variant="primary") - with gr.Accordion(translations["clean_models"], open=False): - with gr.Row(): - model_select = gr.Dropdown(label=translations["model_name"], choices=model_name, value="", interactive=True, allow_custom_value=True) - index_select = gr.Dropdown(label=translations["index_path"], choices=delete_index, value=delete_index[0] if len(delete_index) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Row(): - refesh_model_select = gr.Button(translations["refesh"]) - with gr.Row(): - delete_all_model_button = gr.Button(translations["clean_all"]) - delete_model_button = gr.Button(translations["clean_file"], variant="primary") - with gr.Accordion(translations["clean_pretrained"], open=False): - with gr.Row(): - pretrain_select = gr.Dropdown(label=translations["pretrain_file"].format(dg=" "), choices=Allpretrained, value=Allpretrained[0] if len(Allpretrained) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Column(): - refesh_pretrain_select = gr.Button(translations["refesh"]) - with gr.Row(): - delete_all_pretrain = gr.Button(translations["clean_all"]) - delete_pretrain = gr.Button(translations["clean_file"], variant="primary") - with gr.Accordion(translations["clean_separated"], open=False): - with gr.Row(): - separate_select = gr.Dropdown(label=translations["separator_model"], choices=separate_model, value=separate_model[0] if len(separate_model) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Column(): - refesh_separate_select = gr.Button(translations["refesh"]) - with gr.Row(): - delete_all_separate = gr.Button(translations["clean_all"]) - delete_separate = gr.Button(translations["clean_file"], variant="primary") - with gr.Accordion(translations["clean_presets"], open=False): - with gr.Row(): - presets_select = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True) - with gr.Column(): - refesh_presets_select = gr.Button(translations["refesh"]) - with gr.Row(): - delete_all_presets_button = gr.Button(translations["clean_all"]) - delete_presets_button = gr.Button(translations["clean_file"], variant="primary") - with gr.Accordion(translations["clean_datasets"], open=False): - dataset_folder_name = gr.Textbox(label=translations["dataset_folder"], value="dataset", interactive=True) - delete_dataset_button = gr.Button(translations["clean_dataset_folder"], variant="primary") - with gr.Row(): - clean_log = gr.Button(translations["clean_log"], variant="primary") - clean_predictor = gr.Button(translations["clean_predictors"], variant="primary") - clean_embedders = gr.Button(translations["clean_embed"], variant="primary") - clean_f0_file = gr.Button(translations["clean_f0_file"], variant="primary") - with gr.Row(): - toggle_button.click(fn=None, js="() => {document.body.classList.toggle('dark')}") - with gr.Row(): - change_lang.click(fn=change_language, inputs=[language_dropdown], outputs=[]) - changetheme.click(fn=change_theme, inputs=[theme_dropdown], outputs=[]) - with gr.Row(): - change_lang.click(fn=None, js="setTimeout(function() {location.reload()}, 15000)", inputs=[], outputs=[]) - changetheme.click(fn=None, js="setTimeout(function() {location.reload()}, 15000)", inputs=[], outputs=[]) - with gr.Row(): - separate_stop.click(fn=lambda: stop_pid("separate_pid", None), inputs=[], outputs=[]) - convert_stop.click(fn=lambda: stop_pid("convert_pid", None), inputs=[], outputs=[]) - create_dataset_stop.click(fn=lambda: stop_pid("create_dataset_pid", None), inputs=[], outputs=[]) - with gr.Row(): - preprocess_stop.click(fn=lambda model_name_stop: stop_pid("preprocess_pid", model_name_stop), inputs=[model_name_stop], outputs=[]) - extract_stop.click(fn=lambda model_name_stop: stop_pid("extract_pid", model_name_stop), inputs=[model_name_stop], outputs=[]) - train_stop.click(fn=lambda model_name_stop: stop_train(model_name_stop), inputs=[model_name_stop], outputs=[]) - with gr.Row(): - refesh_audio_select.click(fn=change_audios_choices, inputs=[], outputs=[audio_file_select]) - delete_all_audio.click(fn=delete_all_audios, inputs=[], outputs=[audio_file_select]) - delete_audio.click(fn=delete_audios, inputs=[audio_file_select], outputs=[audio_file_select]) - with gr.Row(): - refesh_model_select.click(fn=change_choices_del, inputs=[], outputs=[model_select, index_select]) - delete_all_model_button.click(fn=delete_all_model, inputs=[], outputs=[model_select, index_select]) - delete_model_button.click(fn=delete_model, inputs=[model_select, index_select], outputs=[model_select, index_select]) - with gr.Row(): - refesh_pretrain_select.click(fn=change_allpretrained_choices, inputs=[], outputs=[pretrain_select]) - delete_all_pretrain.click(fn=delete_all_pretrained, inputs=[], outputs=[pretrain_select]) - delete_pretrain.click(fn=delete_pretrained, inputs=[pretrain_select], outputs=[pretrain_select]) - with gr.Row(): - refesh_separate_select.click(fn=change_separate_choices, inputs=[], outputs=[separate_select]) - delete_all_separate.click(fn=delete_all_separated, inputs=[], outputs=[separate_select]) - delete_separate.click(fn=delete_separated, inputs=[separate_select], outputs=[separate_select]) - with gr.Row(): - refesh_presets_select.click(fn=change_preset_choices, inputs=[], outputs=[presets_select]) - delete_all_presets_button.click(fn=delete_all_presets, inputs=[], outputs=[presets_select]) - delete_presets_button.click(fn=delete_presets, inputs=[presets_select], outputs=[presets_select]) - with gr.Row(): - delete_dataset_button.click(fn=delete_dataset, inputs=[dataset_folder_name], outputs=[]) - with gr.Row(): - clean_log.click(fn=delete_all_log, inputs=[], outputs=[]) - clean_predictor.click(fn=delete_all_predictors, inputs=[], outputs=[]) - clean_embedders.click(fn=delete_all_embedders, inputs=[], outputs=[]) - clean_f0_file.click(fn=clean_f0_files, inputs=[], outputs=[]) - - with gr.TabItem(translations["report_bugs"], visible=configs.get("report_bug_tab", True)): - gr.Markdown(translations["report_bugs"]) - with gr.Row(): - gr.Markdown(translations["report_bug_info"]) - with gr.Row(): - with gr.Column(): - with gr.Group(): - agree_log = gr.Checkbox(label=translations["agree_log"], value=True, interactive=True) - report_text = gr.Textbox(label=translations["error_info"], info=translations["error_info_2"], interactive=True) - report_button = gr.Button(translations["report_bugs"], variant="primary", scale=2) - with gr.Row(): - gr.Markdown(translations["report_info"].format(github=codecs.decode("uggcf://tvguho.pbz/CunzUhlauNau16/Ivrganzrfr-EIP/vffhrf", "rot13"))) - with gr.Row(): - report_button.click(fn=report_bug, inputs=[report_text, agree_log], outputs=[]) - - with gr.Row(): - gr.Markdown(translations["rick_roll"].format(rickroll=codecs.decode('uggcf://jjj.lbhghor.pbz/jngpu?i=qDj4j9JtKpD', 'rot13'))) - with gr.Row(): - gr.Markdown(translations["terms_of_use"]) - with gr.Row(): - gr.Markdown(translations["exemption"]) - - logger.info(translations["start_app"]) - logger.info(translations["set_lang"].format(lang=language)) - - port = configs.get("app_port", 7860) - - for i in range(configs.get("num_of_restart", 5)): - try: - app.queue().launch( - favicon_path=os.path.join("assets", "miku.png"), - server_name=configs.get("server_name", "0.0.0.0"), - server_port=port, - show_error=configs.get("app_show_error", False), - inbrowser="--open" in sys.argv and not app_mode, - share="--share" in sys.argv and not app_mode, - allowed_paths=allow_disk, - prevent_thread_lock=app_mode - ) - break - except OSError: - logger.debug(translations["port"].format(port=port)) - port -= 1 - except Exception as e: - logger.error(translations["error_occurred"].format(e=e)) - sys.exit(1) - -if app_mode: - import webview - - def on_closed(): - logger.info(translations["close"]) - sys.exit(0) - - window = webview.create_window("Vietnamese RVC", f"localhost:{port}", width=1600, height=900, min_size=(800, 600)) - window.events.closed += on_closed - - webview.start(icon=os.path.join("assets", "miku.png"), debug=False) \ No newline at end of file diff --git a/main/app/tensorboard.py b/main/app/tensorboard.py deleted file mode 100644 index 60e030fa36b737154f1b5416e69c91bb30a3eefb..0000000000000000000000000000000000000000 --- a/main/app/tensorboard.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import sys -import json -import logging -import webbrowser - -from tensorboard import program - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -translations = Config().translations - -with open(os.path.join("main", "configs", "config.json"), "r") as f: - configs = json.load(f) - -def launch_tensorboard(): - for l in ["root", "tensorboard"]: - logging.getLogger(l).setLevel(logging.ERROR) - - tb = program.TensorBoard() - tb.configure(argv=[None, "--logdir", "assets/logs", f"--port={configs['tensorboard_port']}"]) - url = tb.launch() - - print(f"{translations['tensorboard_url']}: {url}") - if "--open" in sys.argv: webbrowser.open(url) - - return f"{translations['tensorboard_url']}: {url}" - -if __name__ == "__main__": launch_tensorboard() \ No newline at end of file diff --git a/main/configs/config.json b/main/configs/config.json deleted file mode 100644 index 00c1e441cecf989a0dba8b1b7925f27c08a350dc..0000000000000000000000000000000000000000 --- a/main/configs/config.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "language": "vi-VN", - "support_language": ["en-US", "vi-VN"], - "theme": "NoCrypt/miku", - "themes": ["NoCrypt/miku", "gstaff/xkcd", "JohnSmith9982/small_and_pretty", "ParityError/Interstellar", "earneleh/paris", "shivi/calm_seafoam", "Hev832/Applio", "YTheme/Minecraft", "gstaff/sketch", "SebastianBravo/simci_css", "allenai/gradio-theme", "Nymbo/Nymbo_Theme_5", "lone17/kotaemon", "Zarkel/IBM_Carbon_Theme", "SherlockRamos/Feliz", "freddyaboulton/dracula_revamped", "freddyaboulton/bad-theme-space", "gradio/dracula_revamped", "abidlabs/dracula_revamped", "gradio/dracula_test", "gradio/seafoam", "gradio/glass", "gradio/monochrome", "gradio/soft", "gradio/default", "gradio/base", "abidlabs/pakistan", "dawood/microsoft_windows", "ysharma/steampunk", "ysharma/huggingface", "abidlabs/Lime", "freddyaboulton/this-theme-does-not-exist-2", "aliabid94/new-theme", "aliabid94/test2", "aliabid94/test3", "aliabid94/test4", "abidlabs/banana", "freddyaboulton/test-blue", "gstaff/whiteboard", "ysharma/llamas", "abidlabs/font-test", "YenLai/Superhuman", "bethecloud/storj_theme", "sudeepshouche/minimalist", "knotdgaf/gradiotest", "ParityError/Anime", "Ajaxon6255/Emerald_Isle", "ParityError/LimeFace", "finlaymacklon/smooth_slate", "finlaymacklon/boxy_violet", "derekzen/stardust", "EveryPizza/Cartoony-Gradio-Theme", "Ifeanyi/Cyanister", "Tshackelton/IBMPlex-DenseReadable", "snehilsanyal/scikit-learn", "Himhimhim/xkcd", "nota-ai/theme", "rawrsor1/Everforest", "rottenlittlecreature/Moon_Goblin", "abidlabs/test-yellow", "abidlabs/test-yellow3", "idspicQstitho/dracula_revamped", "kfahn/AnimalPose", "HaleyCH/HaleyCH_Theme", "simulKitke/dracula_test", "braintacles/CrimsonNight", "wentaohe/whiteboardv2", "reilnuud/polite", "remilia/Ghostly", "Franklisi/darkmode", "coding-alt/soft", "xiaobaiyuan/theme_land", "step-3-profit/Midnight-Deep", "xiaobaiyuan/theme_demo", "Taithrah/Minimal", "Insuz/SimpleIndigo", "zkunn/Alipay_Gradio_theme", "Insuz/Mocha", "xiaobaiyuan/theme_brief", "Ama434/434-base-Barlow", "Ama434/def_barlow", "Ama434/neutral-barlow", "dawood/dracula_test", "nuttea/Softblue", "BlueDancer/Alien_Diffusion", "naughtondale/monochrome", "Dagfinn1962/standard", "default"], - - "mdx_model": ["Main_340", "Main_390", "Main_406", "Main_427", "Main_438", "Inst_full_292", "Inst_HQ_1", "Inst_HQ_2", "Inst_HQ_3", "Inst_HQ_4", "Inst_HQ_5", "Kim_Vocal_1", "Kim_Vocal_2", "Kim_Inst", "Inst_187_beta", "Inst_82_beta", "Inst_90_beta", "Voc_FT", "Crowd_HQ", "Inst_1", "Inst_2", "Inst_3", "MDXNET_1_9703", "MDXNET_2_9682", "MDXNET_3_9662", "Inst_Main", "MDXNET_Main", "MDXNET_9482"], - "demucs_model": ["HT-Normal", "HT-Tuned", "HD_MMI", "HT_6S"], - "edge_tts": ["af-ZA-AdriNeural", "af-ZA-WillemNeural", "sq-AL-AnilaNeural", "sq-AL-IlirNeural", "am-ET-AmehaNeural", "am-ET-MekdesNeural", "ar-DZ-AminaNeural", "ar-DZ-IsmaelNeural", "ar-BH-AliNeural", "ar-BH-LailaNeural", "ar-EG-SalmaNeural", "ar-EG-ShakirNeural", "ar-IQ-BasselNeural", "ar-IQ-RanaNeural", "ar-JO-SanaNeural", "ar-JO-TaimNeural", "ar-KW-FahedNeural", "ar-KW-NouraNeural", "ar-LB-LaylaNeural", "ar-LB-RamiNeural", "ar-LY-ImanNeural", "ar-LY-OmarNeural", "ar-MA-JamalNeural", "ar-MA-MounaNeural", "ar-OM-AbdullahNeural", "ar-OM-AyshaNeural", "ar-QA-AmalNeural", "ar-QA-MoazNeural", "ar-SA-HamedNeural", "ar-SA-ZariyahNeural", "ar-SY-AmanyNeural", "ar-SY-LaithNeural", "ar-TN-HediNeural", "ar-TN-ReemNeural", "ar-AE-FatimaNeural", "ar-AE-HamdanNeural", "ar-YE-MaryamNeural", "ar-YE-SalehNeural", "az-AZ-BabekNeural", "az-AZ-BanuNeural", "bn-BD-NabanitaNeural", "bn-BD-PradeepNeural", "bn-IN-BashkarNeural", "bn-IN-TanishaaNeural", "bs-BA-GoranNeural", "bs-BA-VesnaNeural", "bg-BG-BorislavNeural", "bg-BG-KalinaNeural", "my-MM-NilarNeural", "my-MM-ThihaNeural", "ca-ES-EnricNeural", "ca-ES-JoanaNeural", "zh-HK-HiuGaaiNeural", "zh-HK-HiuMaanNeural", "zh-HK-WanLungNeural", "zh-CN-XiaoxiaoNeural", "zh-CN-XiaoyiNeural", "zh-CN-YunjianNeural", "zh-CN-YunxiNeural", "zh-CN-YunxiaNeural", "zh-CN-YunyangNeural", "zh-CN-liaoning-XiaobeiNeural", "zh-TW-HsiaoChenNeural", "zh-TW-YunJheNeural", "zh-TW-HsiaoYuNeural", "zh-CN-shaanxi-XiaoniNeural", "hr-HR-GabrijelaNeural", "hr-HR-SreckoNeural", "cs-CZ-AntoninNeural", "cs-CZ-VlastaNeural", "da-DK-ChristelNeural", "da-DK-JeppeNeural", "nl-BE-ArnaudNeural", "nl-BE-DenaNeural", "nl-NL-ColetteNeural", "nl-NL-FennaNeural", "nl-NL-MaartenNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural", "en-CA-ClaraNeural", "en-CA-LiamNeural", "en-HK-SamNeural", "en-HK-YanNeural", "en-IN-NeerjaExpressiveNeural", "en-IN-NeerjaNeural", "en-IN-PrabhatNeural", "en-IE-ConnorNeural", "en-IE-EmilyNeural", "en-KE-AsiliaNeural", "en-KE-ChilembaNeural", "en-NZ-MitchellNeural", "en-NZ-MollyNeural", "en-NG-AbeoNeural", "en-NG-EzinneNeural", "en-PH-JamesNeural", "en-PH-RosaNeural", "en-SG-LunaNeural", "en-SG-WayneNeural", "en-ZA-LeahNeural", "en-ZA-LukeNeural", "en-TZ-ElimuNeural", "en-TZ-ImaniNeural", "en-GB-LibbyNeural", "en-GB-MaisieNeural", "en-GB-RyanNeural", "en-GB-SoniaNeural", "en-GB-ThomasNeural", "en-US-AvaMultilingualNeural", "en-US-AndrewMultilingualNeural", "en-US-EmmaMultilingualNeural", "en-US-BrianMultilingualNeural", "en-US-AvaNeural", "en-US-AndrewNeural", "en-US-EmmaNeural", "en-US-BrianNeural", "en-US-AnaNeural", "en-US-AriaNeural", "en-US-ChristopherNeural", "en-US-EricNeural", "en-US-GuyNeural", "en-US-JennyNeural", "en-US-MichelleNeural", "en-US-RogerNeural", "en-US-SteffanNeural", "et-EE-AnuNeural", "et-EE-KertNeural", "fil-PH-AngeloNeural", "fil-PH-BlessicaNeural", "fi-FI-HarriNeural", "fi-FI-NooraNeural", "fr-BE-CharlineNeural", "fr-BE-GerardNeural", "fr-CA-ThierryNeural", "fr-CA-AntoineNeural", "fr-CA-JeanNeural", "fr-CA-SylvieNeural", "fr-FR-VivienneMultilingualNeural", "fr-FR-RemyMultilingualNeural", "fr-FR-DeniseNeural", "fr-FR-EloiseNeural", "fr-FR-HenriNeural", "fr-CH-ArianeNeural", "fr-CH-FabriceNeural", "gl-ES-RoiNeural", "gl-ES-SabelaNeural", "ka-GE-EkaNeural", "ka-GE-GiorgiNeural", "de-AT-IngridNeural", "de-AT-JonasNeural", "de-DE-SeraphinaMultilingualNeural", "de-DE-FlorianMultilingualNeural", "de-DE-AmalaNeural", "de-DE-ConradNeural", "de-DE-KatjaNeural", "de-DE-KillianNeural", "de-CH-JanNeural", "de-CH-LeniNeural", "el-GR-AthinaNeural", "el-GR-NestorasNeural", "gu-IN-DhwaniNeural", "gu-IN-NiranjanNeural", "he-IL-AvriNeural", "he-IL-HilaNeural", "hi-IN-MadhurNeural", "hi-IN-SwaraNeural", "hu-HU-NoemiNeural", "hu-HU-TamasNeural", "is-IS-GudrunNeural", "is-IS-GunnarNeural", "id-ID-ArdiNeural", "id-ID-GadisNeural", "ga-IE-ColmNeural", "ga-IE-OrlaNeural", "it-IT-GiuseppeNeural", "it-IT-DiegoNeural", "it-IT-ElsaNeural", "it-IT-IsabellaNeural", "ja-JP-KeitaNeural", "ja-JP-NanamiNeural", "jv-ID-DimasNeural", "jv-ID-SitiNeural", "kn-IN-GaganNeural", "kn-IN-SapnaNeural", "kk-KZ-AigulNeural", "kk-KZ-DauletNeural", "km-KH-PisethNeural", "km-KH-SreymomNeural", "ko-KR-HyunsuNeural", "ko-KR-InJoonNeural", "ko-KR-SunHiNeural", "lo-LA-ChanthavongNeural", "lo-LA-KeomanyNeural", "lv-LV-EveritaNeural", "lv-LV-NilsNeural", "lt-LT-LeonasNeural", "lt-LT-OnaNeural", "mk-MK-AleksandarNeural", "mk-MK-MarijaNeural", "ms-MY-OsmanNeural", "ms-MY-YasminNeural", "ml-IN-MidhunNeural", "ml-IN-SobhanaNeural", "mt-MT-GraceNeural", "mt-MT-JosephNeural", "mr-IN-AarohiNeural", "mr-IN-ManoharNeural", "mn-MN-BataaNeural", "mn-MN-YesuiNeural", "ne-NP-HemkalaNeural", "ne-NP-SagarNeural", "nb-NO-FinnNeural", "nb-NO-PernilleNeural", "ps-AF-GulNawazNeural", "ps-AF-LatifaNeural", "fa-IR-DilaraNeural", "fa-IR-FaridNeural", "pl-PL-MarekNeural", "pl-PL-ZofiaNeural", "pt-BR-ThalitaNeural", "pt-BR-AntonioNeural", "pt-BR-FranciscaNeural", "pt-PT-DuarteNeural", "pt-PT-RaquelNeural", "ro-RO-AlinaNeural", "ro-RO-EmilNeural", "ru-RU-DmitryNeural", "ru-RU-SvetlanaNeural", "sr-RS-NicholasNeural", "sr-RS-SophieNeural", "si-LK-SameeraNeural", "si-LK-ThiliniNeural", "sk-SK-LukasNeural", "sk-SK-ViktoriaNeural", "sl-SI-PetraNeural", "sl-SI-RokNeural", "so-SO-MuuseNeural", "so-SO-UbaxNeural", "es-AR-ElenaNeural", "es-AR-TomasNeural", "es-BO-MarceloNeural", "es-BO-SofiaNeural", "es-CL-CatalinaNeural", "es-CL-LorenzoNeural", "es-ES-XimenaNeural", "es-CO-GonzaloNeural", "es-CO-SalomeNeural", "es-CR-JuanNeural", "es-CR-MariaNeural", "es-CU-BelkysNeural", "es-CU-ManuelNeural", "es-DO-EmilioNeural", "es-DO-RamonaNeural", "es-EC-AndreaNeural", "es-EC-LuisNeural", "es-SV-LorenaNeural", "es-SV-RodrigoNeural", "es-GQ-JavierNeural", "es-GQ-TeresaNeural", "es-GT-AndresNeural", "es-GT-MartaNeural", "es-HN-CarlosNeural", "es-HN-KarlaNeural", "es-MX-DaliaNeural", "es-MX-JorgeNeural", "es-NI-FedericoNeural", "es-NI-YolandaNeural", "es-PA-MargaritaNeural", "es-PA-RobertoNeural", "es-PY-MarioNeural", "es-PY-TaniaNeural", "es-PE-AlexNeural", "es-PE-CamilaNeural", "es-PR-KarinaNeural", "es-PR-VictorNeural", "es-ES-AlvaroNeural", "es-ES-ElviraNeural", "es-US-AlonsoNeural", "es-US-PalomaNeural", "es-UY-MateoNeural", "es-UY-ValentinaNeural", "es-VE-PaolaNeural", "es-VE-SebastianNeural", "su-ID-JajangNeural", "su-ID-TutiNeural", "sw-KE-RafikiNeural", "sw-KE-ZuriNeural", "sw-TZ-DaudiNeural", "sw-TZ-RehemaNeural", "sv-SE-MattiasNeural", "sv-SE-SofieNeural", "ta-IN-PallaviNeural", "ta-IN-ValluvarNeural", "ta-MY-KaniNeural", "ta-MY-SuryaNeural", "ta-SG-AnbuNeural", "ta-SG-VenbaNeural", "ta-LK-KumarNeural", "ta-LK-SaranyaNeural", "te-IN-MohanNeural", "te-IN-ShrutiNeural", "th-TH-NiwatNeural", "th-TH-PremwadeeNeural", "tr-TR-AhmetNeural", "tr-TR-EmelNeural", "uk-UA-OstapNeural", "uk-UA-PolinaNeural", "ur-IN-GulNeural", "ur-IN-SalmanNeural", "ur-PK-AsadNeural", "ur-PK-UzmaNeural", "uz-UZ-MadinaNeural", "uz-UZ-SardorNeural", "vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural", "cy-GB-AledNeural", "cy-GB-NiaNeural", "zu-ZA-ThandoNeural", "zu-ZA-ThembaNeural"], - "google_tts_voice": ["af", "am", "ar", "bg", "bn", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "eu", "fi", "fr", "fr-CA", "gl", "gu", "ha", "hi", "hr", "hu", "id", "is", "it", "iw", "ja", "jw", "km", "kn", "ko", "la", "lt", "lv", "ml", "mr", "ms", "my", "ne", "nl", "no", "pa", "pl", "pt", "pt-PT", "ro", "ru", "si", "sk", "sq", "sr", "su", "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "yue", "zh-CN", "zh-TW", "zh"], - - "separator_tab": true, - "convert_tab": true, - "tts_tab": true, - "effects_tab": true, - "create_dataset_tab": true, - "training_tab": true, - "fushion_tab": true, - "read_tab": true, - "onnx_tab": true, - "downloads_tab": true, - "f0_extractor_tab": true, - "settings_tab": true, - "report_bug_tab": true, - - "app_port": 7860, - "tensorboard_port": 6870, - "num_of_restart": 5, - "server_name": "0.0.0.0", - "app_show_error": true -} \ No newline at end of file diff --git a/main/configs/config.py b/main/configs/config.py deleted file mode 100644 index af5099c020b4c4de208b5b7c47fd30ac59a9c04c..0000000000000000000000000000000000000000 --- a/main/configs/config.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -import json -import torch - -version_config_paths = [os.path.join(version, size) for version in ["v1", "v2"] for size in ["32000.json", "40000.json", "44100.json", "48000.json"]] - -def singleton(cls): - instances = {} - def get_instance(*args, **kwargs): - if cls not in instances: instances[cls] = cls(*args, **kwargs) - return instances[cls] - return get_instance - -@singleton -class Config: - def __init__(self): - self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.gpu_name = (torch.cuda.get_device_name(int(self.device.split(":")[-1])) if self.device.startswith("cuda") else None) - self.translations = self.multi_language() - self.json_config = self.load_config_json() - self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def multi_language(self): - try: - with open(os.path.join("main", "configs", "config.json"), "r") as f: - configs = json.load(f) - - lang = configs.get("language", "vi-VN") - if len([l for l in os.listdir(os.path.join("assets", "languages")) if l.endswith(".json")]) < 1: raise FileNotFoundError("Không tìm thấy bất cứ gói ngôn ngữ nào(No package languages found)") - - if not lang: lang = "vi-VN" - if lang not in configs["support_language"]: raise ValueError("Ngôn ngữ không được hỗ trợ(Language not supported)") - - lang_path = os.path.join("assets", "languages", f"{lang}.json") - if not os.path.exists(lang_path): lang_path = os.path.join("assets", "languages", "vi-VN.json") - - with open(lang_path, encoding="utf-8") as f: - translations = json.load(f) - except json.JSONDecodeError: - print(self.translations["empty_json"].format(file=lang)) - pass - return translations - - def load_config_json(self): - configs = {} - for config_file in version_config_paths: - try: - with open(os.path.join("main", "configs", config_file), "r") as f: - configs[config_file] = json.load(f) - except json.JSONDecodeError: - print(self.translations["empty_json"].format(file=config_file)) - pass - return configs - - def device_config(self): - if self.device.startswith("cuda"): self.set_cuda_config() - elif self.has_mps(): self.device = "mps" - else: self.device = "cpu" - - if self.gpu_mem is not None and self.gpu_mem <= 4: return 1, 5, 30, 32 - return 1, 6, 38, 41 - - def set_cuda_config(self): - i_device = int(self.device.split(":")[-1]) - self.gpu_name = torch.cuda.get_device_name(i_device) - self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (1024**3) - - def has_mps(self): - return torch.backends.mps.is_available() \ No newline at end of file diff --git a/main/configs/decrypt.bin b/main/configs/decrypt.bin deleted file mode 100644 index 85da68557da0749d6532388eab083cdfea3de416..0000000000000000000000000000000000000000 --- a/main/configs/decrypt.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:330268cbf6b9317a76510b533e1640ef48ed074a07c013e5b1abc4d48cfd9dce -size 32 diff --git a/main/configs/v1/32000.json b/main/configs/v1/32000.json deleted file mode 100644 index 224c3757d9bff4d5dda025b6b33d6c9296b312b9..0000000000000000000000000000000000000000 --- a/main/configs/v1/32000.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "batch_size": 4, - "lr_decay": 0.999875, - "segment_size": 12800, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 32000, - "filter_length": 1024, - "hop_length": 320, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 256, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "upsample_rates": [10, 4, 2, 2, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16, 16, 4, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/configs/v1/40000.json b/main/configs/v1/40000.json deleted file mode 100644 index 45ad70b94322c76b248ee9a5bd0885620623b5bb..0000000000000000000000000000000000000000 --- a/main/configs/v1/40000.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "batch_size": 4, - "lr_decay": 0.999875, - "segment_size": 12800, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 40000, - "filter_length": 2048, - "hop_length": 400, - "win_length": 2048, - "n_mel_channels": 125, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 256, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "upsample_rates": [10, 10, 2, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16, 16, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/configs/v1/44100.json b/main/configs/v1/44100.json deleted file mode 100644 index f4abb8a4b96269e0f37886148c8ac0466ac95cab..0000000000000000000000000000000000000000 --- a/main/configs/v1/44100.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "batch_size": 4, - "lr_decay": 0.999875, - "segment_size": 15876, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 44100, - "filter_length": 2048, - "hop_length": 441, - "win_length": 2048, - "n_mel_channels": 128, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 256, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [ 1, 3, 5], [1, 3, 5]], - "upsample_rates": [10, 10, 2.205, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [20, 20, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/configs/v1/48000.json b/main/configs/v1/48000.json deleted file mode 100644 index 9c87fa8f9152310b850edf3c291f242dbcb6cddb..0000000000000000000000000000000000000000 --- a/main/configs/v1/48000.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "batch_size": 4, - "lr_decay": 0.999875, - "segment_size": 11520, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 48000, - "filter_length": 2048, - "hop_length": 480, - "win_length": 2048, - "n_mel_channels": 128, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 256, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "upsample_rates": [10, 6, 2, 2, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16, 16, 4, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/configs/v2/32000.json b/main/configs/v2/32000.json deleted file mode 100644 index 567fa71a6ca8465cc6f77df6d258c8497b9c5a41..0000000000000000000000000000000000000000 --- a/main/configs/v2/32000.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "lr_decay": 0.999875, - "segment_size": 12800, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 32000, - "filter_length": 1024, - "hop_length": 320, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "upsample_rates": [10, 8, 2, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [20, 16, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/configs/v2/40000.json b/main/configs/v2/40000.json deleted file mode 100644 index 344a1673c03faa45d499845f7a61664fe8176a96..0000000000000000000000000000000000000000 --- a/main/configs/v2/40000.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "lr_decay": 0.999875, - "segment_size": 12800, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 40000, - "filter_length": 2048, - "hop_length": 400, - "win_length": 2048, - "n_mel_channels": 125, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "upsample_rates": [10, 10, 2, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16, 16, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/configs/v2/44100.json b/main/configs/v2/44100.json deleted file mode 100644 index d8f791c373c9991ec1d62fdfbd6d2426279cbbc5..0000000000000000000000000000000000000000 --- a/main/configs/v2/44100.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "lr_decay": 0.999875, - "segment_size": 15876, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 44100, - "filter_length": 2048, - "hop_length": 441, - "win_length": 2048, - "n_mel_channels": 128, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "upsample_rates": [10, 10, 2.205, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [20, 20, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/configs/v2/48000.json b/main/configs/v2/48000.json deleted file mode 100644 index 2ad00577a300123be7e4fd1254c07b21ab602c34..0000000000000000000000000000000000000000 --- a/main/configs/v2/48000.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "learning_rate": 0.0001, - "betas": [0.8, 0.99], - "eps": 1e-09, - "lr_decay": 0.999875, - "segment_size": 17280, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sample_rate": 48000, - "filter_length": 2048, - "hop_length": 480, - "win_length": 2048, - "n_mel_channels": 128, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "text_enc_hidden_dim": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3, 7, 11], - "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - "upsample_rates": [12, 10, 2, 2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [24, 20, 4, 4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} \ No newline at end of file diff --git a/main/inference/audio_effects.py b/main/inference/audio_effects.py deleted file mode 100644 index 9226494105b76d81b69c13cf96c407f20f30f557..0000000000000000000000000000000000000000 --- a/main/inference/audio_effects.py +++ /dev/null @@ -1,180 +0,0 @@ -import os -import sys -import librosa -import argparse - -import numpy as np -import soundfile as sf - -from distutils.util import strtobool -from scipy.signal import butter, filtfilt -from pedalboard import Pedalboard, Chorus, Distortion, Reverb, PitchShift, Delay, Limiter, Gain, Bitcrush, Clipping, Compressor, Phaser, HighpassFilter - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.utils import pydub_convert, pydub_load - -translations = Config().translations - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_path", type=str, required=True) - parser.add_argument("--output_path", type=str, default="./audios/apply_effects.wav") - parser.add_argument("--export_format", type=str, default="wav") - parser.add_argument("--resample", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--resample_sr", type=int, default=0) - parser.add_argument("--chorus", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--chorus_depth", type=float, default=0.5) - parser.add_argument("--chorus_rate", type=float, default=1.5) - parser.add_argument("--chorus_mix", type=float, default=0.5) - parser.add_argument("--chorus_delay", type=int, default=10) - parser.add_argument("--chorus_feedback", type=float, default=0) - parser.add_argument("--distortion", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--drive_db", type=int, default=20) - parser.add_argument("--reverb", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--reverb_room_size", type=float, default=0.5) - parser.add_argument("--reverb_damping", type=float, default=0.5) - parser.add_argument("--reverb_wet_level", type=float, default=0.33) - parser.add_argument("--reverb_dry_level", type=float, default=0.67) - parser.add_argument("--reverb_width", type=float, default=1) - parser.add_argument("--reverb_freeze_mode", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--pitchshift", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--pitch_shift", type=int, default=0) - parser.add_argument("--delay", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--delay_seconds", type=float, default=0.5) - parser.add_argument("--delay_feedback", type=float, default=0.5) - parser.add_argument("--delay_mix", type=float, default=0.5) - parser.add_argument("--compressor", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--compressor_threshold", type=int, default=-20) - parser.add_argument("--compressor_ratio", type=float, default=4) - parser.add_argument("--compressor_attack_ms", type=float, default=10) - parser.add_argument("--compressor_release_ms", type=int, default=200) - parser.add_argument("--limiter", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--limiter_threshold", type=int, default=0) - parser.add_argument("--limiter_release", type=int, default=100) - parser.add_argument("--gain", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--gain_db", type=int, default=0) - parser.add_argument("--bitcrush", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--bitcrush_bit_depth", type=int, default=16) - parser.add_argument("--clipping", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clipping_threshold", type=int, default=-10) - parser.add_argument("--phaser", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--phaser_rate_hz", type=float, default=0.5) - parser.add_argument("--phaser_depth", type=float, default=0.5) - parser.add_argument("--phaser_centre_frequency_hz", type=int, default=1000) - parser.add_argument("--phaser_feedback", type=float, default=0) - parser.add_argument("--phaser_mix", type=float, default=0.5) - parser.add_argument("--treble_bass_boost", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--bass_boost_db", type=int, default=0) - parser.add_argument("--bass_boost_frequency", type=int, default=100) - parser.add_argument("--treble_boost_db", type=int, default=0) - parser.add_argument("--treble_boost_frequency", type=int, default=3000) - parser.add_argument("--fade_in_out", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--fade_in_duration", type=float, default=2000) - parser.add_argument("--fade_out_duration", type=float, default=2000) - parser.add_argument("--audio_combination", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--audio_combination_input", type=str) - - return parser.parse_args() - -def process_audio(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, pitchshift, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input): - def bass_boost(audio, gain_db, frequency, sample_rate): - if gain_db >= 1: - b, a = butter(4, frequency / (0.5 * sample_rate), btype='low') - - return filtfilt(b, a, audio) * 10 ** (gain_db / 20) - else: return audio - - def treble_boost(audio, gain_db, frequency, sample_rate): - if gain_db >=1: - b, a = butter(4, frequency / (0.5 * sample_rate), btype='high') - - return filtfilt(b, a, audio) * 10 ** (gain_db / 20) - else: return audio - - def fade_out_effect(audio, sr, duration=3.0): - length = int(duration * sr) - end = audio.shape[0] - - if length > end: length = end - start = end - length - - audio[start:end] = audio[start:end] * np.linspace(1.0, 0.0, length) - return audio - - def fade_in_effect(audio, sr, duration=3.0): - length = int(duration * sr) - start = 0 - - if length > audio.shape[0]: length = audio.shape[0] - end = length - - audio[start:end] = audio[start:end] * np.linspace(0.0, 1.0, length) - return audio - - if not input_path or not os.path.exists(input_path): - print(translations["input_not_valid"]) - sys.exit(1) - - if not output_path: - print(translations["output_not_valid"]) - sys.exit(1) - - if os.path.exists(output_path): os.remove(output_path) - - try: - input_path = input_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - - try: - audio, sample_rate = sf.read(input_path) - except: - audio, sample_rate = librosa.load(input_path, sr=None) - except Exception as e: - raise RuntimeError(f"{translations['errors_loading_audio']}: {e}") - - audio = audio.flatten() - - - try: - board = Pedalboard([HighpassFilter()]) - - if chorus: board.append(Chorus(depth=chorus_depth, rate_hz=chorus_rate, mix=chorus_mix, centre_delay_ms=chorus_delay, feedback=chorus_feedback)) - if distortion: board.append(Distortion(drive_db=distortion_drive)) - if reverb: board.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level, dry_level=reverb_dry_level, width=reverb_width, freeze_mode=1 if reverb_freeze_mode else 0)) - if pitchshift: board.append(PitchShift(semitones=pitch_shift)) - if delay: board.append(Delay(delay_seconds=delay_seconds, feedback=delay_feedback, mix=delay_mix)) - if compressor: board.append(Compressor(threshold_db=compressor_threshold, ratio=compressor_ratio, attack_ms=compressor_attack_ms, release_ms=compressor_release_ms)) - if limiter: board.append(Limiter(threshold_db=limiter_threshold, release_ms=limiter_release)) - if gain: board.append(Gain(gain_db=gain_db)) - if bitcrush: board.append(Bitcrush(bit_depth=bitcrush_bit_depth)) - if clipping: board.append(Clipping(threshold_db=clipping_threshold)) - if phaser: board.append(Phaser(rate_hz=phaser_rate_hz, depth=phaser_depth, centre_frequency_hz=phaser_centre_frequency_hz, feedback=phaser_feedback, mix=phaser_mix)) - - processed_audio = board(audio, sample_rate) - - if treble_bass_boost: - processed_audio = bass_boost(processed_audio, bass_boost_db, bass_boost_frequency, sample_rate) - processed_audio = treble_boost(processed_audio, treble_boost_db, treble_boost_frequency, sample_rate) - - if fade_in_out: - processed_audio = fade_in_effect(processed_audio, sample_rate, fade_in_duration) - processed_audio = fade_out_effect(processed_audio, sample_rate, fade_out_duration) - - if resample_sr != sample_rate and resample_sr > 0 and resample: - target_sr = min([8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000], key=lambda x: abs(x - resample_sr)) - processed_audio = librosa.resample(processed_audio, orig_sr=sample_rate, target_sr=target_sr, res_type="soxr_vhq") - sample_rate = target_sr - - sf.write(output_path.replace("wav", export_format), processed_audio, sample_rate, format=export_format) - - if audio_combination: pydub_convert(pydub_load(audio_combination_input)).overlay(pydub_convert(pydub_load(output_path.replace("wav", export_format)))).export(output_path.replace("wav", export_format), format=export_format) - except Exception as e: - raise RuntimeError(translations["apply_error"].format(e=e)) - - return output_path - -if __name__ == "__main__": - args = parse_arguments() - process_audio(input_path=args.input_path, output_path=args.output_path, resample=args.resample, resample_sr=args.resample_sr, chorus_depth=args.chorus_depth, chorus_rate=args.chorus_rate, chorus_mix=args.chorus_mix, chorus_delay=args.chorus_delay, chorus_feedback=args.chorus_feedback, distortion_drive=args.drive_db, reverb_room_size=args.reverb_room_size, reverb_damping=args.reverb_damping, reverb_wet_level=args.reverb_wet_level, reverb_dry_level=args.reverb_dry_level, reverb_width=args.reverb_width, reverb_freeze_mode=args.reverb_freeze_mode, pitch_shift=args.pitch_shift, delay_seconds=args.delay_seconds, delay_feedback=args.delay_feedback, delay_mix=args.delay_mix, compressor_threshold=args.compressor_threshold, compressor_ratio=args.compressor_ratio, compressor_attack_ms=args.compressor_attack_ms, compressor_release_ms=args.compressor_release_ms, limiter_threshold=args.limiter_threshold, limiter_release=args.limiter_release, gain_db=args.gain_db, bitcrush_bit_depth=args.bitcrush_bit_depth, clipping_threshold=args.clipping_threshold, phaser_rate_hz=args.phaser_rate_hz, phaser_depth=args.phaser_depth, phaser_centre_frequency_hz=args.phaser_centre_frequency_hz, phaser_feedback=args.phaser_feedback, phaser_mix=args.phaser_mix, bass_boost_db=args.bass_boost_db, bass_boost_frequency=args.bass_boost_frequency, treble_boost_db=args.treble_boost_db, treble_boost_frequency=args.treble_boost_frequency, fade_in_duration=args.fade_in_duration, fade_out_duration=args.fade_out_duration, export_format=args.export_format, chorus=args.chorus, distortion=args.distortion, reverb=args.reverb, pitchshift=args.pitchshift, delay=args.delay, compressor=args.compressor, limiter=args.limiter, gain=args.gain, bitcrush=args.bitcrush, clipping=args.clipping, phaser=args.phaser, treble_bass_boost=args.treble_bass_boost, fade_in_out=args.fade_in_out, audio_combination=args.audio_combination, audio_combination_input=args.audio_combination_input) \ No newline at end of file diff --git a/main/inference/convert.py b/main/inference/convert.py deleted file mode 100644 index 8965a046daace4e6255edfa26acad5458dd96992..0000000000000000000000000000000000000000 --- a/main/inference/convert.py +++ /dev/null @@ -1,680 +0,0 @@ -import re -import os -import sys -import time -import faiss -import torch -import shutil -import librosa -import logging -import argparse -import warnings -import onnxruntime -import logging.handlers - -import numpy as np -import soundfile as sf -import torch.nn.functional as F - -from tqdm import tqdm -from scipy import signal -from distutils.util import strtobool -from fairseq import checkpoint_utils - -warnings.filterwarnings("ignore") -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.algorithm.synthesizers import Synthesizer -from main.library.utils import check_predictors, check_embedders, load_audio, process_audio, merge_audio - -bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) -config = Config() -translations = config.translations -logger = logging.getLogger(__name__) -logger.propagate = False - -for l in ["torch", "faiss", "httpx", "fairseq", "httpcore", "faiss.loader", "numba.core", "urllib3"]: - logging.getLogger(l).setLevel(logging.ERROR) - -if logger.hasHandlers(): logger.handlers.clear() -else: - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "convert.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--pitch", type=int, default=0) - parser.add_argument("--filter_radius", type=int, default=3) - parser.add_argument("--index_rate", type=float, default=0.5) - parser.add_argument("--volume_envelope", type=float, default=1) - parser.add_argument("--protect", type=float, default=0.33) - parser.add_argument("--hop_length", type=int, default=64) - parser.add_argument("--f0_method", type=str, default="rmvpe") - parser.add_argument("--embedder_model", type=str, default="contentvec_base.pt") - parser.add_argument("--input_path", type=str, required=True) - parser.add_argument("--output_path", type=str, default="./audios/output.wav") - parser.add_argument("--export_format", type=str, default="wav") - parser.add_argument("--pth_path", type=str, required=True) - parser.add_argument("--index_path", type=str) - parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--f0_autotune_strength", type=float, default=1) - parser.add_argument("--clean_audio", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_strength", type=float, default=0.7) - parser.add_argument("--resample_sr", type=int, default=0) - parser.add_argument("--split_audio", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--checkpointing", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--f0_file", type=str, default=None) - parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--embedders_onnx", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--formant_shifting", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--formant_qfrency", type=float, default=0.8) - parser.add_argument("--formant_timbre", type=float, default=0.8) - - return parser.parse_args() - -def main(): - args = parse_arguments() - pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, checkpointing, f0_file, f0_onnx, embedders_onnx, formant_shifting, formant_qfrency, formant_timbre = args.pitch, args.filter_radius, args.index_rate, args.volume_envelope,args.protect, args.hop_length, args.f0_method, args.input_path, args.output_path, args.pth_path, args.index_path, args.f0_autotune, args.f0_autotune_strength, args.clean_audio, args.clean_strength, args.export_format, args.embedder_model, args.resample_sr, args.split_audio, args.checkpointing, args.f0_file, args.f0_onnx, args.embedders_onnx, args.formant_shifting, args.formant_qfrency, args.formant_timbre - - log_data = {translations['pitch']: pitch, translations['filter_radius']: filter_radius, translations['index_strength']: index_rate, translations['volume_envelope']: volume_envelope, translations['protect']: protect, "Hop length": hop_length, translations['f0_method']: f0_method, translations['audio_path']: input_path, translations['output_path']: output_path.replace('wav', export_format), translations['model_path']: pth_path, translations['indexpath']: index_path, translations['autotune']: f0_autotune, translations['clear_audio']: clean_audio, translations['export_format']: export_format, translations['hubert_model']: embedder_model, translations['split_audio']: split_audio, translations['memory_efficient_training']: checkpointing, translations["f0_onnx_mode"]: f0_onnx, translations["embed_onnx"]: embedders_onnx} - - if clean_audio: log_data[translations['clean_strength']] = clean_strength - if resample_sr != 0: log_data[translations['sample_rate']] = resample_sr - if f0_autotune: log_data[translations['autotune_rate_info']] = f0_autotune_strength - if os.path.isfile(f0_file): log_data[translations['f0_file']] = f0_file - if formant_shifting: - log_data[translations['formant_qfrency']] = formant_qfrency - log_data[translations['formant_timbre']] = formant_timbre - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - run_convert_script(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, input_path=input_path, output_path=output_path, pth_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, split_audio=split_audio, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, embedders_onnx=embedders_onnx, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre) - -def run_batch_convert(params): - path, audio_temp, export_format, cut_files, pitch, filter_radius, index_rate, volume_envelope, protect, hop_length, f0_method, pth_path, index_path, f0_autotune, f0_autotune_strength, clean_audio, clean_strength, embedder_model, resample_sr, checkpointing, f0_file, f0_onnx, formant_shifting, formant_qfrency, formant_timbre = params["path"], params["audio_temp"], params["export_format"], params["cut_files"], params["pitch"], params["filter_radius"], params["index_rate"], params["volume_envelope"], params["protect"], params["hop_length"], params["f0_method"], params["pth_path"], params["index_path"], params["f0_autotune"], params["f0_autotune_strength"], params["clean_audio"], params["clean_strength"], params["embedder_model"], params["resample_sr"], params["checkpointing"], params["f0_file"], params["f0_onnx"], params["formant_shifting"], params["formant_qfrency"], params["formant_timbre"] - - segment_output_path = os.path.join(audio_temp, f"output_{cut_files.index(path)}.{export_format}") - if os.path.exists(segment_output_path): os.remove(segment_output_path) - - VoiceConverter().convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=path, audio_output_path=segment_output_path, model_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre) - os.remove(path) - - if os.path.exists(segment_output_path): return segment_output_path - else: - logger.warning(f"{translations['not_found_convert_file']}: {segment_output_path}") - sys.exit(1) - -def run_convert_script(pitch=0, filter_radius=3, index_rate=0.5, volume_envelope=1, protect=0.5, hop_length=64, f0_method="rmvpe", input_path=None, output_path="./output.wav", pth_path=None, index_path=None, f0_autotune=False, f0_autotune_strength=1, clean_audio=False, clean_strength=0.7, export_format="wav", embedder_model="contentvec_base.pt", resample_sr=0, split_audio=False, checkpointing=False, f0_file=None, f0_onnx=False, embedders_onnx=False, formant_shifting=False, formant_qfrency=0.8, formant_timbre=0.8): - check_predictors(f0_method, f0_onnx); check_embedders(embedder_model, embedders_onnx) - embedder_model += ".onnx" if embedders_onnx else ".pt" - - cvt = VoiceConverter() - start_time = time.time() - - pid_path = os.path.join("assets", "convert_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith((".pth", ".onnx")): - logger.warning(translations["provide_file"].format(filename=translations["model"])) - sys.exit(1) - - processed_segments = [] - audio_temp = os.path.join("audios_temp") - if not os.path.exists(audio_temp) and split_audio: os.makedirs(audio_temp, exist_ok=True) - - if os.path.isdir(input_path): - try: - logger.info(translations["convert_batch"]) - audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))] - - if not audio_files: - logger.warning(translations["not_found_audio"]) - sys.exit(1) - - logger.info(translations["found_audio"].format(audio_files=len(audio_files))) - - for audio in audio_files: - audio_path = os.path.join(input_path, audio) - output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}") - - if split_audio: - try: - cut_files, time_stamps = process_audio(logger, audio_path, audio_temp) - params_list = [{"path": path, "audio_temp": audio_temp, "export_format": export_format, "cut_files": cut_files, "pitch": pitch, "filter_radius": filter_radius, "index_rate": index_rate, "volume_envelope": volume_envelope, "protect": protect, "hop_length": hop_length, "f0_method": f0_method, "pth_path": pth_path, "index_path": index_path, "f0_autotune": f0_autotune, "f0_autotune_strength": f0_autotune_strength, "clean_audio": clean_audio, "clean_strength": clean_strength, "embedder_model": embedder_model, "resample_sr": resample_sr, "checkpointing": checkpointing, "f0_file": f0_file, "f0_onnx": f0_onnx, "formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre} for path in cut_files] - - with tqdm(total=len(params_list), desc=translations["convert_audio"], ncols=100, unit="a") as pbar: - for params in params_list: - processed_segments.append(run_batch_convert(params)) - pbar.update(1) - logger.debug(pbar.format_meter(pbar.n, pbar.total, pbar.format_dict["elapsed"])) - - merge_audio(processed_segments, time_stamps, audio_path, output_audio, export_format) - except Exception as e: - logger.error(translations["error_convert_batch"].format(e=e)) - finally: - if os.path.exists(audio_temp): shutil.rmtree(audio_temp, ignore_errors=True) - else: - try: - logger.info(f"{translations['convert_audio']} '{audio_path}'...") - if os.path.exists(output_audio): os.remove(output_audio) - - with tqdm(total=1, desc=translations["convert_audio"], ncols=100, unit="a") as pbar: - cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=audio_path, audio_output_path=output_audio, model_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre) - pbar.update(1) - logger.debug(pbar.format_meter(pbar.n, pbar.total, pbar.format_dict["elapsed"])) - except Exception as e: - logger.error(translations["error_convert"].format(e=e)) - - logger.info(translations["convert_batch_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}", output_path=output_path.replace('wav', export_format))) - except Exception as e: - logger.error(translations["error_convert_batch_2"].format(e=e)) - else: - logger.info(f"{translations['convert_audio']} '{input_path}'...") - if not os.path.exists(input_path): - logger.warning(translations["not_found_audio"]) - sys.exit(1) - - if os.path.exists(output_path): os.remove(output_path) - - if split_audio: - try: - cut_files, time_stamps = process_audio(logger, input_path, audio_temp) - params_list = [{"path": path, "audio_temp": audio_temp, "export_format": export_format, "cut_files": cut_files, "pitch": pitch, "filter_radius": filter_radius, "index_rate": index_rate, "volume_envelope": volume_envelope, "protect": protect, "hop_length": hop_length, "f0_method": f0_method, "pth_path": pth_path, "index_path": index_path, "f0_autotune": f0_autotune, "f0_autotune_strength": f0_autotune_strength, "clean_audio": clean_audio, "clean_strength": clean_strength, "embedder_model": embedder_model, "resample_sr": resample_sr, "checkpointing": checkpointing, "f0_file": f0_file, "f0_onnx": f0_onnx, "formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre} for path in cut_files] - - with tqdm(total=len(params_list), desc=translations["convert_audio"], ncols=100, unit="a") as pbar: - for params in params_list: - processed_segments.append(run_batch_convert(params)) - pbar.update(1) - logger.debug(pbar.format_meter(pbar.n, pbar.total, pbar.format_dict["elapsed"])) - - merge_audio(processed_segments, time_stamps, input_path, output_path.replace("wav", export_format), export_format) - except Exception as e: - logger.error(translations["error_convert_batch"].format(e=e)) - finally: - if os.path.exists(audio_temp): shutil.rmtree(audio_temp, ignore_errors=True) - else: - try: - with tqdm(total=1, desc=translations["convert_audio"], ncols=100, unit="a") as pbar: - cvt.convert_audio(pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=input_path, audio_output_path=output_path, model_path=pth_path, index_path=index_path, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, embedder_model=embedder_model, resample_sr=resample_sr, checkpointing=checkpointing, f0_file=f0_file, f0_onnx=f0_onnx, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre) - pbar.update(1) - logger.debug(pbar.format_meter(pbar.n, pbar.total, pbar.format_dict["elapsed"])) - except Exception as e: - logger.error(translations["error_convert"].format(e=e)) - - if os.path.exists(pid_path): os.remove(pid_path) - logger.info(translations["convert_audio_success"].format(input_path=input_path, elapsed_time=f"{(time.time() - start_time):.2f}", output_path=output_path.replace('wav', export_format))) - -def change_rms(source_audio, source_rate, target_audio, target_rate, rate): - rms2 = F.interpolate(torch.from_numpy(librosa.feature.rms(y=target_audio, frame_length=target_rate // 2 * 2, hop_length=target_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze() - return (target_audio * (torch.pow(F.interpolate(torch.from_numpy(librosa.feature.rms(y=source_audio, frame_length=source_rate // 2 * 2, hop_length=source_rate // 2)).float().unsqueeze(0), size=target_audio.shape[0], mode="linear").squeeze(), 1 - rate) * torch.pow(torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6), rate - 1)).numpy()) - -def get_providers(): - ort_providers = onnxruntime.get_available_providers() - - if "CUDAExecutionProvider" in ort_providers: providers = ["CUDAExecutionProvider"] - elif "CoreMLExecutionProvider" in ort_providers: providers = ["CoreMLExecutionProvider"] - else: providers = ["CPUExecutionProvider"] - - return providers - -class Autotune: - def __init__(self, ref_freqs): - self.ref_freqs = ref_freqs - self.note_dict = self.ref_freqs - - def autotune_f0(self, f0, f0_autotune_strength): - autotuned_f0 = np.zeros_like(f0) - - for i, freq in enumerate(f0): - autotuned_f0[i] = freq + (min(self.note_dict, key=lambda x: abs(x - freq)) - freq) * f0_autotune_strength - - return autotuned_f0 - -class VC: - def __init__(self, tgt_sr, config): - self.x_pad = config.x_pad - self.x_query = config.x_query - self.x_center = config.x_center - self.x_max = config.x_max - self.sample_rate = 16000 - self.window = 160 - self.t_pad = self.sample_rate * self.x_pad - self.t_pad_tgt = tgt_sr * self.x_pad - self.t_pad2 = self.t_pad * 2 - self.t_query = self.sample_rate * self.x_query - self.t_center = self.sample_rate * self.x_center - self.t_max = self.sample_rate * self.x_max - self.time_step = self.window / self.sample_rate * 1000 - self.f0_min = 50 - self.f0_max = 1100 - self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.device = config.device - self.ref_freqs = [49.00, 51.91, 55.00, 58.27, 61.74, 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, 130.81, 138.59, 146.83, 155.56, 164.81, 174.61, 185.00, 196.00, 207.65, 220.00, 233.08, 246.94, 261.63, 277.18, 293.66, 311.13, 329.63, 349.23, 369.99, 392.00, 415.30, 440.00, 466.16, 493.88, 523.25, 554.37, 587.33, 622.25, 659.25, 698.46, 739.99, 783.99, 830.61, 880.00, 932.33, 987.77, 1046.50] - self.autotune = Autotune(self.ref_freqs) - self.note_dict = self.autotune.note_dict - - def get_f0_pm(self, x, p_len): - import parselmouth - - f0 = (parselmouth.Sound(x, self.sample_rate).to_pitch_ac(time_step=self.window / self.sample_rate * 1000 / 1000, voicing_threshold=0.6, pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array["frequency"]) - pad_size = (p_len - len(f0) + 1) // 2 - - if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - return f0 - - def get_f0_mangio_crepe(self, x, p_len, hop_length, model="full", onnx=False): - from main.library.predictors.CREPE import predict - - x = x.astype(np.float32) - x /= np.quantile(np.abs(x), 0.999) - - audio = torch.unsqueeze(torch.from_numpy(x).to(self.device, copy=True), dim=0) - if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach() - - p_len = p_len or x.shape[0] // hop_length - source = np.array(predict(audio.detach(), self.sample_rate, hop_length, self.f0_min, self.f0_max, model, batch_size=hop_length * 2, device=self.device, pad=True, providers=get_providers(), onnx=onnx).squeeze(0).cpu().float().numpy()) - source[source < 0.001] = np.nan - - return np.nan_to_num(np.interp(np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source)), source)) - - def get_f0_crepe(self, x, model="full", onnx=False): - from main.library.predictors.CREPE import predict, mean, median - - f0, pd = predict(torch.tensor(np.copy(x))[None].float(), self.sample_rate, self.window, self.f0_min, self.f0_max, model, batch_size=512, device=self.device, return_periodicity=True, providers=get_providers(), onnx=onnx) - f0, pd = mean(f0, 3), median(pd, 3) - f0[pd < 0.1] = 0 - - return f0[0].cpu().numpy() - - def get_f0_fcpe(self, x, p_len, hop_length, onnx=False, legacy=False): - from main.library.predictors.FCPE import FCPE - - model_fcpe = FCPE(os.path.join("assets", "models", "predictors", ("fcpe_legacy" if legacy else"fcpe") + (".onnx" if onnx else ".pt")), hop_length=int(hop_length), f0_min=int(self.f0_min), f0_max=int(self.f0_max), dtype=torch.float32, device=self.device, sample_rate=self.sample_rate, threshold=0.03, providers=get_providers(), onnx=onnx, legacy=legacy) - f0 = model_fcpe.compute_f0(x, p_len=p_len) - - del model_fcpe - return f0 - - def get_f0_rmvpe(self, x, legacy=False, onnx=False): - from main.library.predictors.RMVPE import RMVPE - - rmvpe_model = RMVPE(os.path.join("assets", "models", "predictors", "rmvpe" + (".onnx" if onnx else ".pt")), device=self.device, onnx=onnx, providers=get_providers()) - f0 = rmvpe_model.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else rmvpe_model.infer_from_audio(x, thred=0.03) - - del rmvpe_model - return f0 - - def get_f0_pyworld_wrapper(self, x, filter_radius, model="harvest"): - from main.library.predictors.WORLD_WRAPPER import PYWORLD - - pw = PYWORLD() - x = x.astype(np.double) - - if model == "harvest": f0, t = pw.harvest(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10) - elif model == "dio": f0, t = pw.dio(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10) - else: raise ValueError(translations["method_not_valid"]) - - f0 = pw.stonemask(x, self.sample_rate, t, f0) - - if filter_radius > 2 or model == "dio": f0 = signal.medfilt(f0, 3) - return f0 - - def get_f0_pyworld(self, x, filter_radius, model="harvest"): - from main.library.predictors.pyworld import harvest, dio, stonemask - - x = x.astype(np.double) - - if model == "harvest": f0, t = harvest.harvest(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10) - elif model == "dio": f0, t = dio.dio(x, fs=self.sample_rate, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=10) - else: raise ValueError(translations["method_not_valid"]) - - f0 = stonemask.stonemask(x, self.sample_rate, t, f0) - - if filter_radius > 2 or model == "dio": f0 = signal.medfilt(f0, 3) - return f0 - - def get_f0_swipe(self, x): - from main.library.predictors.SWIPE import swipe - - f0, _ = swipe(x.astype(np.double), self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, frame_period=10, device=self.device) - return f0 - - def get_f0_yin(self, x, hop_length, p_len): - source = np.array(librosa.yin(x.astype(np.float32), sr=self.sample_rate, fmin=self.f0_min, fmax=self.f0_max, hop_length=hop_length)) - source[source < 0.001] = np.nan - - return np.nan_to_num(np.interp(np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source)), source)) - - def get_f0_pyin(self, x, hop_length, p_len): - f0, _, _ = librosa.pyin(x.astype(np.float32), fmin=self.f0_min, fmax=self.f0_max, sr=self.sample_rate, hop_length=hop_length) - source = np.array(f0) - source[source < 0.001] = np.nan - - return np.nan_to_num(np.interp(np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source)), source)) - - def get_f0_hybrid(self, methods_str, x, p_len, hop_length, filter_radius, onnx_mode): - methods_str = re.search("hybrid\[(.+)\]", methods_str) - if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")] - - f0_computation_stack, resampled_stack = [], [] - logger.debug(translations["hybrid_methods"].format(methods=methods)) - - x = x.astype(np.float32) - x /= np.quantile(np.abs(x), 0.999) - - for method in methods: - f0 = None - f0_methods = {"pm": lambda: self.get_f0_pm(x, p_len), "diow": lambda: self.get_f0_pyworld_wrapper(x, filter_radius, "dio"), "dio": lambda: self.get_f0_pyworld(x, filter_radius, "dio"), "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "tiny", onnx=onnx_mode), "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "small", onnx=onnx_mode), "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "medium", onnx=onnx_mode), "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "large", onnx=onnx_mode), "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "full", onnx=onnx_mode), "crepe-tiny": lambda: self.get_f0_crepe(x, "tiny", onnx=onnx_mode), "crepe-small": lambda: self.get_f0_crepe(x, "small", onnx=onnx_mode), "crepe-medium": lambda: self.get_f0_crepe(x, "medium", onnx=onnx_mode), "crepe-large": lambda: self.get_f0_crepe(x, "large", onnx=onnx_mode), "crepe-full": lambda: self.get_f0_crepe(x, "full", onnx=onnx_mode), "fcpe": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), onnx=onnx_mode), "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), legacy=True, onnx=onnx_mode), "rmvpe": lambda: self.get_f0_rmvpe(x, onnx=onnx_mode), "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, legacy=True, onnx=onnx_mode), "harvestw": lambda: self.get_f0_pyworld_wrapper(x, filter_radius, "harvest"), "harvest": lambda: self.get_f0_pyworld(x, filter_radius, "harvest"), "yin": lambda: self.get_f0_yin(x, int(hop_length), p_len), "pyin": lambda: self.get_f0_pyin(x, int(hop_length), p_len), "swipe": lambda: self.get_f0_swipe(x)} - f0 = f0_methods.get(method, lambda: ValueError(translations["method_not_valid"]))() - f0_computation_stack.append(f0) - - for f0 in f0_computation_stack: - resampled_stack.append(np.interp(np.linspace(0, len(f0), p_len), np.arange(len(f0)), f0)) - - return resampled_stack[0] if len(resampled_stack) == 1 else np.nanmedian(np.vstack(resampled_stack), axis=0) - - def get_f0(self, x, p_len, pitch, f0_method, filter_radius, hop_length, f0_autotune, f0_autotune_strength, inp_f0=None, onnx_mode=False): - f0_methods = {"pm": lambda: self.get_f0_pm(x, p_len), "diow": lambda: self.get_f0_pyworld_wrapper(x, filter_radius, "dio"), "dio": lambda: self.get_f0_pyworld(x, filter_radius, "dio"), "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "tiny", onnx=onnx_mode), "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "small", onnx=onnx_mode), "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "medium", onnx=onnx_mode), "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "large", onnx=onnx_mode), "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, int(hop_length), "full", onnx=onnx_mode), "crepe-tiny": lambda: self.get_f0_crepe(x, "tiny", onnx=onnx_mode), "crepe-small": lambda: self.get_f0_crepe(x, "small", onnx=onnx_mode), "crepe-medium": lambda: self.get_f0_crepe(x, "medium", onnx=onnx_mode), "crepe-large": lambda: self.get_f0_crepe(x, "large", onnx=onnx_mode), "crepe-full": lambda: self.get_f0_crepe(x, "full", onnx=onnx_mode), "fcpe": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), onnx=onnx_mode), "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, int(hop_length), legacy=True, onnx=onnx_mode), "rmvpe": lambda: self.get_f0_rmvpe(x, onnx=onnx_mode), "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, legacy=True, onnx=onnx_mode), "harvestw": lambda: self.get_f0_pyworld_wrapper(x, filter_radius, "harvest"), "harvest": lambda: self.get_f0_pyworld(x, filter_radius, "harvest"), "yin": lambda: self.get_f0_yin(x, int(hop_length), p_len), "pyin": lambda: self.get_f0_pyin(x, int(hop_length), p_len), "swipe": lambda: self.get_f0_swipe(x)} - f0 = self.get_f0_hybrid(f0_method, x, p_len, hop_length, filter_radius, onnx_mode) if "hybrid" in f0_method else f0_methods.get(f0_method, lambda: ValueError(translations["method_not_valid"]))() - - if f0_autotune: f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength) - if isinstance(f0, tuple): f0 = f0[0] - - f0 *= pow(2, pitch / 12) - tf0 = self.sample_rate // self.window - - if inp_f0 is not None: - replace_f0 = np.interp(list(range(np.round((inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1).astype(np.int16))), inp_f0[:, 0] * 100, inp_f0[:, 1]) - f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[:f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]] - - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - - return np.rint(f0_mel).astype(np.int32), f0.copy() - - def extract_features(self, model, feats, version): - return torch.as_tensor(model.run([model.get_outputs()[0].name, model.get_outputs()[1].name], {"feats": feats.detach().cpu().numpy()})[0 if version == "v1" else 1], dtype=torch.float32, device=feats.device) - - def voice_conversion(self, model, net_g, sid, audio0, pitch, pitchf, index, big_npy, index_rate, version, protect): - pitch_guidance = pitch != None and pitchf != None - feats = torch.from_numpy(audio0).float() - - if feats.dim() == 2: feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - - feats = feats.view(1, -1) - if self.embed_suffix == ".pt": - padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - inputs = {"source": feats.to(self.device), "padding_mask": padding_mask, "output_layer": 9 if version == "v1" else 12} - - with torch.no_grad(): - if self.embed_suffix == ".pt": - logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) if version == "v1" else logits[0] - else: feats = self.extract_features(model, feats, version).to(self.device) - - if protect < 0.5 and pitch_guidance: feats0 = feats.clone() - - if (not isinstance(index, type(None)) and not isinstance(big_npy, type(None)) and index_rate != 0): - npy = feats[0].cpu().numpy() - score, ix = index.search(npy, k=8) - - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - - npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - feats = (torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats) - - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - if protect < 0.5 and pitch_guidance: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - - p_len = audio0.shape[0] // self.window - - if feats.shape[1] < p_len: - p_len = feats.shape[1] - - if pitch_guidance: - pitch = pitch[:, :p_len] - pitchf = pitchf[:, :p_len] - - if protect < 0.5 and pitch_guidance: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) - - p_len = torch.tensor([p_len], device=self.device).long() - audio1 = ((net_g.infer(feats.float(), p_len, pitch if pitch_guidance else None, pitchf.float() if pitch_guidance else None, sid)[0][0, 0]).data.cpu().float().numpy()) if self.suffix == ".pth" else (net_g.run([net_g.get_outputs()[0].name], ({net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32), net_g.get_inputs()[1].name: p_len.cpu().numpy(), net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64), net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32), net_g.get_inputs()[4].name: pitch.cpu().numpy().astype(np.int64), net_g.get_inputs()[5].name: pitchf.cpu().numpy().astype(np.float32)} if pitch_guidance else {net_g.get_inputs()[0].name: feats.cpu().numpy().astype(np.float32), net_g.get_inputs()[1].name: p_len.cpu().numpy(), net_g.get_inputs()[2].name: np.array([sid.cpu().item()], dtype=np.int64), net_g.get_inputs()[3].name: np.random.randn(1, 192, p_len).astype(np.float32)}))[0][0, 0]) - - if self.embed_suffix == ".pt": del padding_mask - del feats, p_len, net_g - if torch.cuda.is_available(): torch.cuda.empty_cache() - elif torch.backends.mps.is_available(): torch.mps.empty_cache() - - return audio1 - - def pipeline(self, model, net_g, sid, audio, pitch, f0_method, file_index, index_rate, pitch_guidance, filter_radius, tgt_sr, resample_sr, volume_envelope, version, protect, hop_length, f0_autotune, f0_autotune_strength, suffix, embed_suffix, f0_file=None, f0_onnx=False): - self.suffix = suffix - self.embed_suffix = embed_suffix - - if file_index != "" and os.path.exists(file_index) and index_rate != 0: - try: - index = faiss.read_index(file_index) - big_npy = index.reconstruct_n(0, index.ntotal) - except Exception as e: - logger.error(translations["read_faiss_index_error"].format(e=e)) - index = big_npy = None - else: index = big_npy = None - - opt_ts, audio_opt = [], [] - audio = signal.filtfilt(bh, ah, audio) - audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") - - if audio_pad.shape[0] > self.t_max: - audio_sum = np.zeros_like(audio) - - for i in range(self.window): - audio_sum += audio_pad[i : i - self.window] - - for t in range(self.t_center, audio.shape[0], self.t_center): - opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query : t + self.t_query]) == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min())[0][0]) - - s = 0 - t, inp_f0 = None, None - - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - p_len = audio_pad.shape[0] // self.window - - if hasattr(f0_file, "name"): - try: - with open(f0_file.name, "r") as f: - raw_lines = f.read() - - if len(raw_lines) > 0: - inp_f0 = [] - - for line in raw_lines.strip("\n").split("\n"): - inp_f0.append([float(i) for i in line.split(",")]) - - inp_f0 = np.array(inp_f0, dtype=np.float32) - except: - logger.error(translations["error_readfile"]) - inp_f0 = None - - if pitch_guidance: - pitch, pitchf = self.get_f0(audio_pad, p_len, pitch, f0_method, filter_radius, hop_length, f0_autotune, f0_autotune_strength, inp_f0, onnx_mode=f0_onnx) - pitch, pitchf = pitch[:p_len], pitchf[:p_len] - - if self.device == "mps": pitchf = pitchf.astype(np.float32) - - pitch, pitchf = torch.tensor(pitch, device=self.device).unsqueeze(0).long(), torch.tensor(pitchf, device=self.device).unsqueeze(0).float() - - for t in opt_ts: - t = t // self.window * self.window - audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[s : t + self.t_pad2 + self.window], pitch[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None, pitchf[:, s // self.window : (t + self.t_pad2) // self.window] if pitch_guidance else None, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt]) - s = t - - audio_opt.append(self.voice_conversion(model, net_g, sid, audio_pad[t:], (pitch[:, t // self.window :] if t is not None else pitch) if pitch_guidance else None, (pitchf[:, t // self.window :] if t is not None else pitchf) if pitch_guidance else None, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt]) - audio_opt = np.concatenate(audio_opt) - - if volume_envelope != 1: audio_opt = change_rms(audio, self.sample_rate, audio_opt, tgt_sr, volume_envelope) - if resample_sr >= self.sample_rate and tgt_sr != resample_sr: audio_opt = librosa.resample(audio_opt, orig_sr=tgt_sr, target_sr=resample_sr, res_type="soxr_vhq") - - audio_max = np.abs(audio_opt).max() / 0.99 - if audio_max > 1: audio_opt /= audio_max - - if pitch_guidance: del pitch, pitchf - del sid - - if torch.cuda.is_available(): torch.cuda.empty_cache() - elif torch.backends.mps.is_available(): torch.mps.empty_cache() - - return audio_opt - -class VoiceConverter: - def __init__(self): - self.config = config - self.hubert_model = None - self.tgt_sr = None - self.net_g = None - self.vc = None - self.cpt = None - self.version = None - self.n_spk = None - self.use_f0 = None - self.loaded_model = None - self.vocoder = "Default" - self.checkpointing = False - - def load_embedders(self, embedder_model): - embedder_model_path = os.path.join("assets", "models", "embedders", embedder_model) - if not os.path.exists(embedder_model_path) and not embedder_model.endswith((".pt", ".onnx")): raise FileNotFoundError(f"{translations['not_found'].format(name=translations['model'])}: {embedder_model}") - - try: - if embedder_model.endswith(".pt"): - models, _, _ = checkpoint_utils.load_model_ensemble_and_task([embedder_model_path], suffix="") - self.embed_suffix = ".pt" - self.hubert_model = models[0].to(self.config.device).float().eval() - else: - sess_options = onnxruntime.SessionOptions() - sess_options.log_severity_level = 3 - self.embed_suffix = ".onnx" - self.hubert_model = onnxruntime.InferenceSession(embedder_model_path, sess_options=sess_options, providers=get_providers()) - except Exception as e: - logger.error(translations["read_model_error"].format(e=e)) - - def convert_audio(self, audio_input_path, audio_output_path, model_path, index_path, embedder_model, pitch, f0_method, index_rate, volume_envelope, protect, hop_length, f0_autotune, f0_autotune_strength, filter_radius, clean_audio, clean_strength, export_format, resample_sr = 0, sid = 0, checkpointing = False, f0_file = None, f0_onnx = False, formant_shifting = False, formant_qfrency=0.8, formant_timbre=0.8): - try: - self.get_vc(model_path, sid) - audio = load_audio(logger, audio_input_path, 16000, formant_shifting=formant_shifting, formant_qfrency=formant_qfrency, formant_timbre=formant_timbre) - self.checkpointing = checkpointing - - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: audio /= audio_max - - if not self.hubert_model: self.load_embedders(embedder_model) - if self.tgt_sr != resample_sr >= 16000: self.tgt_sr = resample_sr - - target_sr = min([8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000], key=lambda x: abs(x - self.tgt_sr)) - audio_output = self.vc.pipeline(model=self.hubert_model, net_g=self.net_g, sid=sid, audio=audio, pitch=pitch, f0_method=f0_method, file_index=(index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added")), index_rate=index_rate, pitch_guidance=self.use_f0, filter_radius=filter_radius, tgt_sr=self.tgt_sr, resample_sr=target_sr, volume_envelope=volume_envelope, version=self.version, protect=protect, hop_length=hop_length, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, suffix=self.suffix, embed_suffix=self.embed_suffix, f0_file=f0_file, f0_onnx=f0_onnx) - - if clean_audio: - from main.tools.noisereduce import reduce_noise - audio_output = reduce_noise(y=audio_output, sr=target_sr, prop_decrease=clean_strength, device=config.device) - - sf.write(audio_output_path, audio_output, target_sr, format=export_format) - except Exception as e: - logger.error(translations["error_convert"].format(e=e)) - import traceback - logger.debug(traceback.format_exc()) - - def get_vc(self, weight_root, sid): - if sid == "" or sid == []: - self.cleanup() - if torch.cuda.is_available(): torch.cuda.empty_cache() - elif torch.backends.mps.is_available(): torch.mps.empty_cache() - - if not self.loaded_model or self.loaded_model != weight_root: - self.loaded_model = weight_root - self.load_model() - if self.cpt is not None: self.setup() - - def cleanup(self): - if self.hubert_model is not None: - del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr - self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None - - if torch.cuda.is_available(): torch.cuda.empty_cache() - elif torch.backends.mps.is_available(): torch.mps.empty_cache() - - del self.net_g, self.cpt - if torch.cuda.is_available(): torch.cuda.empty_cache() - elif torch.backends.mps.is_available(): torch.mps.empty_cache() - - self.cpt = None - - def load_model(self): - if os.path.isfile(self.loaded_model): - if self.loaded_model.endswith(".pth"): self.cpt = torch.load(self.loaded_model, map_location="cpu") - else: - sess_options = onnxruntime.SessionOptions() - sess_options.log_severity_level = 3 - self.cpt = onnxruntime.InferenceSession(self.loaded_model, sess_options=sess_options, providers=get_providers()) - else: self.cpt = None - - def setup(self): - if self.cpt is not None: - if self.loaded_model.endswith(".pth"): - self.tgt_sr = self.cpt["config"][-1] - self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] - self.use_f0 = self.cpt.get("f0", 1) - self.version = self.cpt.get("version", "v1") - self.vocoder = self.cpt.get("vocoder", "Default") - - self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, checkpointing=self.checkpointing) - del self.net_g.enc_q - - self.net_g.load_state_dict(self.cpt["weight"], strict=False) - self.net_g.eval().to(self.config.device).float() - - self.n_spk = self.cpt["config"][-3] - self.suffix = ".pth" - else: - import json - import onnx - - model = onnx.load(self.loaded_model) - metadata_dict = None - - for prop in model.metadata_props: - if prop.key == "model_info": - metadata_dict = json.loads(prop.value) - break - - self.net_g = self.cpt - self.tgt_sr = metadata_dict.get("sr", 32000) - self.use_f0 = metadata_dict.get("f0", 1) - self.suffix = ".onnx" - - self.vc = VC(self.tgt_sr, self.config) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/create_dataset.py b/main/inference/create_dataset.py deleted file mode 100644 index 305931713176ff1807cd4f9e6654d136a65bfe1b..0000000000000000000000000000000000000000 --- a/main/inference/create_dataset.py +++ /dev/null @@ -1,240 +0,0 @@ -import os -import sys -import time -import yt_dlp -import shutil -import librosa -import logging -import argparse -import warnings -import logging.handlers - -from soundfile import read, write -from distutils.util import strtobool - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.algorithm.separator import Separator -from main.library.utils import process_audio, merge_audio - -config = Config() -translations = config.translations -dataset_temp = os.path.join("dataset_temp") -logger = logging.getLogger(__name__) - -if logger.hasHandlers(): logger.handlers.clear() -else: - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "create_dataset.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_audio", type=str, required=True) - parser.add_argument("--output_dataset", type=str, default="./dataset") - parser.add_argument("--sample_rate", type=int, default=44100) - parser.add_argument("--clean_dataset", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_strength", type=float, default=0.7) - parser.add_argument("--separator_reverb", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--kim_vocal_version", type=int, default=2) - parser.add_argument("--overlap", type=float, default=0.25) - parser.add_argument("--segments_size", type=int, default=256) - parser.add_argument("--mdx_hop_length", type=int, default=1024) - parser.add_argument("--mdx_batch_size", type=int, default=1) - parser.add_argument("--denoise_mdx", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--skip", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--skip_start_audios", type=str, default="0") - parser.add_argument("--skip_end_audios", type=str, default="0") - - return parser.parse_args() - -def main(): - pid_path = os.path.join("assets", "create_dataset_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - args = parse_arguments() - input_audio, output_dataset, sample_rate, clean_dataset, clean_strength, separator_reverb, kim_vocal_version, overlap, segments_size, hop_length, batch_size, denoise_mdx, skip, skip_start_audios, skip_end_audios = args.input_audio, args.output_dataset, args.sample_rate, args.clean_dataset, args.clean_strength, args.separator_reverb, args.kim_vocal_version, args.overlap, args.segments_size, args.mdx_hop_length, args.mdx_batch_size, args.denoise_mdx, args.skip, args.skip_start_audios, args.skip_end_audios - log_data = {translations['audio_path']: input_audio, translations['output_path']: output_dataset, translations['sr']: sample_rate, translations['clear_dataset']: clean_dataset, translations['dereveb_audio']: separator_reverb, translations['segments_size']: segments_size, translations['overlap']: overlap, "Hop length": hop_length, translations['batch_size']: batch_size, translations['denoise_mdx']: denoise_mdx, translations['skip']: skip} - - if clean_dataset: log_data[translations['clean_strength']] = clean_strength - if skip: - log_data[translations['skip_start']] = skip_start_audios - log_data[translations['skip_end']] = skip_end_audios - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - if kim_vocal_version not in [1, 2]: raise ValueError(translations["version_not_valid"]) - start_time = time.time() - - try: - paths = [] - - if not os.path.exists(dataset_temp): os.makedirs(dataset_temp, exist_ok=True) - urls = input_audio.replace(", ", ",").split(",") - - for url in urls: - path = downloader(url, urls.index(url)) - paths.append(path) - - if skip: - skip_start_audios = skip_start_audios.replace(", ", ",").split(",") - skip_end_audios = skip_end_audios.replace(", ", ",").split(",") - - if len(skip_start_audios) < len(paths) or len(skip_end_audios) < len(paths): - logger.warning(translations["skip len(paths) or len(skip_end_audios) > len(paths): - logger.warning(translations["skip>audio"]) - sys.exit(1) - else: - for audio, skip_start_audio, skip_end_audio in zip(paths, skip_start_audios, skip_end_audios): - skip_start(audio, skip_start_audio) - skip_end(audio, skip_end_audio) - - separator_paths = [] - - for audio in paths: - vocals = separator_music_main(audio, dataset_temp, segments_size, overlap, denoise_mdx, kim_vocal_version, hop_length, batch_size, sample_rate) - if separator_reverb: vocals = separator_reverb_audio(vocals, dataset_temp, segments_size, overlap, denoise_mdx, hop_length, batch_size, sample_rate) - separator_paths.append(vocals) - - paths = separator_paths - processed_paths = [] - - for audio in paths: - cut_files, time_stamps = process_audio(logger, audio, os.path.dirname(audio)) - processed_paths.append(merge_audio(cut_files, time_stamps, audio, os.path.splitext(audio)[0] + "_processed" + ".wav", "wav")) - - paths = processed_paths - - for audio_path in paths: - data, sample_rate = read(audio_path) - data = librosa.to_mono(data.T) - - if clean_dataset: - from main.tools.noisereduce import reduce_noise - data = reduce_noise(y=data, prop_decrease=clean_strength, device=config.device) - - write(audio_path, data, sample_rate) - except Exception as e: - logger.error(f"{translations['create_dataset_error']}: {e}") - - import traceback - logger.error(traceback.format_exc()) - finally: - for audio in paths: - shutil.move(audio, output_dataset) - - if os.path.exists(dataset_temp): shutil.rmtree(dataset_temp, ignore_errors=True) - - elapsed_time = time.time() - start_time - if os.path.exists(pid_path): os.remove(pid_path) - logger.info(translations["create_dataset_success"].format(elapsed_time=f"{elapsed_time:.2f}")) - -def downloader(url, name): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - - ydl_opts = {"format": "bestaudio/best", "outtmpl": os.path.join(dataset_temp, f"{name}"), "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "wav", "preferredquality": "192"}], "no_warnings": True, "noplaylist": True, "noplaylist": True, "verbose": False} - logger.info(f"{translations['starting_download']}: {url}...") - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - ydl.extract_info(url) - logger.info(f"{translations['download_success']}: {url}") - - return os.path.join(dataset_temp, f"{name}" + ".wav") - -def skip_start(input_file, seconds): - data, sr = read(input_file) - total_duration = len(data) / sr - - if seconds <= 0: logger.warning(translations["=<0"]) - elif seconds >= total_duration: logger.warning(translations["skip_warning"].format(seconds=seconds, total_duration=f"{total_duration:.2f}")) - else: - logger.info(f"{translations['skip_start']}: {input_file}...") - write(input_file, data[int(seconds * sr):], sr) - - logger.info(translations["skip_start_audio"].format(input_file=input_file)) - -def skip_end(input_file, seconds): - data, sr = read(input_file) - total_duration = len(data) / sr - - if seconds <= 0: logger.warning(translations["=<0"]) - elif seconds > total_duration: logger.warning(translations["skip_warning"].format(seconds=seconds, total_duration=f"{total_duration:.2f}")) - else: - logger.info(f"{translations['skip_end']}: {input_file}...") - write(input_file, data[:-int(seconds * sr)], sr) - - logger.info(translations["skip_end_audio"].format(input_file=input_file)) - -def separator_music_main(input, output, segments_size, overlap, denoise, version, hop_length, batch_size, sample_rate): - if not os.path.exists(input): - logger.warning(translations["input_not_valid"]) - return None - - if not os.path.exists(output): - logger.warning(translations["output_not_valid"]) - return None - - model = f"Kim_Vocal_{version}.onnx" - output_separator = separator_main(audio_file=input, model_filename=model, output_format="wav", output_dir=output, mdx_segment_size=segments_size, mdx_overlap=overlap, mdx_batch_size=batch_size, mdx_hop_length=hop_length, mdx_enable_denoise=denoise, sample_rate=sample_rate) - - for f in output_separator: - path = os.path.join(output, f) - if not os.path.exists(path): logger.error(translations["not_found"].format(name=path)) - - if '_(Instrumental)_' in f: os.rename(path, os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav") - elif '_(Vocals)_' in f: - rename_file = os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav" - os.rename(path, rename_file) - - return rename_file - -def separator_reverb_audio(input, output, segments_size, overlap, denoise, hop_length, batch_size, sample_rate): - if not os.path.exists(input): - logger.warning(translations["input_not_valid"]) - return None - - if not os.path.exists(output): - logger.warning(translations["output_not_valid"]) - return None - - logger.info(f"{translations['dereverb']}: {input}...") - output_dereverb = separator_main(audio_file=input, model_filename="Reverb_HQ_By_FoxJoy.onnx", output_format="wav", output_dir=output, mdx_segment_size=segments_size, mdx_overlap=overlap, mdx_batch_size=hop_length, mdx_hop_length=batch_size, mdx_enable_denoise=denoise, sample_rate=sample_rate) - - for f in output_dereverb: - path = os.path.join(output, f) - if not os.path.exists(path): logger.error(translations["not_found"].format(name=path)) - - if '_(Reverb)_' in f: os.rename(path, os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav") - elif '_(No Reverb)_' in f: - rename_file = os.path.splitext(path)[0].replace("(", "").replace(")", "") + ".wav" - os.rename(path, rename_file) - - logger.info(f"{translations['dereverb_success']}: {rename_file}") - return rename_file - -def separator_main(audio_file=None, model_filename="Kim_Vocal_1.onnx", output_format="wav", output_dir=".", mdx_segment_size=256, mdx_overlap=0.25, mdx_batch_size=1, mdx_hop_length=1024, mdx_enable_denoise=True, sample_rate=44100): - try: - separator = Separator(logger=logger, log_formatter=file_formatter, log_level=logging.INFO, output_dir=output_dir, output_format=output_format, output_bitrate=None, normalization_threshold=0.9, output_single_stem=None, invert_using_spec=False, sample_rate=sample_rate, mdx_params={"hop_length": mdx_hop_length, "segment_size": mdx_segment_size, "overlap": mdx_overlap, "batch_size": mdx_batch_size, "enable_denoise": mdx_enable_denoise}) - separator.load_model(model_filename=model_filename) - return separator.separate(audio_file) - except: - logger.debug(translations["default_setting"]) - separator = Separator(logger=logger, log_formatter=file_formatter, log_level=logging.INFO, output_dir=output_dir, output_format=output_format, output_bitrate=None, normalization_threshold=0.9, output_single_stem=None, invert_using_spec=False, sample_rate=44100, mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": mdx_enable_denoise}) - separator.load_model(model_filename=model_filename) - return separator.separate(audio_file) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/create_index.py b/main/inference/create_index.py deleted file mode 100644 index 916e442780fddc651c9ef1fd07809c1237550aa4..0000000000000000000000000000000000000000 --- a/main/inference/create_index.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -import sys -import faiss -import logging -import argparse -import logging.handlers - -import numpy as np - -from multiprocessing import cpu_count -from sklearn.cluster import MiniBatchKMeans - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -translations = Config().translations - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--rvc_version", type=str, default="v2") - parser.add_argument("--index_algorithm", type=str, default="Auto") - - return parser.parse_args() - -def main(): - args = parse_arguments() - - exp_dir = os.path.join("assets", "logs", args.model_name) - version = args.rvc_version - index_algorithm = args.index_algorithm - logger = logging.getLogger(__name__) - - if logger.hasHandlers(): logger.handlers.clear() - else: - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join(exp_dir, "create_index.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - - log_data = {translations['modelname']: args.model_name, translations['model_path']: exp_dir, translations['training_version']: version, translations['index_algorithm_info']: index_algorithm} - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - try: - npys = [] - - feature_dir = os.path.join(exp_dir, f"{version}_extracted") - model_name = os.path.basename(exp_dir) - - for name in sorted(os.listdir(feature_dir)): - npys.append(np.load(os.path.join(feature_dir, name))) - - big_npy = np.concatenate(npys, axis=0) - big_npy_idx = np.arange(big_npy.shape[0]) - - np.random.shuffle(big_npy_idx) - big_npy = big_npy[big_npy_idx] - - if big_npy.shape[0] > 2e5 and (index_algorithm == "Auto" or index_algorithm == "KMeans"): big_npy = (MiniBatchKMeans(n_clusters=10000, verbose=True, batch_size=256 * cpu_count(), compute_labels=False, init="random").fit(big_npy).cluster_centers_) - np.save(os.path.join(exp_dir, "total_fea.npy"), big_npy) - - n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) - index_trained = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat") - - index_ivf_trained = faiss.extract_index_ivf(index_trained) - index_ivf_trained.nprobe = 1 - - index_trained.train(big_npy) - faiss.write_index(index_trained, os.path.join(exp_dir, f"trained_IVF{n_ivf}_Flat_nprobe_{index_ivf_trained.nprobe}_{model_name}_{version}.index")) - - index_added = faiss.index_factory(256 if version == "v1" else 768, f"IVF{n_ivf},Flat") - index_ivf_added = faiss.extract_index_ivf(index_added) - index_ivf_added.nprobe = 1 - - index_added.train(big_npy) - batch_size_add = 8192 - - for i in range(0, big_npy.shape[0], batch_size_add): - index_added.add(big_npy[i : i + batch_size_add]) - - index_filepath_added = os.path.join(exp_dir, f"added_IVF{n_ivf}_Flat_nprobe_{index_ivf_added.nprobe}_{model_name}_{version}.index") - faiss.write_index(index_added, index_filepath_added) - - logger.info(f"{translations['save_index']} '{index_filepath_added}'") - except Exception as e: - logger.error(f"{translations['create_index_error']}: {e}") - - import traceback - logger.debug(traceback.format_exc()) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/extract.py b/main/inference/extract.py deleted file mode 100644 index b50e0608c4fe80d9823041a777595bfc25a71bb1..0000000000000000000000000000000000000000 --- a/main/inference/extract.py +++ /dev/null @@ -1,400 +0,0 @@ -import os -import re -import sys -import time -import tqdm -import torch -import shutil -import logging -import argparse -import warnings -import onnxruntime -import logging.handlers - -import numpy as np -import soundfile as sf -import torch.nn.functional as F - -from random import shuffle -from distutils.util import strtobool -from fairseq import checkpoint_utils -from concurrent.futures import ThreadPoolExecutor, as_completed - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.utils import check_predictors, check_embedders, load_audio - -logger = logging.getLogger(__name__) -translations = Config().translations -logger.propagate = False - -warnings.filterwarnings("ignore") -for l in ["torch", "faiss", "httpx", "fairseq", "httpcore", "faiss.loader", "numba.core", "urllib3"]: - logging.getLogger(l).setLevel(logging.ERROR) - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--rvc_version", type=str, default="v2") - parser.add_argument("--f0_method", type=str, default="rmvpe") - parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--hop_length", type=int, default=128) - parser.add_argument("--cpu_cores", type=int, default=2) - parser.add_argument("--gpu", type=str, default="-") - parser.add_argument("--sample_rate", type=int, required=True) - parser.add_argument("--embedder_model", type=str, default="contentvec_base.pt") - parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--embedders_onnx", type=lambda x: bool(strtobool(x)), default=False) - - return parser.parse_args() - -def generate_config(rvc_version, sample_rate, model_path): - config_save_path = os.path.join(model_path, "config.json") - if not os.path.exists(config_save_path): shutil.copy(os.path.join("main", "configs", rvc_version, f"{sample_rate}.json"), config_save_path) - -def generate_filelist(pitch_guidance, model_path, rvc_version, sample_rate): - gt_wavs_dir, feature_dir = os.path.join(model_path, "sliced_audios"), os.path.join(model_path, f"{rvc_version}_extracted") - f0_dir, f0nsf_dir = None, None - - if pitch_guidance: f0_dir, f0nsf_dir = os.path.join(model_path, "f0"), os.path.join(model_path, "f0_voiced") - - gt_wavs_files, feature_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir)), set(name.split(".")[0] for name in os.listdir(feature_dir)) - names = gt_wavs_files & feature_files & set(name.split(".")[0] for name in os.listdir(f0_dir)) & set(name.split(".")[0] for name in os.listdir(f0nsf_dir)) if pitch_guidance else gt_wavs_files & feature_files - - options = [] - mute_base_path = os.path.join("assets", "logs", "mute") - - for name in names: - options.append(f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|0" if pitch_guidance else f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|0") - - mute_audio_path, mute_feature_path = os.path.join(mute_base_path, "sliced_audios", f"mute{sample_rate}.wav"), os.path.join(mute_base_path, f"{rvc_version}_extracted", "mute.npy") - - for _ in range(2): - options.append(f"{mute_audio_path}|{mute_feature_path}|{os.path.join(mute_base_path, 'f0', 'mute.wav.npy')}|{os.path.join(mute_base_path, 'f0_voiced', 'mute.wav.npy')}|0" if pitch_guidance else f"{mute_audio_path}|{mute_feature_path}|0") - - shuffle(options) - with open(os.path.join(model_path, "filelist.txt"), "w") as f: - f.write("\n".join(options)) - -def setup_paths(exp_dir, version = None): - wav_path = os.path.join(exp_dir, "sliced_audios_16k") - - if version: - out_path = os.path.join(exp_dir, f"{version}_extracted") - os.makedirs(out_path, exist_ok=True) - return wav_path, out_path - else: - output_root1, output_root2 = os.path.join(exp_dir, "f0"), os.path.join(exp_dir, "f0_voiced") - os.makedirs(output_root1, exist_ok=True); os.makedirs(output_root2, exist_ok=True) - return wav_path, output_root1, output_root2 - -def read_wave(wav_path, normalize = False): - wav, sr = sf.read(wav_path) - assert sr == 16000, translations["sr_not_16000"] - - feats = torch.from_numpy(wav).float() - - if feats.dim() == 2: feats = feats.mean(-1) - feats = feats.view(1, -1) - - if normalize: feats = F.layer_norm(feats, feats.shape) - return feats - -def get_device(gpu_index): - try: - index = int(gpu_index) - if index < torch.cuda.device_count(): return f"cuda:{index}" - else: logger.warning(translations["gpu_not_valid"]) - except ValueError: - logger.warning(translations["gpu_not_valid"]) - return "cpu" - -def get_providers(): - ort_providers = onnxruntime.get_available_providers() - - if "CUDAExecutionProvider" in ort_providers: providers = ["CUDAExecutionProvider"] - elif "CoreMLExecutionProvider" in ort_providers: providers = ["CoreMLExecutionProvider"] - else: providers = ["CPUExecutionProvider"] - - return providers - -class FeatureInput: - def __init__(self, sample_rate=16000, hop_size=160, device="cpu"): - self.fs = sample_rate - self.hop = hop_size - self.f0_bin = 256 - self.f0_max = 1100.0 - self.f0_min = 50.0 - self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.device = device - - def compute_f0_hybrid(self, methods_str, np_arr, hop_length, f0_onnx): - methods_str = re.search("hybrid\[(.+)\]", methods_str) - if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")] - - f0_computation_stack, resampled_stack = [], [] - logger.debug(translations["hybrid_methods"].format(methods=methods)) - - for method in methods: - f0 = None - f0_methods = {"pm": lambda: self.get_pm(np_arr), "diow": lambda: self.get_pyworld_wrapper(np_arr, "dio"), "dio": lambda: self.get_pyworld(np_arr, "dio"), "mangio-crepe-full": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "full", onnx=f0_onnx), "mangio-crepe-large": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "large", onnx=f0_onnx), "mangio-crepe-medium": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "medium", onnx=f0_onnx), "mangio-crepe-small": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "small", onnx=f0_onnx), "mangio-crepe-tiny": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "tiny", onnx=f0_onnx), "crepe-full": lambda: self.get_crepe(np_arr, "full", onnx=f0_onnx), "crepe-large": lambda: self.get_crepe(np_arr, "large", onnx=f0_onnx), "crepe-medium": lambda: self.get_crepe(np_arr, "medium", onnx=f0_onnx), "crepe-small": lambda: self.get_crepe(np_arr, "small", onnx=f0_onnx), "crepe-tiny": lambda: self.get_crepe(np_arr, "tiny", onnx=f0_onnx), "fcpe": lambda: self.get_fcpe(np_arr, int(hop_length), onnx=f0_onnx), "fcpe-legacy": lambda: self.get_fcpe(np_arr, int(hop_length), legacy=True, onnx=f0_onnx), "rmvpe": lambda: self.get_rmvpe(np_arr, onnx=f0_onnx), "rmvpe-legacy": lambda: self.get_rmvpe(np_arr, legacy=True, onnx=f0_onnx), "harvestw": lambda: self.get_pyworld_wrapper(np_arr, "harvest"), "harvest": lambda: self.get_pyworld(np_arr, "harvest"), "swipe": lambda: self.get_swipe(np_arr), "yin": lambda: self.get_yin(np_arr, int(hop_length), mode="yin"), "pyin": lambda: self.get_yin(np_arr, int(hop_length), mode="pyin")} - f0 = f0_methods.get(method, lambda: ValueError(translations["method_not_valid"]))() - f0_computation_stack.append(f0) - - for f0 in f0_computation_stack: - resampled_stack.append(np.interp(np.linspace(0, len(f0), (np_arr.size // self.hop)), np.arange(len(f0)), f0)) - - return resampled_stack[0] if len(resampled_stack) == 1 else np.nanmedian(np.vstack(resampled_stack), axis=0) - - def compute_f0(self, np_arr, f0_method, hop_length, f0_onnx=False): - f0_methods = {"pm": lambda: self.get_pm(np_arr), "diow": lambda: self.get_pyworld_wrapper(np_arr, "dio"), "dio": lambda: self.get_pyworld(np_arr, "dio"), "mangio-crepe-full": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "full", onnx=f0_onnx), "mangio-crepe-large": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "large", onnx=f0_onnx), "mangio-crepe-medium": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "medium", onnx=f0_onnx), "mangio-crepe-small": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "small", onnx=f0_onnx), "mangio-crepe-tiny": lambda: self.get_mangio_crepe(np_arr, int(hop_length), "tiny", onnx=f0_onnx), "crepe-full": lambda: self.get_crepe(np_arr, "full", onnx=f0_onnx), "crepe-large": lambda: self.get_crepe(np_arr, "large", onnx=f0_onnx), "crepe-medium": lambda: self.get_crepe(np_arr, "medium", onnx=f0_onnx), "crepe-small": lambda: self.get_crepe(np_arr, "small", onnx=f0_onnx), "crepe-tiny": lambda: self.get_crepe(np_arr, "tiny", onnx=f0_onnx), "fcpe": lambda: self.get_fcpe(np_arr, int(hop_length), onnx=f0_onnx), "fcpe-legacy": lambda: self.get_fcpe(np_arr, int(hop_length), legacy=True, onnx=f0_onnx), "rmvpe": lambda: self.get_rmvpe(np_arr, onnx=f0_onnx), "rmvpe-legacy": lambda: self.get_rmvpe(np_arr, legacy=True, onnx=f0_onnx), "harvestw": lambda: self.get_pyworld_wrapper(np_arr, "harvest"), "harvest": lambda: self.get_pyworld(np_arr, "harvest"), "swipe": lambda: self.get_swipe(np_arr), "yin": lambda: self.get_yin(np_arr, int(hop_length), mode="yin"), "pyin": lambda: self.get_yin(np_arr, int(hop_length), mode="pyin")} - return self.compute_f0_hybrid(f0_method, np_arr, int(hop_length), f0_onnx) if "hybrid" in f0_method else f0_methods.get(f0_method, lambda: ValueError(translations["method_not_valid"]))() - - def get_pm(self, x): - import parselmouth - - f0 = (parselmouth.Sound(x, self.fs).to_pitch_ac(time_step=(160 / 16000 * 1000) / 1000, voicing_threshold=0.6, pitch_floor=50, pitch_ceiling=1100).selected_array["frequency"]) - pad_size = ((x.size // self.hop) - len(f0) + 1) // 2 - - if pad_size > 0 or (x.size // self.hop) - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, (x.size // self.hop) - len(f0) - pad_size]], mode="constant") - return f0 - - def get_mangio_crepe(self, x, hop_length, model="full", onnx=False): - from main.library.predictors.CREPE import predict - - audio = torch.from_numpy(x.astype(np.float32)).to(self.device) - audio /= torch.quantile(torch.abs(audio), 0.999) - audio = audio.unsqueeze(0) - - source = predict(audio, self.fs, hop_length, self.f0_min, self.f0_max, model=model, batch_size=hop_length * 2, device=self.device, pad=True, providers=get_providers(), onnx=onnx).squeeze(0).cpu().float().numpy() - source[source < 0.001] = np.nan - - return np.nan_to_num(np.interp(np.arange(0, len(source) * (x.size // self.hop), len(source)) / (x.size // self.hop), np.arange(0, len(source)), source)) - - def get_crepe(self, x, model="full", onnx=False): - from main.library.predictors.CREPE import predict, mean, median - - f0, pd = predict(torch.tensor(np.copy(x))[None].float(), self.fs, 160, self.f0_min, self.f0_max, model, batch_size=512, device=self.device, return_periodicity=True, providers=get_providers(), onnx=onnx) - f0, pd = mean(f0, 3), median(pd, 3) - f0[pd < 0.1] = 0 - - return f0[0].cpu().numpy() - - def get_fcpe(self, x, hop_length, legacy=False, onnx=False): - from main.library.predictors.FCPE import FCPE - - model_fcpe = FCPE(os.path.join("assets", "models", "predictors", ("fcpe_legacy" if legacy else"fcpe") + (".onnx" if onnx else ".pt")), hop_length=int(hop_length), f0_min=int(self.f0_min), f0_max=int(self.f0_max), dtype=torch.float32, device=self.device, sample_rate=self.fs, threshold=0.03, providers=get_providers(), onnx=onnx, legacy=legacy) - f0 = model_fcpe.compute_f0(x, p_len=(x.size // self.hop)) - - del model_fcpe - return f0 - - def get_rmvpe(self, x, legacy=False, onnx=False): - from main.library.predictors.RMVPE import RMVPE - - rmvpe_model = RMVPE(os.path.join("assets", "models", "predictors", "rmvpe" + (".onnx" if onnx else ".pt")), device=self.device, onnx=onnx, providers=get_providers()) - f0 = rmvpe_model.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else rmvpe_model.infer_from_audio(x, thred=0.03) - - del rmvpe_model - return f0 - - def get_pyworld_wrapper(self, x, model="harvest"): - from main.library.predictors.WORLD_WRAPPER import PYWORLD - - pw = PYWORLD() - x = x.astype(np.double) - - if model == "harvest": f0, t = pw.harvest(x, fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.hop / self.fs) - elif model == "dio": f0, t = pw.dio(x, fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.hop / self.fs) - else: raise ValueError(translations["method_not_valid"]) - - return pw.stonemask(x, self.fs, t, f0) - - def get_pyworld(self, x, model="harvest"): - from main.library.predictors.pyworld import dio, harvest, stonemask - - x = x.astype(np.double) - - if model == "harvest": f0, t = harvest.harvest(x, fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.hop / self.fs) - elif model == "dio": f0, t = dio.dio(x, fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, frame_period=1000 * self.hop / self.fs) - else: raise ValueError(translations["method_not_valid"]) - - return stonemask.stonemask(x, self.fs, t, f0) - - def get_swipe(self, x): - from main.library.predictors.SWIPE import swipe - - f0, _ = swipe(x.astype(np.double), self.fs, f0_floor=self.f0_min, f0_ceil=self.f0_max, frame_period=1000 * self.hop / self.fs, device=self.device) - return f0 - - def get_yin(self, x, hop_length, mode="yin"): - import librosa - - if mode == "yin": - source = np.array(librosa.yin(x.astype(np.float32), sr=self.fs, fmin=self.f0_min, fmax=self.f0_max, hop_length=hop_length)) - source[source < 0.001] = np.nan - else: - f0, _, _ = librosa.pyin(x.astype(np.float32), fmin=self.f0_min, fmax=self.f0_max, sr=self.fs, hop_length=hop_length) - - source = np.array(f0) - source[source < 0.001] = np.nan - - return np.nan_to_num(np.interp(np.arange(0, len(source) * (x.size // self.hop), len(source)) / (x.size // self.hop), np.arange(0, len(source)), source)) - - def coarse_f0(self, f0): - return np.rint(np.clip(((1127 * np.log(1 + f0 / 700)) - self.f0_mel_min) * (self.f0_bin - 2) / (self.f0_mel_max - self.f0_mel_min) + 1, 1, self.f0_bin - 1)).astype(int) - - def process_file(self, file_info, f0_method, hop_length, f0_onnx): - inp_path, opt_path1, opt_path2, np_arr = file_info - if os.path.exists(opt_path1 + ".npy") and os.path.exists(opt_path2 + ".npy"): return - - try: - feature_pit = self.compute_f0(np_arr, f0_method, hop_length, f0_onnx) - if isinstance(feature_pit, tuple): feature_pit = feature_pit[0] - - np.save(opt_path2, feature_pit, allow_pickle=False) - np.save(opt_path1, self.coarse_f0(feature_pit), allow_pickle=False) - except Exception as e: - raise RuntimeError(f"{translations['extract_file_error']} {inp_path}: {e}") - - def process_files(self, files, f0_method, hop_length, f0_onnx, device, pbar): - self.device = device - for file_info in files: - self.process_file(file_info, f0_method, hop_length, f0_onnx) - pbar.update() - -def run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, gpus, f0_onnx): - input_root, *output_roots = setup_paths(exp_dir) - output_root1, output_root2 = output_roots if len(output_roots) == 2 else (output_roots[0], None) - - paths = [(os.path.join(input_root, name), os.path.join(output_root1, name) if output_root1 else None, os.path.join(output_root2, name) if output_root2 else None, load_audio(logger, os.path.join(input_root, name), 16000)) for name in sorted(os.listdir(input_root)) if "spec" not in name] - logger.info(translations["extract_f0_method"].format(num_processes=num_processes, f0_method=f0_method)) - - start_time = time.time() - gpus = gpus.split("-") - - process_partials = [] - devices = get_device(gpu) if gpu != "" else "cpu" - - pbar = tqdm.tqdm(total=len(paths), ncols=100, unit="p") - for idx, gpu in enumerate(gpus): - feature_input = FeatureInput(device=devices) - process_partials.append((feature_input, paths[idx::len(gpus)])) - - with ThreadPoolExecutor(max_workers=num_processes) as executor: - for future in as_completed([executor.submit(FeatureInput.process_files, feature_input, part_paths, f0_method, hop_length, f0_onnx, devices, pbar) for feature_input, part_paths in process_partials]): - pbar.update(1) - logger.debug(pbar.format_meter(pbar.n, pbar.total, pbar.format_dict["elapsed"])) - future.result() - - pbar.close() - logger.info(translations["extract_f0_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) - -def extract_features(model, feats, version): - return torch.as_tensor(model.run([model.get_outputs()[0].name, model.get_outputs()[1].name], {"feats": feats.detach().cpu().numpy()})[0 if version == "v1" else 1], dtype=torch.float32, device=feats.device) - -def process_file_embedding(file, wav_path, out_path, model, device, version, saved_cfg, embed_suffix): - out_file_path = os.path.join(out_path, file.replace("wav", "npy")) - if os.path.exists(out_file_path): return - - feats = read_wave(os.path.join(wav_path, file), normalize=saved_cfg.task.normalize if saved_cfg else False).to(device).float() - if embed_suffix == ".pt": inputs = {"source": feats, "padding_mask": torch.BoolTensor(feats.shape).fill_(False).to(device), "output_layer": 9 if version == "v1" else 12} - - with torch.no_grad(): - if embed_suffix == ".pt": - model = model.to(device).float().eval() - logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) if version == "v1" else logits[0] - else: feats = extract_features(model, feats, version).to(device) - - feats = feats.squeeze(0).float().cpu().numpy() - - if not np.isnan(feats).any(): np.save(out_file_path, feats, allow_pickle=False) - else: logger.warning(f"{file} {translations['NaN']}") - -def run_embedding_extraction(exp_dir, version, gpus, embedder_model): - wav_path, out_path = setup_paths(exp_dir, version) - logger.info(translations["start_extract_hubert"]) - - start_time = time.time() - embedder_model_path = os.path.join("assets", "models", "embedders", embedder_model) - if not os.path.exists(embedder_model_path) and not embedder_model.endswith((".pt", ".onnx")): raise FileNotFoundError(f"{translations['not_found'].format(name=translations['model'])}: {embedder_model}") - - try: - if embedder_model.endswith(".pt"): - models, saved_cfg, _ = checkpoint_utils.load_model_ensemble_and_task([embedder_model_path], suffix="") - - models = models[0] - embed_suffix = ".pt" - else: - sess_options = onnxruntime.SessionOptions() - sess_options.log_severity_level = 3 - - models = onnxruntime.InferenceSession(embedder_model_path, sess_options=sess_options, providers=get_providers()) - saved_cfg, embed_suffix = None, ".onnx" - except Exception as e: - raise ImportError(translations["read_model_error"].format(e=e)) - - devices = [(get_device(gpu) for gpu in (gpus.split("-"))) if gpus != "-" else "cpu"] - paths = sorted([file for file in os.listdir(wav_path) if file.endswith(".wav")]) - - if not paths: - logger.warning(translations["not_found_audio_file"]) - sys.exit(1) - - pbar = tqdm.tqdm(total=len(paths) * len(devices), ncols=100, unit="p") - for task in [(file, wav_path, out_path, models, device, version, saved_cfg, embed_suffix) for file in paths for device in devices]: - try: - process_file_embedding(*task) - except Exception as e: - raise RuntimeError(f"{translations['process_error']} {task[0]}: {e}") - - pbar.update(1) - logger.debug(pbar.format_meter(pbar.n, pbar.total, pbar.format_dict["elapsed"])) - - pbar.close() - logger.info(translations["extract_hubert_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) - -if __name__ == "__main__": - args = parse_arguments() - exp_dir = os.path.join("assets", "logs", args.model_name) - f0_method, hop_length, num_processes, gpus, version, pitch_guidance, sample_rate, embedder_model, f0_onnx, embedders_onnx = args.f0_method, args.hop_length, args.cpu_cores, args.gpu, args.rvc_version, args.pitch_guidance, args.sample_rate, args.embedder_model, args.f0_onnx, args.embedders_onnx - check_predictors(f0_method, f0_onnx); check_embedders(embedder_model, embedders_onnx) - embedder_model += ".onnx" if embedders_onnx else ".pt" - - if logger.hasHandlers(): logger.handlers.clear() - else: - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join(exp_dir, "extract.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - - log_data = {translations['modelname']: args.model_name, translations['export_process']: exp_dir, translations['f0_method']: f0_method, translations['pretrain_sr']: sample_rate, translations['cpu_core']: num_processes, "Gpu": gpus, "Hop length": hop_length, translations['training_version']: version, translations['extract_f0']: pitch_guidance, translations['hubert_model']: embedder_model, translations["f0_onnx_mode"]: f0_onnx, translations["embed_onnx"]: embedders_onnx} - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - pid_path = os.path.join(exp_dir, "extract_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - try: - run_pitch_extraction(exp_dir, f0_method, hop_length, num_processes, gpus, f0_onnx) - run_embedding_extraction(exp_dir, version, gpus, embedder_model) - generate_config(version, sample_rate, exp_dir) - generate_filelist(pitch_guidance, exp_dir, version, sample_rate) - except Exception as e: - logger.error(f"{translations['extract_error']}: {e}") - import traceback - logger.debug(traceback.format_exc()) - - if os.path.exists(pid_path): os.remove(pid_path) - logger.info(f"{translations['extract_success']} {args.model_name}.") \ No newline at end of file diff --git a/main/inference/preprocess.py b/main/inference/preprocess.py deleted file mode 100644 index 274554cabbe85020593568de0264e80faf2003eb..0000000000000000000000000000000000000000 --- a/main/inference/preprocess.py +++ /dev/null @@ -1,270 +0,0 @@ -import os -import sys -import time -import logging -import librosa -import argparse -import logging.handlers - -import numpy as np - -from tqdm import tqdm -from scipy import signal -from scipy.io import wavfile -from distutils.util import strtobool -from concurrent.futures import ProcessPoolExecutor, as_completed - -sys.path.append(os.getcwd()) - -from main.library.utils import load_audio -from main.configs.config import Config - -logger = logging.getLogger(__name__) -for l in ["numba.core.byteflow", "numba.core.ssa", "numba.core.interpreter"]: - logging.getLogger(l).setLevel(logging.ERROR) - -OVERLAP, MAX_AMPLITUDE, ALPHA, HIGH_PASS_CUTOFF, SAMPLE_RATE_16K = 0.3, 0.9, 0.75, 48, 16000 -config = Config() -translations = config.translations - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--dataset_path", type=str, default="./dataset") - parser.add_argument("--sample_rate", type=int, required=True) - parser.add_argument("--cpu_cores", type=int, default=2) - parser.add_argument("--cut_preprocess", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--process_effects", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_dataset", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_strength", type=float, default=0.7) - - return parser.parse_args() - -class Slicer: - def __init__(self, sr, threshold = -40.0, min_length = 5000, min_interval = 300, hop_size = 20, max_sil_kept = 5000): - if not min_length >= min_interval >= hop_size: raise ValueError(translations["min_length>=min_interval>=hop_size"]) - if not max_sil_kept >= hop_size: raise ValueError(translations["max_sil_kept>=hop_size"]) - - min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) - self.hop_size = round(sr * hop_size / 1000) - self.win_size = min(round(min_interval), 4 * self.hop_size) - self.min_length = round(sr * min_length / 1000 / self.hop_size) - self.min_interval = round(min_interval / self.hop_size) - self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) - - def _apply_slice(self, waveform, begin, end): - start_idx = begin * self.hop_size - - if len(waveform.shape) > 1: return waveform[:, start_idx:min(waveform.shape[1], end * self.hop_size)] - else: return waveform[start_idx:min(waveform.shape[0], end * self.hop_size)] - - def slice(self, waveform): - samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform - if samples.shape[0] <= self.min_length: return [waveform] - rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) - sil_tags = [] - silence_start, clip_start = None, 0 - - for i, rms in enumerate(rms_list): - if rms < self.threshold: - if silence_start is None: silence_start = i - continue - - if silence_start is None: continue - - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = (i - silence_start >= self.min_interval and i - clip_start >= self.min_length) - - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - sil_tags.append((0, pos) if silence_start == 0 else (pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() - pos += i - self.max_sil_kept - pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) - - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) - sil_tags.append((0, pos_r) if silence_start == 0 else ((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos_r)) - clip_start = pos_r - - silence_start = None - total_frames = rms_list.shape[0] - if (silence_start is not None and total_frames - silence_start >= self.min_interval): sil_tags.append((rms_list[silence_start : min(total_frames, silence_start + self.max_sil_kept) + 1].argmin() + silence_start, total_frames + 1)) - - if not sil_tags: return [waveform] - else: - chunks = [] - if sil_tags[0][0] > 0: chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) - - for i in range(len(sil_tags) - 1): - chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])) - - if sil_tags[-1][1] < total_frames: chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames)) - return chunks - -def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"): - y = np.pad(y, (int(frame_length // 2), int(frame_length // 2)), mode=pad_mode) - axis = -1 - x_shape_trimmed = list(y.shape) - x_shape_trimmed[axis] -= frame_length - 1 - xw = np.moveaxis(np.lib.stride_tricks.as_strided(y, shape=tuple(x_shape_trimmed) + tuple([frame_length]), strides=y.strides + tuple([y.strides[axis]])), -1, axis - 1 if axis < 0 else axis + 1) - slices = [slice(None)] * xw.ndim - slices[axis] = slice(0, None, hop_length) - return np.sqrt(np.mean(np.abs(xw[tuple(slices)]) ** 2, axis=-2, keepdims=True)) - -class PreProcess: - def __init__(self, sr, exp_dir, per): - self.slicer = Slicer(sr=sr, threshold=-42, min_length=1500, min_interval=400, hop_size=15, max_sil_kept=500) - self.sr = sr - self.b_high, self.a_high = signal.butter(N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr) - self.per = per - self.exp_dir = exp_dir - self.device = "cpu" - self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") - self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") - os.makedirs(self.gt_wavs_dir, exist_ok=True) - os.makedirs(self.wavs16k_dir, exist_ok=True) - - def _normalize_audio(self, audio): - tmp_max = np.abs(audio).max() - if tmp_max > 2.5: return None - return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio - - def process_audio_segment(self, normalized_audio, sid, idx0, idx1): - if normalized_audio is None: - logger.debug(f"{sid}-{idx0}-{idx1}-filtered") - return - - wavfile.write(os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{idx1}.wav"), self.sr, normalized_audio.astype(np.float32)) - wavfile.write(os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), SAMPLE_RATE_16K, librosa.resample(normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type="soxr_vhq").astype(np.float32)) - - def process_audio(self, path, idx0, sid, cut_preprocess, process_effects, clean_dataset, clean_strength): - try: - audio = load_audio(logger, path, self.sr) - - if process_effects: - audio = signal.lfilter(self.b_high, self.a_high, audio) - audio = self._normalize_audio(audio) - - if clean_dataset: - from main.tools.noisereduce import reduce_noise - audio = reduce_noise(y=audio, sr=self.sr, prop_decrease=clean_strength, device=config.device) - - idx1 = 0 - if cut_preprocess: - for audio_segment in self.slicer.slice(audio): - i = 0 - - while 1: - start = int(self.sr * (self.per - OVERLAP) * i) - i += 1 - - if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr: - self.process_audio_segment(audio_segment[start : start + int(self.per * self.sr)], sid, idx0, idx1) - idx1 += 1 - else: - self.process_audio_segment(audio_segment[start:], sid, idx0, idx1) - idx1 += 1 - break - else: self.process_audio_segment(audio, sid, idx0, idx1) - except Exception as e: - raise RuntimeError(f"{translations['process_audio_error']}: {e}") - -def process_file(args): - pp, file, cut_preprocess, process_effects, clean_dataset, clean_strength = (args) - file_path, idx0, sid = file - pp.process_audio(file_path, idx0, sid, cut_preprocess, process_effects, clean_dataset, clean_strength) - -def preprocess_training_set(input_root, sr, num_processes, exp_dir, per, cut_preprocess, process_effects, clean_dataset, clean_strength): - start_time = time.time() - - pp = PreProcess(sr, exp_dir, per) - logger.info(translations["start_preprocess"].format(num_processes=num_processes)) - files = [] - idx = 0 - - for root, _, filenames in os.walk(input_root): - try: - sid = 0 if root == input_root else int(os.path.basename(root)) - - for f in filenames: - if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")): - files.append((os.path.join(root, f), idx, sid)) - idx += 1 - except ValueError: - raise ValueError(f"{translations['not_integer']} '{os.path.basename(root)}'.") - - with tqdm(total=len(files), ncols=100, unit="f") as pbar: - with ProcessPoolExecutor(max_workers=num_processes) as executor: - futures = [executor.submit(process_file, (pp, file, cut_preprocess, process_effects, clean_dataset, clean_strength)) for file in files] - for future in as_completed(futures): - try: - future.result() - except Exception as e: - raise RuntimeError(f"{translations['process_error']}: {e}") - pbar.update(1) - logger.debug(pbar.format_meter(pbar.n, pbar.total, pbar.format_dict["elapsed"])) - - elapsed_time = time.time() - start_time - logger.info(translations["preprocess_success"].format(elapsed_time=f"{elapsed_time:.2f}")) - -if __name__ == "__main__": - args = parse_arguments() - experiment_directory = os.path.join("assets", "logs", args.model_name) - num_processes = args.cpu_cores - num_processes = 2 if num_processes is None else int(num_processes) - dataset = args.dataset_path - sample_rate = args.sample_rate - cut_preprocess = args.cut_preprocess - preprocess_effects = args.process_effects - clean_dataset = args.clean_dataset - clean_strength = args.clean_strength - - os.makedirs(experiment_directory, exist_ok=True) - - if logger.hasHandlers(): logger.handlers.clear() - else: - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join(experiment_directory, "preprocess.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - - log_data = {translations['modelname']: args.model_name, translations['export_process']: experiment_directory, translations['dataset_folder']: dataset, translations['pretrain_sr']: sample_rate, translations['cpu_core']: num_processes, translations['split_audio']: cut_preprocess, translations['preprocess_effect']: preprocess_effects, translations['clear_audio']: clean_dataset} - if clean_dataset: log_data[translations['clean_strength']] = clean_strength - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - pid_path = os.path.join(experiment_directory, "preprocess_pid.txt") - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - try: - preprocess_training_set(dataset, sample_rate, num_processes, experiment_directory, 3.7, cut_preprocess, preprocess_effects, clean_dataset, clean_strength) - except Exception as e: - logger.error(f"{translations['process_audio_error']} {e}") - import traceback - logger.debug(traceback.format_exc()) - - if os.path.exists(pid_path): os.remove(pid_path) - logger.info(f"{translations['preprocess_model_success']} {args.model_name}") \ No newline at end of file diff --git a/main/inference/separator_music.py b/main/inference/separator_music.py deleted file mode 100644 index ce1e773c189d8df1de06e5d5efb4b98ff3e9d186..0000000000000000000000000000000000000000 --- a/main/inference/separator_music.py +++ /dev/null @@ -1,310 +0,0 @@ -import os -import sys -import time -import logging -import argparse -import logging.handlers - -from distutils.util import strtobool - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.algorithm.separator import Separator -from main.library.utils import pydub_convert, pydub_load - -config = Config() -translations = config.translations -logger = logging.getLogger(__name__) - -if logger.hasHandlers(): logger.handlers.clear() -else: - console_handler = logging.StreamHandler() - console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - console_handler.setFormatter(console_formatter) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join("assets", "logs", "separator.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - file_handler.setFormatter(file_formatter) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - -demucs_models = {"HT-Tuned": "htdemucs_ft.yaml", "HT-Normal": "htdemucs.yaml", "HD_MMI": "hdemucs_mmi.yaml", "HT_6S": "htdemucs_6s.yaml"} -mdx_models = {"Main_340": "UVR-MDX-NET_Main_340.onnx", "Main_390": "UVR-MDX-NET_Main_390.onnx", "Main_406": "UVR-MDX-NET_Main_406.onnx", "Main_427": "UVR-MDX-NET_Main_427.onnx", "Main_438": "UVR-MDX-NET_Main_438.onnx", "Inst_full_292": "UVR-MDX-NET-Inst_full_292.onnx", "Inst_HQ_1": "UVR-MDX-NET_Inst_HQ_1.onnx", "Inst_HQ_2": "UVR-MDX-NET_Inst_HQ_2.onnx", "Inst_HQ_3": "UVR-MDX-NET_Inst_HQ_3.onnx", "Inst_HQ_4": "UVR-MDX-NET-Inst_HQ_4.onnx", "Inst_HQ_5": "UVR-MDX-NET-Inst_HQ_5.onnx", "Kim_Vocal_1": "Kim_Vocal_1.onnx", "Kim_Vocal_2": "Kim_Vocal_2.onnx", "Kim_Inst": "Kim_Inst.onnx", "Inst_187_beta": "UVR-MDX-NET_Inst_187_beta.onnx", "Inst_82_beta": "UVR-MDX-NET_Inst_82_beta.onnx", "Inst_90_beta": "UVR-MDX-NET_Inst_90_beta.onnx", "Voc_FT": "UVR-MDX-NET-Voc_FT.onnx", "Crowd_HQ": "UVR-MDX-NET_Crowd_HQ_1.onnx", "MDXNET_9482": "UVR_MDXNET_9482.onnx", "Inst_1": "UVR-MDX-NET-Inst_1.onnx", "Inst_2": "UVR-MDX-NET-Inst_2.onnx", "Inst_3": "UVR-MDX-NET-Inst_3.onnx", "MDXNET_1_9703": "UVR_MDXNET_1_9703.onnx", "MDXNET_2_9682": "UVR_MDXNET_2_9682.onnx", "MDXNET_3_9662": "UVR_MDXNET_3_9662.onnx", "Inst_Main": "UVR-MDX-NET-Inst_Main.onnx", "MDXNET_Main": "UVR_MDXNET_Main.onnx"} -kara_models = {"Version-1": "UVR_MDXNET_KARA.onnx", "Version-2": "UVR_MDXNET_KARA_2.onnx"} - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_path", type=str, required=True) - parser.add_argument("--output_path", type=str, default="./audios") - parser.add_argument("--format", type=str, default="wav") - parser.add_argument("--shifts", type=int, default=2) - parser.add_argument("--segments_size", type=int, default=256) - parser.add_argument("--overlap", type=float, default=0.25) - parser.add_argument("--mdx_hop_length", type=int, default=1024) - parser.add_argument("--mdx_batch_size", type=int, default=1) - parser.add_argument("--clean_audio", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--clean_strength", type=float, default=0.7) - parser.add_argument("--model_name", type=str, default="HT-Normal") - parser.add_argument("--kara_model", type=str, default="Version-1") - parser.add_argument("--backing", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--mdx_denoise", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--reverb", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--backing_reverb", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--sample_rate", type=int, default=44100) - - return parser.parse_args() - -def main(): - start_time = time.time() - pid_path = os.path.join("assets", "separate_pid.txt") - - with open(pid_path, "w") as pid_file: - pid_file.write(str(os.getpid())) - - try: - args = parse_arguments() - input_path, output_path, export_format, shifts, segments_size, overlap, hop_length, batch_size, clean_audio, clean_strength, model_name, kara_model, backing, mdx_denoise, reverb, backing_reverb, sample_rate = args.input_path, args.output_path, args.format, args.shifts, args.segments_size, args.overlap, args.mdx_hop_length, args.mdx_batch_size, args.clean_audio, args.clean_strength, args.model_name, args.kara_model, args.backing, args.mdx_denoise, args.reverb, args.backing_reverb, args.sample_rate - - if backing_reverb and not reverb: - logger.warning(translations["turn_on_dereverb"]) - sys.exit(1) - - if backing_reverb and not backing: - logger.warning(translations["turn_on_separator_backing"]) - sys.exit(1) - - input_path = input_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - output_path = os.path.dirname(output_path) or output_path - - log_data = {translations['audio_path']: input_path, translations['output_path']: output_path, translations['export_format']: export_format, translations['shift']: shifts, translations['segments_size']: segments_size, translations['overlap']: overlap, translations['modelname']: model_name, translations['denoise_mdx']: mdx_denoise, "Hop length": hop_length, translations['batch_size']: batch_size, translations['sr']: sample_rate} - - if clean_audio: - log_data[translations['clear_audio']] = clean_audio - log_data[translations['clean_strength']] = clean_strength - - if backing: - log_data[translations['backing_model_ver']] = kara_model - log_data[translations['separator_backing']] = backing - - if reverb: - log_data[translations['dereveb_audio']] = reverb - log_data[translations['dereveb_backing']] = backing_reverb - - for key, value in log_data.items(): - logger.debug(f"{key}: {value}") - - if os.path.isdir(input_path): - for f in input_path: - separation(f, output_path, export_format, shifts, overlap, segments_size, model_name, sample_rate, mdx_denoise, hop_length, batch_size, backing, reverb, kara_model, backing_reverb, clean_audio, clean_strength) - else: separation(input_path, output_path, export_format, shifts, overlap, segments_size, model_name, sample_rate, mdx_denoise, hop_length, batch_size, backing, reverb, kara_model, backing_reverb, clean_audio, clean_strength) - - except Exception as e: - logger.error(f"{translations['separator_error']}: {e}") - - import traceback - logger.debug(traceback.format_exc()) - - if os.path.exists(pid_path): os.remove(pid_path) - - elapsed_time = time.time() - start_time - logger.info(translations["separator_success"].format(elapsed_time=f"{elapsed_time:.2f}")) - -def separation(input_path, output_path, export_format, shifts, overlap, segments_size, model_name, sample_rate, mdx_denoise, hop_length, batch_size, backing, reverb, kara_model, backing_reverb, clean_audio, clean_strength): - filename, _ = os.path.splitext(os.path.basename(input_path)) - output_path = os.path.join(output_path, filename) - os.makedirs(output_path, exist_ok=True) - - if model_name in ["HT-Tuned", "HT-Normal", "HD_MMI", "HT_6S"]: vocals, _ = separator_music_demucs(input_path, output_path, export_format, shifts, overlap, segments_size, model_name, sample_rate) - else: vocals, _ = separator_music_mdx(input_path, output_path, export_format, segments_size, overlap, mdx_denoise, model_name, hop_length, batch_size, sample_rate) - - if backing: main_vocals, backing_vocals = separator_backing(vocals, output_path, export_format, segments_size, overlap, mdx_denoise, kara_model, hop_length, batch_size, sample_rate) - if reverb: vocals_no_reverb, main_vocals_no_reverb, backing_vocals_no_reverb = separator_reverb(output_path, export_format, segments_size, overlap, mdx_denoise, reverb, backing_reverb, hop_length, batch_size, sample_rate) - - original_output = os.path.join(output_path, f"Original_Vocals_No_Reverb.{export_format}") if reverb else os.path.join(output_path, f"Original_Vocals.{export_format}") - main_output = os.path.join(output_path, f"Main_Vocals_No_Reverb.{export_format}") if reverb and backing_reverb else os.path.join(output_path, f"Main_Vocals.{export_format}") - backing_output = os.path.join(output_path, f"Backing_Vocals_No_Reverb.{export_format}") if reverb and backing_reverb else os.path.join(output_path, f"Backing_Vocals.{export_format}") - - if clean_audio: - import soundfile as sf - - logger.info(f"{translations['clear_audio']}...") - - vocal_data, vocal_sr = sf.read(vocals_no_reverb if reverb else vocals) - main_data, main_sr = sf.read(main_vocals_no_reverb if reverb and backing else main_vocals) - backing_data, backing_sr = sf.read(backing_vocals_no_reverb if reverb and backing_reverb else backing_vocals) - - from main.tools.noisereduce import reduce_noise - sf.write(original_output, reduce_noise(y=vocal_data, prop_decrease=clean_strength), vocal_sr, format=export_format, device=config.device) - - if backing: - sf.write(main_output, reduce_noise(y=main_data, sr=main_sr, prop_decrease=clean_strength), main_sr, format=export_format, device=config.device) - sf.write(backing_output, reduce_noise(y=backing_data, sr=backing_sr, prop_decrease=clean_strength), backing_sr, format=export_format, device=config.device) - - logger.info(translations["clean_audio_success"]) - -def separator_music_demucs(input, output, format, shifts, overlap, segments_size, demucs_model, sample_rate): - if not os.path.exists(input): - logger.warning(translations["input_not_valid"]) - sys.exit(1) - - if not os.path.exists(output): - logger.warning(translations["output_not_valid"]) - sys.exit(1) - - for i in [f"Original_Vocals.{format}", f"Instruments.{format}"]: - if os.path.exists(os.path.join(output, i)): os.remove(os.path.join(output, i)) - - logger.info(f"{translations['separator_process_2']}...") - demucs_output = separator_main(audio_file=input, model_filename=demucs_models.get(demucs_model), output_format=format, output_dir=output, demucs_segment_size=(segments_size / 2), demucs_shifts=shifts, demucs_overlap=overlap, sample_rate=sample_rate) - - for f in demucs_output: - path = os.path.join(output, f) - if not os.path.exists(path): logger.error(translations["not_found"].format(name=path)) - - if '_(Drums)_' in f: drums = path - elif '_(Bass)_' in f: bass = path - elif '_(Other)_' in f: other = path - elif '_(Vocals)_' in f: os.rename(path, os.path.join(output, f"Original_Vocals.{format}")) - - pydub_convert(pydub_load(drums)).overlay(pydub_convert(pydub_load(bass))).overlay(pydub_convert(pydub_load(other))).export(os.path.join(output, f"Instruments.{format}"), format=format) - - for f in [drums, bass, other]: - if os.path.exists(f): os.remove(f) - - logger.info(translations["separator_success_2"]) - return os.path.join(output, f"Original_Vocals.{format}"), os.path.join(output, f"Instruments.{format}") - -def separator_backing(input, output, format, segments_size, overlap, denoise, kara_model, hop_length, batch_size, sample_rate): - if not os.path.exists(input): - logger.warning(translations["input_not_valid"]) - sys.exit(1) - - if not os.path.exists(output): - logger.warning(translations["output_not_valid"]) - sys.exit(1) - - for f in [f"Main_Vocals.{format}", f"Backing_Vocals.{format}"]: - if os.path.exists(os.path.join(output, f)): os.remove(os.path.join(output, f)) - - model_2 = kara_models.get(kara_model) - logger.info(f"{translations['separator_process_backing']}...") - - backing_outputs = separator_main(audio_file=input, model_filename=model_2, output_format=format, output_dir=output, mdx_segment_size=segments_size, mdx_overlap=overlap, mdx_batch_size=batch_size, mdx_hop_length=hop_length, mdx_enable_denoise=denoise, sample_rate=sample_rate) - main_output = os.path.join(output, f"Main_Vocals.{format}") - backing_output = os.path.join(output, f"Backing_Vocals.{format}") - - for f in backing_outputs: - path = os.path.join(output, f) - if not os.path.exists(path): logger.error(translations["not_found"].format(name=path)) - - if '_(Instrumental)_' in f: os.rename(path, backing_output) - elif '_(Vocals)_' in f: os.rename(path, main_output) - - logger.info(translations["separator_process_backing_success"]) - return main_output, backing_output - -def separator_music_mdx(input, output, format, segments_size, overlap, denoise, mdx_model, hop_length, batch_size, sample_rate): - if not os.path.exists(input): - logger.warning(translations["input_not_valid"]) - sys.exit(1) - - if not os.path.exists(output): - logger.warning(translations["output_not_valid"]) - sys.exit(1) - - for i in [f"Original_Vocals.{format}", f"Instruments.{format}"]: - if os.path.exists(os.path.join(output, i)): os.remove(os.path.join(output, i)) - - model_3 = mdx_models.get(mdx_model) - logger.info(f"{translations['separator_process_2']}...") - - output_music = separator_main(audio_file=input, model_filename=model_3, output_format=format, output_dir=output, mdx_segment_size=segments_size, mdx_overlap=overlap, mdx_batch_size=batch_size, mdx_hop_length=hop_length, mdx_enable_denoise=denoise, sample_rate=sample_rate) - original_output, instruments_output = os.path.join(output, f"Original_Vocals.{format}"), os.path.join(output, f"Instruments.{format}") - - for f in output_music: - path = os.path.join(output, f) - if not os.path.exists(path): logger.error(translations["not_found"].format(name=path)) - - if '_(Instrumental)_' in f: os.rename(path, instruments_output) - elif '_(Vocals)_' in f: os.rename(path, original_output) - - logger.info(translations["separator_process_backing_success"]) - return original_output, instruments_output - -def separator_reverb(output, format, segments_size, overlap, denoise, original, backing_reverb, hop_length, batch_size, sample_rate): - if not os.path.exists(output): - logger.warning(translations["output_not_valid"]) - sys.exit(1) - - for i in [f"Original_Vocals_Reverb.{format}", f"Main_Vocals_Reverb.{format}", f"Original_Vocals_No_Reverb.{format}", f"Main_Vocals_No_Reverb.{format}"]: - if os.path.exists(os.path.join(output, i)): os.remove(os.path.join(output, i)) - - dereveb_path = [] - - if original: - try: - dereveb_path.append(os.path.join(output, [f for f in os.listdir(output) if 'Original_Vocals' in f][0])) - except IndexError: - logger.warning(translations["not_found_original_vocal"]) - sys.exit(1) - - if backing_reverb: - try: - dereveb_path.append(os.path.join(output, [f for f in os.listdir(output) if 'Main_Vocals' in f][0])) - except IndexError: - logger.warning(translations["not_found_main_vocal"]) - sys.exit(1) - - if backing_reverb: - try: - dereveb_path.append(os.path.join(output, [f for f in os.listdir(output) if 'Backing_Vocals' in f][0])) - except IndexError: - logger.warning(translations["not_found_backing_vocal"]) - sys.exit(1) - - for path in dereveb_path: - if not os.path.exists(path): - logger.warning(translations["not_found"].format(name=path)) - sys.exit(1) - - if "Original_Vocals" in path: - reverb_path, no_reverb_path = os.path.join(output, f"Original_Vocals_Reverb.{format}"), os.path.join(output, f"Original_Vocals_No_Reverb.{format}") - start_title, end_title = translations["process_original"], translations["process_original_success"] - elif "Main_Vocals" in path: - reverb_path, no_reverb_path = os.path.join(output, f"Main_Vocals_Reverb.{format}"), os.path.join(output, f"Main_Vocals_No_Reverb.{format}") - start_title, end_title = translations["process_main"], translations["process_main_success"] - elif "Backing_Vocals" in path: - reverb_path, no_reverb_path = os.path.join(output, f"Backing_Vocals_Reverb.{format}"), os.path.join(output, f"Backing_Vocals_No_Reverb.{format}") - start_title, end_title = translations["process_backing"], translations["process_backing_success"] - - logger.info(start_title) - output_dereveb = separator_main(audio_file=path, model_filename="Reverb_HQ_By_FoxJoy.onnx", output_format=format, output_dir=output, mdx_segment_size=segments_size, mdx_overlap=overlap, mdx_batch_size=batch_size, mdx_hop_length=hop_length, mdx_enable_denoise=denoise, sample_rate=sample_rate) - - for f in output_dereveb: - path = os.path.join(output, f) - if not os.path.exists(path): logger.error(translations["not_found"].format(name=path)) - - if '_(Reverb)_' in f: os.rename(path, reverb_path) - elif '_(No Reverb)_' in f: os.rename(path, no_reverb_path) - - logger.info(end_title) - - return (os.path.join(output, f"Original_Vocals_No_Reverb.{format}") if original else None), (os.path.join(output, f"Main_Vocals_No_Reverb.{format}") if backing_reverb else None), (os.path.join(output, f"Backing_Vocals_No_Reverb.{format}") if backing_reverb else None) - -def separator_main(audio_file=None, model_filename="UVR-MDX-NET_Main_340.onnx", output_format="wav", output_dir=".", mdx_segment_size=256, mdx_overlap=0.25, mdx_batch_size=1, mdx_hop_length=1024, mdx_enable_denoise=True, demucs_segment_size=256, demucs_shifts=2, demucs_overlap=0.25, sample_rate=44100): - try: - separator = Separator(logger=logger, log_formatter=file_formatter, log_level=logging.INFO, output_dir=output_dir, output_format=output_format, output_bitrate=None, normalization_threshold=0.9, output_single_stem=None, invert_using_spec=False, sample_rate=sample_rate, mdx_params={"hop_length": mdx_hop_length, "segment_size": mdx_segment_size, "overlap": mdx_overlap, "batch_size": mdx_batch_size, "enable_denoise": mdx_enable_denoise}, demucs_params={"segment_size": demucs_segment_size, "shifts": demucs_shifts, "overlap": demucs_overlap, "segments_enabled": True}) - separator.load_model(model_filename=model_filename) - - return separator.separate(audio_file) - except: - logger.debug(translations["default_setting"]) - separator = Separator(logger=logger, log_formatter=file_formatter, log_level=logging.INFO, output_dir=output_dir, output_format=output_format, output_bitrate=None, normalization_threshold=0.9, output_single_stem=None, invert_using_spec=False, sample_rate=44100, mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": mdx_enable_denoise}, demucs_params={"segment_size": 128, "shifts": 2, "overlap": 0.25, "segments_enabled": True}) - separator.load_model(model_filename=model_filename) - - return separator.separate(audio_file) - -if __name__ == "__main__": main() \ No newline at end of file diff --git a/main/inference/train.py b/main/inference/train.py deleted file mode 100644 index 5573c49d9b8fd06e38fcab042b8ebd9a55f6c09c..0000000000000000000000000000000000000000 --- a/main/inference/train.py +++ /dev/null @@ -1,980 +0,0 @@ -import os -import sys -import glob -import json -import torch -import hashlib -import logging -import argparse -import datetime -import warnings -import logging.handlers - -import numpy as np -import soundfile as sf -import matplotlib.pyplot as plt -import torch.distributed as dist -import torch.utils.data as tdata -import torch.multiprocessing as mp - -from tqdm import tqdm -from collections import OrderedDict -from random import randint, shuffle -from torch.utils.checkpoint import checkpoint -from torch.cuda.amp import GradScaler, autocast -from torch.utils.tensorboard import SummaryWriter - -from time import time as ttime -from torch.nn import functional as F -from distutils.util import strtobool -from librosa.filters import mel as librosa_mel_fn -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.nn.utils.parametrizations import spectral_norm, weight_norm - -sys.path.append(os.getcwd()) -from main.configs.config import Config -from main.library.algorithm.residuals import LRELU_SLOPE -from main.library.algorithm.synthesizers import Synthesizer -from main.library.algorithm.commons import get_padding, slice_segments, clip_grad_value - -MATPLOTLIB_FLAG = False -translations = Config().translations -warnings.filterwarnings("ignore") -logging.getLogger("torch").setLevel(logging.ERROR) - -class HParams: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - self[k] = HParams(**v) if isinstance(v, dict) else v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return self.__dict__[key] - - def __setitem__(self, key, value): - self.__dict__[key] = value - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return repr(self.__dict__) - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True) - parser.add_argument("--rvc_version", type=str, default="v2") - parser.add_argument("--save_every_epoch", type=int, required=True) - parser.add_argument("--save_only_latest", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--save_every_weights", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--total_epoch", type=int, default=300) - parser.add_argument("--sample_rate", type=int, required=True) - parser.add_argument("--batch_size", type=int, default=8) - parser.add_argument("--gpu", type=str, default="0") - parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True) - parser.add_argument("--g_pretrained_path", type=str, default="") - parser.add_argument("--d_pretrained_path", type=str, default="") - parser.add_argument("--overtraining_detector", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--overtraining_threshold", type=int, default=50) - parser.add_argument("--cleanup", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--cache_data_in_gpu", type=lambda x: bool(strtobool(x)), default=False) - parser.add_argument("--model_author", type=str) - parser.add_argument("--vocoder", type=str, default="Default") - parser.add_argument("--checkpointing", type=lambda x: bool(strtobool(x)), default=False) - - return parser.parse_args() - -args = parse_arguments() -model_name, save_every_epoch, total_epoch, pretrainG, pretrainD, version, gpus, batch_size, sample_rate, pitch_guidance, save_only_latest, save_every_weights, cache_data_in_gpu, overtraining_detector, overtraining_threshold, cleanup, model_author, vocoder, checkpointing = args.model_name, args.save_every_epoch, args.total_epoch, args.g_pretrained_path, args.d_pretrained_path, args.rvc_version, args.gpu, args.batch_size, args.sample_rate, args.pitch_guidance, args.save_only_latest, args.save_every_weights, args.cache_data_in_gpu, args.overtraining_detector, args.overtraining_threshold, args.cleanup, args.model_author, args.vocoder, args.checkpointing - -experiment_dir = os.path.join("assets", "logs", model_name) -training_file_path = os.path.join(experiment_dir, "training_data.json") -config_save_path = os.path.join(experiment_dir, "config.json") - -os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",") -n_gpus = len(gpus.split("-")) - -torch.backends.cudnn.deterministic = False -torch.backends.cudnn.benchmark = False - -lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} -global_step, last_loss_gen_all, overtrain_save_epoch = 0, 0, 0 -loss_gen_history, smoothed_loss_gen_history, loss_disc_history, smoothed_loss_disc_history = [], [], [], [] - -with open(config_save_path, "r") as f: - config = json.load(f) - -config = HParams(**config) -config.data.training_files = os.path.join(experiment_dir, "filelist.txt") -logger = logging.getLogger(__name__) - -if logger.hasHandlers(): logger.handlers.clear() -else: - console_handler = logging.StreamHandler() - console_handler.setFormatter(logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) - console_handler.setLevel(logging.INFO) - file_handler = logging.handlers.RotatingFileHandler(os.path.join(experiment_dir, "train.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8') - file_handler.setFormatter(logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) - file_handler.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - logger.addHandler(file_handler) - logger.setLevel(logging.DEBUG) - -log_data = {translations['modelname']: model_name, translations["save_every_epoch"]: save_every_epoch, translations["total_e"]: total_epoch, translations["dorg"].format(pretrainG=pretrainG, pretrainD=pretrainD): "", translations['training_version']: version, "Gpu": gpus, translations['batch_size']: batch_size, translations['pretrain_sr']: sample_rate, translations['training_f0']: pitch_guidance, translations['save_only_latest']: save_only_latest, translations['save_every_weights']: save_every_weights, translations['cache_in_gpu']: cache_data_in_gpu, translations['overtraining_detector']: overtraining_detector, translations['threshold']: overtraining_threshold, translations['cleanup_training']: cleanup, translations['memory_efficient_training']: checkpointing} -if model_author: log_data[translations["model_author"].format(model_author=model_author)] = "" -if vocoder != "Default": log_data[translations['vocoder']] = vocoder - -for key, value in log_data.items(): - logger.debug(f"{key}: {value}" if value != "" else f"{key} {value}") - -def main(): - global training_file_path, last_loss_gen_all, smoothed_loss_gen_history, loss_gen_history, loss_disc_history, smoothed_loss_disc_history, overtrain_save_epoch, model_author, vocoder, checkpointing - - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(randint(20000, 55555)) - - if torch.cuda.is_available(): device, n_gpus = torch.device("cuda"), torch.cuda.device_count() - elif torch.backends.mps.is_available(): device, n_gpus = torch.device("mps"), 1 - else: device, n_gpus = torch.device("cpu"), 1 - - def start(): - children = [] - pid_data = {"process_pids": []} - - with open(config_save_path, "r") as pid_file: - try: - pid_data.update(json.load(pid_file)) - except json.JSONDecodeError: - pass - - with open(config_save_path, "w") as pid_file: - for i in range(n_gpus): - subproc = mp.Process(target=run, args=(i, n_gpus, experiment_dir, pretrainG, pretrainD, pitch_guidance, total_epoch, save_every_weights, config, device, model_author, vocoder, checkpointing)) - children.append(subproc) - subproc.start() - pid_data["process_pids"].append(subproc.pid) - - json.dump(pid_data, pid_file, indent=4) - - for i in range(n_gpus): - children[i].join() - - def load_from_json(file_path): - if os.path.exists(file_path): - with open(file_path, "r") as f: - data = json.load(f) - return (data.get("loss_disc_history", []), data.get("smoothed_loss_disc_history", []), data.get("loss_gen_history", []), data.get("smoothed_loss_gen_history", [])) - return [], [], [], [] - - def continue_overtrain_detector(training_file_path): - if overtraining_detector and os.path.exists(training_file_path): (loss_disc_history, smoothed_loss_disc_history, loss_gen_history, smoothed_loss_gen_history) = load_from_json(training_file_path) - - n_gpus = torch.cuda.device_count() - - if not torch.cuda.is_available() and torch.backends.mps.is_available(): n_gpus = 1 - if n_gpus < 1: - logger.warning(translations["not_gpu"]) - n_gpus = 1 - - if cleanup: - for root, dirs, files in os.walk(experiment_dir, topdown=False): - for name in files: - file_path = os.path.join(root, name) - _, file_extension = os.path.splitext(name) - if (file_extension == ".0" or (name.startswith("D_") and file_extension == ".pth") or (name.startswith("G_") and file_extension == ".pth") or (file_extension == ".index")): os.remove(file_path) - - for name in dirs: - if name == "eval": - folder_path = os.path.join(root, name) - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - if os.path.isfile(item_path): os.remove(item_path) - os.rmdir(folder_path) - - continue_overtrain_detector(training_file_path) - start() - -def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - - if not MATPLOTLIB_FLAG: - plt.switch_backend("Agg") - MATPLOTLIB_FLAG = True - - fig, ax = plt.subplots(figsize=(10, 2)) - - plt.colorbar(ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none"), ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - fig.canvas.draw() - plt.close(fig) - - return np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(fig.canvas.get_width_height()[::-1] + (3,)) - -def verify_checkpoint_shapes(checkpoint_path, model): - checkpoint = torch.load(checkpoint_path, map_location="cpu") - checkpoint_state_dict = checkpoint["model"] - try: - model_state_dict = model.module.load_state_dict(checkpoint_state_dict) if hasattr(model, "module") else model.load_state_dict(checkpoint_state_dict) - except RuntimeError: - logger.warning(translations["checkpointing_err"]) - sys.exit(1) - else: del checkpoint, checkpoint_state_dict, model_state_dict - -def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sample_rate=22050): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats="HWC") - - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sample_rate) - -def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path), translations["not_found_checkpoint"].format(checkpoint_path=checkpoint_path) - checkpoint_dict = replace_keys_in_dict(replace_keys_in_dict(torch.load(checkpoint_path, map_location="cpu"), ".weight_v", ".parametrizations.weight.original1"), ".weight_g", ".parametrizations.weight.original0") - new_state_dict = {k: checkpoint_dict["model"].get(k, v) for k, v in (model.module.state_dict() if hasattr(model, "module") else model.state_dict()).items()} - - if hasattr(model, "module"): model.module.load_state_dict(new_state_dict, strict=False) - else: model.load_state_dict(new_state_dict, strict=False) - - if optimizer and load_opt == 1: optimizer.load_state_dict(checkpoint_dict.get("optimizer", {})) - logger.debug(translations["save_checkpoint"].format(checkpoint_path=checkpoint_path, checkpoint_dict=checkpoint_dict['iteration'])) - return (model, optimizer, checkpoint_dict.get("learning_rate", 0), checkpoint_dict["iteration"]) - -def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - state_dict = (model.module.state_dict() if hasattr(model, "module") else model.state_dict()) - torch.save(replace_keys_in_dict(replace_keys_in_dict({"model": state_dict, "iteration": iteration, "optimizer": optimizer.state_dict(), "learning_rate": learning_rate}, ".parametrizations.weight.original1", ".weight_v"), ".parametrizations.weight.original0", ".weight_g"), checkpoint_path) - logger.info(translations["save_model"].format(checkpoint_path=checkpoint_path, iteration=iteration)) - -def latest_checkpoint_path(dir_path, regex="G_*.pth"): - checkpoints = sorted(glob.glob(os.path.join(dir_path, regex)), key=lambda f: int("".join(filter(str.isdigit, f)))) - return checkpoints[-1] if checkpoints else None - -def load_wav_to_torch(full_path): - data, sample_rate = sf.read(full_path, dtype='float32') - return torch.FloatTensor(data.astype(np.float32)), sample_rate - -def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding="utf-8") as f: - return [line.strip().split(split) for line in f] - -def feature_loss(fmap_r, fmap_g): - loss = 0 - for dr, dg in zip(fmap_r, fmap_g): - for rl, gl in zip(dr, dg): - loss += torch.mean(torch.abs(rl.float().detach() - gl.float())) - return loss * 2 - -def discriminator_loss(disc_real_outputs, disc_generated_outputs): - loss = 0 - r_losses, g_losses = [], [] - - for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - dr = dr.float() - dg = dg.float() - r_loss = torch.mean((1 - dr) ** 2) - g_loss = torch.mean(dg**2) - loss += r_loss + g_loss - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) - return loss, r_losses, g_losses - -def generator_loss(disc_outputs): - loss = 0 - gen_losses = [] - - for dg in disc_outputs: - l = torch.mean((1 - dg.float()) ** 2) - gen_losses.append(l) - loss += l - return loss, gen_losses - -def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): - z_p = z_p.float() - logs_q = logs_q.float() - m_p = m_p.float() - logs_p = logs_p.float() - z_mask = z_mask.float() - kl = logs_p - logs_q - 0.5 - kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) - return torch.sum(kl * z_mask) / torch.sum(z_mask) - -class TextAudioLoaderMultiNSFsid(tdata.Dataset): - def __init__(self, hparams): - self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files) - self.max_wav_value = hparams.max_wav_value - self.sample_rate = hparams.sample_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sample_rate = hparams.sample_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - audiopaths_and_text_new, lengths = [], [] - for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - try: - sid = torch.LongTensor([int(sid)]) - except ValueError as e: - logger.error(translations["sid_error"].format(sid=sid, e=e)) - sid = torch.LongTensor([0]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - phone, pitch, pitchf = self.get_labels(audiopath_and_text[1], audiopath_and_text[2], audiopath_and_text[3]) - spec, wav = self.get_audio(audiopath_and_text[0]) - dv = self.get_sid(audiopath_and_text[4]) - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - len_wav = len_min * self.hop_length - spec, wav, phone = spec[:, :len_min], wav[:, :len_wav], phone[:len_min, :] - pitch, pitchf = pitch[:len_min], pitchf[:len_min] - return (spec, wav, phone, pitch, pitchf, dv) - - def get_labels(self, phone, pitch, pitchf): - phone = np.repeat(np.load(phone), 2, axis=0) - n_num = min(phone.shape[0], 900) - return torch.FloatTensor(phone[:n_num, :]), torch.LongTensor(np.load(pitch)[:n_num]), torch.FloatTensor(np.load(pitchf)[:n_num]) - - def get_audio(self, filename): - audio, sample_rate = load_wav_to_torch(filename) - if sample_rate != self.sample_rate: raise ValueError(translations["sr_does_not_match"].format(sample_rate=sample_rate, sample_rate2=self.sample_rate)) - audio_norm = audio.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except Exception as e: - logger.error(translations["spec_error"].format(spec_filename=spec_filename, e=e)) - spec = torch.squeeze(spectrogram_torch(audio_norm, self.filter_length, self.hop_length, self.win_length, center=False), 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = torch.squeeze(spectrogram_torch(audio_norm, self.filter_length, self.hop_length, self.win_length, center=False), 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - -class TextAudioCollateMultiNSFsid: - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True) - spec_lengths, wave_lengths = torch.LongTensor(len(batch)), torch.LongTensor(len(batch)) - spec_padded, wave_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max([x[0].size(1) for x in batch])), torch.FloatTensor(len(batch), 1, max([x[1].size(1) for x in batch])) - spec_padded.zero_() - wave_padded.zero_() - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths, phone_padded = torch.LongTensor(len(batch)), torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1]) - pitch_padded, pitchf_padded = torch.LongTensor(len(batch), max_phone_len), torch.FloatTensor(len(batch), max_phone_len) - phone_padded.zero_() - pitch_padded.zero_() - pitchf_padded.zero_() - sid = torch.LongTensor(len(batch)) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - pitch = row[3] - pitch_padded[i, : pitch.size(0)] = pitch - pitchf = row[4] - pitchf_padded[i, : pitchf.size(0)] = pitchf - sid[i] = row[5] - return (phone_padded, phone_lengths, pitch_padded, pitchf_padded, spec_padded, spec_lengths, wave_padded, wave_lengths, sid) - -class TextAudioLoader(tdata.Dataset): - def __init__(self, hparams): - self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files) - self.max_wav_value = hparams.max_wav_value - self.sample_rate = hparams.sample_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sample_rate = hparams.sample_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - audiopaths_and_text_new, lengths = [], [] - for entry in self.audiopaths_and_text: - if len(entry) >= 3: - audiopath, text, dv = entry[:3] - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - try: - sid = torch.LongTensor([int(sid)]) - except ValueError as e: - logger.error(translations["sid_error"].format(sid=sid, e=e)) - sid = torch.LongTensor([0]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - phone = self.get_labels(audiopath_and_text[1]) - spec, wav = self.get_audio(audiopath_and_text[0]) - dv = self.get_sid(audiopath_and_text[2]) - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - len_wav = len_min * self.hop_length - spec = spec[:, :len_min] - wav = wav[:, :len_wav] - phone = phone[:len_min, :] - return (spec, wav, phone, dv) - - def get_labels(self, phone): - phone = np.repeat(np.load(phone), 2, axis=0) - return torch.FloatTensor(phone[:min(phone.shape[0], 900), :]) - - def get_audio(self, filename): - audio, sample_rate = load_wav_to_torch(filename) - if sample_rate != self.sample_rate: raise ValueError(translations["sr_does_not_match"].format(sample_rate=sample_rate, sample_rate2=self.sample_rate)) - audio_norm = audio.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except Exception as e: - logger.error(translations["spec_error"].format(spec_filename=spec_filename, e=e)) - spec = torch.squeeze(spectrogram_torch(audio_norm, self.filter_length, self.hop_length, self.win_length, center=False), 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = torch.squeeze(spectrogram_torch(audio_norm, self.filter_length, self.hop_length, self.win_length, center=False), 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - -class TextAudioCollate: - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True) - spec_lengths, wave_lengths = torch.LongTensor(len(batch)), torch.LongTensor(len(batch)) - spec_padded, wave_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max([x[0].size(1) for x in batch])), torch.FloatTensor(len(batch), 1, max([x[1].size(1) for x in batch])) - spec_padded.zero_() - wave_padded.zero_() - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths, phone_padded = torch.LongTensor(len(batch)), torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1]) - phone_padded.zero_() - sid = torch.LongTensor(len(batch)) - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - sid[i] = row[3] - return (phone_padded, phone_lengths, spec_padded, spec_lengths, wave_padded, wave_lengths, sid) - -class DistributedBucketSampler(tdata.distributed.DistributedSampler): - def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - self.lengths = dataset.lengths - self.batch_size = batch_size - self.boundaries = boundaries - self.buckets, self.num_samples_per_bucket = self._create_buckets() - self.total_size = sum(self.num_samples_per_bucket) - self.num_samples = self.total_size // self.num_replicas - - def _create_buckets(self): - buckets = [[] for _ in range(len(self.boundaries) - 1)] - for i in range(len(self.lengths)): - idx_bucket = self._bisect(self.lengths[i]) - if idx_bucket != -1: buckets[idx_bucket].append(i) - - for i in range(len(buckets) - 1, -1, -1): - if len(buckets[i]) == 0: - buckets.pop(i) - self.boundaries.pop(i + 1) - - num_samples_per_bucket = [] - for i in range(len(buckets)): - len_bucket = len(buckets[i]) - total_batch_size = self.num_replicas * self.batch_size - num_samples_per_bucket.append(len_bucket + ((total_batch_size - (len_bucket % total_batch_size)) % total_batch_size)) - return buckets, num_samples_per_bucket - - def __iter__(self): - g = torch.Generator() - g.manual_seed(self.epoch) - indices, batches = [], [] - if self.shuffle: - for bucket in self.buckets: - indices.append(torch.randperm(len(bucket), generator=g).tolist()) - else: - for bucket in self.buckets: - indices.append(list(range(len(bucket)))) - - for i in range(len(self.buckets)): - bucket = self.buckets[i] - len_bucket = len(bucket) - ids_bucket = indices[i] - rem = self.num_samples_per_bucket[i] - len_bucket - ids_bucket = (ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)])[self.rank :: self.num_replicas] - - for j in range(len(ids_bucket) // self.batch_size): - batches.append([bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]]) - - if self.shuffle: batches = [batches[i] for i in torch.randperm(len(batches), generator=g).tolist()] - self.batches = batches - assert len(self.batches) * self.batch_size == self.num_samples - return iter(self.batches) - - def _bisect(self, x, lo=0, hi=None): - if hi is None: hi = len(self.boundaries) - 1 - - if hi > lo: - mid = (hi + lo) // 2 - if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: return mid - elif x <= self.boundaries[mid]: return self._bisect(x, lo, mid) - else: return self._bisect(x, mid + 1, hi) - else: return -1 - - def __len__(self): - return self.num_samples // self.batch_size - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, version, use_spectral_norm=False, checkpointing=False): - super(MultiPeriodDiscriminator, self).__init__() - self.checkpointing = checkpointing - periods = ([2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37]) - self.discriminators = torch.nn.ModuleList([DiscriminatorS(use_spectral_norm=use_spectral_norm, checkpointing=checkpointing)] + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm, checkpointing=checkpointing) for p in periods]) - - def forward(self, y, y_hat): - y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] - for d in self.discriminators: - if self.training and self.checkpointing: - def forward_discriminator(d, y, y_hat): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - return y_d_r, fmap_r, y_d_g, fmap_g - y_d_r, fmap_r, y_d_g, fmap_g = checkpoint(forward_discriminator, d, y, y_hat, use_reentrant=False) - else: - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - - y_d_rs.append(y_d_r); fmap_rs.append(fmap_r) - y_d_gs.append(y_d_g); fmap_gs.append(fmap_g) - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False, checkpointing=False): - super(DiscriminatorS, self).__init__() - self.checkpointing = checkpointing - norm_f = spectral_norm if use_spectral_norm else weight_norm - self.convs = torch.nn.ModuleList([norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)), norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2))]) - self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) - - def forward(self, x): - fmap = [] - for conv in self.convs: - x = checkpoint(self.lrelu, checkpoint(conv, x, use_reentrant = False), use_reentrant = False) if self.training and self.checkpointing else self.lrelu(conv(x)) - fmap.append(x) - - x = self.conv_post(x) - fmap.append(x) - return torch.flatten(x, 1, -1), fmap - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False, checkpointing=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.checkpointing = checkpointing - norm_f = spectral_norm if use_spectral_norm else weight_norm - self.convs = torch.nn.ModuleList([norm_f(torch.nn.Conv2d(in_ch, out_ch, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))) for in_ch, out_ch in zip([1, 32, 128, 512, 1024], [32, 128, 512, 1024, 1024])]) - self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) - - def forward(self, x): - fmap = [] - b, c, t = x.shape - if t % self.period != 0: x = torch.nn.functional.pad(x, (0, (self.period - (t % self.period))), "reflect") - x = x.view(b, c, -1, self.period) - for conv in self.convs: - x = checkpoint(self.lrelu, checkpoint(conv, x, use_reentrant = False), use_reentrant = False) if self.training and self.checkpointing else self.lrelu(conv(x)) - fmap.append(x) - - x = self.conv_post(x) - fmap.append(x) - return torch.flatten(x, 1, -1), fmap - -class EpochRecorder: - def __init__(self): - self.last_time = ttime() - - def record(self): - now_time = ttime() - elapsed_time = now_time - self.last_time - self.last_time = now_time - return translations["time_or_speed_training"].format(current_time=datetime.datetime.now().strftime("%H:%M:%S"), elapsed_time_str=str(datetime.timedelta(seconds=int(round(elapsed_time, 1))))) - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - -def dynamic_range_decompression_torch(x, C=1): - return torch.exp(x) / C - -def spectral_normalize_torch(magnitudes): - return dynamic_range_compression_torch(magnitudes) - -def spectral_de_normalize_torch(magnitudes): - return dynamic_range_decompression_torch(magnitudes) - -mel_basis, hann_window = {}, {} - -def spectrogram_torch(y, n_fft, hop_size, win_size, center=False): - global hann_window - - wnsize_dtype_device = str(win_size) + "_" + str(y.dtype) + "_" + str(y.device) - if wnsize_dtype_device not in hann_window: hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - spec = torch.stft(torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect").squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=True) - return torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) - -def spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax): - global mel_basis - - fmax_dtype_device = str(fmax) + "_" + str(spec.dtype) + "_" + str(spec.device) - if fmax_dtype_device not in mel_basis: mel_basis[fmax_dtype_device] = torch.from_numpy(librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)).to(dtype=spec.dtype, device=spec.device) - return spectral_normalize_torch(torch.matmul(mel_basis[fmax_dtype_device], spec)) - -def mel_spectrogram_torch(y, n_fft, num_mels, sample_rate, hop_size, win_size, fmin, fmax, center=False): - return spec_to_mel_torch(spectrogram_torch(y, n_fft, hop_size, win_size, center), n_fft, num_mels, sample_rate, fmin, fmax) - -def replace_keys_in_dict(d, old_key_part, new_key_part): - updated_dict = OrderedDict() if isinstance(d, OrderedDict) else {} - for key, value in d.items(): - updated_dict[(key.replace(old_key_part, new_key_part) if isinstance(key, str) else key)] = (replace_keys_in_dict(value, old_key_part, new_key_part) if isinstance(value, dict) else value) - return updated_dict - -def extract_model(ckpt, sr, pitch_guidance, name, model_path, epoch, step, version, hps, model_author, vocoder): - try: - logger.info(translations["savemodel"].format(model_dir=model_path, epoch=epoch, step=step)) - os.makedirs(os.path.dirname(model_path), exist_ok=True) - - opt = OrderedDict(weight={key: value.half() for key, value in ckpt.items() if "enc_q" not in key}) - opt["config"] = [hps.data.filter_length // 2 + 1, 32, hps.model.inter_channels, hps.model.hidden_channels, hps.model.filter_channels, hps.model.n_heads, hps.model.n_layers, hps.model.kernel_size, hps.model.p_dropout, hps.model.resblock, hps.model.resblock_kernel_sizes, hps.model.resblock_dilation_sizes, hps.model.upsample_rates, hps.model.upsample_initial_channel, hps.model.upsample_kernel_sizes, hps.model.spk_embed_dim, hps.model.gin_channels, hps.data.sample_rate] - opt["epoch"] = f"{epoch}epoch" - opt["step"] = step - opt["sr"] = sr - opt["f0"] = int(pitch_guidance) - opt["version"] = version - opt["creation_date"] = datetime.datetime.now().isoformat() - opt["model_hash"] = hashlib.sha256(f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}".encode()).hexdigest() - opt["model_name"] = name - opt["author"] = model_author - opt["vocoder"] = vocoder - - torch.save(replace_keys_in_dict(replace_keys_in_dict(opt, ".parametrizations.weight.original1", ".weight_v"), ".parametrizations.weight.original0", ".weight_g"), model_path) - except Exception as e: - logger.error(f"{translations['extract_model_error']}: {e}") - -def run(rank, n_gpus, experiment_dir, pretrainG, pretrainD, pitch_guidance, custom_total_epoch, custom_save_every_weights, config, device, model_author, vocoder, checkpointing): - global global_step - - if rank == 0: writer_eval = SummaryWriter(log_dir=os.path.join(experiment_dir, "eval")) - else: writer_eval = None - - dist.init_process_group(backend="gloo", init_method="env://", world_size=n_gpus, rank=rank) - torch.manual_seed(config.train.seed) - if torch.cuda.is_available(): torch.cuda.set_device(rank) - - train_dataset = TextAudioLoaderMultiNSFsid(config.data) - train_loader = tdata.DataLoader(train_dataset, num_workers=4, shuffle=False, pin_memory=True, collate_fn=TextAudioCollateMultiNSFsid(), batch_sampler=DistributedBucketSampler(train_dataset, batch_size * n_gpus, [100, 200, 300, 400, 500, 600, 700, 800, 900], num_replicas=n_gpus, rank=rank, shuffle=True), persistent_workers=True, prefetch_factor=8) - - net_g, net_d = Synthesizer(config.data.filter_length // 2 + 1, config.train.segment_size // config.data.hop_length, **config.model, use_f0=pitch_guidance, sr=sample_rate, vocoder=vocoder, checkpointing=checkpointing), MultiPeriodDiscriminator(version, config.model.use_spectral_norm, checkpointing=checkpointing) - net_g, net_d = (net_g.cuda(rank), net_d.cuda(rank)) if torch.cuda.is_available() else (net_g.to(device), net_d.to(device)) - optim_g, optim_d = torch.optim.AdamW(net_g.parameters(), config.train.learning_rate, betas=config.train.betas, eps=config.train.eps), torch.optim.AdamW(net_d.parameters(), config.train.learning_rate, betas=config.train.betas, eps=config.train.eps) - net_g, net_d = (DDP(net_g, device_ids=[rank]), DDP(net_d, device_ids=[rank])) if torch.cuda.is_available() else (DDP(net_g), DDP(net_d)) - - try: - logger.info(translations["start_training"]) - _, _, _, epoch_str = load_checkpoint((os.path.join(experiment_dir, "D_latest.pth") if save_only_latest else latest_checkpoint_path(experiment_dir, "D_*.pth")), net_d, optim_d) - _, _, _, epoch_str = load_checkpoint((os.path.join(experiment_dir, "G_latest.pth") if save_only_latest else latest_checkpoint_path(experiment_dir, "G_*.pth")), net_g, optim_g) - epoch_str += 1 - global_step = (epoch_str - 1) * len(train_loader) - except: - epoch_str, global_step = 1, 0 - - if pretrainG != "" and pretrainG != "None": - if rank == 0: - verify_checkpoint_shapes(pretrainG, net_g) - logger.info(translations["import_pretrain"].format(dg="G", pretrain=pretrainG)) - - if hasattr(net_g, "module"): net_g.module.load_state_dict(torch.load(pretrainG, map_location="cpu")["model"]) - else: net_g.load_state_dict(torch.load(pretrainG, map_location="cpu")["model"]) - else: logger.warning(translations["not_using_pretrain"].format(dg="G")) - - if pretrainD != "" and pretrainD != "None": - if rank == 0: - verify_checkpoint_shapes(pretrainD, net_d) - logger.info(translations["import_pretrain"].format(dg="D", pretrain=pretrainD)) - - if hasattr(net_d, "module"): net_d.module.load_state_dict(torch.load(pretrainD, map_location="cpu")["model"]) - else: net_d.load_state_dict(torch.load(pretrainD, map_location="cpu")["model"]) - else: logger.warning(translations["not_using_pretrain"].format(dg="D")) - - scheduler_g, scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=config.train.lr_decay, last_epoch=epoch_str - 2), torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=config.train.lr_decay, last_epoch=epoch_str - 2) - optim_d.step(); optim_g.step() - - scaler = GradScaler(enabled=False) - cache = [] - - for info in train_loader: - phone, phone_lengths, pitch, pitchf, _, _, _, _, sid = info - reference = (phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True), (pitch.cuda(rank, non_blocking=True) if pitch_guidance else None), (pitchf.cuda(rank, non_blocking=True) if pitch_guidance else None), sid.cuda(rank, non_blocking=True)) if device.type == "cuda" else (phone.to(device), phone_lengths.to(device), (pitch.to(device) if pitch_guidance else None), (pitchf.to(device) if pitch_guidance else None), sid.to(device)) - break - - for epoch in range(epoch_str, total_epoch + 1): - train_and_evaluate(rank, epoch, config, [net_g, net_d], [optim_g, optim_d], scaler, train_loader, writer_eval, cache, custom_save_every_weights, custom_total_epoch, device, reference, model_author, vocoder) - scheduler_g.step(); scheduler_d.step() - -def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, train_loader, writer, cache, custom_save_every_weights, custom_total_epoch, device, reference, model_author, vocoder): - global global_step, lowest_value, loss_disc, consecutive_increases_gen, consecutive_increases_disc - - if epoch == 1: - lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} - last_loss_gen_all, consecutive_increases_gen, consecutive_increases_disc = 0.0, 0, 0 - - net_g, net_d = nets - optim_g, optim_d = optims - train_loader.batch_sampler.set_epoch(epoch) - - net_g.train(); net_d.train() - - if device.type == "cuda" and cache_data_in_gpu: - data_iterator = cache - if cache == []: - for batch_idx, info in enumerate(train_loader): - cache.append((batch_idx, [tensor.cuda(rank, non_blocking=True) for tensor in info])) - else: shuffle(cache) - else: data_iterator = enumerate(train_loader) - - epoch_recorder = EpochRecorder() - - with tqdm(total=len(train_loader), leave=False) as pbar: - for batch_idx, info in data_iterator: - if device.type == "cuda" and not cache_data_in_gpu: info = [tensor.cuda(rank, non_blocking=True) for tensor in info] - elif device.type != "cuda": info = [tensor.to(device) for tensor in info] - - phone, phone_lengths, pitch, pitchf, spec, spec_lengths, wave, _, sid = info - pitch = pitch if pitch_guidance else None - pitchf = pitchf if pitch_guidance else None - - with autocast(enabled=False): - y_hat, ids_slice, _, z_mask, (_, z_p, m_p, logs_p, _, logs_q) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid) - mel = spec_to_mel_torch(spec, config.data.filter_length, config.data.n_mel_channels, config.data.sample_rate, config.data.mel_fmin, config.data.mel_fmax) - y_mel = slice_segments(mel, ids_slice, config.train.segment_size // config.data.hop_length, dim=3) - - with autocast(enabled=False): - y_hat_mel = mel_spectrogram_torch(y_hat.float().squeeze(1), config.data.filter_length, config.data.n_mel_channels, config.data.sample_rate, config.data.hop_length, config.data.win_length, config.data.mel_fmin, config.data.mel_fmax) - - wave = slice_segments(wave, ids_slice * config.data.hop_length, config.train.segment_size, dim=3) - y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) - - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) - - optim_d.zero_grad() - scaler.scale(loss_disc).backward() - scaler.unscale_(optim_d) - grad_norm_d = clip_grad_value(net_d.parameters(), None) - scaler.step(optim_d) - - with autocast(enabled=False): - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) - with autocast(enabled=False): - loss_mel = F.l1_loss(y_mel, y_hat_mel) * config.train.c_mel - loss_kl = (kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl) - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl - if loss_gen_all < lowest_value["value"]: - lowest_value["value"] = loss_gen_all - lowest_value["step"] = global_step - lowest_value["epoch"] = epoch - if epoch > lowest_value["epoch"]: logger.warning(translations["training_warning"]) - - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = clip_grad_value(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank == 0 and global_step % config.train.log_interval == 0: - if loss_mel > 75: loss_mel = 75 - if loss_kl > 9: loss_kl = 9 - - scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc, "learning_rate": optim_g.param_groups[0]["lr"], "grad/norm_d": grad_norm_d, "grad/norm_g": grad_norm_g, "loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl} - scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)}) - scalar_dict.update({f"loss/d_r/{i}": v for i, v in enumerate(losses_disc_r)}) - scalar_dict.update({f"loss/d_g/{i}": v for i, v in enumerate(losses_disc_g)}) - - with torch.no_grad(): - o, *_ = net_g.module.infer(*reference) if hasattr(net_g, "module") else net_g.infer(*reference) - - summarize(writer=writer, global_step=global_step, images={"slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy())}, scalars=scalar_dict, audios={f"gen/audio_{global_step:07d}": o[0, :, :]}, audio_sample_rate=config.data.sample_rate) - - global_step += 1 - pbar.update(1) - - def check_overtraining(smoothed_loss_history, threshold, epsilon=0.004): - if len(smoothed_loss_history) < threshold + 1: return False - for i in range(-threshold, -1): - if smoothed_loss_history[i + 1] > smoothed_loss_history[i]: return True - if abs(smoothed_loss_history[i + 1] - smoothed_loss_history[i]) >= epsilon: return False - return True - - def update_exponential_moving_average(smoothed_loss_history, new_value, smoothing=0.987): - smoothed_value = new_value if not smoothed_loss_history else (smoothing * smoothed_loss_history[-1] + (1 - smoothing) * new_value) - smoothed_loss_history.append(smoothed_value) - return smoothed_value - - def save_to_json(file_path, loss_disc_history, smoothed_loss_disc_history, loss_gen_history, smoothed_loss_gen_history): - with open(file_path, "w") as f: - json.dump({"loss_disc_history": loss_disc_history, "smoothed_loss_disc_history": smoothed_loss_disc_history, "loss_gen_history": loss_gen_history, "smoothed_loss_gen_history": smoothed_loss_gen_history}, f) - - model_add, model_del = [], [] - done = False - - if rank == 0: - if epoch % save_every_epoch == False: - checkpoint_suffix = f"{'latest' if save_only_latest else global_step}.pth" - save_checkpoint(net_g, optim_g, config.train.learning_rate, epoch, os.path.join(experiment_dir, "G_" + checkpoint_suffix)) - save_checkpoint(net_d, optim_d, config.train.learning_rate, epoch, os.path.join(experiment_dir, "D_" + checkpoint_suffix)) - if custom_save_every_weights: model_add.append(os.path.join("assets", "weights", f"{model_name}_{epoch}e_{global_step}s.pth")) - - if overtraining_detector and epoch > 1: - current_loss_disc = float(loss_disc) - loss_disc_history.append(current_loss_disc) - smoothed_value_disc = update_exponential_moving_average(smoothed_loss_disc_history, current_loss_disc) - is_overtraining_disc = check_overtraining(smoothed_loss_disc_history, overtraining_threshold * 2) - - if is_overtraining_disc: consecutive_increases_disc += 1 - else: consecutive_increases_disc = 0 - - current_loss_gen = float(lowest_value["value"]) - loss_gen_history.append(current_loss_gen) - smoothed_value_gen = update_exponential_moving_average(smoothed_loss_gen_history, current_loss_gen) - is_overtraining_gen = check_overtraining(smoothed_loss_gen_history, overtraining_threshold, 0.01) - - if is_overtraining_gen: consecutive_increases_gen += 1 - else: consecutive_increases_gen = 0 - - if epoch % save_every_epoch == 0: save_to_json(training_file_path, loss_disc_history, smoothed_loss_disc_history, loss_gen_history, smoothed_loss_gen_history) - - if (is_overtraining_gen and consecutive_increases_gen == overtraining_threshold or is_overtraining_disc and consecutive_increases_disc == (overtraining_threshold * 2)): - logger.info(translations["overtraining_find"].format(epoch=epoch, smoothed_value_gen=f"{smoothed_value_gen:.3f}", smoothed_value_disc=f"{smoothed_value_disc:.3f}")) - done = True - else: - logger.info(translations["best_epoch"].format(epoch=epoch, smoothed_value_gen=f"{smoothed_value_gen:.3f}", smoothed_value_disc=f"{smoothed_value_disc:.3f}")) - for file in glob.glob(os.path.join("assets", "weights", f"{model_name}_*e_*s_best_epoch.pth")): - model_del.append(file) - - model_add.append(os.path.join("assets", "weights", f"{model_name}_{epoch}e_{global_step}s_best_epoch.pth")) - - if epoch >= custom_total_epoch: - logger.info(translations["success_training"].format(epoch=epoch, global_step=global_step, loss_gen_all=round(loss_gen_all.item(), 3))) - logger.info(translations["training_info"].format(lowest_value_rounded=round(float(lowest_value["value"]), 3), lowest_value_epoch=lowest_value['epoch'], lowest_value_step=lowest_value['step'])) - - pid_file_path = os.path.join(experiment_dir, "config.json") - with open(pid_file_path, "r") as pid_file: - pid_data = json.load(pid_file) - - with open(pid_file_path, "w") as pid_file: - pid_data.pop("process_pids", None) - json.dump(pid_data, pid_file, indent=4) - - model_add.append(os.path.join("assets", "weights", f"{model_name}_{epoch}e_{global_step}s.pth")) - done = True - - for m in model_del: - os.remove(m) - - if model_add: - ckpt = (net_g.module.state_dict() if hasattr(net_g, "module") else net_g.state_dict()) - for m in model_add: - extract_model(ckpt=ckpt, sr=sample_rate, pitch_guidance=pitch_guidance == True, name=model_name, model_path=m, epoch=epoch, step=global_step, version=version, hps=hps, model_author=model_author, vocoder=vocoder) - - lowest_value_rounded = round(float(lowest_value["value"]), 3) - - if epoch > 1 and overtraining_detector: logger.info(translations["model_training_info"].format(model_name=model_name, epoch=epoch, global_step=global_step, epoch_recorder=epoch_recorder.record(), lowest_value_rounded=lowest_value_rounded, lowest_value_epoch=lowest_value['epoch'], lowest_value_step=lowest_value['step'], remaining_epochs_gen=(overtraining_threshold - consecutive_increases_gen), remaining_epochs_disc=((overtraining_threshold * 2) - consecutive_increases_disc), smoothed_value_gen=f"{smoothed_value_gen:.3f}", smoothed_value_disc=f"{smoothed_value_disc:.3f}")) - elif epoch > 1 and overtraining_detector == False: logger.info(translations["model_training_info_2"].format(model_name=model_name, epoch=epoch, global_step=global_step, epoch_recorder=epoch_recorder.record(), lowest_value_rounded=lowest_value_rounded, lowest_value_epoch=lowest_value['epoch'], lowest_value_step=lowest_value['step'])) - else: logger.info(translations["model_training_info_3"].format(model_name=model_name, epoch=epoch, global_step=global_step, epoch_recorder=epoch_recorder.record())) - - last_loss_gen_all = loss_gen_all - if done: os._exit(0) - -if __name__ == "__main__": - torch.multiprocessing.set_start_method("spawn") - try: - main() - except Exception as e: - logger.error(f"{translations['training_error']} {e}") - import traceback - logger.debug(traceback.format_exc()) \ No newline at end of file diff --git a/main/library/algorithm/commons.py b/main/library/algorithm/commons.py deleted file mode 100644 index 805a1d592aaaa905bc3731144759090cdfb1fd3e..0000000000000000000000000000000000000000 --- a/main/library/algorithm/commons.py +++ /dev/null @@ -1,60 +0,0 @@ -import torch - - - -def init_weights(m, mean=0.0, std=0.01): - if m.__class__.__name__.find("Conv") != -1: m.weight.data.normal_(mean, std) - -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - -def convert_pad_shape(pad_shape): - return [item for sublist in pad_shape[::-1] for item in sublist] - -def slice_segments(x, ids_str, segment_size = 4, dim = 2): - if dim == 2: ret = torch.zeros_like(x[:, :segment_size]) - elif dim == 3: ret = torch.zeros_like(x[:, :, :segment_size]) - - for i in range(x.size(0)): - idx_str = ids_str[i].item() - idx_end = idx_str + segment_size - - if dim == 2: ret[i] = x[i, idx_str:idx_end] - else: ret[i] = x[i, :, idx_str:idx_end] - - return ret - -def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, _, t = x.size() - if x_lengths is None: x_lengths = t - - ids_str = (torch.rand([b]).to(device=x.device) * (x_lengths - segment_size + 1)).to(dtype=torch.long) - - return slice_segments(x, ids_str, segment_size, dim=3), ids_str - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - - in_act = input_a + input_b - - return torch.tanh(in_act[:, :n_channels_int, :]) * torch.sigmoid(in_act[:, n_channels_int:, :]) - -def sequence_mask(length, max_length = None): - if max_length is None: max_length = length.max() - - return torch.arange(max_length, dtype=length.dtype, device=length.device).unsqueeze(0) < length.unsqueeze(1) - -def clip_grad_value(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): parameters = [parameters] - norm_type = float(norm_type) - - if clip_value is not None: clip_value = float(clip_value) - total_norm = 0 - - for p in list(filter(lambda p: p.grad is not None, parameters)): - total_norm += (p.grad.data.norm(norm_type)).item() ** norm_type - - if clip_value is not None: p.grad.data.clamp_(min=-clip_value, max=clip_value) - - return total_norm ** (1.0 / norm_type) \ No newline at end of file diff --git a/main/library/algorithm/modules.py b/main/library/algorithm/modules.py deleted file mode 100644 index 59772910a9e99c67feb913e7d59a85dc02809d49..0000000000000000000000000000000000000000 --- a/main/library/algorithm/modules.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -import sys -import torch - -sys.path.append(os.getcwd()) - -from .commons import fused_add_tanh_sigmoid_multiply - - -class WaveNet(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WaveNet, self).__init__() - assert kernel_size % 2 == 1 - self.hidden_channels = hidden_channels - self.kernel_size = (kernel_size,) - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = torch.nn.Dropout(p_dropout) - - if gin_channels != 0: self.cond_layer = torch.nn.utils.parametrizations.weight_norm(torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), name="weight") - - dilations = [dilation_rate**i for i in range(n_layers)] - paddings = [(kernel_size * d - d) // 2 for d in dilations] - - for i in range(n_layers): - in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilations[i], padding=paddings[i]) - in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight") - self.in_layers.append(in_layer) - - res_skip_channels = (hidden_channels if i == n_layers - 1 else 2 * hidden_channels) - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - - res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight") - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - else: g_l = torch.zeros_like(x_in) - - res_skip_acts = self.res_skip_layers[i](self.drop(fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor))) - - if i < self.n_layers - 1: - x = (x + (res_skip_acts[:, : self.hidden_channels, :])) * x_mask - output = output + res_skip_acts[:, self.hidden_channels :, :] - else: output = output + res_skip_acts - - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: torch.nn.utils.remove_weight_norm(self.cond_layer) - - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) \ No newline at end of file diff --git a/main/library/algorithm/mrf_hifigan.py b/main/library/algorithm/mrf_hifigan.py deleted file mode 100644 index 0217e44d03e2277b38d7d51acd1c462d5a272a8d..0000000000000000000000000000000000000000 --- a/main/library/algorithm/mrf_hifigan.py +++ /dev/null @@ -1,170 +0,0 @@ -import torch - -import numpy as np -import torch.nn.functional as F - -from torch.nn.utils import remove_weight_norm -from torch.utils.checkpoint import checkpoint -from torch.nn.utils.parametrizations import weight_norm - -LRELU_SLOPE = 0.1 - -class MRFLayer(torch.nn.Module): - def __init__(self, channels, kernel_size, dilation): - super().__init__() - self.conv1 = weight_norm(torch.nn.Conv1d(channels, channels, kernel_size, padding=(kernel_size * dilation - dilation) // 2, dilation=dilation)) - self.conv2 = weight_norm(torch.nn.Conv1d(channels, channels, kernel_size, padding=kernel_size // 2, dilation=1)) - - def forward(self, x): - return x + self.conv2(F.leaky_relu(self.conv1(F.leaky_relu(x, LRELU_SLOPE)), LRELU_SLOPE)) - - def remove_weight_norm(self): - remove_weight_norm(self.conv1) - remove_weight_norm(self.conv2) - -class MRFBlock(torch.nn.Module): - def __init__(self, channels, kernel_size, dilations): - super().__init__() - self.layers = torch.nn.ModuleList() - - for dilation in dilations: - self.layers.append(MRFLayer(channels, kernel_size, dilation)) - - def forward(self, x): - for layer in self.layers: - x = layer(x) - - return x - - def remove_weight_norm(self): - for layer in self.layers: - layer.remove_weight_norm() - -class SineGenerator(torch.nn.Module): - def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0): - super(SineGenerator, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - return torch.ones_like(f0) * (f0 > self.voiced_threshold) - - def _f02sine(self, f0_values): - rad_values = (f0_values / self.sampling_rate) % 1 - rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device) - - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - - tmp_over_one = torch.cumsum(rad_values, 1) % 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - - return torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) - - def forward(self, f0): - with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - f0_buf[:, :, 0] = f0[:, :, 0] - - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) - - sine_waves = self._f02sine(f0_buf) * self.sine_amp - uv = self._f02uv(f0) - - sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves)) - - return sine_waves - -class SourceModuleHnNSF(torch.nn.Module): - def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshold=0): - super(SourceModuleHnNSF, self).__init__() - self.sine_amp = sine_amp - self.noise_std = add_noise_std - - self.l_sin_gen = SineGenerator(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold) - self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x): - return self.l_tanh(self.l_linear(self.l_sin_gen(x).to(dtype=self.l_linear.weight.dtype))) - -class HiFiGANMRFGenerator(torch.nn.Module): - def __init__(self, in_channel, upsample_initial_channel, upsample_rates, upsample_kernel_sizes, resblock_kernel_sizes, resblock_dilations, gin_channels, sample_rate, harmonic_num, checkpointing=False): - super().__init__() - self.num_kernels = len(resblock_kernel_sizes) - - self.upp = int(np.prod(upsample_rates)) - self.f0_upsample = torch.nn.Upsample(scale_factor=self.upp) - self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) - - self.conv_pre = weight_norm(torch.nn.Conv1d(in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3)) - self.checkpointing = checkpointing - - self.upsamples = torch.nn.ModuleList() - self.upsampler = torch.nn.ModuleList() - self.noise_convs = torch.nn.ModuleList() - - stride_f0s = [upsample_rates[1] * upsample_rates[2] * upsample_rates[3], upsample_rates[2] * upsample_rates[3], upsample_rates[3], 1] - - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - if self.upp == 441: - self.upsampler.append(torch.nn.Upsample(scale_factor=u, mode="linear")) - self.upsamples.append(weight_norm(torch.nn.Conv1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), kernel_size=1))) - self.noise_convs.append(torch.nn.Conv1d(in_channels=1, out_channels=upsample_initial_channel // (2 ** (i + 1)), kernel_size = 1)) - else: - self.upsampler.append(torch.nn.Identity()) - self.upsamples.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), kernel_size=k, stride=u, padding=(k - u) // 2))) - self.noise_convs.append(torch.nn.Conv1d(1, upsample_initial_channel // (2 ** (i + 1)), kernel_size=stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1, stride=stride_f0s[i], padding=stride_f0s[i] // 2)) - - self.mrfs = torch.nn.ModuleList() - for i in range(len(self.upsamples)): - channel = upsample_initial_channel // (2 ** (i + 1)) - self.mrfs.append(torch.nn.ModuleList([MRFBlock(channel, kernel_size=k, dilations=d) for k, d in zip(resblock_kernel_sizes, resblock_dilations)])) - - self.conv_post = weight_norm(torch.nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3)) - if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, f0, g = None): - har_source = self.m_source(self.f0_upsample(f0[:, None, :]).transpose(-1, -2)).transpose(-1, -2) - x = self.conv_pre(x) - if g is not None: x += self.cond(g) - - for ups, upr, mrf, noise_conv in zip(self.upsamples, self.upsampler, self.mrfs, self.noise_convs): - x = F.leaky_relu(x, LRELU_SLOPE) - - if self.training and self.checkpointing: - if self.upp == 441: x = upr(x) - x = checkpoint(ups, x, use_reentrant=False) - else: - if self.upp == 441: x = upr(x) - x = ups(x) - - h = noise_conv(har_source) - if self.upp == 441: h = torch.nn.functional.interpolate(h, size=x.shape[-1], mode="linear") - x += h - - def mrf_sum(x, layers): - return sum(layer(x) for layer in layers) / self.num_kernels - - x = checkpoint(mrf_sum, x, mrf, use_reentrant=False) if self.training and self.checkpointing else mrf_sum(x, mrf) - - return torch.tanh(self.conv_post(F.leaky_relu(x))) - - def remove_weight_norm(self): - remove_weight_norm(self.conv_pre) - - for up in self.upsamples: - remove_weight_norm(up) - - for mrf in self.mrfs: - mrf.remove_weight_norm() - - remove_weight_norm(self.conv_post) \ No newline at end of file diff --git a/main/library/algorithm/onnx_export.py b/main/library/algorithm/onnx_export.py deleted file mode 100644 index 9c6c3fd5ed1b811ffd0c5aff5d32a26c4d5d8554..0000000000000000000000000000000000000000 --- a/main/library/algorithm/onnx_export.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -import sys -import onnx -import json -import torch -import onnxsim -import warnings - -sys.path.append(os.getcwd()) - -from main.library.algorithm.synthesizers import SynthesizerONNX - -warnings.filterwarnings("ignore") - -def onnx_exporter(input_path, output_path, device="cpu"): - cpt = (torch.load(input_path, map_location="cpu") if os.path.isfile(input_path) else None) - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] - - model_name, model_author, epochs, steps, version, f0, model_hash, vocoder, creation_date = cpt.get("model_name", None), cpt.get("author", None), cpt.get("epoch", None), cpt.get("step", None), cpt.get("version", "v1"), cpt.get("f0", 1), cpt.get("model_hash", None), cpt.get("vocoder", "Default"), cpt.get("creation_date", None) - text_enc_hidden_dim = 768 if version == "v2" else 256 - tgt_sr = cpt["config"][-1] - - net_g = SynthesizerONNX(*cpt["config"], use_f0=f0, text_enc_hidden_dim=text_enc_hidden_dim, vocoder=vocoder, checkpointing=False) - net_g.load_state_dict(cpt["weight"], strict=False) - - if f0: - args = (torch.rand(1, 200, text_enc_hidden_dim).to(device), torch.tensor([200]).long().to(device), torch.LongTensor([0]).to(device), torch.rand(1, 192, 200).to(device), torch.randint(size=(1, 200), low=5, high=255).to(device), torch.rand(1, 200).to(device)) - input_names = ["phone", "phone_lengths", "ds", "rnd", "pitch", "pitchf"] - dynamic_axes = {"phone": [1], "rnd": [2], "pitch": [1], "pitchf": [1]} - else: - args = (torch.rand(1, 200, text_enc_hidden_dim).to(device), torch.tensor([200]).long().to(device), torch.LongTensor([0]).to(device), torch.rand(1, 192, 200).to(device)) - input_names = ["phone", "phone_lengths", "ds", "rnd"] - dynamic_axes = {"phone": [1], "rnd": [2]} - - torch.onnx.export(net_g, args, output_path, do_constant_folding=False, opset_version=17, verbose=False, input_names=input_names, output_names=["audio"], dynamic_axes=dynamic_axes) - model, _ = onnxsim.simplify(output_path) - model.metadata_props.append(onnx.StringStringEntryProto(key="model_info", value=json.dumps({"model_name": model_name, "author": model_author, "epoch": epochs, "step": steps, "version": version, "sr": tgt_sr, "f0": f0, "model_hash": model_hash, "creation_date": creation_date, "vocoder": vocoder, "text_enc_hidden_dim": text_enc_hidden_dim}))) - - onnx.save(model, output_path) - return output_path \ No newline at end of file diff --git a/main/library/algorithm/refinegan.py b/main/library/algorithm/refinegan.py deleted file mode 100644 index 32ecc2cf98f9966c1342b07f06cf861a27e96c66..0000000000000000000000000000000000000000 --- a/main/library/algorithm/refinegan.py +++ /dev/null @@ -1,190 +0,0 @@ -import os -import sys -import torch - -import numpy as np -import torch.nn as nn -import torch.nn.functional as F - -from torch.utils.checkpoint import checkpoint -from torch.nn.utils.parametrizations import weight_norm -from torch.nn.utils.parametrize import remove_parametrizations - -sys.path.append(os.getcwd()) - -from .commons import get_padding - - -class ResBlock(nn.Module): - def __init__(self, *, in_channels, out_channels, kernel_size = 7, dilation = (1, 3, 5), leaky_relu_slope = 0.2): - super(ResBlock, self).__init__() - self.leaky_relu_slope = leaky_relu_slope - self.in_channels = in_channels - self.out_channels = out_channels - self.convs1 = nn.ModuleList([weight_norm(nn.Conv1d(in_channels=in_channels if idx == 0 else out_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, dilation=d, padding=get_padding(kernel_size, d))) for idx, d in enumerate(dilation)]) - self.convs1.apply(self.init_weights) - self.convs2 = nn.ModuleList([weight_norm(nn.Conv1d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, dilation=d, padding=get_padding(kernel_size, d))) for _, d in enumerate(dilation)]) - self.convs2.apply(self.init_weights) - - def forward(self, x): - for idx, (c1, c2) in enumerate(zip(self.convs1, self.convs2)): - xt = c2(F.leaky_relu_(c1(F.leaky_relu(x, self.leaky_relu_slope)), self.leaky_relu_slope)) - x = (xt + x) if idx != 0 or self.in_channels == self.out_channels else xt - - return x - - def remove_parametrizations(self): - for c1, c2 in zip(self.convs1, self.convs2): - remove_parametrizations(c1) - remove_parametrizations(c2) - - def init_weights(self, m): - if type(m) == nn.Conv1d: - m.weight.data.normal_(0, 0.01) - m.bias.data.fill_(0.0) - -class AdaIN(nn.Module): - def __init__(self, *, channels, leaky_relu_slope = 0.2): - super().__init__() - self.weight = nn.Parameter(torch.ones(channels)) - self.activation = nn.LeakyReLU(leaky_relu_slope, inplace=True) - - def forward(self, x): - return self.activation(x + (torch.randn_like(x) * self.weight[None, :, None])) - -class ParallelResBlock(nn.Module): - def __init__(self, *, in_channels, out_channels, kernel_sizes = (3, 7, 11), dilation = (1, 3, 5), leaky_relu_slope = 0.2): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.input_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=7, stride=1, padding=3) - self.blocks = nn.ModuleList([nn.Sequential(AdaIN(channels=out_channels), ResBlock(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, dilation=dilation, leaky_relu_slope=leaky_relu_slope), AdaIN(channels=out_channels)) for kernel_size in kernel_sizes]) - - def forward(self, x): - x = self.input_conv(x) - return torch.mean(torch.stack([block(x) for block in self.blocks]), dim=0) - - def remove_parametrizations(self): - for block in self.blocks: - block[1].remove_parametrizations() - -class SineGenerator(nn.Module): - def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0): - super(SineGenerator, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - self.merge = nn.Sequential(nn.Linear(self.dim, 1, bias=False), nn.Tanh()) - - def _f02uv(self, f0): - return torch.ones_like(f0) * (f0 > self.voiced_threshold) - - def _f02sine(self, f0_values): - rad_values = (f0_values / self.sampling_rate) % 1 - rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device) - - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - - tmp_over_one = torch.cumsum(rad_values, 1) % 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - - return torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) - - def forward(self, f0): - with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - f0_buf[:, :, 0] = f0[:, :, 0] - - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) - - sine_waves = self._f02sine(f0_buf) * self.sine_amp - uv = self._f02uv(f0) - sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves)) - - return self.merge(sine_waves - sine_waves.mean(dim=1, keepdim=True)) - -class RefineGANGenerator(nn.Module): - def __init__(self, *, sample_rate = 44100, upsample_rates = (8, 8, 2, 2), leaky_relu_slope = 0.2, num_mels = 128, gin_channels = 256, checkpointing = False, upsample_initial_channel = 512): - super().__init__() - self.upsample_rates = upsample_rates - self.checkpointing = checkpointing - self.leaky_relu_slope = leaky_relu_slope - self.upp = int(np.prod(upsample_rates)) - assert self.upp == sample_rate // 100 - self.m_source = SineGenerator(sample_rate) - self.pre_conv = weight_norm(nn.Conv1d(in_channels=1, out_channels=upsample_initial_channel // 2, kernel_size=7, stride=1, padding=3, bias=False)) - channels = upsample_initial_channel - self.downsample_blocks = nn.ModuleList([]) - - stride_f0s = [upsample_rates[1] * upsample_rates[2] * upsample_rates[3], upsample_rates[2] * upsample_rates[3], upsample_rates[3], 1] - - for i, _ in enumerate(upsample_rates): - if self.upp == 441: self.downsample_blocks.append(nn.Conv1d(in_channels=1, out_channels=channels // 2 ** (i + 2), kernel_size = 1)) - else: self.downsample_blocks.append(nn.Conv1d(in_channels=1, out_channels=channels // 2 ** (i + 2), kernel_size=stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1, stride=stride_f0s[i], padding=stride_f0s[i] // 2)) - - self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels, out_channels=channels // 2, kernel_size=7, stride=1, padding=3)) - if gin_channels != 0: self.cond = nn.Conv1d(256, channels // 2, 1) - - self.upsample_blocks = nn.ModuleList([]) - self.upsample_conv_blocks = nn.ModuleList([]) - self.filters = nn.ModuleList([]) - - for rate in upsample_rates: - new_channels = channels // 2 - self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear")) - - low_pass = nn.Conv1d(channels, channels, kernel_size=15, padding=7, groups=channels, bias=False) - low_pass.weight.data.fill_(1.0 / 15) - self.filters.append(low_pass) - - self.upsample_conv_blocks.append(ParallelResBlock(in_channels=channels + channels // 4, out_channels=new_channels, kernel_sizes=(3, 7, 11), dilation=(1, 3, 5), leaky_relu_slope=leaky_relu_slope)) - channels = new_channels - - self.conv_post = weight_norm(nn.Conv1d(in_channels=channels, out_channels=1, kernel_size=7, stride=1, padding=3)) - - def forward(self, mel, f0, g = None): - f0 = F.interpolate(f0.unsqueeze(1), size=mel.shape[-1] * self.upp, mode="linear") - har_source = self.m_source(f0.transpose(1, 2)).transpose(1, 2) - x = F.interpolate(self.pre_conv(har_source), size=mel.shape[-1], mode="linear") - - mel = self.mel_conv(mel) - if g is not None: mel += self.cond(g) - - x = torch.cat([mel, x], dim=1) - - for ups, res, down, flt in zip(self.upsample_blocks, self.upsample_conv_blocks, self.downsample_blocks, self.filters): - x = F.leaky_relu_(x, self.leaky_relu_slope) - - if self.training and self.checkpointing: - x = checkpoint(flt, checkpoint(ups, x, use_reentrant=False), use_reentrant=False) - h = down(har_source) - - if self.upp == 441: h = F.interpolate(h, size=x.shape[-1], mode="linear") - x = checkpoint(res, torch.cat([x, h], dim=1), use_reentrant=False) - else: - x = flt(ups(x)) - h = down(har_source) - - if self.upp == 441: h = F.interpolate(h, size=x.shape[-1], mode="linear") - x = res(torch.cat([x, h], dim=1)) - - return torch.tanh_(self.conv_post(F.leaky_relu_(x, self.leaky_relu_slope))) - - def remove_parametrizations(self): - remove_parametrizations(self.source_conv) - remove_parametrizations(self.mel_conv) - remove_parametrizations(self.conv_post) - - for block in self.downsample_blocks: - block[1].remove_parametrizations() - - for block in self.upsample_conv_blocks: - block.remove_parametrizations() \ No newline at end of file diff --git a/main/library/algorithm/residuals.py b/main/library/algorithm/residuals.py deleted file mode 100644 index 2188fc7e39aeffbd1624fe305e6d6edee968651a..0000000000000000000000000000000000000000 --- a/main/library/algorithm/residuals.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -import sys -import torch - -from torch.nn.utils import remove_weight_norm -from torch.nn.utils.parametrizations import weight_norm - -sys.path.append(os.getcwd()) - -from .modules import WaveNet -from .commons import get_padding, init_weights - - -LRELU_SLOPE = 0.1 - -def create_conv1d_layer(channels, kernel_size, dilation): - return weight_norm(torch.nn.Conv1d(channels, channels, kernel_size, 1, dilation=dilation, padding=get_padding(kernel_size, dilation))) - -def apply_mask(tensor, mask): - return tensor * mask if mask is not None else tensor - -class ResBlockBase(torch.nn.Module): - def __init__(self, channels, kernel_size, dilations): - super(ResBlockBase, self).__init__() - - self.convs1 = torch.nn.ModuleList([create_conv1d_layer(channels, kernel_size, d) for d in dilations]) - self.convs1.apply(init_weights) - - self.convs2 = torch.nn.ModuleList([create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]) - self.convs2.apply(init_weights) - - def forward(self, x, x_mask=None): - for c1, c2 in zip(self.convs1, self.convs2): - x = c2(apply_mask(torch.nn.functional.leaky_relu(c1(apply_mask(torch.nn.functional.leaky_relu(x, LRELU_SLOPE), x_mask)), LRELU_SLOPE), x_mask)) + x - - return apply_mask(x, x_mask) - - def remove_weight_norm(self): - for conv in self.convs1 + self.convs2: - remove_weight_norm(conv) - -class ResBlock(ResBlockBase): - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock, self).__init__(channels, kernel_size, dilation) - -class Log(torch.nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - return y, torch.sum(-y, [1, 2]) - else: return torch.exp(x) * x_mask - -class Flip(torch.nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - - if not reverse: return x, torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - else: return x - -class ElementwiseAffine(torch.nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = torch.nn.Parameter(torch.zeros(channels, 1)) - self.logs = torch.nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: return ((self.m + torch.exp(self.logs) * x) * x_mask), torch.sum(self.logs * x_mask, [1, 2]) - else: return (x - self.m) * torch.exp(-self.logs) * x_mask - -class ResidualCouplingBlock(torch.nn.Module): - def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0): - super(ResidualCouplingBlock, self).__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - self.flows = torch.nn.ModuleList() - - for _ in range(n_flows): - self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) - self.flows.append(Flip()) - - def forward(self, x, x_mask, g = None, reverse = False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow.forward(x, x_mask, g=g, reverse=reverse) - - return x - - def remove_weight_norm(self): - for i in range(self.n_flows): - self.flows[i * 2].remove_weight_norm() - - def __prepare_scriptable__(self): - for i in range(self.n_flows): - for hook in self.flows[i * 2]._forward_pre_hooks.values(): - if (hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" and hook.__class__.__name__ == "WeightNorm"): torch.nn.utils.remove_weight_norm(self.flows[i * 2]) - - return self - -class ResidualCouplingLayer(torch.nn.Module): - def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False): - assert channels % 2 == 0, "Channels/2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WaveNet(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) - self.post = torch.nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - stats = self.post(self.enc((self.pre(x0) * x_mask), x_mask, g=g)) * x_mask - - if not self.mean_only: m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: return torch.cat([x0, (m + x1 * torch.exp(logs) * x_mask)], 1), torch.sum(logs, [1, 2]) - else: return torch.cat([x0, ((x1 - m) * torch.exp(-logs) * x_mask)], 1) - - def remove_weight_norm(self): - self.enc.remove_weight_norm() \ No newline at end of file diff --git a/main/library/algorithm/separator.py b/main/library/algorithm/separator.py deleted file mode 100644 index cd2300525420db96931707bc0c33561841c53d12..0000000000000000000000000000000000000000 --- a/main/library/algorithm/separator.py +++ /dev/null @@ -1,330 +0,0 @@ -import os -import sys -import time -import yaml -import torch -import codecs -import hashlib -import logging -import platform -import warnings -import requests -import onnxruntime - -from importlib import metadata, import_module - -now_dir = os.getcwd() -sys.path.append(now_dir) - -from main.configs.config import Config -translations = Config().translations - -class Separator: - def __init__(self, logger=logging.getLogger(__name__), log_level=logging.INFO, log_formatter=None, model_file_dir="assets/models/uvr5", output_dir=None, output_format="wav", output_bitrate=None, normalization_threshold=0.9, output_single_stem=None, invert_using_spec=False, sample_rate=44100, mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False}, demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True}): - self.logger = logger - self.log_level = log_level - self.log_formatter = log_formatter - self.log_handler = logging.StreamHandler() - - if self.log_formatter is None: self.log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(module)s - %(message)s") - self.log_handler.setFormatter(self.log_formatter) - - if not self.logger.hasHandlers(): self.logger.addHandler(self.log_handler) - if log_level > logging.DEBUG: warnings.filterwarnings("ignore") - - self.logger.info(translations["separator_info"].format(output_dir=output_dir, output_format=output_format)) - self.model_file_dir = model_file_dir - - if output_dir is None: - output_dir = now_dir - self.logger.info(translations["output_dir_is_none"]) - - self.output_dir = output_dir - - os.makedirs(self.model_file_dir, exist_ok=True) - os.makedirs(self.output_dir, exist_ok=True) - - self.output_format = output_format - self.output_bitrate = output_bitrate - - if self.output_format is None: self.output_format = "wav" - self.normalization_threshold = normalization_threshold - if normalization_threshold <= 0 or normalization_threshold > 1: raise ValueError(translations[">0or=1"]) - - self.output_single_stem = output_single_stem - if output_single_stem is not None: self.logger.debug(translations["output_single"].format(output_single_stem=output_single_stem)) - - self.invert_using_spec = invert_using_spec - if self.invert_using_spec: self.logger.debug(translations["step2"]) - - self.sample_rate = int(sample_rate) - self.arch_specific_params = {"MDX": mdx_params, "Demucs": demucs_params} - self.torch_device = None - self.torch_device_cpu = None - self.torch_device_mps = None - self.onnx_execution_provider = None - self.model_instance = None - self.model_is_uvr_vip = False - self.model_friendly_name = None - self.setup_accelerated_inferencing_device() - - def setup_accelerated_inferencing_device(self): - system_info = self.get_system_info() - self.log_onnxruntime_packages() - self.setup_torch_device(system_info) - - def get_system_info(self): - os_name = platform.system() - os_version = platform.version() - self.logger.info(f"{translations['os']}: {os_name} {os_version}") - system_info = platform.uname() - self.logger.info(translations["platform_info"].format(system_info=system_info, node=system_info.node, release=system_info.release, machine=system_info.machine, processor=system_info.processor)) - python_version = platform.python_version() - self.logger.info(f"{translations['name_ver'].format(name='python')}: {python_version}") - pytorch_version = torch.__version__ - self.logger.info(f"{translations['name_ver'].format(name='pytorch')}: {pytorch_version}") - - return system_info - - def log_onnxruntime_packages(self): - onnxruntime_gpu_package = self.get_package_distribution("onnxruntime-gpu") - onnxruntime_cpu_package = self.get_package_distribution("onnxruntime") - - if onnxruntime_gpu_package is not None: self.logger.info(f"{translations['install_onnx'].format(pu='GPU')}: {onnxruntime_gpu_package.version}") - if onnxruntime_cpu_package is not None: self.logger.info(f"{translations['install_onnx'].format(pu='CPU')}: {onnxruntime_cpu_package.version}") - - def setup_torch_device(self, system_info): - hardware_acceleration_enabled = False - ort_providers = onnxruntime.get_available_providers() - self.torch_device_cpu = torch.device("cpu") - - if torch.cuda.is_available(): - self.configure_cuda(ort_providers) - hardware_acceleration_enabled = True - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and system_info.processor == "arm": - self.configure_mps(ort_providers) - hardware_acceleration_enabled = True - - if not hardware_acceleration_enabled: - self.logger.info(translations["running_in_cpu"]) - self.torch_device = self.torch_device_cpu - self.onnx_execution_provider = ["CPUExecutionProvider"] - - def configure_cuda(self, ort_providers): - self.logger.info(translations["running_in_cuda"]) - self.torch_device = torch.device("cuda") - - if "CUDAExecutionProvider" in ort_providers: - self.logger.info(translations["onnx_have"].format(have='CUDAExecutionProvider')) - self.onnx_execution_provider = ["CUDAExecutionProvider"] - else: self.logger.warning(translations["onnx_not_have"].format(have='CUDAExecutionProvider')) - - def configure_mps(self, ort_providers): - self.logger.info(translations["set_torch_mps"]) - self.torch_device_mps = torch.device("mps") - self.torch_device = self.torch_device_mps - - if "CoreMLExecutionProvider" in ort_providers: - self.logger.info(translations["onnx_have"].format(have='CoreMLExecutionProvider')) - self.onnx_execution_provider = ["CoreMLExecutionProvider"] - else: self.logger.warning(translations["onnx_not_have"].format(have='CoreMLExecutionProvider')) - - def get_package_distribution(self, package_name): - try: - return metadata.distribution(package_name) - except metadata.PackageNotFoundError: - self.logger.debug(translations["python_not_install"].format(package_name=package_name)) - return None - - def get_model_hash(self, model_path): - self.logger.debug(translations["hash"].format(model_path=model_path)) - - try: - with open(model_path, "rb") as f: - f.seek(-10000 * 1024, 2) - return hashlib.md5(f.read()).hexdigest() - except IOError as e: - self.logger.error(translations["ioerror"].format(e=e)) - return hashlib.md5(open(model_path, "rb").read()).hexdigest() - - def download_file_if_not_exists(self, url, output_path): - if os.path.isfile(output_path): - self.logger.debug(translations["cancel_download"].format(output_path=output_path)) - return - - self.logger.debug(translations["download_model"].format(url=url, output_path=output_path)) - response = requests.get(url, stream=True, timeout=300) - - if response.status_code == 200: - from tqdm import tqdm - - progress_bar = tqdm(total=int(response.headers.get("content-length", 0)), ncols=100, unit="byte") - - with open(output_path, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - progress_bar.update(len(chunk)) - f.write(chunk) - - progress_bar.close() - else: raise RuntimeError(translations["download_error"].format(url=url, status_code=response.status_code)) - - def print_uvr_vip_message(self): - if self.model_is_uvr_vip: - self.logger.warning(translations["vip_model"].format(model_friendly_name=self.model_friendly_name)) - self.logger.warning(translations["vip_print"]) - - def list_supported_model_files(self): - response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/enj/znva/wfba/hie_zbqryf.wfba", "rot13")) - response.raise_for_status() - model_downloads_list = response.json() - self.logger.debug(translations["load_download_json"]) - - return {"MDX": {**model_downloads_list["mdx_download_list"], **model_downloads_list["mdx_download_vip_list"]}, "Demucs": {key: value for key, value in model_downloads_list["demucs_download_list"].items() if key.startswith("Demucs v4")}} - - def download_model_files(self, model_filename): - model_path = os.path.join(self.model_file_dir, model_filename) - supported_model_files_grouped = self.list_supported_model_files() - - yaml_config_filename = None - self.logger.debug(translations["search_model"].format(model_filename=model_filename)) - - for model_type, model_list in supported_model_files_grouped.items(): - for model_friendly_name, model_download_list in model_list.items(): - self.model_is_uvr_vip = "VIP" in model_friendly_name - model_repo_url_prefix = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/hie5_zbqryf", "rot13") - - if isinstance(model_download_list, str) and model_download_list == model_filename: - self.logger.debug(translations["single_model"].format(model_friendly_name=model_friendly_name)) - self.model_friendly_name = model_friendly_name - - try: - self.download_file_if_not_exists(f"{model_repo_url_prefix}/MDX/{model_filename}", model_path) - except RuntimeError: - self.logger.warning(translations["not_found_model"]) - self.download_file_if_not_exists(f"{model_repo_url_prefix}/Demucs/{model_filename}", model_path) - - self.print_uvr_vip_message() - self.logger.debug(translations["single_model_path"].format(model_path=model_path)) - - return model_filename, model_type, model_friendly_name, model_path, yaml_config_filename - elif isinstance(model_download_list, dict): - this_model_matches_input_filename = False - - for file_name, file_url in model_download_list.items(): - if file_name == model_filename or file_url == model_filename: - self.logger.debug(translations["find_model"].format(model_filename=model_filename, model_friendly_name=model_friendly_name)) - this_model_matches_input_filename = True - - if this_model_matches_input_filename: - self.logger.debug(translations["find_models"].format(model_friendly_name=model_friendly_name)) - self.model_friendly_name = model_friendly_name - self.print_uvr_vip_message() - - for config_key, config_value in model_download_list.items(): - self.logger.debug(f"{translations['find_path']}: {config_key} -> {config_value}") - - if config_value.startswith("http"): self.download_file_if_not_exists(config_value, os.path.join(self.model_file_dir, config_key)) - elif config_key.endswith(".ckpt"): - try: - self.download_file_if_not_exists(f"{model_repo_url_prefix}/Demucs/{config_key}", os.path.join(self.model_file_dir, config_key)) - except RuntimeError: - self.logger.warning(translations["not_found_model_warehouse"]) - - if model_filename.endswith(".yaml"): - self.logger.warning(translations["yaml_warning"].format(model_filename=model_filename)) - self.logger.warning(translations["yaml_warning_2"].format(config_key=config_key)) - self.logger.warning(translations["yaml_warning_3"]) - - model_filename = config_key - model_path = os.path.join(self.model_file_dir, f"{model_filename}") - - yaml_config_filename = config_value - yaml_config_filepath = os.path.join(self.model_file_dir, yaml_config_filename) - - try: - self.download_file_if_not_exists(f"{model_repo_url_prefix}/mdx_c_configs/{yaml_config_filename}", yaml_config_filepath) - except RuntimeError: - self.logger.debug(translations["yaml_debug"]) - else: self.download_file_if_not_exists(f"{model_repo_url_prefix}/Demucs/{config_value}", os.path.join(self.model_file_dir, config_value)) - - self.logger.debug(translations["download_model_friendly"].format(model_friendly_name=model_friendly_name, model_path=model_path)) - return model_filename, model_type, model_friendly_name, model_path, yaml_config_filename - - raise ValueError(translations["not_found_model_2"].format(model_filename=model_filename)) - - def load_model_data_from_yaml(self, yaml_config_filename): - model_data_yaml_filepath = os.path.join(self.model_file_dir, yaml_config_filename) if not os.path.exists(yaml_config_filename) else yaml_config_filename - self.logger.debug(translations["load_yaml"].format(model_data_yaml_filepath=model_data_yaml_filepath)) - - model_data = yaml.load(open(model_data_yaml_filepath, encoding="utf-8"), Loader=yaml.FullLoader) - self.logger.debug(translations["load_yaml_2"].format(model_data=model_data)) - - if "roformer" in model_data_yaml_filepath: model_data["is_roformer"] = True - return model_data - - def load_model_data_using_hash(self, model_path): - self.logger.debug(translations["hash_md5"]) - model_hash = self.get_model_hash(model_path) - - self.logger.debug(translations["model_hash"].format(model_path=model_path, model_hash=model_hash)) - mdx_model_data_path = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/enj/znva/wfba/zbqry_qngn.wfba", "rot13") - self.logger.debug(translations["mdx_data"].format(mdx_model_data_path=mdx_model_data_path)) - - response = requests.get(mdx_model_data_path) - response.raise_for_status() - - mdx_model_data_object = response.json() - self.logger.debug(translations["load_mdx"]) - - if model_hash in mdx_model_data_object: model_data = mdx_model_data_object[model_hash] - else: raise ValueError(translations["model_not_support"].format(model_hash=model_hash)) - - self.logger.debug(translations["uvr_json"].format(model_hash=model_hash, model_data=model_data)) - return model_data - - def load_model(self, model_filename): - self.logger.info(translations["loading_model"].format(model_filename=model_filename)) - load_model_start_time = time.perf_counter() - model_filename, model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename) - self.logger.debug(translations["download_model_friendly_2"].format(model_friendly_name=model_friendly_name, model_path=model_path)) - - if model_path.lower().endswith(".yaml"): yaml_config_filename = model_path - - common_params = {"logger": self.logger, "log_level": self.log_level, "torch_device": self.torch_device, "torch_device_cpu": self.torch_device_cpu, "torch_device_mps": self.torch_device_mps, "onnx_execution_provider": self.onnx_execution_provider, "model_name": model_filename.split(".")[0], "model_path": model_path, "model_data": self.load_model_data_from_yaml(yaml_config_filename) if yaml_config_filename is not None else self.load_model_data_using_hash(model_path), "output_format": self.output_format, "output_bitrate": self.output_bitrate, "output_dir": self.output_dir, "normalization_threshold": self.normalization_threshold, "output_single_stem": self.output_single_stem, "invert_using_spec": self.invert_using_spec, "sample_rate": self.sample_rate} - separator_classes = {"MDX": "mdx_separator.MDXSeparator", "Demucs": "demucs_separator.DemucsSeparator"} - - if model_type not in self.arch_specific_params or model_type not in separator_classes: raise ValueError(translations["model_type_not_support"].format(model_type=model_type)) - if model_type == "Demucs" and sys.version_info < (3, 10): raise Exception(translations["demucs_not_support_python<3.10"]) - - self.logger.debug(f"{translations['import_module']} {model_type}: {separator_classes[model_type]}") - module_name, class_name = separator_classes[model_type].split(".") - separator_class = getattr(import_module(f"main.library.architectures.{module_name}"), class_name) - - self.logger.debug(f"{translations['initialization']} {model_type}: {separator_class}") - self.model_instance = separator_class(common_config=common_params, arch_config=self.arch_specific_params[model_type]) - - self.logger.debug(translations["loading_model_success"]) - self.logger.info(f"{translations['loading_model_duration']}: {time.strftime('%H:%M:%S', time.gmtime(int(time.perf_counter() - load_model_start_time)))}") - - def separate(self, audio_file_path): - self.logger.info(f"{translations['starting_separator']}: {audio_file_path}") - separate_start_time = time.perf_counter() - - self.logger.debug(translations["normalization"].format(normalization_threshold=self.normalization_threshold)) - output_files = self.model_instance.separate(audio_file_path) - - self.model_instance.clear_gpu_cache() - self.model_instance.clear_file_specific_paths() - - self.print_uvr_vip_message() - - self.logger.debug(translations["separator_success_3"]) - self.logger.info(f"{translations['separator_duration']}: {time.strftime('%H:%M:%S', time.gmtime(int(time.perf_counter() - separate_start_time)))}") - return output_files - - def download_model_and_data(self, model_filename): - self.logger.info(translations["loading_separator_model"].format(model_filename=model_filename)) - model_filename, model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename) - - if model_path.lower().endswith(".yaml"): yaml_config_filename = model_path - self.logger.info(translations["downloading_model"].format(model_type=model_type, model_friendly_name=model_friendly_name, model_path=model_path, model_data_dict_size=len(self.load_model_data_from_yaml(yaml_config_filename) if yaml_config_filename is not None else self.load_model_data_using_hash(model_path)))) \ No newline at end of file diff --git a/main/library/algorithm/stftpitchshift.py b/main/library/algorithm/stftpitchshift.py deleted file mode 100644 index 5d809d4432c0ff9d37b9ece00660cf0033f58b95..0000000000000000000000000000000000000000 --- a/main/library/algorithm/stftpitchshift.py +++ /dev/null @@ -1,250 +0,0 @@ -import numpy as np - -from numpy.lib.stride_tricks import sliding_window_view - -def istft(frames, framesize, hopsize): - frames = np.atleast_2d(frames) - assert frames.ndim == 2 - - analysis_window_size = np.ravel(framesize)[0] - synthesis_window_size = np.ravel(framesize)[-1] - - assert analysis_window_size >= synthesis_window_size - - A = asymmetric_analysis_window(analysis_window_size, synthesis_window_size) if analysis_window_size != synthesis_window_size else symmetric_window(analysis_window_size) - S = asymmetric_synthesis_window(analysis_window_size, synthesis_window_size) if analysis_window_size != synthesis_window_size else symmetric_window(synthesis_window_size) - - W = S * hopsize / np.sum(A * S) - N = frames.shape[0] * hopsize + analysis_window_size - - y = np.zeros((N), float) - - frames[:, 0] = 0 - frames[:, -1] = 0 - frames0 = sliding_window_view(y, analysis_window_size, writeable=True)[::hopsize] - frames1 = np.fft.irfft(frames, axis=-1, norm='forward') * W - - for i in range(min(len(frames0), len(frames1))): - frames0[i] += frames1[i] - - return y - -def asymmetric_synthesis_window(analysis_window_size, synthesis_window_size): - n = analysis_window_size - m = synthesis_window_size // 2 - - right = symmetric_window(2 * m) - window = np.zeros(n) - - window[n-m-m:n-m] = np.square(right[:m]) / symmetric_window(2 * n - 2 * m)[n-m-m:n-m] - window[-m:] = right[-m:] - - return window - -def asymmetric_analysis_window(analysis_window_size, synthesis_window_size): - n = analysis_window_size - m = synthesis_window_size // 2 - - window = np.zeros(n) - window[:n-m] = symmetric_window(2 * n - 2 * m)[:n-m] - window[-m:] = symmetric_window(2 * m)[-m:] - - return window - -def symmetric_window(symmetric_window_size): - n = symmetric_window_size - window = 0.5 - 0.5 * np.cos(2 * np.pi * np.arange(n) / n) - - return window - -def stft(x, framesize, hopsize): - x = np.atleast_1d(x) - assert x.ndim == 1 - - analysis_window_size = np.ravel(framesize)[0] - synthesis_window_size = np.ravel(framesize)[-1] - - assert analysis_window_size >= synthesis_window_size - - W = asymmetric_analysis_window(analysis_window_size, synthesis_window_size) if analysis_window_size != synthesis_window_size else symmetric_window(analysis_window_size) - - frames0 = sliding_window_view(x, analysis_window_size, writeable=False)[::hopsize] - frames1 = np.fft.rfft(frames0 * W, axis=-1, norm='forward') - - return frames1 - -def normalize(frames, frames0): - for i in range(len(frames)): - a = np.real(frames0[i]) - b = np.real(frames[i]) - a = np.dot(a, a) - b = np.dot(b, b) - - if b == 0: continue - frames[i] = np.real(frames[i]) * np.sqrt(a / b) + 1j * np.imag(frames[i]) - - return frames - -def lowpass(cepstrum, quefrency): - cepstrum[1:quefrency] *= 2 - cepstrum[quefrency+1:] = 0 - - return cepstrum - -def lifter(frames, quefrency): - envelopes = np.zeros(frames.shape) - - for i, frame in enumerate(frames): - with np.errstate(divide='ignore', invalid='ignore'): - spectrum = np.log10(np.real(frame)) - - envelopes[i] = np.power(10, np.real(np.fft.rfft(lowpass(np.fft.irfft(spectrum, norm='forward'), quefrency), norm='forward'))) - - return envelopes - -def resample(x, factor): - if factor == 1: return x.copy() - y = np.zeros(x.shape, dtype=x.dtype) - - n = len(x) - m = int(n * factor) - - i = np.arange(min(n, m)) - k = i * (n / m) - - j = np.trunc(k).astype(int) - k = k - j - - ok = (0 <= j) & (j < n - 1) - y[i[ok]] = k[ok] * x[j[ok] + 1] + (1 - k[ok]) * x[j[ok]] - - return y - -def shiftpitch(frames, factors, samplerate): - for i in range(len(frames)): - magnitudes = np.vstack([resample(np.real(frames[i]), factor) for factor in factors]) - frequencies = np.vstack([resample(np.imag(frames[i]), factor) * factor for factor in factors]) - - magnitudes[(frequencies <= 0) | (frequencies >= samplerate / 2)] = 0 - mask = np.argmax(magnitudes, axis=0) - - magnitudes = np.take_along_axis(magnitudes, mask[None,:], axis=0) - frequencies = np.take_along_axis(frequencies, mask[None,:], axis=0) - - frames[i] = magnitudes + 1j * frequencies - - return frames - -def wrap(x): - return (x + np.pi) % (2 * np.pi) - np.pi - -def encode(frames, framesize, hopsize, samplerate): - M, N = frames.shape - analysis_framesize = np.ravel(framesize)[0] - - freqinc = samplerate / analysis_framesize - phaseinc = 2 * np.pi * hopsize / analysis_framesize - - buffer = np.zeros(N) - data = np.zeros((M, N), complex) - - for m, frame in enumerate(frames): - arg = np.angle(frame) - buffer = arg - - i = np.arange(N) - freq = (i + (wrap((arg - buffer) - i * phaseinc) / phaseinc)) * freqinc - - data[m] = np.abs(frame) + 1j * freq - - return data - -def decode(frames, framesize, hopsize, samplerate): - M, N = frames.shape - analysis_framesize = np.ravel(framesize)[0] - synthesis_framesize = np.ravel(framesize)[-1] - - freqinc = samplerate / analysis_framesize - phaseinc = 2 * np.pi * hopsize / analysis_framesize - timeshift = 2 * np.pi * synthesis_framesize * np.arange(N) / N if synthesis_framesize != analysis_framesize else 0 - - buffer = np.zeros(N) - data = np.zeros((M, N), complex) - - for m, frame in enumerate(frames): - i = np.arange(N) - delta = (i + ((np.imag(frame) - i * freqinc) / freqinc)) * phaseinc - buffer += delta - arg = buffer.copy() - arg -= timeshift - data[m] = np.real(frame) * np.exp(1j * arg) - - return data - -class StftPitchShift: - def __init__(self, framesize, hopsize, samplerate): - self.framesize = framesize - self.hopsize = hopsize - self.samplerate = samplerate - - def shiftpitch(self, input, factors = 1, quefrency = 0, distortion = 1, normalization = False): - input = np.atleast_1d(input) - dtype = input.dtype - shape = input.shape - - input = np.squeeze(input) - if input.ndim != 1: raise ValueError('input.ndim != 1') - - if np.issubdtype(dtype, np.integer): - a, b = np.iinfo(dtype).min, np.iinfo(dtype).max - input = ((input.astype(float) - a) / (b - a)) * 2 - 1 - elif not np.issubdtype(dtype, np.floating): raise TypeError('not np.issubdtype(dtype, np.floating)') - - def isnotnormal(x): - return (np.isinf(x)) | (np.isnan(x)) | (abs(x) < np.finfo(x.dtype).tiny) - - framesize = self.framesize - hopsize = self.hopsize - samplerate = self.samplerate - - factors = np.asarray(factors).flatten() - quefrency = int(quefrency * samplerate) - - frames = encode(stft(input, framesize, hopsize), framesize, hopsize, samplerate) - - if normalization: frames0 = frames.copy() - - if quefrency: - envelopes = lifter(frames, quefrency) - mask = isnotnormal(envelopes) - - frames.real /= envelopes - frames.real[mask] = 0 - - if distortion != 1: - envelopes[mask] = 0 - - for i in range(len(envelopes)): - envelopes[i] = resample(envelopes[i], distortion) - - mask = isnotnormal(envelopes) - - frames = shiftpitch(frames, factors, samplerate) - frames.real *= envelopes - frames.real[mask] = 0 - else: frames = shiftpitch(frames, factors, samplerate) - - if normalization: frames = normalize(frames, frames0) - - output = istft(decode(frames, framesize, hopsize, samplerate), framesize, hopsize) - output.resize(shape, refcheck=False) - - if np.issubdtype(dtype, np.integer): - a, b = np.iinfo(dtype).min, np.iinfo(dtype).max - output = (((output + 1) / 2) * (b - a) + a).clip(a, b).astype(dtype) - elif output.dtype != dtype: output = output.astype(dtype) - - assert output.dtype == dtype - assert output.shape == shape - - return output \ No newline at end of file diff --git a/main/library/algorithm/synthesizers.py b/main/library/algorithm/synthesizers.py deleted file mode 100644 index a080b66dade4bf942a23715404718417b1ebfd6e..0000000000000000000000000000000000000000 --- a/main/library/algorithm/synthesizers.py +++ /dev/null @@ -1,500 +0,0 @@ -import os -import sys -import math -import torch -import numpy as np -import torch.nn.functional as F - -from torch.nn.utils import remove_weight_norm -from torch.utils.checkpoint import checkpoint -from torch.nn.utils.parametrizations import weight_norm - -sys.path.append(os.getcwd()) - -from .modules import WaveNet -from .refinegan import RefineGANGenerator -from .mrf_hifigan import HiFiGANMRFGenerator -from .residuals import ResidualCouplingBlock, ResBlock, LRELU_SLOPE -from .commons import init_weights, slice_segments, rand_slice_segments, sequence_mask, convert_pad_shape - -class Generator(torch.nn.Module): - def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): - super(Generator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - self.ups_and_resblocks = torch.nn.ModuleList() - - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups_and_resblocks.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2))) - ch = upsample_initial_channel // (2 ** (i + 1)) - for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): - self.ups_and_resblocks.append(ResBlock(ch, k, d)) - - self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups_and_resblocks.apply(init_weights) - if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, g = None): - x = self.conv_pre(x) - if g is not None: x = x + self.cond(g) - - resblock_idx = 0 - - for _ in range(self.num_upsamples): - x = self.ups_and_resblocks[resblock_idx](F.leaky_relu(x, LRELU_SLOPE)) - resblock_idx += 1 - xs = 0 - - for _ in range(self.num_kernels): - xs += self.ups_and_resblocks[resblock_idx](x) - resblock_idx += 1 - - x = xs / self.num_kernels - - return torch.tanh(self.conv_post(F.leaky_relu(x))) - - def __prepare_scriptable__(self): - for l in self.ups_and_resblocks: - for hook in l._forward_pre_hooks.values(): - if (hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" and hook.__class__.__name__ == "WeightNorm"): torch.nn.utils.remove_weight_norm(l) - - return self - - def remove_weight_norm(self): - for l in self.ups_and_resblocks: - remove_weight_norm(l) - -class SineGen(torch.nn.Module): - def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0, flag_for_pulse=False): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - return torch.ones_like(f0) * (f0 > self.voiced_threshold) - - def _f02sine(self, f0, upp): - rad = f0 / self.sampling_rate * torch.arange(1, upp + 1, dtype=f0.dtype, device=f0.device) - rad += F.pad((torch.fmod(rad[:, :-1, -1:].float() + 0.5, 1.0) - 0.5).cumsum(dim=1).fmod(1.0).to(f0), (0, 0, 1, 0), mode='constant') - rad = rad.reshape(f0.shape[0], -1, 1) - rad *= torch.arange(1, self.dim + 1, dtype=f0.dtype, device=f0.device).reshape(1, 1, -1) - rand_ini = torch.rand(1, 1, self.dim, device=f0.device) - rand_ini[..., 0] = 0 - rad += rand_ini - - return torch.sin(2 * np.pi * rad) - - def forward(self, f0, upp): - with torch.no_grad(): - f0 = f0.unsqueeze(-1) - sine_waves = self._f02sine(f0, upp) * self.sine_amp - uv = F.interpolate(self._f02uv(f0).transpose(2, 1), scale_factor=float(upp), mode="nearest").transpose(2, 1) - sine_waves = sine_waves * uv + ((uv * self.noise_std + (1 - uv) * self.sine_amp / 3) * torch.randn_like(sine_waves)) - - return sine_waves - -class SourceModuleHnNSF(torch.nn.Module): - def __init__(self, sample_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0): - super(SourceModuleHnNSF, self).__init__() - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.l_sin_gen = SineGen(sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod) - self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x, upsample_factor = 1): - return self.l_tanh(self.l_linear(self.l_sin_gen(x, upsample_factor).to(dtype=self.l_linear.weight.dtype))) - -class GeneratorNSF(torch.nn.Module): - def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, sr, checkpointing = False): - super(GeneratorNSF, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - - self.upp = int(np.prod(upsample_rates)) - self.f0_upsamp = torch.nn.Upsample(scale_factor=self.upp) - self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0) - - self.conv_pre = torch.nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - self.checkpointing = checkpointing - - self.ups = torch.nn.ModuleList() - self.upsampler = torch.nn.ModuleList() - self.noise_convs = torch.nn.ModuleList() - - channels = [upsample_initial_channel // (2 ** (i + 1)) for i in range(len(upsample_rates))] - stride_f0s = [upsample_rates[1] * upsample_rates[2] * upsample_rates[3], upsample_rates[2] * upsample_rates[3], upsample_rates[3], 1] - - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - if self.upp == 441: - self.upsampler.append(torch.nn.Upsample(scale_factor=u, mode="linear")) - self.ups.append(weight_norm(torch.nn.Conv1d(upsample_initial_channel // (2**i), channels[i], kernel_size=1))) - self.noise_convs.append(torch.nn.Conv1d(in_channels=1, out_channels=channels[i], kernel_size = 1)) - else: - self.upsampler.append(torch.nn.Identity()) - self.ups.append(weight_norm(torch.nn.ConvTranspose1d(upsample_initial_channel // (2**i), channels[i], kernel_size=k, stride=u, padding=(k - u) // 2))) - self.noise_convs.append(torch.nn.Conv1d(1, channels[i], kernel_size=stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1, stride=stride_f0s[i], padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0))) - - self.resblocks = torch.nn.ModuleList([ResBlock(channels[i], k, d) for i in range(len(self.ups)) for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)]) - self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False) - - self.ups.apply(init_weights) - if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, f0, g = None): - har_source = self.m_source(f0, self.upp).transpose(1, 2) - x = self.conv_pre(x) - if g is not None: x += self.cond(g) - - for i, (ups, upr, noise_convs) in enumerate(zip(self.ups, self.upsampler, self.noise_convs)): - x = F.leaky_relu(x, LRELU_SLOPE) - - if self.training and self.checkpointing: - if self.upp == 441: x = upr(x) - x = checkpoint(ups, x, use_reentrant=False) - else: - if self.upp == 441: x = upr(x) - x = ups(x) - - h = noise_convs(har_source) - if self.upp == 441: h = torch.nn.functional.interpolate(h, size=x.shape[-1], mode="linear") - x += h - - def resblock_forward(x, blocks): - return sum(block(x) for block in blocks) / len(blocks) - - blocks = self.resblocks[i * self.num_kernels:(i + 1) * self.num_kernels] - x = checkpoint(resblock_forward, x, blocks, use_reentrant=False) if self.training and self.checkpointing else resblock_forward(x, blocks) - - return torch.tanh(self.conv_post(F.leaky_relu(x))) - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - - for l in self.resblocks: - l.remove_weight_norm() - -class LayerNorm(torch.nn.Module): - def __init__(self, channels, eps=1e-5, onnx=False): - super().__init__() - self.channels = channels - self.eps = eps - self.onnx = onnx - self.gamma = torch.nn.Parameter(torch.ones(channels)) - self.beta = torch.nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - return (F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) if self.onnx else F.layer_norm(x, (x.size(-1),), self.gamma, self.beta, self.eps)).transpose(1, -1) - -class MultiHeadAttention(torch.nn.Module): - def __init__(self, channels, out_channels, n_heads, p_dropout=0.0, window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False, onnx=False): - super().__init__() - assert channels % n_heads == 0 - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.onnx = onnx - self.attn = None - self.k_channels = channels // n_heads - self.conv_q = torch.nn.Conv1d(channels, channels, 1) - self.conv_k = torch.nn.Conv1d(channels, channels, 1) - self.conv_v = torch.nn.Conv1d(channels, channels, 1) - self.conv_o = torch.nn.Conv1d(channels, out_channels, 1) - self.drop = torch.nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - - self.emb_rel_k = torch.nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = torch.nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - - torch.nn.init.xavier_uniform_(self.conv_q.weight) - torch.nn.init.xavier_uniform_(self.conv_k.weight) - torch.nn.init.xavier_uniform_(self.conv_v.weight) - - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c) - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - return self.conv_o(x) - - def attention(self, query, key, value, mask=None): - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - - if self.window_size is not None: - assert (t_s == t_t), "(t_s == t_t)" - scores = scores + self._relative_position_to_absolute_position(self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), self._get_relative_embeddings(self.emb_rel_k, t_s, onnx=self.onnx)), onnx=self.onnx) - - if self.proximal_bias: - assert t_s == t_t, "t_s == t_t" - scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) - - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert (t_s == t_t), "(t_s == t_t)" - scores = scores.masked_fill((torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)) == 0, -1e4) - - p_attn = self.drop(F.softmax(scores, dim=-1)) - output = torch.matmul(p_attn, value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)) - - if self.window_size is not None: output = output + self._matmul_with_relative_values(self._absolute_position_to_relative_position(p_attn, onnx=self.onnx), self._get_relative_embeddings(self.emb_rel_v, t_s, onnx=self.onnx)) - return (output.transpose(2, 3).contiguous().view(b, d, t_t)), p_attn - - def _matmul_with_relative_values(self, x, y): - return torch.matmul(x, y.unsqueeze(0)) - - def _matmul_with_relative_keys(self, x, y): - return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - - def _get_relative_embeddings(self, relative_embeddings, length, onnx=False): - if onnx: - pad_length = torch.clamp(length - (self.window_size + 1), min=0) - slice_start_position = torch.clamp((self.window_size + 1) - length, min=0) - - return (F.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0]) if pad_length > 0 else relative_embeddings)[:, slice_start_position:(slice_start_position + 2 * length - 1)] - else: - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - - return (F.pad(relative_embeddings, convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) if pad_length > 0 else relative_embeddings)[:, slice_start_position:(slice_start_position + 2 * length - 1)] - - def _relative_position_to_absolute_position(self, x, onnx=False): - batch, heads, length, _ = x.size() - - return (F.pad(F.pad(x, [0, 1, 0, 0, 0, 0, 0, 0]).view([batch, heads, length * 2 * length]), [0, length - 1, 0, 0, 0, 0]).view([batch, heads, length + 1, 2 * length - 1]) if onnx else F.pad(F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])).view([batch, heads, length * 2 * length]), convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])).view([batch, heads, length + 1, 2 * length - 1]))[:, :, :length, length - 1 :] - - def _absolute_position_to_relative_position(self, x, onnx=False): - batch, heads, length, _ = x.size() - - return (F.pad(F.pad(x, [0, length - 1, 0, 0, 0, 0, 0, 0]).view([batch, heads, length*length + length * (length - 1)]), [length, 0, 0, 0, 0, 0]).view([batch, heads, length, 2 * length]) if onnx else F.pad(F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])).view([batch, heads, length**2 + length * (length - 1)]), convert_pad_shape([[0, 0], [0, 0], [length, 0]])).view([batch, heads, length, 2 * length]))[:, :, :, 1:] - - def _attention_bias_proximal(self, length): - r = torch.arange(length, dtype=torch.float32) - - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs((torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)))), 0), 0) - -class FFN(torch.nn.Module): - def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0, activation=None, causal=False, onnx=False): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - self.onnx = onnx - self.padding = self._causal_padding if causal else self._same_padding - self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = torch.nn.Dropout(p_dropout) - - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - - return self.conv_2(self.padding(self.drop(((x * torch.sigmoid(1.702 * x)) if self.activation == "gelu" else torch.relu(x))) * x_mask)) * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: return x - - return F.pad(x, [self.kernel_size - 1, 0, 0, 0, 0, 0]) if self.onnx else F.pad(x, convert_pad_shape([[0, 0], [0, 0], [(self.kernel_size - 1), 0]])) - - def _same_padding(self, x): - if self.kernel_size == 1: return x - - return F.pad(x, [(self.kernel_size - 1) // 2, self.kernel_size // 2, 0, 0, 0, 0]) if self.onnx else F.pad(x, convert_pad_shape([[0, 0], [0, 0], [((self.kernel_size - 1) // 2), (self.kernel_size // 2)]])) - -class Encoder(torch.nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, window_size=10, onnx=False, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - self.drop = torch.nn.Dropout(p_dropout) - self.attn_layers = torch.nn.ModuleList() - self.norm_layers_1 = torch.nn.ModuleList() - self.ffn_layers = torch.nn.ModuleList() - self.norm_layers_2 = torch.nn.ModuleList() - - for _ in range(self.n_layers): - self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size, onnx=onnx)) - self.norm_layers_1.append(LayerNorm(hidden_channels, onnx=onnx)) - - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, onnx=onnx)) - self.norm_layers_2.append(LayerNorm(hidden_channels, onnx=onnx)) - - def forward(self, x, x_mask): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - - for i in range(self.n_layers): - x = self.norm_layers_1[i](x + self.drop(self.attn_layers[i](x, x, attn_mask))) - x = self.norm_layers_2[i](x + self.drop(self.ffn_layers[i](x, x_mask))) - - return x * x_mask - -class TextEncoder(torch.nn.Module): - def __init__(self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, embedding_dim, f0=True, onnx=False): - super(TextEncoder, self).__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels) - self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True) - if f0: self.emb_pitch = torch.nn.Embedding(256, hidden_channels) - self.encoder = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), onnx=onnx) - self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - x = torch.transpose(self.lrelu(((self.emb_phone(phone) if pitch is None else (self.emb_phone(phone) + self.emb_pitch(pitch))) * math.sqrt(self.hidden_channels))), 1, -1) - x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype) - m, logs = torch.split((self.proj(self.encoder(x * x_mask, x_mask)) * x_mask), self.out_channels, dim=1) - - return m, logs, x_mask - -class PosteriorEncoder(torch.nn.Module): - def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0): - super(PosteriorEncoder, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = WaveNet(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) - self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g = None): - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) - m, logs = torch.split((self.proj(self.enc((self.pre(x) * x_mask), x_mask, g=g)) * x_mask), self.out_channels, dim=1) - - return ((m + torch.randn_like(m) * torch.exp(logs)) * x_mask), m, logs, x_mask - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - -class Synthesizer(torch.nn.Module): - def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, use_f0, text_enc_hidden_dim=768, vocoder="Default", checkpointing=False, onnx=False, **kwargs): - super(Synthesizer, self).__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.spk_embed_dim = spk_embed_dim - self.use_f0 = use_f0 - self.enc_p = TextEncoder(inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, float(p_dropout), text_enc_hidden_dim, f0=use_f0, onnx=onnx) - - if use_f0: - if vocoder == "RefineGAN": self.dec = RefineGANGenerator(sample_rate=sr, upsample_rates=upsample_rates, num_mels=inter_channels, checkpointing=checkpointing) - elif vocoder == "MRF HiFi-GAN": self.dec = HiFiGANMRFGenerator(in_channel=inter_channels, upsample_initial_channel=upsample_initial_channel, upsample_rates=upsample_rates, upsample_kernel_sizes=upsample_kernel_sizes, resblock_kernel_sizes=resblock_kernel_sizes, resblock_dilations=resblock_dilation_sizes, gin_channels=gin_channels, sample_rate=sr, harmonic_num=8, checkpointing=checkpointing) - else: self.dec = GeneratorNSF(inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, checkpointing=checkpointing) - else: self.dec = Generator(inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels) - self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - @torch.jit.ignore - def forward(self, phone, phone_lengths, pitch = None, pitchf = None, y = None, y_lengths = None, ds = None): - g = self.emb_g(ds).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - - if y is not None: - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) - - return (self.dec(z_slice, slice_segments(pitchf, ids_slice, self.segment_size, 2), g=g) if self.use_f0 else self.dec(z_slice, g=g)), ids_slice, x_mask, y_mask, (z, self.flow(z, y_mask, g=g), m_p, logs_p, m_q, logs_q) - else: return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) - - @torch.jit.export - def infer(self, phone, phone_lengths, pitch = None, nsff0 = None, sid = None, rate = None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - - if rate is not None: - assert isinstance(rate, torch.Tensor) - head = int(z_p.shape[2] * (1.0 - rate.item())) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - if self.use_f0: nsff0 = nsff0[:, head:] - - if self.use_f0: - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) - else: - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, g=g) - - return o, x_mask, (z, z_p, m_p, logs_p) - -class SynthesizerONNX(Synthesizer): - def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, use_f0, text_enc_hidden_dim=768, vocoder="Default", checkpointing=False, **kwargs): - super().__init__(spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, sr, use_f0, text_enc_hidden_dim, vocoder, checkpointing, True) - self.speaker_map = None - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def construct_spkmixmap(self, n_speaker): - self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) - for i in range(n_speaker): - self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) - self.speaker_map = self.speaker_map.unsqueeze(0) - - def forward(self, phone, phone_lengths, g=None, rnd=None, pitch=None, nsff0=None, max_len=None): - g = self.emb_g(g).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask - - return self.dec((self.flow(z_p, x_mask, g=g, reverse=True) * x_mask)[:, :, :max_len], nsff0, g=g) if self.use_f0 else self.dec((self.flow(z_p, x_mask, g=g, reverse=True) * x_mask)[:, :, :max_len], g=g) \ No newline at end of file diff --git a/main/library/architectures/demucs_separator.py b/main/library/architectures/demucs_separator.py deleted file mode 100644 index 6002d3aa0de4c0c979178382d82bdc5ece563fa7..0000000000000000000000000000000000000000 --- a/main/library/architectures/demucs_separator.py +++ /dev/null @@ -1,160 +0,0 @@ -import os -import sys -import yaml -import torch - -import numpy as np - -from pathlib import Path -from hashlib import sha256 - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.uvr5_separator import spec_utils, common_separator -from main.library.uvr5_separator.demucs import hdemucs, states, apply - -translations = Config().translations -sys.path.insert(0, os.path.join(os.getcwd(), "main", "library", "uvr5_separator")) -DEMUCS_4_SOURCE_MAPPER = {common_separator.CommonSeparator.BASS_STEM: 0, common_separator.CommonSeparator.DRUM_STEM: 1, common_separator.CommonSeparator.OTHER_STEM: 2, common_separator.CommonSeparator.VOCAL_STEM: 3} - -class DemucsSeparator(common_separator.CommonSeparator): - def __init__(self, common_config, arch_config): - super().__init__(config=common_config) - self.segment_size = arch_config.get("segment_size", "Default") - self.shifts = arch_config.get("shifts", 2) - self.overlap = arch_config.get("overlap", 0.25) - self.segments_enabled = arch_config.get("segments_enabled", True) - self.logger.debug(translations["demucs_info"].format(segment_size=self.segment_size, segments_enabled=self.segments_enabled)) - self.logger.debug(translations["demucs_info_2"].format(shifts=self.shifts, overlap=self.overlap)) - self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER - self.audio_file_path = None - self.audio_file_base = None - self.demucs_model_instance = None - self.logger.info(translations["start_demucs"]) - - def separate(self, audio_file_path): - self.logger.debug(translations["start_separator"]) - source = None - inst_source = {} - self.audio_file_path = audio_file_path - self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] - self.logger.debug(translations["prepare_mix"]) - mix = self.prepare_mix(self.audio_file_path) - self.logger.debug(translations["demix"].format(shape=mix.shape)) - self.logger.debug(translations["cancel_mix"]) - self.demucs_model_instance = hdemucs.HDemucs(sources=["drums", "bass", "other", "vocals"]) - self.demucs_model_instance = get_demucs_model(name=os.path.splitext(os.path.basename(self.model_path))[0], repo=Path(os.path.dirname(self.model_path))) - self.demucs_model_instance = apply.demucs_segments(self.segment_size, self.demucs_model_instance) - self.demucs_model_instance.to(self.torch_device) - self.demucs_model_instance.eval() - self.logger.debug(translations["model_review"]) - source = self.demix_demucs(mix) - del self.demucs_model_instance - self.clear_gpu_cache() - self.logger.debug(translations["del_gpu_cache_after_demix"]) - output_files = [] - self.logger.debug(translations["process_output_file"]) - - if isinstance(inst_source, np.ndarray): - self.logger.debug(translations["process_ver"]) - inst_source[self.demucs_source_map[common_separator.CommonSeparator.VOCAL_STEM]] = spec_utils.reshape_sources(inst_source[self.demucs_source_map[common_separator.CommonSeparator.VOCAL_STEM]], source[self.demucs_source_map[common_separator.CommonSeparator.VOCAL_STEM]]) - source = inst_source - - if isinstance(source, np.ndarray): - source_length = len(source) - self.logger.debug(translations["source_length"].format(source_length=source_length)) - self.logger.debug(translations["set_map"].format(part=source_length)) - match source_length: - case 2: self.demucs_source_map = {common_separator.CommonSeparator.INST_STEM: 0, common_separator.CommonSeparator.VOCAL_STEM: 1} - case 6: self.demucs_source_map = {common_separator.CommonSeparator.BASS_STEM: 0, common_separator.CommonSeparator.DRUM_STEM: 1, common_separator.CommonSeparator.OTHER_STEM: 2, common_separator.CommonSeparator.VOCAL_STEM: 3, common_separator.CommonSeparator.GUITAR_STEM: 4, common_separator.CommonSeparator.PIANO_STEM: 5} - case _: self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER - - self.logger.debug(translations["process_all_part"]) - for stem_name, stem_value in self.demucs_source_map.items(): - if self.output_single_stem is not None: - if stem_name.lower() != self.output_single_stem.lower(): - self.logger.debug(translations["skip_part"].format(stem_name=stem_name, output_single_stem=self.output_single_stem)) - continue - stem_path = os.path.join(f"{self.audio_file_base}_({stem_name})_{self.model_name}.{self.output_format.lower()}") - self.final_process(stem_path, source[stem_value].T, stem_name) - output_files.append(stem_path) - return output_files - - def demix_demucs(self, mix): - self.logger.debug(translations["starting_demix_demucs"]) - processed = {} - mix = torch.tensor(mix, dtype=torch.float32) - ref = mix.mean(0) - mix = (mix - ref.mean()) / ref.std() - mix_infer = mix - with torch.no_grad(): - self.logger.debug(translations["model_infer"]) - sources = apply.apply_model(model=self.demucs_model_instance, mix=mix_infer[None], shifts=self.shifts, split=self.segments_enabled, overlap=self.overlap, static_shifts=1 if self.shifts == 0 else self.shifts, set_progress_bar=None, device=self.torch_device, progress=True)[0] - sources = (sources * ref.std() + ref.mean()).cpu().numpy() - sources[[0, 1]] = sources[[1, 0]] - processed[mix] = sources[:, :, 0:None].copy() - return np.concatenate([s[:, :, 0:None] for s in list(processed.values())], axis=-1) - -class LocalRepo: - def __init__(self, root): - self.root = root - self.scan() - - def scan(self): - self._models, self._checksums = {}, {} - for file in self.root.iterdir(): - if file.suffix == ".th": - if "-" in file.stem: - xp_sig, checksum = file.stem.split("-") - self._checksums[xp_sig] = checksum - else: xp_sig = file.stem - - if xp_sig in self._models: raise RuntimeError(translations["del_all_but_one"].format(xp_sig=xp_sig)) - self._models[xp_sig] = file - - def has_model(self, sig): - return sig in self._models - - def get_model(self, sig): - try: - file = self._models[sig] - except KeyError: - raise RuntimeError(translations["not_found_model_signature"].format(sig=sig)) - - if sig in self._checksums: check_checksum(file, self._checksums[sig]) - return states.load_model(file) - -class BagOnlyRepo: - def __init__(self, root, model_repo): - self.root = root - self.model_repo = model_repo - self.scan() - - def scan(self): - self._bags = {} - for file in self.root.iterdir(): - if file.suffix == ".yaml": self._bags[file.stem] = file - - def get_model(self, name): - try: - yaml_file = self._bags[name] - except KeyError: - raise RuntimeError(translations["name_not_pretrained"].format(name=name)) - bag = yaml.safe_load(open(yaml_file)) - return apply.BagOfModels([self.model_repo.get_model(sig) for sig in bag["models"]], bag.get("weights"), bag.get("segment")) - -def check_checksum(path, checksum): - sha = sha256() - with open(path, "rb") as file: - while 1: - buf = file.read(2**20) - if not buf: break - sha.update(buf) - - actual_checksum = sha.hexdigest()[: len(checksum)] - if actual_checksum != checksum: raise RuntimeError(translations["invalid_checksum"].format(path=path, checksum=checksum, actual_checksum=actual_checksum)) - -def get_demucs_model(name, repo = None): - model_repo = LocalRepo(repo) - return (model_repo.get_model(name) if model_repo.has_model(name) else BagOnlyRepo(repo, model_repo).get_model(name)).eval() \ No newline at end of file diff --git a/main/library/architectures/mdx_separator.py b/main/library/architectures/mdx_separator.py deleted file mode 100644 index ea37bb49da1ba5942b4ee5b1a18834a535e3cb0d..0000000000000000000000000000000000000000 --- a/main/library/architectures/mdx_separator.py +++ /dev/null @@ -1,320 +0,0 @@ -import os -import sys -import onnx -import torch -import platform -import onnx2torch - -import numpy as np -import onnxruntime as ort - -from tqdm import tqdm - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -from main.library.uvr5_separator import spec_utils -from main.library.uvr5_separator.common_separator import CommonSeparator - -translations = Config().translations - -class MDXSeparator(CommonSeparator): - def __init__(self, common_config, arch_config): - super().__init__(config=common_config) - self.segment_size = arch_config.get("segment_size") - self.overlap = arch_config.get("overlap") - self.batch_size = arch_config.get("batch_size", 1) - self.hop_length = arch_config.get("hop_length") - self.enable_denoise = arch_config.get("enable_denoise") - self.logger.debug(translations["mdx_info"].format(batch_size=self.batch_size, segment_size=self.segment_size)) - self.logger.debug(translations["mdx_info_2"].format(overlap=self.overlap, hop_length=self.hop_length, enable_denoise=self.enable_denoise)) - self.compensate = self.model_data["compensate"] - self.dim_f = self.model_data["mdx_dim_f_set"] - self.dim_t = 2 ** self.model_data["mdx_dim_t_set"] - self.n_fft = self.model_data["mdx_n_fft_scale_set"] - self.config_yaml = self.model_data.get("config_yaml", None) - self.logger.debug(f"{translations['mdx_info_3']}: compensate = {self.compensate}, dim_f = {self.dim_f}, dim_t = {self.dim_t}, n_fft = {self.n_fft}") - self.logger.debug(f"{translations['mdx_info_3']}: config_yaml = {self.config_yaml}") - self.load_model() - self.n_bins = 0 - self.trim = 0 - self.chunk_size = 0 - self.gen_size = 0 - self.stft = None - self.primary_source = None - self.secondary_source = None - self.audio_file_path = None - self.audio_file_base = None - - def load_model(self): - self.logger.debug(translations["load_model_onnx"]) - - if self.segment_size == self.dim_t: - ort_session_options = ort.SessionOptions() - ort_session_options.log_severity_level = 3 if self.log_level > 10 else 0 - ort_inference_session = ort.InferenceSession(self.model_path, providers=self.onnx_execution_provider, sess_options=ort_session_options) - self.model_run = lambda spek: ort_inference_session.run(None, {"input": spek.cpu().numpy()})[0] - self.logger.debug(translations["load_model_onnx_success"]) - else: - self.model_run = onnx2torch.convert(onnx.load(self.model_path)) if platform.system() == 'Windows' else onnx2torch.convert(self.model_path) - self.model_run.to(self.torch_device).eval() - self.logger.debug(translations["onnx_to_pytorch"]) - - def separate(self, audio_file_path): - self.audio_file_path = audio_file_path - self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0] - self.logger.debug(translations["mix"].format(audio_file_path=self.audio_file_path)) - mix = self.prepare_mix(self.audio_file_path) - self.logger.debug(translations["normalization_demix"]) - mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold) - source = self.demix(mix) - self.logger.debug(translations["mix_success"]) - output_files = [] - self.logger.debug(translations["process_output_file"]) - - if not isinstance(self.primary_source, np.ndarray): - self.logger.debug(translations["primary_source"]) - self.primary_source = spec_utils.normalize(wave=source, max_peak=self.normalization_threshold).T - - if not isinstance(self.secondary_source, np.ndarray): - self.logger.debug(translations["secondary_source"]) - raw_mix = self.demix(mix, is_match_mix=True) - - if self.invert_using_spec: - self.logger.debug(translations["invert_using_spec"]) - self.secondary_source = spec_utils.invert_stem(raw_mix, source) - else: - self.logger.debug(translations["invert_using_spec_2"]) - self.secondary_source = mix.T - source.T - - if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower(): - self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}") - self.logger.info(translations["save_secondary_stem_output_path"].format(stem_name=self.secondary_stem_name, stem_output_path=self.secondary_stem_output_path)) - self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name) - output_files.append(self.secondary_stem_output_path) - - if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower(): - self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}") - if not isinstance(self.primary_source, np.ndarray): self.primary_source = source.T - - self.logger.info(translations["save_secondary_stem_output_path"].format(stem_name=self.primary_stem_name, stem_output_path=self.primary_stem_output_path)) - self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name) - output_files.append(self.primary_stem_output_path) - - return output_files - - def initialize_model_settings(self): - self.logger.debug(translations["starting_model"]) - - self.n_bins = self.n_fft // 2 + 1 - self.trim = self.n_fft // 2 - - self.chunk_size = self.hop_length * (self.segment_size - 1) - self.gen_size = self.chunk_size - 2 * self.trim - - self.stft = STFT(self.logger, self.n_fft, self.hop_length, self.dim_f, self.torch_device) - - self.logger.debug(f"{translations['input_info']}: n_fft = {self.n_fft} hop_length = {self.hop_length} dim_f = {self.dim_f}") - self.logger.debug(f"{translations['model_settings']}: n_bins = {self.n_bins}, Trim = {self.trim}, chunk_size = {self.chunk_size}, gen_size = {self.gen_size}") - - def initialize_mix(self, mix, is_ckpt=False): - self.logger.debug(translations["initialize_mix"].format(is_ckpt=is_ckpt, shape=mix.shape)) - - if mix.shape[0] != 2: - error_message = translations["!=2"].format(shape=mix.shape[0]) - self.logger.error(error_message) - raise ValueError(error_message) - - if is_ckpt: - self.logger.debug(translations["process_check"]) - pad = self.gen_size + self.trim - (mix.shape[-1] % self.gen_size) - self.logger.debug(f"{translations['cache']}: {pad}") - - mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, pad), dtype="float32")), 1) - - num_chunks = mixture.shape[-1] // self.gen_size - self.logger.debug(translations["shape"].format(shape=mixture.shape, num_chunks=num_chunks)) - - mix_waves = [mixture[:, i * self.gen_size : i * self.gen_size + self.chunk_size] for i in range(num_chunks)] - else: - self.logger.debug(translations["process_no_check"]) - mix_waves = [] - n_sample = mix.shape[1] - - pad = self.gen_size - n_sample % self.gen_size - self.logger.debug(translations["n_sample_or_pad"].format(n_sample=n_sample, pad=pad)) - - mix_p = np.concatenate((np.zeros((2, self.trim)), mix, np.zeros((2, pad)), np.zeros((2, self.trim))), 1) - self.logger.debug(f"{translations['shape_2']}: {mix_p.shape}") - - i = 0 - while i < n_sample + pad: - mix_waves.append(np.array(mix_p[:, i : i + self.chunk_size])) - - self.logger.debug(translations["process_part"].format(mix_waves=len(mix_waves), i=i, ii=i + self.chunk_size)) - i += self.gen_size - - mix_waves_tensor = torch.tensor(mix_waves, dtype=torch.float32).to(self.torch_device) - self.logger.debug(translations["mix_waves_to_tensor"].format(shape=mix_waves_tensor.shape)) - - return mix_waves_tensor, pad - - def demix(self, mix, is_match_mix=False): - self.logger.debug(f"{translations['demix_is_match_mix']}: {is_match_mix}...") - self.initialize_model_settings() - self.logger.debug(f"{translations['mix_shape']}: {mix.shape}") - tar_waves_ = [] - - if is_match_mix: - chunk_size = self.hop_length * (self.segment_size - 1) - overlap = 0.02 - self.logger.debug(translations["chunk_size_or_overlap"].format(chunk_size=chunk_size, overlap=overlap)) - else: - chunk_size = self.chunk_size - overlap = self.overlap - self.logger.debug(translations["chunk_size_or_overlap_standard"].format(chunk_size=chunk_size, overlap=overlap)) - - gen_size = chunk_size - 2 * self.trim - self.logger.debug(f"{translations['calc_size']}: {gen_size}") - - mixture = np.concatenate((np.zeros((2, self.trim), dtype="float32"), mix, np.zeros((2, gen_size + self.trim - ((mix.shape[-1]) % gen_size)), dtype="float32")), 1) - self.logger.debug(f"{translations['mix_cache']}: {mixture.shape}") - - step = int((1 - overlap) * chunk_size) - self.logger.debug(translations["step_or_overlap"].format(step=step, overlap=overlap)) - - result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) - divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32) - - total = 0 - total_chunks = (mixture.shape[-1] + step - 1) // step - self.logger.debug(f"{translations['all_process_part']}: {total_chunks}") - - for i in tqdm(range(0, mixture.shape[-1], step), ncols=100, unit="f"): - total += 1 - start = i - end = min(i + chunk_size, mixture.shape[-1]) - self.logger.debug(translations["process_part_2"].format(total=total, total_chunks=total_chunks, start=start, end=end)) - - chunk_size_actual = end - start - window = None - - if overlap != 0: - window = np.hanning(chunk_size_actual) - window = np.tile(window[None, None, :], (1, 2, 1)) - self.logger.debug(translations["window"]) - - mix_part_ = mixture[:, start:end] - - if end != i + chunk_size: - pad_size = (i + chunk_size) - end - mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype="float32")), axis=-1) - - mix_waves = torch.tensor([mix_part_], dtype=torch.float32).to(self.torch_device).split(self.batch_size) - - total_batches = len(mix_waves) - self.logger.debug(f"{translations['mix_or_batch']}: {total_batches}") - - with torch.no_grad(): - batches_processed = 0 - - for mix_wave in mix_waves: - batches_processed += 1 - self.logger.debug(f"{translations['mix_wave']} {batches_processed}/{total_batches}") - - tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix) - - if window is not None: - tar_waves[..., :chunk_size_actual] *= window - divider[..., start:end] += window - else: divider[..., start:end] += 1 - - result[..., start:end] += tar_waves[..., : end - start] - - - self.logger.debug(translations["normalization_2"]) - tar_waves = result / divider - tar_waves_.append(tar_waves) - - tar_waves = np.concatenate(np.vstack(tar_waves_)[:, :, self.trim : -self.trim], axis=-1)[:, : mix.shape[-1]] - - source = tar_waves[:, 0:None] - self.logger.debug(f"{translations['tar_waves']}: {tar_waves.shape}") - - if not is_match_mix: - source *= self.compensate - self.logger.debug(translations["mix_match"]) - - self.logger.debug(translations["mix_success"]) - return source - - def run_model(self, mix, is_match_mix=False): - spek = self.stft(mix.to(self.torch_device)) - self.logger.debug(translations["stft_2"].format(shape=spek.shape)) - - spek[:, :, :3, :] *= 0 - - if is_match_mix: - spec_pred = spek.cpu().numpy() - self.logger.debug(translations["is_match_mix"]) - else: - if self.enable_denoise: - spec_pred_neg = self.model_run(-spek) - spec_pred_pos = self.model_run(spek) - spec_pred = (spec_pred_neg * -0.5) + (spec_pred_pos * 0.5) - self.logger.debug(translations["enable_denoise"]) - else: - spec_pred = self.model_run(spek) - self.logger.debug(translations["no_denoise"]) - - result = self.stft.inverse(torch.tensor(spec_pred).to(self.torch_device)).cpu().detach().numpy() - self.logger.debug(f"{translations['stft']}: {result.shape}") - - return result - -class STFT: - def __init__(self, logger, n_fft, hop_length, dim_f, device): - self.logger = logger - self.n_fft = n_fft - self.hop_length = hop_length - self.dim_f = dim_f - self.device = device - self.hann_window = torch.hann_window(window_length=self.n_fft, periodic=True) - - def __call__(self, input_tensor): - is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"] - - if is_non_standard_device: input_tensor = input_tensor.cpu() - - batch_dimensions = input_tensor.shape[:-2] - channel_dim, time_dim = input_tensor.shape[-2:] - - permuted_stft_output = torch.stft(input_tensor.reshape([-1, time_dim]), n_fft=self.n_fft, hop_length=self.hop_length, window=self.hann_window.to(input_tensor.device), center=True, return_complex=False).permute([0, 3, 1, 2]) - final_output = permuted_stft_output.reshape([*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]]).reshape([*batch_dimensions, channel_dim * 2, -1, permuted_stft_output.shape[-1]]) - - if is_non_standard_device: final_output = final_output.to(self.device) - return final_output[..., : self.dim_f, :] - - def pad_frequency_dimension(self, input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins): - return torch.cat([input_tensor, torch.zeros([*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim]).to(input_tensor.device)], -2) - - def calculate_inverse_dimensions(self, input_tensor): - channel_dim, freq_dim, time_dim = input_tensor.shape[-3:] - - return input_tensor.shape[:-3], channel_dim, freq_dim, time_dim, self.n_fft // 2 + 1 - - def prepare_for_istft(self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim): - permuted_tensor = padded_tensor.reshape([*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim]).reshape([-1, 2, num_freq_bins, time_dim]).permute([0, 2, 3, 1]) - - return permuted_tensor[..., 0] + permuted_tensor[..., 1] * 1.0j - - def inverse(self, input_tensor): - is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"] - if is_non_standard_device: input_tensor = input_tensor.cpu() - - batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = self.calculate_inverse_dimensions(input_tensor) - final_output = torch.istft(self.prepare_for_istft(self.pad_frequency_dimension(input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins), batch_dimensions, channel_dim, num_freq_bins, time_dim), n_fft=self.n_fft, hop_length=self.hop_length, window=self.hann_window.to(input_tensor.device), center=True).reshape([*batch_dimensions, 2, -1]) - - if is_non_standard_device: final_output = final_output.to(self.device) - - return final_output \ No newline at end of file diff --git a/main/library/predictors/CREPE.py b/main/library/predictors/CREPE.py deleted file mode 100644 index 68353641469f60dd7f28326e96735f373ed2f5e4..0000000000000000000000000000000000000000 --- a/main/library/predictors/CREPE.py +++ /dev/null @@ -1,210 +0,0 @@ -import os -import torch -import librosa -import functools -import scipy.stats - -import numpy as np - -CENTS_PER_BIN, MAX_FMAX, PITCH_BINS, SAMPLE_RATE, WINDOW_SIZE = 20, 2006, 360, 16000, 1024 - -class Crepe(torch.nn.Module): - def __init__(self, model='full'): - super().__init__() - if model == 'full': - in_channels = [1, 1024, 128, 128, 128, 256] - out_channels = [1024, 128, 128, 128, 256, 512] - self.in_features = 2048 - elif model == 'large': - in_channels = [1, 768, 96, 96, 96, 192] - out_channels = [768, 96, 96, 96, 192, 384] - self.in_features = 1536 - elif model == 'medium': - in_channels = [1, 512, 64, 64, 64, 128] - out_channels = [512, 64, 64, 64, 128, 256] - self.in_features = 1024 - elif model == 'small': - in_channels = [1, 256, 32, 32, 32, 64] - out_channels = [256, 32, 32, 32, 64, 128] - self.in_features = 512 - elif model == 'tiny': - in_channels = [1, 128, 16, 16, 16, 32] - out_channels = [128, 16, 16, 16, 32, 64] - self.in_features = 256 - - kernel_sizes = [(512, 1)] + 5 * [(64, 1)] - strides = [(4, 1)] + 5 * [(1, 1)] - - batch_norm_fn = functools.partial(torch.nn.BatchNorm2d, eps=0.0010000000474974513, momentum=0.0) - - self.conv1 = torch.nn.Conv2d(in_channels=in_channels[0], out_channels=out_channels[0], kernel_size=kernel_sizes[0], stride=strides[0]) - self.conv1_BN = batch_norm_fn(num_features=out_channels[0]) - self.conv2 = torch.nn.Conv2d(in_channels=in_channels[1], out_channels=out_channels[1], kernel_size=kernel_sizes[1], stride=strides[1]) - self.conv2_BN = batch_norm_fn(num_features=out_channels[1]) - - self.conv3 = torch.nn.Conv2d(in_channels=in_channels[2], out_channels=out_channels[2], kernel_size=kernel_sizes[2], stride=strides[2]) - self.conv3_BN = batch_norm_fn(num_features=out_channels[2]) - self.conv4 = torch.nn.Conv2d(in_channels=in_channels[3], out_channels=out_channels[3], kernel_size=kernel_sizes[3], stride=strides[3]) - self.conv4_BN = batch_norm_fn(num_features=out_channels[3]) - - self.conv5 = torch.nn.Conv2d(in_channels=in_channels[4], out_channels=out_channels[4], kernel_size=kernel_sizes[4], stride=strides[4]) - self.conv5_BN = batch_norm_fn(num_features=out_channels[4]) - self.conv6 = torch.nn.Conv2d(in_channels=in_channels[5], out_channels=out_channels[5], kernel_size=kernel_sizes[5], stride=strides[5]) - self.conv6_BN = batch_norm_fn(num_features=out_channels[5]) - - self.classifier = torch.nn.Linear(in_features=self.in_features, out_features=PITCH_BINS) - - def forward(self, x, embed=False): - x = self.embed(x) - if embed: return x - - return torch.sigmoid(self.classifier(self.layer(x, self.conv6, self.conv6_BN).permute(0, 2, 1, 3).reshape(-1, self.in_features))) - - def embed(self, x): - x = x[:, None, :, None] - - return self.layer(self.layer(self.layer(self.layer(self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254)), self.conv2, self.conv2_BN), self.conv3, self.conv3_BN), self.conv4, self.conv4_BN), self.conv5, self.conv5_BN) - - def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)): - return torch.nn.functional.max_pool2d(batch_norm(torch.nn.functional.relu(conv(torch.nn.functional.pad(x, padding)))), (2, 1), (2, 1)) - -def viterbi(logits): - if not hasattr(viterbi, 'transition'): - xx, yy = np.meshgrid(range(360), range(360)) - transition = np.maximum(12 - abs(xx - yy), 0) - viterbi.transition = transition / transition.sum(axis=1, keepdims=True) - - with torch.no_grad(): - probs = torch.nn.functional.softmax(logits, dim=1) - - bins = torch.tensor(np.array([librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64) for sequence in probs.cpu().numpy()]), device=probs.device) - return bins, bins_to_frequency(bins) - -def predict(audio, sample_rate, hop_length=None, fmin=50, fmax=MAX_FMAX, model='full', return_periodicity=False, batch_size=None, device='cpu', pad=True, providers=None, onnx=False): - results = [] - - if onnx: - import onnxruntime as ort - - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - - session = ort.InferenceSession(os.path.join("assets", "models", "predictors", f"crepe_{model}.onnx"), sess_options=sess_options, providers=providers) - - for frames in preprocess(audio, sample_rate, hop_length, batch_size, device, pad): - result = postprocess(torch.tensor(session.run([session.get_outputs()[0].name], {session.get_inputs()[0].name: frames.cpu().numpy()})[0].transpose(1, 0)[None]), fmin, fmax, return_periodicity) - results.append((result[0], result[1]) if isinstance(result, tuple) else result) - - del session - - if return_periodicity: - pitch, periodicity = zip(*results) - return torch.cat(pitch, 1), torch.cat(periodicity, 1) - - return torch.cat(results, 1) - else: - with torch.no_grad(): - for frames in preprocess(audio, sample_rate, hop_length, batch_size, device, pad): - result = postprocess(infer(frames, model, device, embed=False).reshape(audio.size(0), -1, PITCH_BINS).transpose(1, 2), fmin, fmax, return_periodicity) - results.append((result[0].to(audio.device), result[1].to(audio.device)) if isinstance(result, tuple) else result.to(audio.device)) - - if return_periodicity: - pitch, periodicity = zip(*results) - return torch.cat(pitch, 1), torch.cat(periodicity, 1) - - return torch.cat(results, 1) - -def bins_to_frequency(bins): - cents = CENTS_PER_BIN * bins + 1997.3794084376191 - return 10 * 2 ** ((cents + cents.new_tensor(scipy.stats.triang.rvs(c=0.5, loc=-CENTS_PER_BIN, scale=2 * CENTS_PER_BIN, size=cents.size()))) / 1200) - -def frequency_to_bins(frequency, quantize_fn=torch.floor): - return quantize_fn(((1200 * torch.log2(frequency / 10)) - 1997.3794084376191) / CENTS_PER_BIN).int() - -def infer(frames, model='full', device='cpu', embed=False): - if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or (hasattr(infer, 'capacity') and infer.capacity != model): load_model(device, model) - infer.model = infer.model.to(device) - - return infer.model(frames, embed=embed) - -def load_model(device, capacity='full'): - infer.capacity = capacity - infer.model = Crepe(capacity) - infer.model.load_state_dict(torch.load(os.path.join("assets", "models", "predictors", f"crepe_{capacity}.pth"), map_location=device)) - infer.model = infer.model.to(torch.device(device)) - infer.model.eval() - -def postprocess(probabilities, fmin=0, fmax=MAX_FMAX, return_periodicity=False): - probabilities = probabilities.detach() - - probabilities[:, :frequency_to_bins(torch.tensor(fmin))] = -float('inf') - probabilities[:, frequency_to_bins(torch.tensor(fmax), torch.ceil):] = -float('inf') - - bins, pitch = viterbi(probabilities) - - if not return_periodicity: return pitch - return pitch, periodicity(probabilities, bins) - -def preprocess(audio, sample_rate, hop_length=None, batch_size=None, device='cpu', pad=True): - hop_length = sample_rate // 100 if hop_length is None else hop_length - - if sample_rate != SAMPLE_RATE: - audio = torch.tensor(librosa.resample(audio.detach().cpu().numpy().squeeze(0), orig_sr=sample_rate, target_sr=SAMPLE_RATE, res_type="soxr_vhq"), device=audio.device).unsqueeze(0) - hop_length = int(hop_length * SAMPLE_RATE / sample_rate) - - if pad: - total_frames = 1 + int(audio.size(1) // hop_length) - audio = torch.nn.functional.pad(audio, (WINDOW_SIZE // 2, WINDOW_SIZE // 2)) - else: total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length) - - batch_size = total_frames if batch_size is None else batch_size - - for i in range(0, total_frames, batch_size): - frames = torch.nn.functional.unfold(audio[:, None, None, max(0, i * hop_length):min(audio.size(1), (i + batch_size - 1) * hop_length + WINDOW_SIZE)], kernel_size=(1, WINDOW_SIZE), stride=(1, hop_length)) - frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE).to(device) - frames -= frames.mean(dim=1, keepdim=True) - frames /= torch.max(torch.tensor(1e-10, device=frames.device), frames.std(dim=1, keepdim=True)) - - yield frames - -def periodicity(probabilities, bins): - probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS) - periodicity = probs_stacked.gather(1, bins.reshape(-1, 1).to(torch.int64)) - - return periodicity.reshape(probabilities.size(0), probabilities.size(2)) - -def mean(signals, win_length=9): - assert signals.dim() == 2 - - signals = signals.unsqueeze(1) - mask = ~torch.isnan(signals) - padding = win_length // 2 - - ones_kernel = torch.ones(signals.size(1), 1, win_length, device=signals.device) - avg_pooled = torch.nn.functional.conv1d(torch.where(mask, signals, torch.zeros_like(signals)), ones_kernel, stride=1, padding=padding) / torch.nn.functional.conv1d(mask.float(), ones_kernel, stride=1, padding=padding).clamp(min=1) - avg_pooled[avg_pooled == 0] = float("nan") - - return avg_pooled.squeeze(1) - -def median(signals, win_length): - assert signals.dim() == 2 - - signals = signals.unsqueeze(1) - mask = ~torch.isnan(signals) - padding = win_length // 2 - - x = torch.nn.functional.pad(torch.where(mask, signals, torch.zeros_like(signals)), (padding, padding), mode="reflect") - mask = torch.nn.functional.pad(mask.float(), (padding, padding), mode="constant", value=0) - - x = x.unfold(2, win_length, 1) - mask = mask.unfold(2, win_length, 1) - - x = x.contiguous().view(x.size()[:3] + (-1,)) - mask = mask.contiguous().view(mask.size()[:3] + (-1,)) - - x_sorted, _ = torch.sort(torch.where(mask.bool(), x.float(), float("inf")).to(x), dim=-1) - - median_pooled = x_sorted.gather(-1, ((mask.sum(dim=-1) - 1) // 2).clamp(min=0).unsqueeze(-1).long()).squeeze(-1) - median_pooled[torch.isinf(median_pooled)] = float("nan") - - return median_pooled.squeeze(1) \ No newline at end of file diff --git a/main/library/predictors/FCPE.py b/main/library/predictors/FCPE.py deleted file mode 100644 index 22965924537183ca0e809ecfb54b8c713eca472f..0000000000000000000000000000000000000000 --- a/main/library/predictors/FCPE.py +++ /dev/null @@ -1,1102 +0,0 @@ -import os -import io -import math -import torch -import librosa - -import numpy as np -import soundfile as sf -import onnxruntime as ort -import torch.nn.functional as F - -from torch import nn, einsum -from functools import partial -from Crypto.Cipher import AES -from Crypto.Util.Padding import unpad -from torchaudio.transforms import Resample -from einops import rearrange, repeat, pack, unpack -from torch.nn.utils.parametrizations import weight_norm - -from librosa.filters import mel as librosa_mel_fn - -os.environ["LRU_CACHE_CAPACITY"] = "3" - -def exists(val): - return val is not None - -def default(value, d): - return value if exists(value) else d - -def max_neg_value(tensor): - return -torch.finfo(tensor.dtype).max - -def empty(tensor): - return tensor.numel() == 0 - -def cast_tuple(val): - return (val,) if not isinstance(val, tuple) else val - -def l2norm(tensor): - return F.normalize(tensor, dim = -1).type(tensor.dtype) - -def decrypt_model(input_path): - with open(input_path, "rb") as f: - data = f.read() - - with open(os.path.join("main", "configs", "decrypt.bin"), "rb") as f: - key = f.read() - - return io.BytesIO(unpad(AES.new(key, AES.MODE_CBC, data[:16]).decrypt(data[16:]), AES.block_size)).read() - -def l2_regularization(model, l2_alpha): - l2_loss = [] - - for module in model.modules(): - if type(module) is nn.Conv2d: l2_loss.append((module.weight**2).sum() / 2.0) - - return l2_alpha * sum(l2_loss) - -def pad_to_multiple(tensor, multiple, dim=-1, value=0): - seqlen = tensor.shape[dim] - m = seqlen / multiple - - if m.is_integer(): return False, tensor - return True, F.pad(tensor, (*((0,) * (-1 - dim) * 2), 0, (math.ceil(m) * multiple - seqlen)), value = value) - -def look_around(x, backward = 1, forward = 0, pad_value = -1, dim = 2): - t = x.shape[1] - dims = (len(x.shape) - dim) * (0, 0) - - padded_x = F.pad(x, (*dims, backward, forward), value = pad_value) - return torch.cat([padded_x[:, ind:(ind + t), ...] for ind in range(forward + backward + 1)], dim = dim) - -def rotate_half(x): - x1, x2 = rearrange(x, 'b ... (r d) -> b ... r d', r = 2).unbind(dim = -2) - return torch.cat((-x2, x1), dim = -1) - -def apply_rotary_pos_emb(q, k, freqs, scale = 1): - q_len = q.shape[-2] - q_freqs = freqs[..., -q_len:, :] - inv_scale = scale ** -1 - - if scale.ndim == 2: scale = scale[-q_len:, :] - - q = (q * q_freqs.cos() * scale) + (rotate_half(q) * q_freqs.sin() * scale) - k = (k * freqs.cos() * inv_scale) + (rotate_half(k) * freqs.sin() * inv_scale) - - return q, k - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - -def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): - unstructured_block = torch.randn((cols, cols), device=device) - - q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced") - q, r = map(lambda t: t.to(device), (q, r)) - - if qr_uniform_q: - d = torch.diag(r, 0) - q *= d.sign() - - return q.t() - -def linear_attention(q, k, v): - return torch.einsum("...ed,...nd->...ne", k, q) if v is None else torch.einsum("...de,...nd,...n->...ne", torch.einsum("...nd,...ne->...de", k, v), q, 1.0 / (torch.einsum("...nd,...d->...n", q, k.sum(dim=-2).type_as(q)) + 1e-8)) - -def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None): - nb_full_blocks = int(nb_rows / nb_columns) - block_list = [] - - for _ in range(nb_full_blocks): - block_list.append(orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device)) - - remaining_rows = nb_rows - nb_full_blocks * nb_columns - if remaining_rows > 0: block_list.append(orthogonal_matrix_chunk(nb_columns, qr_uniform_q=qr_uniform_q, device=device)[:remaining_rows]) - - if scaling == 0: multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1) - elif scaling == 1: multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device=device) - else: raise ValueError(f"{scaling} != 0, 1") - - return torch.diag(multiplier) @ torch.cat(block_list) - -def calc_same_padding(kernel_size): - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - -def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None): - b, h, *_ = data.shape - - data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0 - ratio = projection_matrix.shape[0] ** -0.5 - - data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), repeat(projection_matrix, "j d -> b h j d", b=b, h=h).type_as(data)) - diag_data = ((torch.sum(data**2, dim=-1) / 2.0) * (data_normalizer**2)).unsqueeze(dim=-1) - - return (ratio * (torch.exp(data_dash - diag_data - torch.max(data_dash, dim=-1, keepdim=True).values) + eps) if is_query else ratio * (torch.exp(data_dash - diag_data + eps))).type_as(data) - -def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): - try: - data, sample_rate = sf.read(full_path, always_2d=True) - except Exception as e: - print(f"{full_path}: {e}") - - if return_empty_on_exception: return [], sample_rate or target_sr or 48000 - else: raise - - data = data[:, 0] if len(data.shape) > 1 else data - assert len(data) > 2 - - max_mag = (-np.iinfo(data.dtype).min if np.issubdtype(data.dtype, np.integer) else max(np.amax(data), -np.amin(data))) - data = torch.FloatTensor(data.astype(np.float32)) / ((2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)) - - if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception: return [], sample_rate or target_sr or 48000 - - if target_sr is not None and sample_rate != target_sr: - data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sample_rate, target_sr=target_sr)) - sample_rate = target_sr - - return data, sample_rate - -def torch_interp(x, xp, fp): - sort_idx = torch.argsort(xp) - - xp = xp[sort_idx] - fp = fp[sort_idx] - - right_idxs = torch.searchsorted(xp, x).clamp(max=len(xp) - 1) - left_idxs = (right_idxs - 1).clamp(min=0) - - x_left = xp[left_idxs] - y_left = fp[left_idxs] - - interp_vals = y_left + ((x - x_left) * (fp[right_idxs] - y_left) / (xp[right_idxs] - x_left)) - interp_vals[x < xp[0]] = fp[0] - interp_vals[x > xp[-1]] = fp[-1] - - return interp_vals - -def batch_interp_with_replacement_detach(uv, f0): - result = f0.clone() - - for i in range(uv.shape[0]): - interp_vals = torch_interp(torch.where(uv[i])[-1], torch.where(~uv[i])[-1], f0[i][~uv[i]]).detach() - result[i][uv[i]] = interp_vals - - return result - -def spawn_model(args): - return CFNaiveMelPE(input_channels=catch_none_args_must(args.mel.num_mels, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.mel.num_mels is None"), out_dims=catch_none_args_must(args.model.out_dims, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.out_dims is None"), hidden_dims=catch_none_args_must(args.model.hidden_dims, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.hidden_dims is None"), n_layers=catch_none_args_must(args.model.n_layers, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.n_layers is None"), n_heads=catch_none_args_must(args.model.n_heads, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.n_heads is None"), f0_max=catch_none_args_must(args.model.f0_max, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.f0_max is None"), f0_min=catch_none_args_must(args.model.f0_min, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.f0_min is None"), use_fa_norm=catch_none_args_must(args.model.use_fa_norm, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.use_fa_norm is None"), conv_only=catch_none_args_opti(args.model.conv_only, default=False, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.conv_only is None"), conv_dropout=catch_none_args_opti(args.model.conv_dropout, default=0.0, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.conv_dropout is None"), atten_dropout=catch_none_args_opti(args.model.atten_dropout, default=0.0, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.atten_dropout is None"), use_harmonic_emb=catch_none_args_opti(args.model.use_harmonic_emb, default=False, func_name="torchfcpe.tools.spawn_cf_naive_mel_pe", warning_str="args.model.use_harmonic_emb is None")) - -def catch_none_args_must(x, func_name, warning_str): - level = "ERROR" - - if x is None: - print(f' [{level}] {warning_str}') - print(f' [{level}] > {func_name}') - raise ValueError(f' [{level}] {warning_str}') - else: return x - -def catch_none_args_opti(x, default, func_name, warning_str=None, level='WARN'): - return default if x is None else x - -def spawn_wav2mel(args, device = None): - _type = args.mel.type - - if (str(_type).lower() == 'none') or (str(_type).lower() == 'default'): _type = 'default' - elif str(_type).lower() == 'stft': _type = 'stft' - - wav2mel = Wav2MelModule(sr=catch_none_args_opti(args.mel.sr, default=16000, func_name='torchfcpe.tools.spawn_wav2mel', warning_str='args.mel.sr is None'), n_mels=catch_none_args_opti(args.mel.num_mels, default=128, func_name='torchfcpe.tools.spawn_wav2mel', warning_str='args.mel.num_mels is None'), n_fft=catch_none_args_opti(args.mel.n_fft, default=1024, func_name='torchfcpe.tools.spawn_wav2mel', warning_str='args.mel.n_fft is None'), win_size=catch_none_args_opti(args.mel.win_size, default=1024, func_name='torchfcpe.tools.spawn_wav2mel', warning_str='args.mel.win_size is None'), hop_length=catch_none_args_opti(args.mel.hop_size, default=160, func_name='torchfcpe.tools.spawn_wav2mel', warning_str='args.mel.hop_size is None'), fmin=catch_none_args_opti(args.mel.fmin, default=0, func_name='torchfcpe.tools.spawn_wav2mel', warning_str='args.mel.fmin is None'), fmax=catch_none_args_opti(args.mel.fmax, default=8000, func_name='torchfcpe.tools.spawn_wav2mel', warning_str='args.mel.fmax is None'), clip_val=1e-05, mel_type=_type) - device = catch_none_args_opti(device, default='cpu', func_name='torchfcpe.tools.spawn_wav2mel', warning_str='.device is None') - - return wav2mel.to(torch.device(device)) - -def ensemble_f0(f0s, key_shift_list, tta_uv_penalty): - device = f0s.device - f0s = f0s / (torch.pow(2, torch.tensor(key_shift_list, device=device).to(device).unsqueeze(0).unsqueeze(0) / 12)) - - notes = torch.log2(f0s / 440) * 12 + 69 - notes[notes < 0] = 0 - - uv_penalty = tta_uv_penalty**2 - dp = torch.zeros_like(notes, device=device) - - backtrack = torch.zeros_like(notes, device=device).long() - dp[:, 0, :] = (notes[:, 0, :] <= 0) * uv_penalty - - for t in range(1, notes.size(1)): - penalty = torch.zeros([notes.size(0), notes.size(2), notes.size(2)], device=device) - t_uv = notes[:, t, :] <= 0 - penalty += uv_penalty * t_uv.unsqueeze(1) - - t1_uv = notes[:, t - 1, :] <= 0 - l2 = torch.pow((notes[:, t - 1, :].unsqueeze(-1) - notes[:, t, :].unsqueeze(1)) * (~t1_uv).unsqueeze(-1) * (~t_uv).unsqueeze(1), 2) - 0.5 - l2 = l2 * (l2 > 0) - - penalty += l2 - penalty += t1_uv.unsqueeze(-1) * (~t_uv).unsqueeze(1) * uv_penalty * 2 - - min_value, min_indices = torch.min(dp[:, t - 1, :].unsqueeze(-1) + penalty, dim=1) - dp[:, t, :] = min_value - backtrack[:, t, :] = min_indices - - t = f0s.size(1) - 1 - f0_result = torch.zeros_like(f0s[:, :, 0], device=device) - min_indices = torch.argmin(dp[:, t, :], dim=-1) - - for i in range(0, t + 1): - f0_result[:, t - i] = f0s[:, t - i, min_indices] - min_indices = backtrack[:, t - i, min_indices] - - return f0_result.unsqueeze(-1) - -class LocalAttention(nn.Module): - def __init__(self, window_size, causal = False, look_backward = 1, look_forward = None, dropout = 0., shared_qk = False, rel_pos_emb_config = None, dim = None, autopad = False, exact_windowsize = False, scale = None, use_rotary_pos_emb = True, use_xpos = False, xpos_scale_base = None): - super().__init__() - look_forward = default(look_forward, 0 if causal else 1) - assert not (causal and look_forward > 0) - self.scale = scale - self.window_size = window_size - self.autopad = autopad - self.exact_windowsize = exact_windowsize - self.causal = causal - self.look_backward = look_backward - self.look_forward = look_forward - self.dropout = nn.Dropout(dropout) - self.shared_qk = shared_qk - self.rel_pos = None - self.use_xpos = use_xpos - - if use_rotary_pos_emb and (exists(rel_pos_emb_config) or exists(dim)): - if exists(rel_pos_emb_config): dim = rel_pos_emb_config[0] - self.rel_pos = SinusoidalEmbeddings(dim, use_xpos = use_xpos, scale_base = default(xpos_scale_base, window_size // 2)) - - def forward(self, q, k, v, mask = None, input_mask = None, attn_bias = None, window_size = None): - mask = default(mask, input_mask) - assert not (exists(window_size) and not self.use_xpos) - - _, autopad, pad_value, window_size, causal, look_backward, look_forward, shared_qk = q.shape, self.autopad, -1, default(window_size, self.window_size), self.causal, self.look_backward, self.look_forward, self.shared_qk - (q, packed_shape), (k, _), (v, _) = map(lambda t: pack([t], '* n d'), (q, k, v)) - - if autopad: - orig_seq_len = q.shape[1] - (_, q), (_, k), (_, v) = map(lambda t: pad_to_multiple(t, self.window_size, dim = -2), (q, k, v)) - - b, n, dim_head, device, dtype = *q.shape, q.device, q.dtype - scale = default(self.scale, dim_head ** -0.5) - - assert (n % window_size) == 0 - windows = n // window_size - - if shared_qk: k = l2norm(k) - - seq = torch.arange(n, device = device) - b_t = rearrange(seq, '(w n) -> 1 w n', w = windows, n = window_size) - bq, bk, bv = map(lambda t: rearrange(t, 'b (w n) d -> b w n d', w = windows), (q, k, v)) - - bq = bq * scale - look_around_kwargs = dict(backward = look_backward, forward = look_forward, pad_value = pad_value) - - bk = look_around(bk, **look_around_kwargs) - bv = look_around(bv, **look_around_kwargs) - - if exists(self.rel_pos): - pos_emb, xpos_scale = self.rel_pos(bk) - bq, bk = apply_rotary_pos_emb(bq, bk, pos_emb, scale = xpos_scale) - - bq_t = b_t - bq_k = look_around(b_t, **look_around_kwargs) - - bq_t = rearrange(bq_t, '... i -> ... i 1') - bq_k = rearrange(bq_k, '... j -> ... 1 j') - - pad_mask = bq_k == pad_value - sim = einsum('b h i e, b h j e -> b h i j', bq, bk) - - if exists(attn_bias): - heads = attn_bias.shape[0] - assert (b % heads) == 0 - - attn_bias = repeat(attn_bias, 'h i j -> (b h) 1 i j', b = b // heads) - sim = sim + attn_bias - - mask_value = max_neg_value(sim) - - if shared_qk: - self_mask = bq_t == bq_k - sim = sim.masked_fill(self_mask, -5e4) - del self_mask - - if causal: - causal_mask = bq_t < bq_k - if self.exact_windowsize: causal_mask = causal_mask | (bq_t > (bq_k + (self.window_size * self.look_backward))) - sim = sim.masked_fill(causal_mask, mask_value) - del causal_mask - - sim = sim.masked_fill(((bq_k - (self.window_size * self.look_forward)) > bq_t) | (bq_t > (bq_k + (self.window_size * self.look_backward))) | pad_mask, mask_value) if not causal and self.exact_windowsize else sim.masked_fill(pad_mask, mask_value) - - if exists(mask): - batch = mask.shape[0] - assert (b % batch) == 0 - - h = b // mask.shape[0] - if autopad: _, mask = pad_to_multiple(mask, window_size, dim = -1, value = False) - - mask = repeat(rearrange(look_around(rearrange(mask, '... (w n) -> (...) w n', w = windows, n = window_size), **{**look_around_kwargs, 'pad_value': False}), '... j -> ... 1 j'), 'b ... -> (b h) ...', h = h) - sim = sim.masked_fill(~mask, mask_value) - - del mask - - out = rearrange(einsum('b h i j, b h j e -> b h i e', self.dropout(sim.softmax(dim = -1)), bv), 'b w n d -> b (w n) d') - if autopad: out = out[:, :orig_seq_len, :] - - out, *_ = unpack(out, packed_shape, '* n d') - return out - -class SinusoidalEmbeddings(nn.Module): - def __init__(self, dim, scale_base = None, use_xpos = False, theta = 10000): - super().__init__() - inv_freq = 1. / (theta ** (torch.arange(0, dim, 2).float() / dim)) - self.register_buffer('inv_freq', inv_freq) - self.use_xpos = use_xpos - self.scale_base = scale_base - assert not (use_xpos and not exists(scale_base)) - scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim) - self.register_buffer('scale', scale, persistent = False) - - def forward(self, x): - seq_len, device = x.shape[-2], x.device - t = torch.arange(seq_len, device = x.device).type_as(self.inv_freq) - - freqs = torch.einsum('i , j -> i j', t, self.inv_freq) - freqs = torch.cat((freqs, freqs), dim = -1) - - if not self.use_xpos: return freqs, torch.ones(1, device = device) - - power = (t - (seq_len // 2)) / self.scale_base - scale = self.scale ** rearrange(power, 'n -> n 1') - - return freqs, torch.cat((scale, scale), dim = -1) - -class STFT: - def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): - self.target_sr = sr - self.n_mels = n_mels - self.n_fft = n_fft - self.win_size = win_size - self.hop_length = hop_length - self.fmin = fmin - self.fmax = fmax - self.clip_val = clip_val - self.mel_basis = {} - self.hann_window = {} - - def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): - n_fft = self.n_fft - win_size = self.win_size - hop_length = self.hop_length - fmax = self.fmax - factor = 2 ** (keyshift / 12) - win_size_new = int(np.round(win_size * factor)) - hop_length_new = int(np.round(hop_length * speed)) - mel_basis = self.mel_basis if not train else {} - hann_window = self.hann_window if not train else {} - mel_basis_key = str(fmax) + "_" + str(y.device) - - if mel_basis_key not in mel_basis: mel_basis[mel_basis_key] = torch.from_numpy(librosa_mel_fn(sr=self.target_sr, n_fft=n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=fmax)).float().to(y.device) - keyshift_key = str(keyshift) + "_" + str(y.device) - if keyshift_key not in hann_window: hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) - - pad_left = (win_size_new - hop_length_new) // 2 - pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left) - - spec = torch.stft(torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode="reflect" if pad_right < y.size(-1) else "constant").squeeze(1), int(np.round(n_fft * factor)), hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=True) - spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) - - if keyshift != 0: - size = n_fft // 2 + 1 - resize = spec.size(1) - spec = (F.pad(spec, (0, 0, 0, size - resize)) if resize < size else spec[:, :size, :]) * win_size / win_size_new - - return dynamic_range_compression_torch(torch.matmul(mel_basis[mel_basis_key], spec), clip_val=self.clip_val) - - def __call__(self, audiopath): - audio, _ = load_wav_to_torch(audiopath, target_sr=self.target_sr) - return self.get_mel(audio.unsqueeze(0)).squeeze(0) - -class PCmer(nn.Module): - def __init__(self, num_layers, num_heads, dim_model, dim_keys, dim_values, residual_dropout, attention_dropout): - super().__init__() - self.num_layers = num_layers - self.num_heads = num_heads - self.dim_model = dim_model - self.dim_values = dim_values - self.dim_keys = dim_keys - self.residual_dropout = residual_dropout - self.attention_dropout = attention_dropout - self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) - - def forward(self, phone, mask=None): - for layer in self._layers: - phone = layer(phone, mask) - - return phone - -class _EncoderLayer(nn.Module): - def __init__(self, parent): - super().__init__() - self.conformer = ConformerConvModule_LEGACY(parent.dim_model) - self.norm = nn.LayerNorm(parent.dim_model) - self.dropout = nn.Dropout(parent.residual_dropout) - self.attn = SelfAttention(dim=parent.dim_model, heads=parent.num_heads, causal=False) - - def forward(self, phone, mask=None): - phone = phone + (self.attn(self.norm(phone), mask=mask)) - return phone + (self.conformer(phone)) - -class ConformerNaiveEncoder(nn.Module): - def __init__(self, num_layers, num_heads, dim_model, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0): - super().__init__() - self.num_layers = num_layers - self.num_heads = num_heads - self.dim_model = dim_model - self.use_norm = use_norm - self.residual_dropout = 0.1 - self.attention_dropout = 0.1 - self.encoder_layers = nn.ModuleList([CFNEncoderLayer(dim_model, num_heads, use_norm, conv_only, conv_dropout, atten_dropout) for _ in range(num_layers)]) - - def forward(self, x, mask=None): - for (_, layer) in enumerate(self.encoder_layers): - x = layer(x, mask) - - return x - -class CFNaiveMelPE(nn.Module): - def __init__(self, input_channels, out_dims, hidden_dims = 512, n_layers = 6, n_heads = 8, f0_max = 1975.5, f0_min = 32.70, use_fa_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0, use_harmonic_emb = False): - super().__init__() - self.input_channels = input_channels - self.out_dims = out_dims - self.hidden_dims = hidden_dims - self.n_layers = n_layers - self.n_heads = n_heads - self.f0_max = f0_max - self.f0_min = f0_min - self.use_fa_norm = use_fa_norm - self.residual_dropout = 0.1 - self.attention_dropout = 0.1 - self.harmonic_emb = nn.Embedding(9, hidden_dims) if use_harmonic_emb else None - self.input_stack = nn.Sequential(nn.Conv1d(input_channels, hidden_dims, 3, 1, 1), nn.GroupNorm(4, hidden_dims), nn.LeakyReLU(), nn.Conv1d(hidden_dims, hidden_dims, 3, 1, 1)) - self.net = ConformerNaiveEncoder(num_layers=n_layers, num_heads=n_heads, dim_model=hidden_dims, use_norm=use_fa_norm, conv_only=conv_only, conv_dropout=conv_dropout, atten_dropout=atten_dropout) - self.norm = nn.LayerNorm(hidden_dims) - self.output_proj = weight_norm(nn.Linear(hidden_dims, out_dims)) - self.cent_table_b = torch.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0], out_dims).detach() - self.register_buffer("cent_table", self.cent_table_b) - self.gaussian_blurred_cent_mask_b = (1200 * torch.log2(torch.Tensor([self.f0_max / 10.])))[0].detach() - self.register_buffer("gaussian_blurred_cent_mask", self.gaussian_blurred_cent_mask_b) - - def forward(self, x, _h_emb=None): - x = self.input_stack(x.transpose(-1, -2)).transpose(-1, -2) - if self.harmonic_emb is not None: x = x + self.harmonic_emb(torch.LongTensor([0]).to(x.device)) if _h_emb is None else x + self.harmonic_emb(torch.LongTensor([int(_h_emb)]).to(x.device)) - - return torch.sigmoid(self.output_proj(self.norm(self.net(x)))) - - @torch.no_grad() - def latent2cents_decoder(self, y, threshold = 0.05, mask = True): - B, N, _ = y.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True) - - if mask: - confident = torch.max(y, dim=-1, keepdim=True)[0] - confident_mask = torch.ones_like(confident) - confident_mask[confident <= threshold] = float("-INF") - rtn = rtn * confident_mask - - return rtn - - @torch.no_grad() - def latent2cents_local_decoder(self, y, threshold = 0.05, mask = True): - B, N, _ = y.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - confident, max_index = torch.max(y, dim=-1, keepdim=True) - - local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4) - local_argmax_index[local_argmax_index < 0] = 0 - local_argmax_index[local_argmax_index >= self.out_dims] = self.out_dims - 1 - - y_l = torch.gather(y, -1, local_argmax_index) - rtn = torch.sum(torch.gather(ci, -1, local_argmax_index) * y_l, dim=-1, keepdim=True) / torch.sum(y_l, dim=-1, keepdim=True) - - if mask: - confident_mask = torch.ones_like(confident) - confident_mask[confident <= threshold] = float("-INF") - - rtn = rtn * confident_mask - - return rtn - - @torch.no_grad() - def infer(self, mel, decoder = "local_argmax", threshold = 0.05): - latent = self.forward(mel) - - if decoder == "argmax": cents = self.latent2cents_local_decoder - elif decoder == "local_argmax": cents = self.latent2cents_local_decoder - - return self.cent_to_f0(cents(latent, threshold=threshold)) - - @torch.no_grad() - def cent_to_f0(self, cent: torch.Tensor) -> torch.Tensor: - return 10 * 2 ** (cent / 1200) - - @torch.no_grad() - def f0_to_cent(self, f0): - return 1200 * torch.log2(f0 / 10) - -class CFNEncoderLayer(nn.Module): - def __init__(self, dim_model, num_heads = 8, use_norm = False, conv_only = False, conv_dropout = 0, atten_dropout = 0): - super().__init__() - - self.conformer = nn.Sequential(ConformerConvModule(dim_model), nn.Dropout(conv_dropout)) if conv_dropout > 0 else ConformerConvModule(dim_model) - self.norm = nn.LayerNorm(dim_model) - - self.dropout = nn.Dropout(0.1) - self.attn = SelfAttention(dim=dim_model, heads=num_heads, causal=False, use_norm=use_norm, dropout=atten_dropout) if not conv_only else None - - def forward(self, x, mask=None): - if self.attn is not None: x = x + (self.attn(self.norm(x), mask=mask)) - return x + (self.conformer(x)) - -class Swish(nn.Module): - def forward(self, x): - return x * x.sigmoid() - -class Transpose(nn.Module): - def __init__(self, dims): - super().__init__() - assert len(dims) == 2, "dims == 2" - - self.dims = dims - - def forward(self, x): - return x.transpose(*self.dims) - -class GLU(nn.Module): - def __init__(self, dim): - super().__init__() - self.dim = dim - - def forward(self, x): - out, gate = x.chunk(2, dim=self.dim) - return out * gate.sigmoid() - -class DepthWiseConv1d_LEGACY(nn.Module): - def __init__(self, chan_in, chan_out, kernel_size, padding): - super().__init__() - self.padding = padding - self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in) - - def forward(self, x): - return self.conv(F.pad(x, self.padding)) - -class DepthWiseConv1d(nn.Module): - def __init__(self, chan_in, chan_out, kernel_size, padding, groups): - super().__init__() - self.conv = nn.Conv1d(chan_in, chan_out, kernel_size=kernel_size, padding=padding, groups=groups) - - def forward(self, x): - return self.conv(x) - -class ConformerConvModule_LEGACY(nn.Module): - def __init__(self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0): - super().__init__() - inner_dim = dim * expansion_factor - self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), GLU(dim=1), DepthWiseConv1d_LEGACY(inner_dim, inner_dim, kernel_size=kernel_size, padding=(calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0))), Swish(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout)) - - def forward(self, x): - return self.net(x) - -class ConformerConvModule(nn.Module): - def __init__(self, dim, expansion_factor=2, kernel_size=31, dropout=0): - super().__init__() - inner_dim = dim * expansion_factor - - self.net = nn.Sequential(nn.LayerNorm(dim), Transpose((1, 2)), nn.Conv1d(dim, inner_dim * 2, 1), nn.GLU(dim=1), DepthWiseConv1d(inner_dim, inner_dim, kernel_size=kernel_size, padding=calc_same_padding(kernel_size)[0], groups=inner_dim), nn.SiLU(), nn.Conv1d(inner_dim, dim, 1), Transpose((1, 2)), nn.Dropout(dropout)) - - def forward(self, x): - return self.net(x) - -class FastAttention(nn.Module): - def __init__(self, dim_heads, nb_features=None, ortho_scaling=0, causal=False, generalized_attention=False, kernel_fn=nn.ReLU(), qr_uniform_q=False, no_projection=False): - super().__init__() - nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) - self.dim_heads = dim_heads - self.nb_features = nb_features - self.ortho_scaling = ortho_scaling - self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows=self.nb_features, nb_columns=dim_heads, scaling=ortho_scaling, qr_uniform_q=qr_uniform_q) - projection_matrix = self.create_projection() - self.register_buffer("projection_matrix", projection_matrix) - self.generalized_attention = generalized_attention - self.kernel_fn = kernel_fn - self.no_projection = no_projection - self.causal = causal - - @torch.no_grad() - def redraw_projection_matrix(self): - projections = self.create_projection() - self.projection_matrix.copy_(projections) - - del projections - - def forward(self, q, k, v): - if self.no_projection: q, k = q.softmax(dim=-1), (torch.exp(k) if self.causal else k.softmax(dim=-2)) - else: - create_kernel = partial(softmax_kernel, projection_matrix=self.projection_matrix, device=q.device) - q, k = create_kernel(q, is_query=True), create_kernel(k, is_query=False) - - attn_fn = linear_attention if not self.causal else self.causal_linear_fn - return attn_fn(q, k, None) if v is None else attn_fn(q, k, v) - -class SelfAttention(nn.Module): - def __init__(self, dim, causal=False, heads=8, dim_head=64, local_heads=0, local_window_size=256, nb_features=None, feature_redraw_interval=1000, generalized_attention=False, kernel_fn=nn.ReLU(), qr_uniform_q=False, dropout=0.0, no_projection=False): - super().__init__() - assert dim % heads == 0 - dim_head = default(dim_head, dim // heads) - inner_dim = dim_head * heads - self.fast_attention = FastAttention(dim_head, nb_features, causal=causal, generalized_attention=generalized_attention, kernel_fn=kernel_fn, qr_uniform_q=qr_uniform_q, no_projection=no_projection) - self.heads = heads - self.global_heads = heads - local_heads - self.local_attn = (LocalAttention(window_size=local_window_size, causal=causal, autopad=True, dropout=dropout, look_forward=int(not causal), rel_pos_emb_config=(dim_head, local_heads)) if local_heads > 0 else None) - self.to_q = nn.Linear(dim, inner_dim) - self.to_k = nn.Linear(dim, inner_dim) - self.to_v = nn.Linear(dim, inner_dim) - self.to_out = nn.Linear(inner_dim, dim) - self.dropout = nn.Dropout(dropout) - - @torch.no_grad() - def redraw_projection_matrix(self): - self.fast_attention.redraw_projection_matrix() - - def forward(self, x, context=None, mask=None, context_mask=None, name=None, inference=False, **kwargs): - _, _, _, h, gh = *x.shape, self.heads, self.global_heads - cross_attend = exists(context) - - context = default(context, x) - context_mask = default(context_mask, mask) if not cross_attend else context_mask - - q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (self.to_q(x), self.to_k(context), self.to_v(context))) - (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) - - attn_outs = [] - - if not empty(q): - if exists(context_mask): v.masked_fill_(~context_mask[:, None, :, None], 0.0) - - if cross_attend: pass - else: out = self.fast_attention(q, k, v) - - attn_outs.append(out) - - if not empty(lq): - assert (not cross_attend), "not cross_attend" - - out = self.local_attn(lq, lk, lv, input_mask=mask) - attn_outs.append(out) - - return self.dropout(self.to_out(rearrange(torch.cat(attn_outs, dim=1), "b h n d -> b n (h d)"))) - -class HannWindow(torch.nn.Module): - def __init__(self, win_size): - super().__init__() - self.register_buffer('window', torch.hann_window(win_size), persistent=False) - - def forward(self): - return self.window - -class FCPE_LEGACY(nn.Module): - def __init__(self, input_channel=128, out_dims=360, n_layers=12, n_chans=512, use_siren=False, use_full=False, loss_mse_scale=10, loss_l2_regularization=False, loss_l2_regularization_scale=1, loss_grad1_mse=False, loss_grad1_mse_scale=1, f0_max=1975.5, f0_min=32.70, confidence=False, threshold=0.05, use_input_conv=True): - super().__init__() - if use_siren: raise ValueError("Siren not support") - if use_full: raise ValueError("Model full not support") - - self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 - self.loss_l2_regularization = (loss_l2_regularization if (loss_l2_regularization is not None) else False) - self.loss_l2_regularization_scale = (loss_l2_regularization_scale if (loss_l2_regularization_scale is not None) else 1) - self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False - self.loss_grad1_mse_scale = (loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1) - self.f0_max = f0_max if (f0_max is not None) else 1975.5 - self.f0_min = f0_min if (f0_min is not None) else 32.70 - self.confidence = confidence if (confidence is not None) else False - self.threshold = threshold if (threshold is not None) else 0.05 - self.use_input_conv = use_input_conv if (use_input_conv is not None) else True - self.cent_table_b = torch.Tensor(np.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0], out_dims)) - self.register_buffer("cent_table", self.cent_table_b) - self.stack = nn.Sequential(nn.Conv1d(input_channel, n_chans, 3, 1, 1), nn.GroupNorm(4, n_chans), nn.LeakyReLU(), nn.Conv1d(n_chans, n_chans, 3, 1, 1)) - self.decoder = PCmer(num_layers=n_layers, num_heads=8, dim_model=n_chans, dim_keys=n_chans, dim_values=n_chans, residual_dropout=0.1, attention_dropout=0.1) - self.norm = nn.LayerNorm(n_chans) - self.n_out = out_dims - self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out)) - - def forward(self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"): - if cdecoder == "argmax": self.cdecoder = self.cents_decoder - elif cdecoder == "local_argmax": self.cdecoder = self.cents_local_decoder - - x = torch.sigmoid(self.dense_out(self.norm(self.decoder((self.stack(mel.transpose(1, 2)).transpose(1, 2) if self.use_input_conv else mel))))) - - if not infer: - loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, self.gaussian_blurred_cent(self.f0_to_cent(gt_f0))) - if self.loss_l2_regularization: loss_all = loss_all + l2_regularization(model=self, l2_alpha=self.loss_l2_regularization_scale) - x = loss_all - - if infer: - x = self.cent_to_f0(self.cdecoder(x)) - x = (1 + x / 700).log() if not return_hz_f0 else x - - return x - - def cents_decoder(self, y, mask=True): - B, N, _ = y.size() - rtn = torch.sum(self.cent_table[None, None, :].expand(B, N, -1) * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True) - - if mask: - confident = torch.max(y, dim=-1, keepdim=True)[0] - confident_mask = torch.ones_like(confident) - - confident_mask[confident <= self.threshold] = float("-INF") - rtn = rtn * confident_mask - - return (rtn, confident) if self.confidence else rtn - - def cents_local_decoder(self, y, mask=True): - B, N, _ = y.size() - - confident, max_index = torch.max(y, dim=-1, keepdim=True) - local_argmax_index = torch.clamp(torch.arange(0, 9).to(max_index.device) + (max_index - 4), 0, self.n_out - 1) - - y_l = torch.gather(y, -1, local_argmax_index) - rtn = torch.sum(torch.gather(self.cent_table[None, None, :].expand(B, N, -1), -1, local_argmax_index) * y_l, dim=-1, keepdim=True) / torch.sum(y_l, dim=-1, keepdim=True) - - if mask: - confident_mask = torch.ones_like(confident) - confident_mask[confident <= self.threshold] = float("-INF") - - rtn = rtn * confident_mask - - return (rtn, confident) if self.confidence else rtn - - def cent_to_f0(self, cent): - return 10.0 * 2 ** (cent / 1200.0) - - def f0_to_cent(self, f0): - return 1200.0 * torch.log2(f0 / 10.0) - - def gaussian_blurred_cent(self, cents): - B, N, _ = cents.size() - return torch.exp(-torch.square(self.cent_table[None, None, :].expand(B, N, -1) - cents) / 1250) * (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))).float() - -class InferCFNaiveMelPE(torch.nn.Module): - def __init__(self, args, state_dict): - super().__init__() - self.wav2mel = spawn_wav2mel(args, device="cpu") - self.model = spawn_model(args) - self.model.load_state_dict(state_dict) - self.model.eval() - self.args_dict = dict(args) - self.register_buffer("tensor_device_marker", torch.tensor(1.0).float(), persistent=False) - - def forward(self, wav, sr, decoder_mode = "local_argmax", threshold = 0.006, key_shifts = [0]): - with torch.no_grad(): - mels = rearrange(torch.stack([self.wav2mel(wav.to(self.tensor_device_marker.device), sr, keyshift=keyshift) for keyshift in key_shifts], -1), "B T C K -> (B K) T C") - f0s = rearrange(self.model.infer(mels, decoder=decoder_mode, threshold=threshold), "(B K) T 1 -> B T (K 1)", K=len(key_shifts)) - - return f0s - - def infer(self, wav, sr, decoder_mode = "local_argmax", threshold = 0.006, f0_min = None, f0_max = None, interp_uv = False, output_interp_target_length = None, return_uv = False, test_time_augmentation = False, tta_uv_penalty = 12.0, tta_key_shifts = [0, -12, 12], tta_use_origin_uv=False): - if test_time_augmentation: - assert len(tta_key_shifts) > 0 - flag = 0 - - if tta_use_origin_uv: - if 0 not in tta_key_shifts: - flag = 1 - tta_key_shifts.append(0) - - tta_key_shifts.sort(key=lambda x: (x if x >= 0 else -x / 2)) - f0s = self.__call__(wav, sr, decoder_mode, threshold, tta_key_shifts) - f0 = ensemble_f0(f0s[:, :, flag:], tta_key_shifts[flag:], tta_uv_penalty) - - f0_for_uv = f0s[:, :, [0]] if tta_use_origin_uv else f0 - else: - f0 = self.__call__(wav, sr, decoder_mode, threshold) - f0_for_uv = f0 - - if f0_min is None: f0_min = self.args_dict["model"]["f0_min"] - - uv = (f0_for_uv < f0_min).type(f0_for_uv.dtype) - f0 = f0 * (1 - uv) - - if interp_uv: f0 = batch_interp_with_replacement_detach(uv.squeeze(-1).bool(), f0.squeeze(-1)).unsqueeze(-1) - if f0_max is not None: f0[f0 > f0_max] = f0_max - if output_interp_target_length is not None: f0 = torch.nn.functional.interpolate(f0.transpose(1, 2), size=int(output_interp_target_length), mode="nearest").transpose(1, 2) - - if return_uv: return f0, torch.nn.functional.interpolate(uv.transpose(1, 2), size=int(output_interp_target_length), mode="nearest").transpose(1, 2) - else: return f0 - -class FCPEInfer_LEGACY: - def __init__(self, model_path, device=None, dtype=torch.float32, providers=None, onnx=False): - if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" - self.wav2mel = Wav2Mel(device=device, dtype=dtype) - self.device = device - self.dtype = dtype - self.onnx = onnx - - if self.onnx: - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - - self.model = ort.InferenceSession(decrypt_model(model_path), sess_options=sess_options, providers=providers) - else: - ckpt = torch.load(model_path, map_location=torch.device(self.device)) - self.args = DotDict(ckpt["config"]) - - model = FCPE_LEGACY(input_channel=self.args.model.input_channel, out_dims=self.args.model.out_dims, n_layers=self.args.model.n_layers, n_chans=self.args.model.n_chans, use_siren=self.args.model.use_siren, use_full=self.args.model.use_full, loss_mse_scale=self.args.loss.loss_mse_scale, loss_l2_regularization=self.args.loss.loss_l2_regularization, loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, loss_grad1_mse=self.args.loss.loss_grad1_mse, loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, f0_max=self.args.model.f0_max, f0_min=self.args.model.f0_min, confidence=self.args.model.confidence) - model.to(self.device).to(self.dtype) - model.load_state_dict(ckpt["model"]) - - model.eval() - self.model = model - - @torch.no_grad() - def __call__(self, audio, sr, threshold=0.05): - if not self.onnx: self.model.threshold = threshold - else: self.wav2mel = Wav2Mel(device=self.device, dtype=self.dtype) - return (torch.as_tensor(self.model.run([self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype).detach().cpu().numpy(), self.model.get_inputs()[1].name: np.array(threshold, dtype=np.float32)})[0], dtype=self.dtype, device=self.device) if self.onnx else self.model(mel=self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype), infer=True, return_hz_f0=True)) - -class FCPEInfer: - def __init__(self, model_path, device=None, dtype=torch.float32, providers=None, onnx=False): - if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.dtype = dtype - self.onnx = onnx - - if self.onnx: - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - - self.model = ort.InferenceSession(decrypt_model(model_path), sess_options=sess_options, providers=providers) - else: - ckpt = torch.load(model_path, map_location=torch.device(device)) - ckpt["config_dict"]["model"]["conv_dropout"] = ckpt["config_dict"]["model"]["atten_dropout"] = 0.0 - self.args = DotDict(ckpt["config_dict"]) - - model = InferCFNaiveMelPE(self.args, ckpt["model"]) - model = model.to(device) - - model.eval() - self.model = model - - @torch.no_grad() - def __call__(self, audio, sr, threshold=0.05, f0_min=50, f0_max=1100, p_len=None): - if self.onnx: self.wav2mel = Wav2Mel(device=self.device, dtype=self.dtype) - return (torch.as_tensor(self.model.run([self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: self.wav2mel(audio=audio[None, :], sample_rate=sr).to(self.dtype).detach().cpu().numpy(), self.model.get_inputs()[1].name: np.array(threshold, dtype=np.float32)})[0], dtype=self.dtype, device=self.device) if self.onnx else self.model.infer(audio[None, :], sr, threshold=threshold, f0_min=f0_min, f0_max=f0_max, output_interp_target_length=p_len)) - -class MelModule(torch.nn.Module): - def __init__(self, sr, n_mels, n_fft, win_size, hop_length, fmin = None, fmax = None, clip_val = 1e-5, out_stft = False): - super().__init__() - if fmin is None: fmin = 0 - if fmax is None: fmax = sr / 2 - - self.target_sr = sr - self.n_mels = n_mels - self.n_fft = n_fft - self.win_size = win_size - self.hop_length = hop_length - self.fmin = fmin - self.fmax = fmax - self.clip_val = clip_val - - self.register_buffer('mel_basis', torch.tensor(librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)).float(), persistent=False) - self.hann_window = torch.nn.ModuleDict() - self.out_stft = out_stft - - @torch.no_grad() - def __call__(self, y, key_shift = 0, speed = 1, center = False, no_cache_window = False): - n_fft = self.n_fft - win_size = self.win_size - hop_length = self.hop_length - clip_val = self.clip_val - - factor = 2 ** (key_shift / 12) - n_fft_new = int(np.round(n_fft * factor)) - win_size_new = int(np.round(win_size * factor)) - hop_length_new = int(np.round(hop_length * speed)) - - y = y.squeeze(-1) - - if torch.min(y) < -1: print('[error with torchfcpe.mel_extractor.MelModule] min ', torch.min(y)) - if torch.max(y) > 1: print('[error with torchfcpe.mel_extractor.MelModule] max ', torch.max(y)) - - key_shift_key = str(key_shift) - if not no_cache_window: - if key_shift_key in self.hann_window: hann_window = self.hann_window[key_shift_key] - else: - hann_window = HannWindow(win_size_new).to(self.mel_basis.device) - self.hann_window[key_shift_key] = hann_window - - hann_window_tensor = hann_window() - else: hann_window_tensor = torch.hann_window(win_size_new).to(self.mel_basis.device) - - pad_left = (win_size_new - hop_length_new) // 2 - pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left) - - mode = 'reflect' if pad_right < y.size(-1) else 'constant' - - spec = torch.stft(torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode).squeeze(1), n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window_tensor, center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) - spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-9) - - if key_shift != 0: - size = n_fft // 2 + 1 - resize = spec.size(1) - - if resize < size: spec = F.pad(spec, (0, 0, 0, size - resize)) - spec = spec[:, :size, :] * win_size / win_size_new - - spec = spec[:, :512, :] if self.out_stft else torch.matmul(self.mel_basis, spec) - - return dynamic_range_compression_torch(spec, clip_val=clip_val).transpose(-1, -2) - -class Wav2MelModule(torch.nn.Module): - def __init__(self, sr, n_mels, n_fft, win_size, hop_length, fmin = None, fmax = None, clip_val = 1e-5, mel_type="default"): - super().__init__() - if fmin is None: fmin = 0 - if fmax is None: fmax = sr / 2 - - self.sampling_rate = sr - self.n_mels = n_mels - self.n_fft = n_fft - self.win_size = win_size - self.hop_size = hop_length - self.fmin = fmin - self.fmax = fmax - self.clip_val = clip_val - - self.register_buffer('tensor_device_marker', torch.tensor(1.0).float(), persistent=False) - self.resample_kernel = torch.nn.ModuleDict() - - if mel_type == "default": self.mel_extractor = MelModule(sr, n_mels, n_fft, win_size, hop_length, fmin, fmax, clip_val, out_stft=False) - elif mel_type == "stft": self.mel_extractor = MelModule(sr, n_mels, n_fft, win_size, hop_length, fmin, fmax, clip_val, out_stft=True) - - self.mel_type = mel_type - - @torch.no_grad() - def __call__(self, audio, sample_rate, keyshift = 0, no_cache_window = False): - - if sample_rate == self.sampling_rate: audio_res = audio - else: - key_str = str(sample_rate) - - if key_str not in self.resample_kernel: - if len(self.resample_kernel) > 8: self.resample_kernel.clear() - self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate, lowpass_filter_width=128).to(self.tensor_device_marker.device) - - audio_res = self.resample_kernel[key_str](audio.squeeze(-1)).unsqueeze(-1) - - mel = self.mel_extractor(audio_res, keyshift, no_cache_window=no_cache_window) - n_frames = int(audio.shape[1] // self.hop_size) + 1 - - if n_frames > int(mel.shape[1]): mel = torch.cat((mel, mel[:, -1:, :]), 1) - if n_frames < int(mel.shape[1]): mel = mel[:, :n_frames, :] - - return mel - -class Wav2Mel: - def __init__(self, device=None, dtype=torch.float32): - self.sample_rate = 16000 - self.hop_size = 160 - if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.dtype = dtype - self.stft = STFT(16000, 128, 1024, 1024, 160, 0, 8000) - self.resample_kernel = {} - - def extract_nvstft(self, audio, keyshift=0, train=False): - return self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) - - def extract_mel(self, audio, sample_rate, keyshift=0, train=False): - audio = audio.to(self.dtype).to(self.device) - - if sample_rate == self.sample_rate: audio_res = audio - else: - key_str = str(sample_rate) - - if key_str not in self.resample_kernel: self.resample_kernel[key_str] = Resample(sample_rate, self.sample_rate, lowpass_filter_width=128) - - self.resample_kernel[key_str] = (self.resample_kernel[key_str].to(self.dtype).to(self.device)) - audio_res = self.resample_kernel[key_str](audio) - - mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train) - n_frames = int(audio.shape[1] // self.hop_size) + 1 - - mel = (torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel) - return mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel - - def __call__(self, audio, sample_rate, keyshift=0, train=False): - return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) - -class DotDict(dict): - def __getattr__(*args): - val = dict.get(*args) - return DotDict(val) if type(val) is dict else val - - __setattr__ = dict.__setitem__ - __delattr__ = dict.__delitem__ - -class FCPE: - def __init__(self, model_path, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sample_rate=44100, threshold=0.05, providers=None, onnx=False, legacy=False): - self.fcpe = FCPEInfer_LEGACY(model_path, device=device, dtype=dtype, providers=providers, onnx=onnx) if legacy else FCPEInfer(model_path, device=device, dtype=dtype, providers=providers, onnx=onnx) - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") - self.threshold = threshold - self.sample_rate = sample_rate - self.dtype = dtype - self.legacy = legacy - self.name = "fcpe" - - def repeat_expand(self, content, target_len, mode = "nearest"): - ndim = content.ndim - content = (content[None, None] if ndim == 1 else content[None] if ndim == 2 else content) - - assert content.ndim == 3 - is_np = isinstance(content, np.ndarray) - - results = torch.nn.functional.interpolate(torch.from_numpy(content) if is_np else content, size=target_len, mode=mode) - results = results.numpy() if is_np else results - return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results - - def post_process(self, x, sample_rate, f0, pad_to): - f0 = (torch.from_numpy(f0).float().to(x.device) if isinstance(f0, np.ndarray) else f0) - f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0 - - vuv_vector = torch.zeros_like(f0) - vuv_vector[f0 > 0.0] = 1.0 - vuv_vector[f0 <= 0.0] = 0.0 - - nzindex = torch.nonzero(f0).squeeze() - f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() - vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] - - if f0.shape[0] <= 0: return np.zeros(pad_to), vuv_vector.cpu().numpy() - if f0.shape[0] == 1: return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy() - - return np.interp(np.arange(pad_to) * self.hop_length / sample_rate, self.hop_length / sample_rate * nzindex.cpu().numpy(), f0, left=f0[0], right=f0[-1]), vuv_vector.cpu().numpy() - - def compute_f0(self, wav, p_len=None): - x = torch.FloatTensor(wav).to(self.dtype).to(self.device) - p_len = x.shape[0] // self.hop_length if p_len is None else p_len - - f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold) if self.legacy else (self.fcpe(x, sr=self.sample_rate, threshold=self.threshold, f0_min=self.f0_min, f0_max=self.f0_max, p_len=p_len)) - f0 = f0[:] if f0.dim() == 1 else f0[0, :, 0] - - if torch.all(f0 == 0): return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (f0.cpu().numpy() if p_len is None else np.zeros(p_len)) - return self.post_process(x, self.sample_rate, f0, p_len)[0] \ No newline at end of file diff --git a/main/library/predictors/RMVPE.py b/main/library/predictors/RMVPE.py deleted file mode 100644 index 2cb320e6b1fbce73e812ed45e6e0a48be8ea0ff3..0000000000000000000000000000000000000000 --- a/main/library/predictors/RMVPE.py +++ /dev/null @@ -1,260 +0,0 @@ -import torch - -import numpy as np -import torch.nn as nn -import torch.nn.functional as F - -from librosa.filters import mel - -N_MELS, N_CLASS = 128, 360 - -class ConvBlockRes(nn.Module): - def __init__(self, in_channels, out_channels, momentum=0.01): - super(ConvBlockRes, self).__init__() - self.conv = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU(), nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU()) - - if in_channels != out_channels: - self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - self.is_shortcut = True - else: self.is_shortcut = False - - def forward(self, x): - return self.conv(x) + self.shortcut(x) if self.is_shortcut else self.conv(x) + x - -class ResEncoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01): - super(ResEncoderBlock, self).__init__() - self.n_blocks = n_blocks - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - - for _ in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - - self.kernel_size = kernel_size - if self.kernel_size is not None: self.pool = nn.AvgPool2d(kernel_size=kernel_size) - - def forward(self, x): - for i in range(self.n_blocks): - x = self.conv[i](x) - - if self.kernel_size is not None: return x, self.pool(x) - else: return x - -class Encoder(nn.Module): - def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01): - super(Encoder, self).__init__() - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - - self.layers = nn.ModuleList() - self.latent_channels = [] - - for _ in range(self.n_encoders): - self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum)) - self.latent_channels.append([out_channels, in_size]) - in_channels = out_channels - out_channels *= 2 - in_size //= 2 - - self.out_size = in_size - self.out_channel = out_channels - - def forward(self, x): - concat_tensors = [] - x = self.bn(x) - - for i in range(self.n_encoders): - t, x = self.layers[i](x) - concat_tensors.append(t) - - return x, concat_tensors - -class Intermediate(nn.Module): - def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): - super(Intermediate, self).__init__() - self.n_inters = n_inters - self.layers = nn.ModuleList() - self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)) - - for _ in range(self.n_inters - 1): - self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)) - - def forward(self, x): - for i in range(self.n_inters): - x = self.layers[i](x) - - return x - -class ResDecoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): - super(ResDecoderBlock, self).__init__() - out_padding = (0, 1) if stride == (1, 2) else (1, 1) - self.n_blocks = n_blocks - - self.conv1 = nn.Sequential(nn.ConvTranspose2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), stride=stride, padding=(1, 1), output_padding=out_padding, bias=False), nn.BatchNorm2d(out_channels, momentum=momentum), nn.ReLU()) - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - - for _ in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x, concat_tensor): - x = torch.cat((self.conv1(x), concat_tensor), dim=1) - - for i in range(self.n_blocks): - x = self.conv2[i](x) - - return x - -class Decoder(nn.Module): - def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): - super(Decoder, self).__init__() - self.layers = nn.ModuleList() - self.n_decoders = n_decoders - - for _ in range(self.n_decoders): - out_channels = in_channels // 2 - self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)) - in_channels = out_channels - - def forward(self, x, concat_tensors): - for i in range(self.n_decoders): - x = self.layers[i](x, concat_tensors[-1 - i]) - - return x - -class DeepUnet(nn.Module): - def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16): - super(DeepUnet, self).__init__() - self.encoder = Encoder(in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels) - self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks) - self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks) - - def forward(self, x): - x, concat_tensors = self.encoder(x) - return self.decoder(self.intermediate(x), concat_tensors) - -class E2E(nn.Module): - def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16): - super(E2E, self).__init__() - self.unet = DeepUnet(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels) - self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) - self.fc = nn.Sequential(BiGRU(3 * 128, 256, n_gru), nn.Linear(512, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) if n_gru else nn.Sequential(nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()) - - def forward(self, mel): - return self.fc(self.cnn(self.unet(mel.transpose(-1, -2).unsqueeze(1))).transpose(1, 2).flatten(-2)) - -class MelSpectrogram(torch.nn.Module): - def __init__(self, n_mel_channels, sample_rate, win_length, hop_length, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-5): - super().__init__() - n_fft = win_length if n_fft is None else n_fft - self.hann_window = {} - mel_basis = mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer("mel_basis", mel_basis) - self.n_fft = win_length if n_fft is None else n_fft - self.hop_length = hop_length - self.win_length = win_length - self.sample_rate = sample_rate - self.n_mel_channels = n_mel_channels - self.clamp = clamp - - def forward(self, audio, keyshift=0, speed=1, center=True): - factor = 2 ** (keyshift / 12) - win_length_new = int(np.round(self.win_length * factor)) - keyshift_key = str(keyshift) + "_" + str(audio.device) - if keyshift_key not in self.hann_window: self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device) - - fft = torch.stft(audio, n_fft=int(np.round(self.n_fft * factor)), hop_length=int(np.round(self.hop_length * speed)), win_length=win_length_new, window=self.hann_window[keyshift_key], center=center, return_complex=True) - magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) - - if keyshift != 0: - size = self.n_fft // 2 + 1 - resize = magnitude.size(1) - if resize < size: magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) - magnitude = magnitude[:, :size, :] * self.win_length / win_length_new - - mel_output = torch.matmul(self.mel_basis, magnitude) - return torch.log(torch.clamp(mel_output, min=self.clamp)) - -class RMVPE: - def __init__(self, model_path, device=None, providers=None, onnx=False): - self.resample_kernel = {} - self.onnx = onnx - - if self.onnx: - import onnxruntime as ort - - sess_options = ort.SessionOptions() - sess_options.log_severity_level = 3 - - self.model = ort.InferenceSession(model_path, sess_options=sess_options, providers=providers) - else: - model = E2E(4, 1, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu") - model.load_state_dict(ckpt) - model.eval() - self.model = model.to(device) - - self.resample_kernel = {} - self.device = device - self.mel_extractor = MelSpectrogram(N_MELS, 16000, 1024, 160, None, 30, 8000).to(device) - cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) - - def mel2hidden(self, mel): - with torch.no_grad(): - n_frames = mel.shape[-1] - mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect") - - hidden = self.model.run([self.model.get_outputs()[0].name], input_feed={self.model.get_inputs()[0].name: mel.cpu().numpy()})[0] if self.onnx else self.model(mel.float()) - return hidden[:, :n_frames] - - def decode(self, hidden, thred=0.03): - f0 = 10 * (2 ** (self.to_local_average_cents(hidden, thred=thred) / 1200)) - f0[f0 == 10] = 0 - - return f0 - - def infer_from_audio(self, audio, thred=0.03): - hidden = self.mel2hidden(self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True)) - - return self.decode(hidden.squeeze(0).cpu().numpy() if not self.onnx else hidden[0], thred=thred) - - def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100): - hidden = self.mel2hidden(self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True)) - - f0 = self.decode(hidden.squeeze(0).cpu().numpy() if not self.onnx else hidden[0], thred=thred) - f0[(f0 < f0_min) | (f0 > f0_max)] = 0 - - return f0 - - def to_local_average_cents(self, salience, thred=0.05): - center = np.argmax(salience, axis=1) - salience = np.pad(salience, ((0, 0), (4, 4))) - - center += 4 - todo_salience, todo_cents_mapping = [], [] - - starts = center - 4 - ends = center + 5 - - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - - todo_salience = np.array(todo_salience) - - devided = np.sum(todo_salience * np.array(todo_cents_mapping), 1) / np.sum(todo_salience, 1) - devided[np.max(salience, axis=1) <= thred] = 0 - - return devided - -class BiGRU(nn.Module): - def __init__(self, input_features, hidden_features, num_layers): - super(BiGRU, self).__init__() - self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True) - - def forward(self, x): - return self.gru(x)[0] \ No newline at end of file diff --git a/main/library/predictors/SWIPE.py b/main/library/predictors/SWIPE.py deleted file mode 100644 index 716e4becfe29de24318cb64711140164acd7d83f..0000000000000000000000000000000000000000 --- a/main/library/predictors/SWIPE.py +++ /dev/null @@ -1,150 +0,0 @@ -import math -import logging - -import numpy as np - -from matplotlib import mlab -from scipy import interpolate -from decimal import Decimal, ROUND_HALF_UP - -logging.getLogger("matplotlib").setLevel(logging.ERROR) - - -def swipe(x, fs, f0_floor=50, f0_ceil=1100, frame_period=10, sTHR=0.3): - plim = np.array([f0_floor, f0_ceil]) - t = np.arange(0, int(1000 * len(x) / fs / (frame_period) + 1)) * (frame_period / 1000) - - log2pc = np.arange(np.log2(plim[0]) * 96, np.log2(plim[-1]) * 96) - log2pc *= (1 / 96) - - pc = 2 ** log2pc - S = np.zeros((len(pc), len(t))) - - logWs = [round_matlab(elm) for elm in np.log2(4 * 2 * fs / plim)] - - ws = 2 ** np.arange(logWs[0], logWs[1] - 1, -1) - p0 = 4 * 2 * fs / ws - - d = 1 + log2pc - np.log2(4 * 2 * fs / ws[0]) - fERBs = erbs2hz(np.arange(hz2erbs(pc[0] / 4), hz2erbs(fs / 2), 0.1)) - - for i in range(len(ws)): - dn = round_matlab(4 * fs / p0[i]) - X, f, ti = mlab.specgram(x=np.r_[np.zeros(int(ws[i] / 2)), np.r_[x, np.zeros(int(dn + ws[i] / 2))]], NFFT=ws[i], Fs=fs, window=np.hanning(ws[i] + 2)[1:-1], noverlap=max(0, np.round(ws[i] - dn)), mode='complex') - - ti = np.r_[0, ti[:-1]] - M = np.maximum(0, interpolate.interp1d(f, np.abs(X.T), kind='cubic')(fERBs)).T - - if i == len(ws) - 1: - j = np.where(d - (i + 1) > -1)[0] - k = np.where(d[j] - (i + 1) < 0)[0] - elif i == 0: - j = np.where(d - (i + 1) < 1)[0] - k = np.where(d[j] - (i + 1) > 0)[0] - else: - j = np.where(np.abs(d - (i + 1)) < 1)[0] - k = np.arange(len(j)) - - Si = pitchStrengthAllCandidates(fERBs, np.sqrt(M), pc[j]) - Si = interpolate.interp1d(ti, Si, bounds_error=False, fill_value='nan')(t) if Si.shape[1] > 1 else np.full((len(Si), len(t)), np.nan) - - mu = np.ones(j.shape) - mu[k] = 1 - np.abs(d[j[k]] - i - 1) - - S[j, :] = S[j, :] + np.tile(mu.reshape(-1, 1), (1, Si.shape[1])) * Si - - - p = np.full((S.shape[1], 1), np.nan) - s = np.full((S.shape[1], 1), np.nan) - - for j in range(S.shape[1]): - s[j] = np.max(S[:, j]) - i = np.argmax(S[:, j]) - - if s[j] < sTHR: continue - - if i == 0: p[j] = pc[0] - elif i == len(pc) - 1: p[j] = pc[0] - else: - I = np.arange(i-1, i+2) - tc = 1 / pc[I] - - ntc = (tc / tc[1] - 1) * 2 * np.pi - idx = np.isfinite(S[I, j]) - - c = np.zeros(len(ntc)) - c += np.nan - - I_ = I[idx] - - if len(I_) < 2: c[idx] = (S[I, j])[0] / ntc[0] - else: c[idx] = np.polyfit(ntc[idx], (S[I_, j]), 2) - - pval = np.polyval(c, ((1 / (2 ** np.arange(np.log2(pc[I[0]]), np.log2(pc[I[2]]) + 1 / 12 / 64, 1 / 12 / 64))) / tc[1] - 1) * 2 * np.pi) - - s[j] = np.max(pval) - p[j] = 2 ** (np.log2(pc[I[0]]) + (np.argmax(pval)) / 12 / 64) - - p = p.flatten() - p[np.isnan(p)] = 0 - - return np.array(p, dtype=np.float32), np.array(t, dtype=np.float32) - -def round_matlab(n): - return int(Decimal(n).quantize(0, ROUND_HALF_UP)) - -def pitchStrengthAllCandidates(f, L, pc): - den = np.sqrt(np.sum(L * L, axis=0)) - den = np.where(den == 0, 2.220446049250313e-16, den) - - L = L / den - - S = np.zeros((len(pc), L.shape[1])) - - for j in range(len(pc)): - S[j,:] = pitchStrengthOneCandidate(f, L, pc[j]) - - return S - -def pitchStrengthOneCandidate(f, L, pc): - k = np.zeros(len(f)) - q = f / pc - - for i in ([1] + sieve(int(np.fix(f[-1] / pc - 0.75)))): - a = np.abs(q - i) - p = a < 0.25 - - k[p] = np.cos(2 * np.pi * q[p]) - - v = np.logical_and((0.25 < a), (a < 0.75)) - k[v] = k[v] + np.cos(2 * np.pi * q[v]) / 2 - - k *= np.sqrt(1 / f) - k /= np.linalg.norm(k[k>0]) - - return k @ L - -def hz2erbs(hz): - return 21.4 * np.log10(1 + hz / 229) - -def erbs2hz(erbs): - return (10 ** (erbs / 21.4) - 1) * 229 - -def sieve(n): - primes = list(range(2, n+1)) - num = 2 - - while num < math.sqrt(n): - i = num - - while i <= n: - i += num - - if i in primes: primes.remove(i) - - for j in primes: - if j > num: - num = j - break - - return primes \ No newline at end of file diff --git a/main/library/predictors/WORLD_WRAPPER.py b/main/library/predictors/WORLD_WRAPPER.py deleted file mode 100644 index 840150816a72f414b3a89678cc8598438364f586..0000000000000000000000000000000000000000 --- a/main/library/predictors/WORLD_WRAPPER.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import torch -import ctypes -import platform - -import numpy as np - - - -class DioOption(ctypes.Structure): - _fields_ = [("F0Floor", ctypes.c_double), ("F0Ceil", ctypes.c_double), ("ChannelsInOctave", ctypes.c_double), ("FramePeriod", ctypes.c_double), ("Speed", ctypes.c_int), ("AllowedRange", ctypes.c_double)] - -class HarvestOption(ctypes.Structure): - _fields_ = [("F0Floor", ctypes.c_double), ("F0Ceil", ctypes.c_double), ("FramePeriod", ctypes.c_double)] - -class PYWORLD: - def __init__(self): - self.world_path = os.path.join("assets", "models", "predictors", "world") - os.makedirs(self.world_path, exist_ok=True) - - model_type, suffix = (("world_64" if platform.architecture()[0] == "64bit" else "world_86"), ".dll") if platform.system() == "Windows" else ("world_linux", ".so") - self.world_file_path = os.path.join(self.world_path, f"{model_type}{suffix}") - - if not os.path.exists(self.world_file_path): - model = torch.load(os.path.join("assets", "models", "predictors", "world.pth"), map_location="cpu") - - with open(self.world_file_path, "wb") as w: - w.write(model[model_type]) - - self.world_dll = ctypes.CDLL(self.world_file_path) - - def harvest(self, x, fs, f0_floor=50, f0_ceil=1100, frame_period=10): - self.world_dll.Harvest.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(HarvestOption), ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double)] - self.world_dll.Harvest.restype = None - - self.world_dll.InitializeHarvestOption.argtypes = [ctypes.POINTER(HarvestOption)] - self.world_dll.InitializeHarvestOption.restype = None - - self.world_dll.GetSamplesForHarvest.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_double] - self.world_dll.GetSamplesForHarvest.restype = ctypes.c_int - - option = HarvestOption() - self.world_dll.InitializeHarvestOption(ctypes.byref(option)) - - option.F0Floor = f0_floor - option.F0Ceil = f0_ceil - option.FramePeriod = frame_period - - f0_length = self.world_dll.GetSamplesForHarvest(fs, len(x), option.FramePeriod) - f0 = (ctypes.c_double * f0_length)() - tpos = (ctypes.c_double * f0_length)() - - self.world_dll.Harvest((ctypes.c_double * len(x))(*x), len(x), fs, ctypes.byref(option), tpos, f0) - return np.array(f0, dtype=np.float32), np.array(tpos, dtype=np.float32) - - def dio(self, x, fs, f0_floor=50, f0_ceil=1100, channels_in_octave=2, frame_period=10, speed=1, allowed_range=0.1): - self.world_dll.Dio.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(DioOption), ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double)] - self.world_dll.Dio.restype = None - - self.world_dll.InitializeDioOption.argtypes = [ctypes.POINTER(DioOption)] - self.world_dll.InitializeDioOption.restype = None - - self.world_dll.GetSamplesForDIO.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_double] - self.world_dll.GetSamplesForDIO.restype = ctypes.c_int - - option = DioOption() - self.world_dll.InitializeDioOption(ctypes.byref(option)) - - option.F0Floor = f0_floor - option.F0Ceil = f0_ceil - option.ChannelsInOctave = channels_in_octave - option.FramePeriod = frame_period - option.Speed = speed - option.AllowedRange = allowed_range - - f0_length = self.world_dll.GetSamplesForDIO(fs, len(x), option.FramePeriod) - f0 = (ctypes.c_double * f0_length)() - tpos = (ctypes.c_double * f0_length)() - - self.world_dll.Dio((ctypes.c_double * len(x))(*x), len(x), fs, ctypes.byref(option), tpos, f0) - return np.array(f0, dtype=np.float32), np.array(tpos, dtype=np.float32) - - def stonemask(self, x, fs, tpos, f0): - self.world_dll.StoneMask.argtypes = [ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_double), ctypes.POINTER(ctypes.c_double), ctypes.c_int, ctypes.POINTER(ctypes.c_double)] - self.world_dll.StoneMask.restype = None - - out_f0 = (ctypes.c_double * len(f0))() - self.world_dll.StoneMask((ctypes.c_double * len(x))(*x), len(x), fs, (ctypes.c_double * len(tpos))(*tpos), (ctypes.c_double * len(f0))(*f0), len(f0), out_f0) - - return np.array(out_f0, dtype=np.float32) \ No newline at end of file diff --git a/main/library/predictors/pyworld/dio.py b/main/library/predictors/pyworld/dio.py deleted file mode 100644 index 696209de526aec7cd37f6600d42a27e568d857a5..0000000000000000000000000000000000000000 --- a/main/library/predictors/pyworld/dio.py +++ /dev/null @@ -1,330 +0,0 @@ -import math - -import numba as nb -import numpy as np - -from scipy import signal -from scipy.interpolate import interp1d - -def dio(x, fs, f0_floor=50, f0_ceil=1100, channels_in_octave=2, target_fs=4000, frame_period=10, allowed_range=0.1): - temporal_positions = np.arange(0, int(1000 * len(x) / fs / frame_period + 1)) * frame_period / 1000 - boundary_f0_list = f0_floor * (2.0 ** ((np.arange(math.ceil(np.log2(f0_ceil / f0_floor) * channels_in_octave)) + 1) / channels_in_octave)) - - y = decimate(x, int(fs / target_fs)) - y_spectrum = get_spectrum(y, target_fs, f0_floor) - raw_f0_candidate, raw_stability = get_candidate_and_stability(np.size(temporal_positions), boundary_f0_list, np.size(y), temporal_positions, target_fs, y_spectrum, f0_floor, f0_ceil) - - return np.array(fix_f0_contour(sort_candidates(raw_f0_candidate, raw_stability), frame_period, f0_floor, allowed_range), dtype=np.float32), np.array(temporal_positions, dtype=np.float32) - -def get_downsampled_signal(x, fs, target_fs): - decimation_ratio = int(fs / target_fs + 0.5) - - if fs < target_fs: - y = np.empty_like(x) - y[:] = x - actual_fs = fs - else: - y = decimate_matlab(x, decimation_ratio, n = 3) - actual_fs = fs / decimation_ratio - - y -= np.mean(y) - return y, actual_fs - -def get_spectrum(x, fs, lowest_f0): - fft_size = 2 ** math.ceil(math.log(np.size(x) + int(fs / lowest_f0 / 2 + 0.5) * 4,2)) - cutoff_in_sample = int(fs / 50 + 0.5) - - low_cut_filter = signal.windows.hann(2 * cutoff_in_sample + 3)[1:-1] - low_cut_filter = -low_cut_filter / np.sum(low_cut_filter) - low_cut_filter[cutoff_in_sample] = low_cut_filter[cutoff_in_sample] + 1 - low_cut_filter = np.r_[low_cut_filter, np.zeros(fft_size - len(low_cut_filter))] - low_cut_filter = np.r_[low_cut_filter[cutoff_in_sample:], low_cut_filter[:cutoff_in_sample]] - - return np.fft.fft(x, fft_size) * np.fft.fft(low_cut_filter, fft_size) - -def get_candidate_and_stability(number_of_frames, boundary_f0_list, y_length, temporal_positions, actual_fs, y_spectrum, f0_floor, f0_ceil): - raw_f0_candidate = np.zeros((np.size(boundary_f0_list), number_of_frames)) - raw_f0_stability = np.zeros((np.size(boundary_f0_list), number_of_frames)) - - for i in range(np.size(boundary_f0_list)): - interpolated_f0, f0_deviations = get_raw_event(boundary_f0_list[i], actual_fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil) - raw_f0_stability[i, :] = np.exp(-(f0_deviations / np.maximum(interpolated_f0, 0.0000001))) - raw_f0_candidate[i, :] = interpolated_f0 - - return raw_f0_candidate, raw_f0_stability - -def sort_candidates(f0_candidate_map, stability_map): - number_of_candidates, number_of_frames = f0_candidate_map.shape - sorted_index = np.argsort(-stability_map, axis=0, kind='quicksort') - f0_candidates = np.zeros((number_of_candidates, number_of_frames)) - - for i in range(number_of_frames): - f0_candidates[:, i] = f0_candidate_map[sorted_index[:number_of_candidates,i], i] - - return f0_candidates - -def get_raw_event(boundary_f0, fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil): - low_pass_filter = nuttall(int(fs / boundary_f0 / 2 + 0.5) * 4) - - filtered_signal = np.real(np.fft.ifft(np.fft.fft(low_pass_filter, len(y_spectrum)) * y_spectrum)) - filtered_signal = filtered_signal[low_pass_filter.argmax() + np.arange(1, y_length + 1)] - - neg_loc, neg_f0 = ZeroCrossingEngine(filtered_signal, fs) - pos_loc, pos_f0 = ZeroCrossingEngine(-filtered_signal, fs) - peak_loc, peak_f0 = ZeroCrossingEngine(np.diff(filtered_signal), fs) - dip_loc, dip_f0 = ZeroCrossingEngine(-np.diff(filtered_signal), fs) - - f0_candidate, f0_deviations = get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions) - - f0_candidate[f0_candidate > boundary_f0] = 0 - f0_candidate[f0_candidate < (boundary_f0 / 2)] = 0 - f0_candidate[f0_candidate > f0_ceil] = 0 - f0_candidate[f0_candidate < f0_floor] = 0 - f0_deviations[f0_candidate == 0] = 100000 - - return f0_candidate, f0_deviations - -def get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions): - usable_channel = max(0, np.size(neg_loc) - 2) * max(0, np.size(pos_loc) - 2) * max(0, np.size(peak_loc) - 2) * max(0, np.size(dip_f0) - 2) - interpolated_f0_list = np.zeros((4, np.size(temporal_positions))) - - if usable_channel > 0: - interpolated_f0_list[0, :] = interp1d(neg_loc, neg_f0, fill_value='extrapolate')(temporal_positions) - interpolated_f0_list[1, :] = interp1d(pos_loc, pos_f0, fill_value='extrapolate')(temporal_positions) - interpolated_f0_list[2, :] = interp1d(peak_loc, peak_f0, fill_value='extrapolate')(temporal_positions) - interpolated_f0_list[3, :] = interp1d(dip_loc, dip_f0, fill_value='extrapolate')(temporal_positions) - interpolated_f0 = np.mean(interpolated_f0_list, axis=0) - f0_deviations = np.std(interpolated_f0_list, axis=0, ddof=1) - else: - interpolated_f0 = temporal_positions * 0 - f0_deviations = temporal_positions * 0 + 1000 - - return interpolated_f0, f0_deviations - -@nb.jit((nb.float64[:], nb.float64), nopython=True, cache=True) -def ZeroCrossingEngine(x, fs): - y = np.empty_like(x) - y[:-1] = x[1:] - y[-1] = x[-1] - - negative_going_points = np.arange(1, len(x) + 1) * ((y * x < 0) * (y < x)) - edge_list = negative_going_points[negative_going_points > 0] - fine_edge_list = (edge_list) - x[edge_list - 1] / (x[edge_list] - x[edge_list - 1]) - - return (fine_edge_list[:len(fine_edge_list) - 1] + fine_edge_list[1:]) / 2 / fs, fs / np.diff(fine_edge_list) - -def nuttall(N): - return np.squeeze(np.asarray(np.array([0.355768, -0.487396, 0.144232, -0.012604]) @ np.cos(np.matrix([0,1,2,3]).T @ np.asmatrix(np.arange(N) * 2 * math.pi / (N-1))))) - -def fix_f0_contour(f0_candidates, frame_period, f0_floor, allowed_range): - voice_range_minimum =int(1 / (frame_period / 1000) / f0_floor + 0.5) * 2 + 1 - f0_step2 = fix_step2(fix_step1(f0_candidates, voice_range_minimum, allowed_range), voice_range_minimum) - section_list = count_voiced_sections(f0_step2) - f0_step4 = fix_step4(fix_step3(f0_step2, f0_candidates, section_list, allowed_range), f0_candidates, section_list, allowed_range) - - return np.copy(f0_step4) - -def fix_step1(f0_candidates, voice_range_minimum, allowed_range): - f0_base = f0_candidates[0] - f0_base[ : voice_range_minimum] = 0 - f0_base[-voice_range_minimum : ] = 0 - - f0_step1 = np.copy(f0_base) - rounding_f0_base = np.array([float("{0:.6f}".format(elm)) for elm in f0_base]) - for i in np.arange(voice_range_minimum - 1, len(f0_base)): - if abs((rounding_f0_base[i] - rounding_f0_base[i-1]) / (0.000001 + rounding_f0_base[i])) > allowed_range: f0_step1[i] = 0 - - return f0_step1 - -def fix_step2(f0_step1, voice_range_minimum): - f0_step2 = np.copy(f0_step1) - for i in np.arange((voice_range_minimum - 1) / 2 , len(f0_step1) - (voice_range_minimum - 1) / 2).astype(int): - for j in np.arange( -(voice_range_minimum - 1) / 2 , (voice_range_minimum - 1) / 2 + 1).astype(int): - if f0_step1[i + j] == 0: - f0_step2[i] = 0 - break - - return f0_step2 - -def fix_step3(f0_step2, f0_candidates, section_list, allowed_range): - f0_step3 = np.empty_like(f0_step2) - f0_step3[:] = f0_step2 - - for i in np.arange(section_list.shape[0]): - limit = len(f0_step3) - 1 if i == section_list.shape[0] - 1 else section_list[i + 1, 0] + 1 - - for j in np.arange(section_list[i, 1], limit).astype(int): - f0_step3[j + 1] = select_best_f0(f0_step3[j], f0_step3[j - 1], f0_candidates[:, j + 1], allowed_range) - if f0_step3[j + 1] == 0: break - - return f0_step3 - -def fix_step4(f0_step3, f0_candidates, section_list, allowed_range): - f0_step4 = np.copy(f0_step3) - - for i in range(section_list.shape[0] - 1, -1 , -1): - limit = 1 if i == 0 else section_list[i - 1, 1] - - for j in np.arange(section_list[i, 0], limit - 1, -1).astype(int): - f0_step4[j - 1] = select_best_f0(f0_step4[j], f0_step4[j + 1], f0_candidates[:, j - 1], allowed_range) - if f0_step4[j - 1] == 0: break - - return f0_step4 - -def select_best_f0(current_f0, past_f0, candidates, allowed_range): - from sys import float_info - - reference_f0 = (current_f0 * 3 - past_f0) / 2 - minimum_error = abs(reference_f0 - candidates[0]) - best_f0 = candidates[0] - - for i in range(1, len(candidates)): - current_error = abs(reference_f0 - candidates[i]) - if current_error < minimum_error: - minimum_error = current_error - best_f0 = candidates[i] - - if abs(1 - best_f0 / (reference_f0 + float_info.epsilon)) > allowed_range: best_f0 = 0 - return best_f0 - -def count_voiced_sections(f0): - vuv = np.copy(f0) - vuv[vuv != 0] = 1 - diff_vuv = np.diff(vuv) - boundary_list = np.append(np.append([0], np.where(diff_vuv != 0)[0]), [len(vuv) - 2]) - - first_section = np.ceil(-0.5 * diff_vuv[boundary_list[1]]) - number_of_voiced_sections = np.floor((len(boundary_list) - (1 - first_section)) / 2).astype(int) - - voiced_section_list = np.zeros((number_of_voiced_sections, 2)) - for i in range(number_of_voiced_sections): - voiced_section_list[i, :] = np.array([1 + boundary_list[int((i - 1) * 2 + 1 + (1 - first_section)) + 1], boundary_list[int((i * 2) + (1 - first_section)) + 1]]) - - return voiced_section_list - -def decimate_matlab(x, q, n=None, axis=-1): - if not isinstance(q, int): raise TypeError - if n is not None and not isinstance(n, int): raise TypeError - - system = signal.dlti(*signal.cheby1(n, 0.05, 0.8 / q)) - y = signal.filtfilt(system.num, system.den, x, axis=axis, padlen=3 * (max(len(system.den), len(system.num)) - 1)) - - nd = len(y) - return y[int(q - (q * np.ceil(nd / q) - nd)) - 1::q] - -def FilterForDecimate(x,r): - a, b = np.zeros(3), np.zeros(2) - - if r==11: - a[0] = 2.450743295230728 - a[1] = -2.06794904601978 - a[2] = 0.59574774438332101 - b[0] = 0.0026822508007163792 - b[1] = 0.0080467524021491377 - elif r==12: - a[0] = 2.4981398605924205 - a[1] = -2.1368928194784025 - a[2] = 0.62187513816221485 - b[0] = 0.0021097275904709001 - b[1] = 0.0063291827714127002 - elif r==10: - a[0] = 2.3936475118069387 - a[1] = -1.9873904075111861 - a[2] = 0.5658879979027055 - b[0] = 0.0034818622251927556 - b[1] = 0.010445586675578267 - elif r==9: - a[0] = 2.3236003491759578 - a[1] = -1.8921545617463598 - a[2] = 0.53148928133729068 - b[0] = 0.0046331164041389372 - b[1] = 0.013899349212416812 - elif r==8: - a[0] = 2.2357462340187593 - a[1] = -1.7780899984041358 - a[2] = 0.49152555365968692 - b[0] = 0.0063522763407111993 - b[1] = 0.019056829022133598 - elif r==7: - a[0] = 2.1225239019534703 - a[1] = -1.6395144861046302 - a[2] = 0.44469707800587366 - b[0] = 0.0090366882681608418 - b[1] = 0.027110064804482525 - elif r==6: - a[0] = 1.9715352749512141 - a[1] = -1.4686795689225347 - a[2] = 0.3893908434965701 - b[0] = 0.013469181309343825 - b[1] = 0.040407543928031475 - elif r==5: - a[0] = 1.7610939654280557 - a[1] = -1.2554914843859768 - a[2] = 0.3237186507788215 - b[0] = 0.021334858522387423 - b[1] = 0.06400457556716227 - elif r==4: - a[0] = 1.4499664446880227 - a[1] = -0.98943497080950582 - a[2] = 0.24578252340690215 - b[0] = 0.036710750339322612 - b[1] = 0.11013225101796784 - elif r==3: - a[0] = 0.95039378983237421 - a[1] = -0.67429146741526791 - a[2] = 0.15412211621346475 - b[0] = 0.071221945171178636 - b[1] = 0.21366583551353591 - elif r==2: - a[0] = 0.041156734567757189 - a[1] = -0.42599112459189636 - a[2] = 0.041037215479961225 - b[0] = 0.16797464681802227 - b[1] = 0.50392394045406674 - else: a[0] = a[1] = a[2] = b[0] = b[1] = 0.0 - - w = np.zeros(3) - y_prime = np.zeros_like(x) - - for i in range(len(x)): - wt = x[i] + a[0] * w[0] + a[1] * w[1] + a[2] * w[2] - y_prime[i] = b[0] * wt + b[1] * w[0] + b[1] * w[1] + b[0] * w[2] - w[2] = w[1] - w[1] = w[0] - w[0] = wt - - return y_prime - -def decimate(x,r): - y = [] - kNFact = 9 - x_length = len(x) - - tmp1 = np.zeros(x_length + kNFact * 2) - tmp2 = np.zeros(x_length + kNFact * 2) - - for i in range(kNFact): - tmp1[i] = 2 * x[0] - x[kNFact - i] - - for i in range(kNFact, kNFact + x_length): - tmp1[i] = x[i - kNFact] - - for i in range(kNFact + x_length, 2 * kNFact + x_length): - tmp1[i] = 2 * x[-1] - x[x_length - 2 - (i - (kNFact + x_length))] - - tmp2 = FilterForDecimate(tmp1, r) - for i in range(2 * kNFact + x_length): - tmp1[i] = tmp2[2 * kNFact + x_length - i - 1] - - tmp2 = FilterForDecimate(tmp1, r) - for i in range(2 * kNFact + x_length): - tmp1[i] = tmp2[2 * kNFact + x_length - i - 1] - - nbeg = int(r - r * np.ceil(x_length / r + 1) + x_length) - - count = 0 - for i in range(nbeg, x_length + kNFact, r): - y.append(tmp1[i + kNFact - 1]) - count += 1 - - return np.array(y) \ No newline at end of file diff --git a/main/library/predictors/pyworld/harvest.py b/main/library/predictors/pyworld/harvest.py deleted file mode 100644 index b60ae68cc4adfbef75ad391d9787e6b7d53efe03..0000000000000000000000000000000000000000 --- a/main/library/predictors/pyworld/harvest.py +++ /dev/null @@ -1,420 +0,0 @@ -import copy -import math - -import numba as nb -import numpy as np -import multiprocessing as mp - -from scipy import signal -from scipy.fftpack import fft -from scipy.signal import lfilter -from scipy.interpolate import interp1d -from decimal import Decimal, ROUND_HALF_UP - - -mp.set_start_method("spawn", force=True) - -EPS = 0.00000000000000022204460492503131 - - -def harvest(x, fs, f0_floor=50, f0_ceil=1100, frame_period=10): - basic_temporal_positions = np.arange(0, int(1000 * len(x) / fs / 1 + 1)) * 1 / 1000 - channels_in_octave = 40 - f0_floor_adjusted = f0_floor * 0.9 - - y, actual_fs = CalculateDownsampledSignal(x, fs, 8000) - - f0_candidates, number_of_candidates = DetectCandidates(CalculateCandidates(len(basic_temporal_positions), np.array([f0_floor_adjusted * pow(2.0, (i + 1) / channels_in_octave) for i in range(int(np.ceil(np.log2((f0_ceil * 1.1) / f0_floor_adjusted) * channels_in_octave) + 1))]), len(y), basic_temporal_positions, actual_fs, np.fft.fft(y, int(2 ** np.ceil(np.log2(len(y) + int(fs / f0_floor_adjusted * 4 + 0.5) + 1)))), f0_floor, f0_ceil)) - f0_candidates = OverlapF0Candidates(f0_candidates, number_of_candidates) - - f0_candidates, f0_candidates_score = RefineCandidates(y, actual_fs, basic_temporal_positions, f0_candidates, f0_floor, f0_ceil) - f0_candidates, f0_candidates_score = RemoveUnreliableCandidates(f0_candidates, f0_candidates_score) - - smoothed_f0 = SmoothF0(FixF0Contour(f0_candidates, f0_candidates_score)) - temporal_positions = np.arange(0, int(1000 * len(x) / fs / frame_period + 1)) * frame_period / 1000 - - return np.array(smoothed_f0[np.array(np.minimum(len(smoothed_f0) - 1, round_matlab(temporal_positions * 1000)), dtype=int)], dtype=np.float32), np.array(temporal_positions, dtype=np.float32) - -def CalculateDownsampledSignal(x, fs, target_fs): - decimation_ratio = int(fs / target_fs + 0.5) - - if fs <= target_fs: - y = copy.deepcopy(x) - actual_fs = fs - else: - offset = int(np.ceil(140 / decimation_ratio) * decimation_ratio) - actual_fs = fs / decimation_ratio - y = decimate_matlab(np.append(np.append(np.ones(offset) * x[0], x), np.ones(offset) * x[-1]), decimation_ratio, n = 3)[int(offset / decimation_ratio) : int(-offset / decimation_ratio)] - - y -= np.mean(y) - return y, actual_fs - -def CalculateCandidates(number_of_frames, boundary_f0_list, y_length, temporal_positions, actual_fs, y_spectrum, f0_floor, f0_ceil): - raw_f0_candidates = np.zeros((len(boundary_f0_list), number_of_frames)) - - for i in range(len(boundary_f0_list)): - raw_f0_candidates[i, :] = CalculateRawEvent(boundary_f0_list[i], actual_fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil) - - return raw_f0_candidates - -def DetectCandidates(raw_f0_candidates): - number_of_channels, number_of_frames = raw_f0_candidates.shape - f0_candidates = np.zeros((int(number_of_channels / 10 + 0.5), number_of_frames)) - - number_of_candidates = 0 - threshold = 10 - - for i in np.arange(number_of_frames): - tmp = np.array(raw_f0_candidates[:, i]) - tmp[tmp > 0] = 1 - tmp[0] = 0 - tmp[-1] = 0 - tmp = np.diff(tmp) - - st = np.where(tmp == 1)[0] - ed = np.where(tmp == -1)[0] - - count = 0 - - for j in np.arange(len(st)): - dif = ed[j] - st[j] - - if dif >= threshold: - f0_candidates[count, i] = np.mean(raw_f0_candidates[st[j] + 1: ed[j] + 1, i]) - count += 1 - - number_of_candidates = max(number_of_candidates, count) - - return f0_candidates, number_of_candidates - -def OverlapF0Candidates(f0_candidates, max_candidates): - n = 3 - number_of_candidates = n * 2 + 1 - - new_f0_candidates = np.zeros((number_of_candidates * max_candidates, f0_candidates.shape[1])) - new_f0_candidates[0, :] = f0_candidates[number_of_candidates - 1, :] - - for i in np.arange(number_of_candidates): - st1 = max(-(i - n) + 1, 1) - ed1 = min(-(i - n), 0) - new_f0_candidates[np.arange(max_candidates) + i * max_candidates, st1 - 1 : new_f0_candidates.shape[1] + ed1] = f0_candidates[np.arange(max_candidates), -ed1 : new_f0_candidates.shape[1] - (st1 - 1)] - - return new_f0_candidates - -def RefineCandidates(x, fs, temporal_positions, f0_candidates, f0_floor, f0_ceil): - N, f = f0_candidates.shape - - with mp.Pool(mp.cpu_count()) as pool: - results = np.array(pool.starmap(GetRefinedF0, [(x, fs, temporal_positions[i], f0_candidates[j, i], f0_floor, f0_ceil) for j in np.arange(N) for i in np.arange(f)])) - - return np.reshape(results[:, 0], [N, f]), np.reshape(results[:, 1], [N, f]) - -@nb.jit((nb.float64[:],), nopython=True, cache=True) -def round_matlab(x): - y = x.copy() - y[x > 0] += 0.5 - y[x <= 0] -= 0.5 - - return y - -def GetRefinedF0(x, fs, current_time, current_f0, f0_floor, f0_ceil): - if current_f0 == 0: return 0, 0 - - half_window_length = np.ceil(3 * fs / current_f0 / 2) - fft_size = int(2 ** np.ceil(np.log2((half_window_length * 2 + 1)) + 1)) - index_raw = round_matlab((current_time + (np.arange(-half_window_length, half_window_length + 1) / fs)) * fs + 0.001) - common = math.pi * ((index_raw - 1) / fs - current_time) / ((2 * half_window_length + 1) / fs) - main_window = 0.42 + 0.5 * np.cos(2 * common) + 0.08 * np.cos(4 * common) - - diff_window = np.empty_like(main_window) - diff_window[0] = - main_window[1] / 2 - diff_window[-1] = main_window[-2] / 2 - diff = np.diff(main_window) - diff_window[1:-1] = - (diff[1:] + diff[:-1]) / 2 - - index = (np.maximum(1, np.minimum(len(x), index_raw)) - 1).astype(int) - spectrum = fft(x[index] * main_window, fft_size) - diff_spectrum = fft(x[index] * diff_window, fft_size) - - power_spectrum = np.abs(spectrum) ** 2 - number_of_harmonics = min(np.floor(fs / 2 / current_f0), 6) - harmonic_index = np.arange(1, number_of_harmonics + 1) - - index = round_matlab(current_f0 * fft_size / fs * harmonic_index).astype(int) - instantaneous_frequency_list = ((np.arange(fft_size) / fft_size + (spectrum.real * diff_spectrum.imag - spectrum.imag * diff_spectrum.real) / power_spectrum / 2 / math.pi) * fs)[index] - amplitude_list = np.sqrt(power_spectrum[index]) - - refined_f0 = np.sum(amplitude_list * instantaneous_frequency_list) / np.sum(amplitude_list * harmonic_index) - refined_score = 1 / (0.000000000001 + np.mean(np.abs(((instantaneous_frequency_list / harmonic_index) - current_f0) / current_f0))) - - if refined_f0 < f0_floor or refined_f0 > f0_ceil or refined_score < 2.5: refined_f0 = refined_score = 0 - - return refined_f0, refined_score - -def RemoveUnreliableCandidates(f0_candidates, f0_candidates_score): - new_f0_candidates = np.array(f0_candidates) - new_f0_candidates_score = np.array(f0_candidates_score) - - for i in np.arange(1, f0_candidates.shape[1] - 1): - for j in np.arange(0, f0_candidates.shape[0]): - reference_f0 = f0_candidates[j, i] - if reference_f0 == 0: continue - - _, min_error1 = SelectBestF0(reference_f0, f0_candidates[:, i + 1], 1) - _, min_error2 = SelectBestF0(reference_f0, f0_candidates[:, i - 1], 1) - - min_error = min([min_error1, min_error2]) - if min_error > 0.05: new_f0_candidates[j, i] = new_f0_candidates_score[j, i] = 0 - - return new_f0_candidates, new_f0_candidates_score - -@nb.jit((nb.float64, nb.float64[:], nb.float64), nopython=True, cache=True) -def SelectBestF0(reference_f0, f0_candidates, allowed_range): - best_f0 = 0 - best_error = allowed_range - - for i in np.arange(len(f0_candidates)): - tmp = np.abs(reference_f0 - f0_candidates[i]) / reference_f0 - if tmp > best_error: continue - - best_f0 = f0_candidates[i] - best_error = tmp - - return best_f0, best_error - -def CalculateRawEvent(boundary_f0, fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil): - filter_length_half = int(Decimal(fs / boundary_f0 * 2).quantize(0, ROUND_HALF_UP)) - - filtered_signal = np.real(np.fft.ifft(np.fft.fft(nuttall(filter_length_half * 2 + 1) * np.cos(2 * math.pi * boundary_f0 * np.arange(-filter_length_half, filter_length_half + 1) / fs), len(y_spectrum)) * y_spectrum)) - filtered_signal = filtered_signal[(filter_length_half + 1) + np.arange(y_length)] - - neg_loc, neg_f0 = ZeroCrossingEngine(filtered_signal, fs) - pos_loc, pos_f0 = ZeroCrossingEngine(-filtered_signal, fs) - - peak_loc, peak_f0 = ZeroCrossingEngine(np.diff(filtered_signal), fs) - dip_loc, dip_f0 = ZeroCrossingEngine(-np.diff(filtered_signal), fs) - - f0_candidates = GetF0Candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions) - f0_candidates[f0_candidates > boundary_f0 * 1.1] = f0_candidates[f0_candidates < boundary_f0 * 0.9] = f0_candidates[f0_candidates > f0_ceil] = f0_candidates[f0_candidates < f0_floor] = 0 - - return f0_candidates - -@nb.jit((nb.float64[:], nb.float64), nopython=True, cache=True) -def ZeroCrossingEngine(x, fs): - y = np.empty_like(x) - y[:-1] = x[1:] - y[-1] = x[-1] - - negative_going_points = np.arange(1, len(x) + 1) * ((y * x < 0) * (y < x)) - edge_list = negative_going_points[negative_going_points > 0] - fine_edge_list = (edge_list) - x[edge_list - 1] / (x[edge_list] - x[edge_list - 1]) - - return (fine_edge_list[:len(fine_edge_list) - 1] + fine_edge_list[1:]) / 2 / fs, fs / np.diff(fine_edge_list) - -def FixF0Contour(f0_candidates, f0_candidates_score): - return FixStep4(FixStep3(FixStep2(FixStep1(SearchF0Base(f0_candidates, f0_candidates_score), 0.008), 6), f0_candidates, 0.18, f0_candidates_score), 9) - -def SearchF0Base(f0_candidates, f0_candidates_score): - f0_base = np.zeros((f0_candidates.shape[1])) - - for i in range(len(f0_base)): - f0_base[i] = f0_candidates[np.argmax(f0_candidates_score[:, i]), i] - - return f0_base - -@nb.jit((nb.float64[:], nb.float64), nopython=True, cache=True) -def FixStep1(f0_base, allowed_range): - f0_step1 = np.empty_like(f0_base) - f0_step1[:] = f0_base - f0_step1[0] = f0_step1[1] = 0 - - for i in np.arange(2, len(f0_base)): - if f0_base[i] == 0: continue - - reference_f0 = f0_base[i - 1] * 2 - f0_base[i - 2] - if np.abs((f0_base[i] - reference_f0) / (reference_f0 + EPS)) > allowed_range and np.abs((f0_base[i] - f0_base[i - 1]) / (f0_base[i - 1] + EPS)) > allowed_range: f0_step1[i] = 0 - - return f0_step1 - -def FixStep2(f0_step1, voice_range_minimum): - f0_step2 = np.empty_like(f0_step1) - f0_step2[:] = f0_step1 - - boundary_list = GetBoundaryList(f0_step1) - - for i in np.arange(1, len(boundary_list) // 2 + 1): - if boundary_list[2 * i - 1] - boundary_list[(2 * i) - 2] < voice_range_minimum: f0_step2[boundary_list[(2 * i) - 2] : boundary_list[2 * i - 1] + 1] = 0 - - return f0_step2 - -def FixStep3(f0_step2, f0_candidates, allowed_range, f0_candidates_score): - f0_step3 = np.array(f0_step2) - boundary_list = GetBoundaryList(f0_step2) - - multi_channel_f0 = GetMultiChannelF0(f0_step2, boundary_list) - range = np.zeros((len(boundary_list) // 2, 2)) - - count = -1 - for i in np.arange(1, len(boundary_list) // 2 + 1): - tmp_range = np.zeros(2) - - extended_f0, tmp_range[1] = ExtendF0(multi_channel_f0[i - 1, :], boundary_list[i * 2 - 1], min(len(f0_step2) - 2, boundary_list[i * 2 - 1] + 100), 1, f0_candidates, allowed_range) - tmp_f0_sequence, tmp_range[0] = ExtendF0(extended_f0, boundary_list[(i * 2) - 2], max(1, boundary_list[(i * 2) - 2] - 100), -1, f0_candidates, allowed_range) - - if 2200 / np.mean(tmp_f0_sequence[int(tmp_range[0]) : int(tmp_range[1]) + 1]) < tmp_range[1] - tmp_range[0]: - count += 1 - multi_channel_f0[count, :] = tmp_f0_sequence - range[count, :] = tmp_range - - if count > -1: f0_step3 = MergeF0(multi_channel_f0[0 : count + 1, :], range[0 : count + 1, :], f0_candidates, f0_candidates_score) - return f0_step3 - -def FixStep4(f0_step3, threshold): - f0_step4 = np.empty_like(f0_step3) - f0_step4[:] = f0_step3 - - boundary_list = GetBoundaryList(f0_step3) - - for i in np.arange(1, len(boundary_list) // 2 ): - distance = boundary_list[2 * i] - boundary_list[2 * i - 1] - 1 - if distance >= threshold: continue - - tmp0 = f0_step3[boundary_list[2 * i - 1]] + 1 - c = ((f0_step3[boundary_list[2 * i]] - 1) - tmp0) / (distance + 1) - count = 1 - - for j in np.arange(boundary_list[2 * i - 1] + 1, boundary_list[2 * i]): - f0_step4[j] = tmp0 + c * count - count += 1 - - return f0_step4 - -def ExtendF0(f0, origin, last_point, shift, f0_candidates, allowed_range): - extended_f0 = np.array(f0) - tmp_f0 = extended_f0[origin] - shifted_origin = origin - - count = 0 - - if shift == 1: last_point += 1 - elif shift == -1: last_point -= 1 - - for i in np.arange(origin, last_point, shift): - extended_f0[i + shift], _ = SelectBestF0(tmp_f0, f0_candidates[:, i + shift], allowed_range) - - if extended_f0[i + shift] != 0: - tmp_f0 = extended_f0[i + shift] - count = 0 - shifted_origin = i + shift - else: count += + 1 - - if count == 4: break - - return extended_f0, shifted_origin - -def GetMultiChannelF0(f0, boundary_list): - multi_channel_f0 = np.zeros((len(boundary_list) // 2, len(f0))) - - for i in np.arange(1, len(boundary_list) // 2 + 1): - multi_channel_f0[i - 1, boundary_list[(i * 2) - 2] : boundary_list[i * 2 - 1] + 1] = f0[boundary_list[(i * 2) - 2] : boundary_list[(i * 2) - 1] + 1] - - return multi_channel_f0 - -def MergeF0(multi_channel_f0, range_, f0_candidates, f0_candidates_score): - sorted_order = np.argsort(range_[:, 0], axis=0, kind='quicksort') - f0 = multi_channel_f0[sorted_order[0], :] - range_ = range_.astype(int) - - for i in np.arange(1, multi_channel_f0.shape[0]): - if range_[sorted_order[i], 0] - range_[sorted_order[0], 1] > 0: - f0[range_[sorted_order[i], 0] : range_[sorted_order[i], 1] + 1] = multi_channel_f0[sorted_order[i], range_[sorted_order[i], 0] : range_[sorted_order[i], 1] + 1] - range_[sorted_order[0], 0] = range_[sorted_order[i], 0] - range_[sorted_order[0], 1] = range_[sorted_order[i], 1] - else: f0, range_[sorted_order[0], 1] = MergeF0Sub(f0, range_[sorted_order[0], 0], range_[sorted_order[0], 1], multi_channel_f0[sorted_order[i], :], range_[sorted_order[i], 0], range_[sorted_order[i], 1], f0_candidates, f0_candidates_score) - - return f0 - -def MergeF0Sub(f0_1, st1, ed1, f0_2, st2, ed2, f0_candidates, f0_candidates_score): - merged_f0 = copy.deepcopy(f0_1) - st1, st2, ed1, ed2 = int(st1), int(st2), int(ed1), int(ed2) - - if st1 <= st2 and ed1 >= ed2: - new_ed = ed1 - return merged_f0, new_ed - - new_ed = ed2 - score1, score2 = 0, 0 - - for i in np.arange(st2, ed1 + 1): - score1 = score1 + SerachScore(f0_1[i], f0_candidates[:, i], f0_candidates_score[:, i]) - score2 = score2 + SerachScore(f0_2[i], f0_candidates[:, i], f0_candidates_score[:, i]) - - if score1 > score2: merged_f0[ed1 : ed2 + 1] = f0_2[ed1 : ed2 + 1] - else: merged_f0[st2 : ed2 + 1] = f0_2[st2 : ed2 + 1] - - return merged_f0, new_ed - -def SerachScore(f0, f0_candidates, f0_candidates_score): - score = 0 - - for i in range(f0_candidates.shape[0]): - if f0 == f0_candidates[i] and score < f0_candidates_score[i]: score = f0_candidates_score[i] - - return score - -def GetF0Candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions): - interpolated_f0_list = np.zeros((4, np.size(temporal_positions))) - - if max(0, np.size(neg_loc) - 2) * max(0, np.size(pos_loc) - 2) * max(0, np.size(peak_loc) - 2) * max(0, np.size(dip_f0) - 2) > 0: - interpolated_f0_list[0, :] = interp1d(neg_loc, neg_f0, fill_value='extrapolate')(temporal_positions) - interpolated_f0_list[1, :] = interp1d(pos_loc, pos_f0, fill_value='extrapolate')(temporal_positions) - - interpolated_f0_list[2, :] = interp1d(peak_loc, peak_f0, fill_value='extrapolate')(temporal_positions) - interpolated_f0_list[3, :] = interp1d(dip_loc, dip_f0, fill_value='extrapolate')(temporal_positions) - - interpolated_f0 = np.mean(interpolated_f0_list, axis=0) - else: interpolated_f0 = temporal_positions * 0 - - return interpolated_f0 - -def SmoothF0(f0): - smoothed_f0 = np.append(np.append(np.zeros(300), f0), np.zeros(300)) - boundary_list = GetBoundaryList(smoothed_f0) - - for i in np.arange(1, len(boundary_list) // 2 + 1): - tmp_f0_contour = FilterF0(GetMultiChannelF0(smoothed_f0, boundary_list)[i - 1, :], boundary_list[i * 2 - 2], boundary_list[i * 2 - 1], np.array([0.0078202080334971724, 0.015640416066994345, 0.007822412033497172]), np.array([1.0, -1.7347257688092754, 0.76600660094326412])) - smoothed_f0[boundary_list[i * 2 - 2] : boundary_list[i * 2 - 1] + 1] = tmp_f0_contour[boundary_list[i * 2 - 2] : boundary_list[i * 2 - 1] + 1] - - return smoothed_f0[300 : len(smoothed_f0) - 300] - -def FilterF0(f0_contour, st, ed, b, a): - smoothed_f0 = copy.deepcopy(f0_contour) - smoothed_f0[0 : st] = smoothed_f0[st] - smoothed_f0[ed + 1: ] = smoothed_f0[ed] - smoothed_f0 = lfilter(b, a, lfilter(b, a, smoothed_f0, axis=0)[-1 : : -1], axis=0)[-1 : : -1] - smoothed_f0[0 : st] = smoothed_f0[ed + 1: ] = 0 - - return smoothed_f0 - -def nuttall(N): - return np.squeeze(np.asarray(np.array([0.355768, -0.487396, 0.144232, -0.012604]) @ np.cos(np.matrix([0,1,2,3]).T @ np.asmatrix(np.arange(N) * 2 * math.pi / (N-1))))) - -def GetBoundaryList(f0): - vuv = np.array(f0) - vuv[vuv != 0] = 1 - vuv[0] = vuv[-1] = 0 - - boundary_list = np.where(np.diff(vuv) != 0)[0] - boundary_list[0:: 2] += 1 - - return boundary_list - -def decimate_matlab(x, q, n=None, axis=-1): - if not isinstance(q, int): raise TypeError - if n is not None and not isinstance(n, int): raise TypeError - - system = signal.dlti(*signal.cheby1(n, 0.05, 0.8 / q)) - y = signal.filtfilt(system.num, system.den, x, axis=axis, padlen=3 * (max(len(system.den), len(system.num)) - 1)) - nd = len(y) - - return y[int(q - (q * np.ceil(nd / q) - nd)) - 1::q] \ No newline at end of file diff --git a/main/library/predictors/pyworld/stonemask.py b/main/library/predictors/pyworld/stonemask.py deleted file mode 100644 index a37a634c0a77a1aa4f0bfc411978284b62417ea5..0000000000000000000000000000000000000000 --- a/main/library/predictors/pyworld/stonemask.py +++ /dev/null @@ -1,60 +0,0 @@ -import math - -import numba as nb -import numpy as np - -def stonemask(x, fs, temporal_positions, f0): - refined_f0 = np.copy(f0) - - for i in range(len(temporal_positions)): - if f0[i] != 0: - refined_f0[i] = get_refined_f0(x, fs, temporal_positions[i], f0[i]) - if abs(refined_f0[i] - f0[i]) / f0[i] > 0.2: refined_f0[i] = f0[i] - - return np.array(refined_f0, dtype=np.float32) - -def get_refined_f0(x, fs, current_time, current_f0): - f0_initial = current_f0 - half_window_length = np.ceil(3 * fs / f0_initial / 2) - window_length_in_time = (2 * half_window_length + 1) / fs - - base_time = np.arange(-half_window_length, half_window_length + 1) / fs - fft_size = 2 ** math.ceil(math.log((half_window_length * 2 + 1), 2) + 1) - - base_time = np.array([float("{0:.4f}".format(elm)) for elm in base_time]) - index_raw = round_matlab((current_time + base_time) * fs) - - window_time = ((index_raw - 1) / fs) - current_time - main_window = 0.42 + 0.5 * np.cos(2 * math.pi * window_time / window_length_in_time) + 0.08 * np.cos(4 * math.pi * window_time / window_length_in_time) - - index = np.array(np.maximum(1, np.minimum(len(x), index_raw)), dtype=int) - spectrum = np.fft.fft(x[index - 1] * main_window, fft_size) - - diff_spectrum = np.fft.fft(x[index - 1] * (-(np.diff(np.r_[0, main_window]) + np.diff(np.r_[main_window, 0])) / 2), fft_size) - power_spectrum = np.abs(spectrum) ** 2 - - from sys import float_info - - power_spectrum[power_spectrum == 0] = float_info.epsilon - instantaneous_frequency = (np.arange(fft_size) / fft_size * fs) + (np.real(spectrum) * np.imag(diff_spectrum) - np.imag(spectrum) * np.real(diff_spectrum)) / power_spectrum * fs / 2 / math.pi - - trim_index = np.array([1, 2]) - index_list_trim = np.array(round_matlab(f0_initial * fft_size / fs * trim_index) + 1, int) - - amp_list = np.sqrt(power_spectrum[index_list_trim - 1]) - f0_initial = np.sum(amp_list * instantaneous_frequency[index_list_trim - 1]) / np.sum(amp_list * trim_index) - - if f0_initial < 0: return 0 - - trim_index = np.array([1, 2, 3, 4, 5, 6]) - index_list_trim = np.array(round_matlab(f0_initial * fft_size / fs * trim_index) + 1, int) - amp_list = np.sqrt(power_spectrum[index_list_trim - 1]) - - return np.sum(amp_list * instantaneous_frequency[index_list_trim - 1]) / np.sum(amp_list * trim_index) - -@nb.jit((nb.float64[:],), nopython=True, cache=True) -def round_matlab(x: np.ndarray) -> np.ndarray: - y = x.copy() - y[x > 0] += 0.5 - y[x <= 0] -= 0.5 - return y \ No newline at end of file diff --git a/main/library/utils.py b/main/library/utils.py deleted file mode 100644 index 79d0ea311ed354dc16a44d4f1c12200f26a17a34..0000000000000000000000000000000000000000 --- a/main/library/utils.py +++ /dev/null @@ -1,130 +0,0 @@ -import os -import re -import sys -import codecs -import librosa -import logging - -import numpy as np -import soundfile as sf - -from pydub import AudioSegment, silence - -sys.path.append(os.getcwd()) - -from main.tools import huggingface -from main.configs.config import Config - -for l in ["httpx", "httpcore"]: - logging.getLogger(l).setLevel(logging.ERROR) - -translations = Config().translations - - -def check_predictors(method, f0_onnx=False): - if f0_onnx and method not in ["harvestw", "diow"]: method += "-onnx" - - def download(predictors): - if not os.path.exists(os.path.join("assets", "models", "predictors", predictors)): huggingface.HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cerqvpgbef/", "rot13") + predictors, os.path.join("assets", "models", "predictors", predictors)) - - model_dict = {**dict.fromkeys(["rmvpe", "rmvpe-legacy"], "rmvpe.pt"), **dict.fromkeys(["rmvpe-onnx", "rmvpe-legacy-onnx"], "rmvpe.onnx"), **dict.fromkeys(["fcpe"], "fcpe.pt"), **dict.fromkeys(["fcpe-legacy"], "fcpe_legacy.pt"), **dict.fromkeys(["fcpe-onnx"], "fcpe.onnx"), **dict.fromkeys(["fcpe-legacy-onnx"], "fcpe_legacy.onnx"), **dict.fromkeys(["crepe-full", "mangio-crepe-full"], "crepe_full.pth"), **dict.fromkeys(["crepe-full-onnx", "mangio-crepe-full-onnx"], "crepe_full.onnx"), **dict.fromkeys(["crepe-large", "mangio-crepe-large"], "crepe_large.pth"), **dict.fromkeys(["crepe-large-onnx", "mangio-crepe-large-onnx"], "crepe_large.onnx"), **dict.fromkeys(["crepe-medium", "mangio-crepe-medium"], "crepe_medium.pth"), **dict.fromkeys(["crepe-medium-onnx", "mangio-crepe-medium-onnx"], "crepe_medium.onnx"), **dict.fromkeys(["crepe-small", "mangio-crepe-small"], "crepe_small.pth"), **dict.fromkeys(["crepe-small-onnx", "mangio-crepe-small-onnx"], "crepe_small.onnx"), **dict.fromkeys(["crepe-tiny", "mangio-crepe-tiny"], "crepe_tiny.pth"), **dict.fromkeys(["crepe-tiny-onnx", "mangio-crepe-tiny-onnx"], "crepe_tiny.onnx"), **dict.fromkeys(["harvestw", "diow"], "world.pth")} - - if "hybrid" in method: - methods_str = re.search("hybrid\[(.+)\]", method) - if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")] - - for method in methods: - if method in model_dict: download(model_dict[method]) - elif method in model_dict: download(model_dict[method]) - -def check_embedders(hubert, embedders_onnx=False): - if hubert in ["contentvec_base", "hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base"]: - hubert += ".onnx" if embedders_onnx else ".pt" - - model_path = os.path.join("assets", "models", "embedders", hubert) - if not os.path.exists(model_path): huggingface.HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/rzorqqref/", "rot13") + ("onnx/" if embedders_onnx else "fairseq/") + hubert, model_path) - -def load_audio(logger, file, sample_rate=16000, formant_shifting=False, formant_qfrency=0.8, formant_timbre=0.8): - try: - file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - if not os.path.isfile(file): raise FileNotFoundError(translations["not_found"].format(name=file)) - - try: - logger.debug(translations['read_sf']) - audio, sr = sf.read(file) - except: - logger.debug(translations['read_librosa']) - audio, sr = librosa.load(file, sr=None) - - if len(audio.shape) > 1: audio = librosa.to_mono(audio.T) - if sr != sample_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq") - - if formant_shifting: - from main.library.algorithm.stftpitchshift import StftPitchShift - - pitchshifter = StftPitchShift(1024, 32, sample_rate) - audio = pitchshifter.shiftpitch(audio, factors=1, quefrency=formant_qfrency * 1e-3, distortion=formant_timbre) - except Exception as e: - raise RuntimeError(f"{translations['errors_loading_audio']}: {e}") - - return audio.flatten() - -def process_audio(logger, file_path, output_path): - try: - song = pydub_convert(pydub_load(file_path)) - cut_files, time_stamps = [], [] - - for i, (start_i, end_i) in enumerate(silence.detect_nonsilent(song, min_silence_len=250, silence_thresh=-60)): - chunk = song[start_i:end_i] - - chunk_file_path = os.path.join(output_path, f"chunk{i}.wav") - logger.debug(f"{chunk_file_path}: {len(chunk)}") - - if os.path.exists(chunk_file_path): os.remove(chunk_file_path) - chunk.export(chunk_file_path, format="wav") - - cut_files.append(chunk_file_path) - time_stamps.append((start_i, end_i)) - - logger.info(f"{translations['split_total']}: {len(cut_files)}") - return cut_files, time_stamps - except Exception as e: - raise RuntimeError(f"{translations['process_audio_error']}: {e}") - -def merge_audio(files_list, time_stamps, original_file_path, output_path, format): - try: - def extract_number(filename): - match = re.search(r'_(\d+)', filename) - return int(match.group(1)) if match else 0 - - total_duration = len(pydub_load(original_file_path)) - - combined = AudioSegment.empty() - current_position = 0 - - for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps): - if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position) - - combined += pydub_load(file) - current_position = end_i - - if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position) - combined.export(output_path, format=format) - - return output_path - except Exception as e: - raise RuntimeError(f"{translations['merge_error']}: {e}") - -def pydub_convert(audio): - samples = np.frombuffer(audio.raw_data, dtype=np.int16) - if samples.dtype != np.int16: samples = (samples * 32767).astype(np.int16) - - return AudioSegment(samples.tobytes(), frame_rate=audio.frame_rate, sample_width=samples.dtype.itemsize, channels=audio.channels) - -def pydub_load(input_path): - if input_path.endswith(".wav"): audio = AudioSegment.from_wav(input_path) - elif input_path.endswith(".mp3"): audio = AudioSegment.from_mp3(input_path) - elif input_path.endswith(".ogg"): audio = AudioSegment.from_ogg(input_path) - else: audio = AudioSegment.from_file(input_path) - - return audio \ No newline at end of file diff --git a/main/library/uvr5_separator/common_separator.py b/main/library/uvr5_separator/common_separator.py deleted file mode 100644 index 109d76ebc30b41b6a504777b5d31113dea6388fc..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/common_separator.py +++ /dev/null @@ -1,250 +0,0 @@ -import os -import gc -import sys -import torch -import librosa - -import numpy as np -import soundfile as sf - -from pydub import AudioSegment - -sys.path.append(os.getcwd()) - -from .spec_utils import normalize -from main.configs.config import Config - -translations = Config().translations - -class CommonSeparator: - ALL_STEMS = "All Stems" - VOCAL_STEM = "Vocals" - INST_STEM = "Instrumental" - OTHER_STEM = "Other" - BASS_STEM = "Bass" - DRUM_STEM = "Drums" - GUITAR_STEM = "Guitar" - PIANO_STEM = "Piano" - SYNTH_STEM = "Synthesizer" - STRINGS_STEM = "Strings" - WOODWINDS_STEM = "Woodwinds" - BRASS_STEM = "Brass" - WIND_INST_STEM = "Wind Inst" - NO_OTHER_STEM = "No Other" - NO_BASS_STEM = "No Bass" - NO_DRUM_STEM = "No Drums" - NO_GUITAR_STEM = "No Guitar" - NO_PIANO_STEM = "No Piano" - NO_SYNTH_STEM = "No Synthesizer" - NO_STRINGS_STEM = "No Strings" - NO_WOODWINDS_STEM = "No Woodwinds" - NO_WIND_INST_STEM = "No Wind Inst" - NO_BRASS_STEM = "No Brass" - PRIMARY_STEM = "Primary Stem" - SECONDARY_STEM = "Secondary Stem" - LEAD_VOCAL_STEM = "lead_only" - BV_VOCAL_STEM = "backing_only" - LEAD_VOCAL_STEM_I = "with_lead_vocals" - BV_VOCAL_STEM_I = "with_backing_vocals" - LEAD_VOCAL_STEM_LABEL = "Lead Vocals" - BV_VOCAL_STEM_LABEL = "Backing Vocals" - NO_STEM = "No " - STEM_PAIR_MAPPER = {VOCAL_STEM: INST_STEM, INST_STEM: VOCAL_STEM, LEAD_VOCAL_STEM: BV_VOCAL_STEM, BV_VOCAL_STEM: LEAD_VOCAL_STEM, PRIMARY_STEM: SECONDARY_STEM} - NON_ACCOM_STEMS = (VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM, SYNTH_STEM, STRINGS_STEM, WOODWINDS_STEM, BRASS_STEM, WIND_INST_STEM) - - def __init__(self, config): - self.logger = config.get("logger") - self.log_level = config.get("log_level") - self.torch_device = config.get("torch_device") - self.torch_device_cpu = config.get("torch_device_cpu") - self.torch_device_mps = config.get("torch_device_mps") - self.onnx_execution_provider = config.get("onnx_execution_provider") - self.model_name = config.get("model_name") - self.model_path = config.get("model_path") - self.model_data = config.get("model_data") - self.output_dir = config.get("output_dir") - self.output_format = config.get("output_format") - self.output_bitrate = config.get("output_bitrate") - self.normalization_threshold = config.get("normalization_threshold") - self.enable_denoise = config.get("enable_denoise") - self.output_single_stem = config.get("output_single_stem") - self.invert_using_spec = config.get("invert_using_spec") - self.sample_rate = config.get("sample_rate") - self.primary_stem_name = None - self.secondary_stem_name = None - - if "training" in self.model_data and "instruments" in self.model_data["training"]: - instruments = self.model_data["training"]["instruments"] - if instruments: - self.primary_stem_name = instruments[0] - self.secondary_stem_name = instruments[1] if len(instruments) > 1 else self.secondary_stem(self.primary_stem_name) - - if self.primary_stem_name is None: - self.primary_stem_name = self.model_data.get("primary_stem", "Vocals") - self.secondary_stem_name = self.secondary_stem(self.primary_stem_name) - - self.is_karaoke = self.model_data.get("is_karaoke", False) - self.is_bv_model = self.model_data.get("is_bv_model", False) - self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0) - self.logger.debug(translations["info"].format(model_name=self.model_name, model_path=self.model_path)) - self.logger.debug(translations["info_2"].format(output_dir=self.output_dir, output_format=self.output_format)) - self.logger.debug(translations["info_3"].format(normalization_threshold=self.normalization_threshold)) - self.logger.debug(translations["info_4"].format(enable_denoise=self.enable_denoise, output_single_stem=self.output_single_stem)) - self.logger.debug(translations["info_5"].format(invert_using_spec=self.invert_using_spec, sample_rate=self.sample_rate)) - self.logger.debug(translations["info_6"].format(primary_stem_name=self.primary_stem_name, secondary_stem_name=self.secondary_stem_name)) - self.logger.debug(translations["info_7"].format(is_karaoke=self.is_karaoke, is_bv_model=self.is_bv_model, bv_model_rebalance=self.bv_model_rebalance)) - self.audio_file_path = None - self.audio_file_base = None - self.primary_source = None - self.secondary_source = None - self.primary_stem_output_path = None - self.secondary_stem_output_path = None - self.cached_sources_map = {} - - def secondary_stem(self, primary_stem): - primary_stem = primary_stem if primary_stem else self.NO_STEM - return self.STEM_PAIR_MAPPER[primary_stem] if primary_stem in self.STEM_PAIR_MAPPER else primary_stem.replace(self.NO_STEM, "") if self.NO_STEM in primary_stem else f"{self.NO_STEM}{primary_stem}" - - def separate(self, audio_file_path): - pass - - def final_process(self, stem_path, source, stem_name): - self.logger.debug(translations["success_process"].format(stem_name=stem_name)) - self.write_audio(stem_path, source) - return {stem_name: source} - - def cached_sources_clear(self): - self.cached_sources_map = {} - - def cached_source_callback(self, model_architecture, model_name=None): - model, sources = None, None - mapper = self.cached_sources_map[model_architecture] - for key, value in mapper.items(): - if model_name in key: - model = key - sources = value - - return model, sources - - def cached_model_source_holder(self, model_architecture, sources, model_name=None): - self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}} - - def prepare_mix(self, mix): - audio_path = mix - if not isinstance(mix, np.ndarray): - self.logger.debug(f"{translations['load_audio']}: {mix}") - mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate) - self.logger.debug(translations["load_audio_success"].format(sr=sr, shape=mix.shape)) - else: - self.logger.debug(translations["convert_mix"]) - mix = mix.T - self.logger.debug(translations["convert_shape"].format(shape=mix.shape)) - - if isinstance(audio_path, str): - if not np.any(mix): - error_msg = translations["audio_not_valid"].format(audio_path=audio_path) - self.logger.error(error_msg) - raise ValueError(error_msg) - else: self.logger.debug(translations["audio_valid"]) - - if mix.ndim == 1: - self.logger.debug(translations["mix_single"]) - mix = np.asfortranarray([mix, mix]) - self.logger.debug(translations["convert_mix_audio"]) - - self.logger.debug(translations["mix_success_2"]) - return mix - - def write_audio(self, stem_path, stem_source): - duration_seconds = librosa.get_duration(filename=self.audio_file_path) - duration_hours = duration_seconds / 3600 - self.logger.info(translations["duration"].format(duration_hours=f"{duration_hours:.2f}", duration_seconds=f"{duration_seconds:.2f}")) - - if duration_hours >= 1: - self.logger.debug(translations["write"].format(name="soundfile")) - self.write_audio_soundfile(stem_path, stem_source) - else: - self.logger.info(translations["write"].format(name="pydub")) - self.write_audio_pydub(stem_path, stem_source) - - def write_audio_pydub(self, stem_path, stem_source): - self.logger.debug(f"{translations['write_audio'].format(name='write_audio_pydub')} {stem_path}") - stem_source = normalize(wave=stem_source, max_peak=self.normalization_threshold) - - if np.max(np.abs(stem_source)) < 1e-6: - self.logger.warning(translations["original_not_valid"]) - return - - if self.output_dir: - os.makedirs(self.output_dir, exist_ok=True) - stem_path = os.path.join(self.output_dir, stem_path) - - self.logger.debug(f"{translations['shape_audio']}: {stem_source.shape}") - self.logger.debug(f"{translations['convert_data']}: {stem_source.dtype}") - - if stem_source.dtype != np.int16: - stem_source = (stem_source * 32767).astype(np.int16) - self.logger.debug(translations["original_source_to_int16"]) - - stem_source_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16) - stem_source_interleaved[0::2] = stem_source[:, 0] - stem_source_interleaved[1::2] = stem_source[:, 1] - self.logger.debug(f"{translations['shape_audio_2']}: {stem_source_interleaved.shape}") - - try: - audio_segment = AudioSegment(stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=stem_source.dtype.itemsize, channels=2) - self.logger.debug(translations["create_audiosegment"]) - except (IOError, ValueError) as e: - self.logger.error(f"{translations['create_audiosegment_error']}: {e}") - return - - file_format = stem_path.lower().split(".")[-1] - - if file_format == "m4a": file_format = "mp4" - elif file_format == "mka": file_format = "matroska" - - try: - audio_segment.export(stem_path, format=file_format, bitrate="320k" if file_format == "mp3" and self.output_bitrate is None else self.output_bitrate) - self.logger.debug(f"{translations['export_success']} {stem_path}") - except (IOError, ValueError) as e: - self.logger.error(f"{translations['export_error']}: {e}") - - def write_audio_soundfile(self, stem_path, stem_source): - self.logger.debug(f"{translations['write_audio'].format(name='write_audio_soundfile')}: {stem_path}") - - if stem_source.shape[1] == 2: - if stem_source.flags["F_CONTIGUOUS"]: stem_source = np.ascontiguousarray(stem_source) - else: - stereo_interleaved = np.empty((2 * stem_source.shape[0],), dtype=np.int16) - stereo_interleaved[0::2] = stem_source[:, 0] - stereo_interleaved[1::2] = stem_source[:, 1] - stem_source = stereo_interleaved - - self.logger.debug(f"{translations['shape_audio_2']}: {stem_source.shape}") - - try: - sf.write(stem_path, stem_source, self.sample_rate) - self.logger.debug(f"{translations['export_success']} {stem_path}") - except Exception as e: - self.logger.error(f"{translations['export_error']}: {e}") - - def clear_gpu_cache(self): - self.logger.debug(translations["clean"]) - gc.collect() - - if self.torch_device == torch.device("mps"): - self.logger.debug(translations["clean_cache"].format(name="MPS")) - torch.mps.empty_cache() - - if self.torch_device == torch.device("cuda"): - self.logger.debug(translations["clean_cache"].format(name="CUDA")) - torch.cuda.empty_cache() - - def clear_file_specific_paths(self): - self.logger.info(translations["del_path"]) - self.audio_file_path = None - self.audio_file_base = None - self.primary_source = None - self.secondary_source = None - self.primary_stem_output_path = None - self.secondary_stem_output_path = None \ No newline at end of file diff --git a/main/library/uvr5_separator/demucs/apply.py b/main/library/uvr5_separator/demucs/apply.py deleted file mode 100644 index 66578cc4855a5508bac3e75fdd3982c463b28aa1..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/demucs/apply.py +++ /dev/null @@ -1,250 +0,0 @@ -import tqdm -import torch -import random - -from torch import nn -from torch.nn import functional as F -from concurrent.futures import ThreadPoolExecutor - -from .utils import center_trim - -class DummyPoolExecutor: - class DummyResult: - def __init__(self, func, *args, **kwargs): - self.func = func - self.args = args - self.kwargs = kwargs - - def result(self): - return self.func(*self.args, **self.kwargs) - - def __init__(self, workers=0): - pass - - def submit(self, func, *args, **kwargs): - return DummyPoolExecutor.DummyResult(func, *args, **kwargs) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, exc_tb): - return - -class BagOfModels(nn.Module): - def __init__(self, models, weights = None, segment = None): - super().__init__() - assert len(models) > 0 - first = models[0] - - for other in models: - assert other.sources == first.sources - assert other.samplerate == first.samplerate - assert other.audio_channels == first.audio_channels - - if segment is not None: other.segment = segment - - self.audio_channels = first.audio_channels - self.samplerate = first.samplerate - self.sources = first.sources - self.models = nn.ModuleList(models) - - if weights is None: weights = [[1.0 for _ in first.sources] for _ in models] - else: - assert len(weights) == len(models) - - for weight in weights: - assert len(weight) == len(first.sources) - - self.weights = weights - - def forward(self, x): - pass - -class TensorChunk: - def __init__(self, tensor, offset=0, length=None): - total_length = tensor.shape[-1] - assert offset >= 0 - assert offset < total_length - - length = total_length - offset if length is None else min(total_length - offset, length) - - if isinstance(tensor, TensorChunk): - self.tensor = tensor.tensor - self.offset = offset + tensor.offset - else: - self.tensor = tensor - self.offset = offset - - self.length = length - self.device = tensor.device - - @property - def shape(self): - shape = list(self.tensor.shape) - shape[-1] = self.length - return shape - - def padded(self, target_length): - delta = target_length - self.length - total_length = self.tensor.shape[-1] - assert delta >= 0 - - start = self.offset - delta // 2 - end = start + target_length - - correct_start = max(0, start) - correct_end = min(total_length, end) - - pad_left = correct_start - start - pad_right = end - correct_end - - out = F.pad(self.tensor[..., correct_start:correct_end], (pad_left, pad_right)) - - assert out.shape[-1] == target_length - return out - -def tensor_chunk(tensor_or_chunk): - if isinstance(tensor_or_chunk, TensorChunk): return tensor_or_chunk - else: - assert isinstance(tensor_or_chunk, torch.Tensor) - return TensorChunk(tensor_or_chunk) - -def apply_model(model, mix, shifts=1, split=True, overlap=0.25, transition_power=1.0, static_shifts=1, set_progress_bar=None, device=None, progress=False, num_workers=0, pool=None): - global fut_length, bag_num, prog_bar - - device = mix.device if device is None else torch.device(device) - if pool is None: pool = ThreadPoolExecutor(num_workers) if num_workers > 0 and device.type == "cpu" else DummyPoolExecutor() - - kwargs = { - "shifts": shifts, - "split": split, - "overlap": overlap, - "transition_power": transition_power, - "progress": progress, - "device": device, - "pool": pool, - "set_progress_bar": set_progress_bar, - "static_shifts": static_shifts, - } - - if isinstance(model, BagOfModels): - estimates, fut_length, prog_bar, current_model = 0, 0, 0, 0 - totals = [0] * len(model.sources) - bag_num = len(model.models) - - for sub_model, weight in zip(model.models, model.weights): - original_model_device = next(iter(sub_model.parameters())).device - sub_model.to(device) - fut_length += fut_length - current_model += 1 - out = apply_model(sub_model, mix, **kwargs) - sub_model.to(original_model_device) - - for k, inst_weight in enumerate(weight): - out[:, k, :, :] *= inst_weight - totals[k] += inst_weight - - estimates += out - del out - - for k in range(estimates.shape[1]): - estimates[:, k, :, :] /= totals[k] - - return estimates - - model.to(device) - model.eval() - assert transition_power >= 1 - batch, channels, length = mix.shape - - if shifts: - kwargs["shifts"] = 0 - max_shift = int(0.5 * model.samplerate) - mix = tensor_chunk(mix) - padded_mix = mix.padded(length + 2 * max_shift) - out = 0 - - for _ in range(shifts): - offset = random.randint(0, max_shift) - shifted = TensorChunk(padded_mix, offset, length + max_shift - offset) - shifted_out = apply_model(model, shifted, **kwargs) - out += shifted_out[..., max_shift - offset :] - - out /= shifts - return out - elif split: - kwargs["split"] = False - out = torch.zeros(batch, len(model.sources), channels, length, device=mix.device) - sum_weight = torch.zeros(length, device=mix.device) - segment = int(model.samplerate * model.segment) - stride = int((1 - overlap) * segment) - offsets = range(0, length, stride) - weight = torch.cat([torch.arange(1, segment // 2 + 1, device=device), torch.arange(segment - segment // 2, 0, -1, device=device)]) - assert len(weight) == segment - weight = (weight / weight.max()) ** transition_power - futures = [] - - for offset in offsets: - chunk = TensorChunk(mix, offset, segment) - future = pool.submit(apply_model, model, chunk, **kwargs) - futures.append((future, offset)) - offset += segment - - if progress: futures = tqdm.tqdm(futures) - - for future, offset in futures: - if set_progress_bar: - fut_length = len(futures) * bag_num * static_shifts - prog_bar += 1 - set_progress_bar(0.1, (0.8 / fut_length * prog_bar)) - - chunk_out = future.result() - chunk_length = chunk_out.shape[-1] - - out[..., offset : offset + segment] += (weight[:chunk_length] * chunk_out).to(mix.device) - sum_weight[offset : offset + segment] += weight[:chunk_length].to(mix.device) - - assert sum_weight.min() > 0 - - out /= sum_weight - return out - else: - valid_length = model.valid_length(length) if hasattr(model, "valid_length") else length - mix = tensor_chunk(mix) - padded_mix = mix.padded(valid_length).to(device) - - with torch.no_grad(): - out = model(padded_mix) - - return center_trim(out, length) - -def demucs_segments(demucs_segment, demucs_model): - if demucs_segment == "Default": - segment = None - - if isinstance(demucs_model, BagOfModels): - if segment is not None: - for sub in demucs_model.models: - sub.segment = segment - else: - if segment is not None: sub.segment = segment - else: - try: - segment = int(demucs_segment) - if isinstance(demucs_model, BagOfModels): - if segment is not None: - for sub in demucs_model.models: - sub.segment = segment - else: - if segment is not None: sub.segment = segment - except: - segment = None - - if isinstance(demucs_model, BagOfModels): - if segment is not None: - for sub in demucs_model.models: - sub.segment = segment - else: - if segment is not None: sub.segment = segment - - return demucs_model \ No newline at end of file diff --git a/main/library/uvr5_separator/demucs/demucs.py b/main/library/uvr5_separator/demucs/demucs.py deleted file mode 100644 index 5b27d3fe41a2453ae5530421c71ea09a6ab7e65a..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/demucs/demucs.py +++ /dev/null @@ -1,370 +0,0 @@ -import math -import torch -import inspect - -from torch import nn - -from torch.nn import functional as F - -from .utils import center_trim -from .states import capture_init - - - -def unfold(a, kernel_size, stride): - *shape, length = a.shape - n_frames = math.ceil(length / stride) - tgt_length = (n_frames - 1) * stride + kernel_size - a = F.pad(a, (0, tgt_length - length)) - strides = list(a.stride()) - assert strides[-1] == 1 - strides = strides[:-1] + [stride, 1] - return a.as_strided([*shape, n_frames, kernel_size], strides) - -def rescale_conv(conv, reference): - scale = (conv.weight.std().detach() / reference) ** 0.5 - conv.weight.data /= scale - if conv.bias is not None: conv.bias.data /= scale - -def rescale_module(module, reference): - for sub in module.modules(): - if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d)): rescale_conv(sub, reference) - -class BLSTM(nn.Module): - def __init__(self, dim, layers=1, max_steps=None, skip=False): - super().__init__() - assert max_steps is None or max_steps % 4 == 0 - self.max_steps = max_steps - self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim) - self.linear = nn.Linear(2 * dim, dim) - self.skip = skip - - def forward(self, x): - B, C, T = x.shape - y = x - framed = False - - if self.max_steps is not None and T > self.max_steps: - width = self.max_steps - stride = width // 2 - frames = unfold(x, width, stride) - nframes = frames.shape[2] - framed = True - x = frames.permute(0, 2, 1, 3).reshape(-1, C, width) - - x = x.permute(2, 0, 1) - x = self.lstm(x)[0] - x = self.linear(x) - x = x.permute(1, 2, 0) - - if framed: - out = [] - frames = x.reshape(B, -1, C, width) - limit = stride // 2 - - for k in range(nframes): - if k == 0: out.append(frames[:, k, :, :-limit]) - elif k == nframes - 1: out.append(frames[:, k, :, limit:]) - else: out.append(frames[:, k, :, limit:-limit]) - - out = torch.cat(out, -1) - out = out[..., :T] - x = out - - if self.skip: x = x + y - return x - -class LayerScale(nn.Module): - def __init__(self, channels, init = 0): - super().__init__() - self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True)) - self.scale.data[:] = init - - def forward(self, x): - return self.scale[:, None] * x - -class DConv(nn.Module): - def __init__(self, channels, compress = 4, depth = 2, init = 1e-4, norm=True, attn=False, heads=4, ndecay=4, lstm=False, gelu=True, kernel=3, dilate=True): - super().__init__() - assert kernel % 2 == 1 - self.channels = channels - self.compress = compress - self.depth = abs(depth) - dilate = depth > 0 - norm_fn = lambda d: nn.Identity() - if norm: norm_fn = lambda d: nn.GroupNorm(1, d) - hidden = int(channels / compress) - act = nn.GELU if gelu else nn.ReLU - self.layers = nn.ModuleList([]) - - for d in range(self.depth): - dilation = 2**d if dilate else 1 - padding = dilation * (kernel // 2) - - mods = [nn.Conv1d(channels, hidden, kernel, dilation=dilation, padding=padding), norm_fn(hidden), act(), nn.Conv1d(hidden, 2 * channels, 1), norm_fn(2 * channels), nn.GLU(1), LayerScale(channels, init)] - - if attn: mods.insert(3, LocalState(hidden, heads=heads, ndecay=ndecay)) - if lstm: mods.insert(3, BLSTM(hidden, layers=2, max_steps=200, skip=True)) - layer = nn.Sequential(*mods) - self.layers.append(layer) - - def forward(self, x): - for layer in self.layers: - x = x + layer(x) - - return x - -class LocalState(nn.Module): - def __init__(self, channels, heads = 4, nfreqs = 0, ndecay = 4): - super().__init__() - assert channels % heads == 0, (channels, heads) - self.heads = heads - self.nfreqs = nfreqs - self.ndecay = ndecay - self.content = nn.Conv1d(channels, channels, 1) - self.query = nn.Conv1d(channels, channels, 1) - self.key = nn.Conv1d(channels, channels, 1) - - if nfreqs: self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1) - - if ndecay: - self.query_decay = nn.Conv1d(channels, heads * ndecay, 1) - self.query_decay.weight.data *= 0.01 - assert self.query_decay.bias is not None - self.query_decay.bias.data[:] = -2 - - self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1) - - def forward(self, x): - B, C, T = x.shape - heads = self.heads - indexes = torch.arange(T, device=x.device, dtype=x.dtype) - delta = indexes[:, None] - indexes[None, :] - queries = self.query(x).view(B, heads, -1, T) - keys = self.key(x).view(B, heads, -1, T) - dots = torch.einsum("bhct,bhcs->bhts", keys, queries) - dots /= keys.shape[2] ** 0.5 - - if self.nfreqs: - periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype) - freq_kernel = torch.cos(2 * math.pi * delta / periods.view(-1, 1, 1)) - freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs**0.5 - dots += torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q) - - if self.ndecay: - decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype) - decay_q = self.query_decay(x).view(B, heads, -1, T) - decay_q = torch.sigmoid(decay_q) / 2 - decay_kernel = -decays.view(-1, 1, 1) * delta.abs() / self.ndecay**0.5 - dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q) - - dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100) - weights = torch.softmax(dots, dim=2) - content = self.content(x).view(B, heads, -1, T) - result = torch.einsum("bhts,bhct->bhcs", weights, content) - - if self.nfreqs: - time_sig = torch.einsum("bhts,fts->bhfs", weights, freq_kernel) - result = torch.cat([result, time_sig], 2) - - result = result.reshape(B, -1, T) - return x + self.proj(result) - -class Demucs(nn.Module): - @capture_init - def __init__(self, sources, audio_channels=2, channels=64, growth=2.0, depth=6, rewrite=True, lstm_layers=0, kernel_size=8, stride=4, context=1, gelu=True, glu=True, norm_starts=4, norm_groups=4, dconv_mode=1, dconv_depth=2, dconv_comp=4, dconv_attn=4, dconv_lstm=4, dconv_init=1e-4, normalize=True, resample=True, rescale=0.1, samplerate=44100, segment=4 * 10): - super().__init__() - self.audio_channels = audio_channels - self.sources = sources - self.kernel_size = kernel_size - self.context = context - self.stride = stride - self.depth = depth - self.resample = resample - self.channels = channels - self.normalize = normalize - self.samplerate = samplerate - self.segment = segment - self.encoder = nn.ModuleList() - self.decoder = nn.ModuleList() - self.skip_scales = nn.ModuleList() - - if glu: - activation = nn.GLU(dim=1) - ch_scale = 2 - else: - activation = nn.ReLU() - ch_scale = 1 - - act2 = nn.GELU if gelu else nn.ReLU - - in_channels = audio_channels - padding = 0 - - for index in range(depth): - norm_fn = lambda d: nn.Identity() - if index >= norm_starts: norm_fn = lambda d: nn.GroupNorm(norm_groups, d) - - encode = [] - encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), norm_fn(channels), act2()] - attn = index >= dconv_attn - lstm = index >= dconv_lstm - - if dconv_mode & 1: encode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)] - if rewrite: encode += [nn.Conv1d(channels, ch_scale * channels, 1), norm_fn(ch_scale * channels), activation] - self.encoder.append(nn.Sequential(*encode)) - - decode = [] - out_channels = in_channels if index > 0 else len(self.sources) * audio_channels - - if rewrite: decode += [nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context), norm_fn(ch_scale * channels), activation] - if dconv_mode & 2: decode += [DConv(channels, depth=dconv_depth, init=dconv_init, compress=dconv_comp, attn=attn, lstm=lstm)] - decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride, padding=padding)] - - if index > 0: decode += [norm_fn(out_channels), act2()] - self.decoder.insert(0, nn.Sequential(*decode)) - in_channels = channels - channels = int(growth * channels) - - channels = in_channels - self.lstm = BLSTM(channels, lstm_layers) if lstm_layers else None - if rescale: rescale_module(self, reference=rescale) - - def valid_length(self, length): - if self.resample: length *= 2 - - for _ in range(self.depth): - length = math.ceil((length - self.kernel_size) / self.stride) + 1 - length = max(1, length) - - for _ in range(self.depth): - length = (length - 1) * self.stride + self.kernel_size - - if self.resample: length = math.ceil(length / 2) - return int(length) - - def forward(self, mix): - x = mix - length = x.shape[-1] - - if self.normalize: - mono = mix.mean(dim=1, keepdim=True) - mean = mono.mean(dim=-1, keepdim=True) - std = mono.std(dim=-1, keepdim=True) - x = (x - mean) / (1e-5 + std) - else: - mean = 0 - std = 1 - - delta = self.valid_length(length) - length - x = F.pad(x, (delta // 2, delta - delta // 2)) - - if self.resample: x = resample_frac(x, 1, 2) - saved = [] - - for encode in self.encoder: - x = encode(x) - saved.append(x) - - if self.lstm: x = self.lstm(x) - - for decode in self.decoder: - skip = saved.pop(-1) - skip = center_trim(skip, x) - x = decode(x + skip) - - if self.resample: x = resample_frac(x, 2, 1) - - x = x * std + mean - x = center_trim(x, length) - x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1)) - return x - - def load_state_dict(self, state, strict=True): - for idx in range(self.depth): - for a in ["encoder", "decoder"]: - for b in ["bias", "weight"]: - new = f"{a}.{idx}.3.{b}" - old = f"{a}.{idx}.2.{b}" - - if old in state and new not in state: state[new] = state.pop(old) - super().load_state_dict(state, strict=strict) - -class ResampleFrac(torch.nn.Module): - def __init__(self, old_sr, new_sr, zeros = 24, rolloff = 0.945): - super().__init__() - gcd = math.gcd(old_sr, new_sr) - self.old_sr = old_sr // gcd - self.new_sr = new_sr // gcd - self.zeros = zeros - self.rolloff = rolloff - self._init_kernels() - - def _init_kernels(self): - if self.old_sr == self.new_sr: return - - kernels = [] - sr = min(self.new_sr, self.old_sr) - sr *= self.rolloff - - self._width = math.ceil(self.zeros * self.old_sr / sr) - idx = torch.arange(-self._width, self._width + self.old_sr).float() - - for i in range(self.new_sr): - t = ((-i/self.new_sr + idx/self.old_sr) * sr).clamp_(-self.zeros, self.zeros) - t *= math.pi - - kernel = sinc(t) * (torch.cos(t/self.zeros/2)**2) - kernel.div_(kernel.sum()) - kernels.append(kernel) - - self.register_buffer("kernel", torch.stack(kernels).view(self.new_sr, 1, -1)) - - def forward(self, x, output_length = None, full = False): - if self.old_sr == self.new_sr: return x - shape = x.shape - length = x.shape[-1] - - x = x.reshape(-1, length) - y = F.conv1d(F.pad(x[:, None], (self._width, self._width + self.old_sr), mode='replicate'), self.kernel, stride=self.old_sr).transpose(1, 2).reshape(list(shape[:-1]) + [-1]) - - float_output_length = torch.as_tensor(self.new_sr * length / self.old_sr) - max_output_length = torch.ceil(float_output_length).long() - default_output_length = torch.floor(float_output_length).long() - - if output_length is None: applied_output_length = max_output_length if full else default_output_length - elif output_length < 0 or output_length > max_output_length: raise ValueError("output_length < 0 or output_length > max_output_length") - else: - applied_output_length = torch.tensor(output_length) - if full: raise ValueError("full=True") - - return y[..., :applied_output_length] - - def __repr__(self): - return simple_repr(self) - -def sinc(x): - return torch.where(x == 0, torch.tensor(1., device=x.device, dtype=x.dtype), torch.sin(x) / x) - -def simple_repr(obj, attrs = None, overrides = {}): - params = inspect.signature(obj.__class__).parameters - attrs_repr = [] - - if attrs is None: attrs = list(params.keys()) - for attr in attrs: - display = False - - if attr in overrides: value = overrides[attr] - elif hasattr(obj, attr): value = getattr(obj, attr) - else: continue - - if attr in params: - param = params[attr] - if param.default is inspect._empty or value != param.default: display = True - else: display = True - - if display: attrs_repr.append(f"{attr}={value}") - return f"{obj.__class__.__name__}({','.join(attrs_repr)})" - -def resample_frac(x, old_sr, new_sr, zeros = 24, rolloff = 0.945, output_length = None, full = False): - return ResampleFrac(old_sr, new_sr, zeros, rolloff).to(x)(x, output_length, full) \ No newline at end of file diff --git a/main/library/uvr5_separator/demucs/hdemucs.py b/main/library/uvr5_separator/demucs/hdemucs.py deleted file mode 100644 index c37fd0c024695108491fb57e0f075ffe348a18f1..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/demucs/hdemucs.py +++ /dev/null @@ -1,760 +0,0 @@ -import math -import torch - -from torch import nn -from copy import deepcopy - -from torch.nn import functional as F - -from .states import capture_init -from .demucs import DConv, rescale_module - - -def spectro(x, n_fft=512, hop_length=None, pad=0): - *other, length = x.shape - x = x.reshape(-1, length) - device_type = x.device.type - is_other_gpu = not device_type in ["cuda", "cpu"] - if is_other_gpu: x = x.cpu() - z = torch.stft(x, n_fft * (1 + pad), hop_length or n_fft // 4, window=torch.hann_window(n_fft).to(x), win_length=n_fft, normalized=True, center=True, return_complex=True, pad_mode="reflect") - _, freqs, frame = z.shape - return z.view(*other, freqs, frame) - -def ispectro(z, hop_length=None, length=None, pad=0): - *other, freqs, frames = z.shape - n_fft = 2 * freqs - 2 - z = z.view(-1, freqs, frames) - win_length = n_fft // (1 + pad) - device_type = z.device.type - is_other_gpu = not device_type in ["cuda", "cpu"] - if is_other_gpu: z = z.cpu() - x = torch.istft(z, n_fft, hop_length, window=torch.hann_window(win_length).to(z.real), win_length=win_length, normalized=True, length=length, center=True) - _, length = x.shape - return x.view(*other, length) - -def atan2(y, x): - pi = 2 * torch.asin(torch.tensor(1.0)) - x += ((x == 0) & (y == 0)) * 1.0 - out = torch.atan(y / x) - out += ((y >= 0) & (x < 0)) * pi - out -= ((y < 0) & (x < 0)) * pi - out *= 1 - ((y > 0) & (x == 0)) * 1.0 - out += ((y > 0) & (x == 0)) * (pi / 2) - out *= 1 - ((y < 0) & (x == 0)) * 1.0 - out += ((y < 0) & (x == 0)) * (-pi / 2) - return out - -def _norm(x): - return torch.abs(x[..., 0]) ** 2 + torch.abs(x[..., 1]) ** 2 - -def _mul_add(a, b, out = None): - target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)]) - if out is None or out.shape != target_shape: out = torch.zeros(target_shape, dtype=a.dtype, device=a.device) - - if out is a: - real_a = a[..., 0] - out[..., 0] = out[..., 0] + (real_a * b[..., 0] - a[..., 1] * b[..., 1]) - out[..., 1] = out[..., 1] + (real_a * b[..., 1] + a[..., 1] * b[..., 0]) - else: - out[..., 0] = out[..., 0] + (a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1]) - out[..., 1] = out[..., 1] + (a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0]) - - return out - -def _mul(a, b, out = None): - target_shape = torch.Size([max(sa, sb) for (sa, sb) in zip(a.shape, b.shape)]) - if out is None or out.shape != target_shape: out = torch.zeros(target_shape, dtype=a.dtype, device=a.device) - - if out is a: - real_a = a[..., 0] - out[..., 0] = real_a * b[..., 0] - a[..., 1] * b[..., 1] - out[..., 1] = real_a * b[..., 1] + a[..., 1] * b[..., 0] - else: - out[..., 0] = a[..., 0] * b[..., 0] - a[..., 1] * b[..., 1] - out[..., 1] = a[..., 0] * b[..., 1] + a[..., 1] * b[..., 0] - - return out - -def _inv(z, out = None): - ez = _norm(z) - if out is None or out.shape != z.shape: out = torch.zeros_like(z) - - out[..., 0] = z[..., 0] / ez - out[..., 1] = -z[..., 1] / ez - - return out - -def _conj(z, out = None): - if out is None or out.shape != z.shape: out = torch.zeros_like(z) - - out[..., 0] = z[..., 0] - out[..., 1] = -z[..., 1] - - return out - -def _invert(M, out = None): - nb_channels = M.shape[-2] - if out is None or out.shape != M.shape: out = torch.empty_like(M) - - if nb_channels == 1: out = _inv(M, out) - elif nb_channels == 2: - det = _mul(M[..., 0, 0, :], M[..., 1, 1, :]) - det = det - _mul(M[..., 0, 1, :], M[..., 1, 0, :]) - invDet = _inv(det) - out[..., 0, 0, :] = _mul(invDet, M[..., 1, 1, :], out[..., 0, 0, :]) - out[..., 1, 0, :] = _mul(-invDet, M[..., 1, 0, :], out[..., 1, 0, :]) - out[..., 0, 1, :] = _mul(-invDet, M[..., 0, 1, :], out[..., 0, 1, :]) - out[..., 1, 1, :] = _mul(invDet, M[..., 0, 0, :], out[..., 1, 1, :]) - else: raise Exception("Torch == 2 Channels") - return out - -def expectation_maximization(y, x, iterations = 2, eps = 1e-10, batch_size = 200): - (nb_frames, nb_bins, nb_channels) = x.shape[:-1] - nb_sources = y.shape[-1] - regularization = torch.cat((torch.eye(nb_channels, dtype=x.dtype, device=x.device)[..., None], torch.zeros((nb_channels, nb_channels, 1), dtype=x.dtype, device=x.device)), dim=2) - regularization = torch.sqrt(torch.as_tensor(eps)) * (regularization[None, None, ...].expand((-1, nb_bins, -1, -1, -1))) - R = [torch.zeros((nb_bins, nb_channels, nb_channels, 2), dtype=x.dtype, device=x.device) for j in range(nb_sources)] - weight = torch.zeros((nb_bins,), dtype=x.dtype, device=x.device) - v = torch.zeros((nb_frames, nb_bins, nb_sources), dtype=x.dtype, device=x.device) - - for _ in range(iterations): - v = torch.mean(torch.abs(y[..., 0, :]) ** 2 + torch.abs(y[..., 1, :]) ** 2, dim=-2) - for j in range(nb_sources): - R[j] = torch.tensor(0.0, device=x.device) - - weight = torch.tensor(eps, device=x.device) - pos = 0 - batch_size = batch_size if batch_size else nb_frames - - while pos < nb_frames: - t = torch.arange(pos, min(nb_frames, pos + batch_size)) - pos = int(t[-1]) + 1 - - R[j] = R[j] + torch.sum(_covariance(y[t, ..., j]), dim=0) - weight = weight + torch.sum(v[t, ..., j], dim=0) - - R[j] = R[j] / weight[..., None, None, None] - weight = torch.zeros_like(weight) - - if y.requires_grad: y = y.clone() - - pos = 0 - - while pos < nb_frames: - t = torch.arange(pos, min(nb_frames, pos + batch_size)) - pos = int(t[-1]) + 1 - - y[t, ...] = torch.tensor(0.0, device=x.device, dtype=x.dtype) - - Cxx = regularization - - for j in range(nb_sources): - Cxx = Cxx + (v[t, ..., j, None, None, None] * R[j][None, ...].clone()) - - inv_Cxx = _invert(Cxx) - - for j in range(nb_sources): - gain = torch.zeros_like(inv_Cxx) - indices = torch.cartesian_prod(torch.arange(nb_channels), torch.arange(nb_channels), torch.arange(nb_channels)) - - for index in indices: - gain[:, :, index[0], index[1], :] = _mul_add(R[j][None, :, index[0], index[2], :].clone(), inv_Cxx[:, :, index[2], index[1], :], gain[:, :, index[0], index[1], :]) - - gain = gain * v[t, ..., None, None, None, j] - - for i in range(nb_channels): - y[t, ..., j] = _mul_add(gain[..., i, :], x[t, ..., i, None, :], y[t, ..., j]) - - return y, v, R - -def wiener(targets_spectrograms, mix_stft, iterations = 1, softmask = False, residual = False, scale_factor = 10.0, eps = 1e-10): - if softmask: y = mix_stft[..., None] * (targets_spectrograms / (eps + torch.sum(targets_spectrograms, dim=-1, keepdim=True).to(mix_stft.dtype)))[..., None, :] - else: - angle = atan2(mix_stft[..., 1], mix_stft[..., 0])[..., None] - nb_sources = targets_spectrograms.shape[-1] - y = torch.zeros(mix_stft.shape + (nb_sources,), dtype=mix_stft.dtype, device=mix_stft.device) - y[..., 0, :] = targets_spectrograms * torch.cos(angle) - y[..., 1, :] = targets_spectrograms * torch.sin(angle) - - if residual: y = torch.cat([y, mix_stft[..., None] - y.sum(dim=-1, keepdim=True)], dim=-1) - if iterations == 0: return y - - max_abs = torch.max(torch.as_tensor(1.0, dtype=mix_stft.dtype, device=mix_stft.device), torch.sqrt(_norm(mix_stft)).max() / scale_factor) - mix_stft = mix_stft / max_abs - y = y / max_abs - y = expectation_maximization(y, mix_stft, iterations, eps=eps)[0] - y = y * max_abs - - return y - -def _covariance(y_j): - (nb_frames, nb_bins, nb_channels) = y_j.shape[:-1] - - Cj = torch.zeros((nb_frames, nb_bins, nb_channels, nb_channels, 2), dtype=y_j.dtype, device=y_j.device) - indices = torch.cartesian_prod(torch.arange(nb_channels), torch.arange(nb_channels)) - - for index in indices: - Cj[:, :, index[0], index[1], :] = _mul_add(y_j[:, :, index[0], :], _conj(y_j[:, :, index[1], :]), Cj[:, :, index[0], index[1], :]) - - return Cj - -def pad1d(x, paddings, mode = "constant", value = 0.0): - x0 = x - length = x.shape[-1] - padding_left, padding_right = paddings - - if mode == "reflect": - max_pad = max(padding_left, padding_right) - - if length <= max_pad: - extra_pad = max_pad - length + 1 - extra_pad_right = min(padding_right, extra_pad) - extra_pad_left = extra_pad - extra_pad_right - paddings = (padding_left - extra_pad_left, padding_right - extra_pad_right) - x = F.pad(x, (extra_pad_left, extra_pad_right)) - - out = F.pad(x, paddings, mode, value) - - assert out.shape[-1] == length + padding_left + padding_right - assert (out[..., padding_left : padding_left + length] == x0).all() - return out - -class ScaledEmbedding(nn.Module): - def __init__(self, num_embeddings, embedding_dim, scale = 10.0, smooth=False): - super().__init__() - self.embedding = nn.Embedding(num_embeddings, embedding_dim) - - if smooth: - weight = torch.cumsum(self.embedding.weight.data, dim=0) - weight = weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None] - self.embedding.weight.data[:] = weight - - self.embedding.weight.data /= scale - self.scale = scale - - @property - def weight(self): - return self.embedding.weight * self.scale - - def forward(self, x): - return self.embedding(x) * self.scale - -class HEncLayer(nn.Module): - def __init__(self, chin, chout, kernel_size=8, stride=4, norm_groups=1, empty=False, freq=True, dconv=True, norm=True, context=0, dconv_kw={}, pad=True, rewrite=True): - super().__init__() - norm_fn = lambda d: nn.Identity() - if norm: norm_fn = lambda d: nn.GroupNorm(norm_groups, d) - pad = kernel_size // 4 if pad else 0 - - klass = nn.Conv1d - self.freq = freq - self.kernel_size = kernel_size - self.stride = stride - self.empty = empty - self.norm = norm - self.pad = pad - - if freq: - kernel_size = [kernel_size, 1] - stride = [stride, 1] - pad = [pad, 0] - klass = nn.Conv2d - - self.conv = klass(chin, chout, kernel_size, stride, pad) - if self.empty: return - - self.norm1 = norm_fn(chout) - self.rewrite = None - - if rewrite: - self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context) - self.norm2 = norm_fn(2 * chout) - - self.dconv = None - if dconv: self.dconv = DConv(chout, **dconv_kw) - - def forward(self, x, inject=None): - if not self.freq and x.dim() == 4: - B, C, Fr, T = x.shape - x = x.view(B, -1, T) - - if not self.freq: - le = x.shape[-1] - if not le % self.stride == 0: x = F.pad(x, (0, self.stride - (le % self.stride))) - - y = self.conv(x) - if self.empty: return y - - if inject is not None: - assert inject.shape[-1] == y.shape[-1], (inject.shape, y.shape) - - if inject.dim() == 3 and y.dim() == 4: inject = inject[:, :, None] - y = y + inject - - y = F.gelu(self.norm1(y)) - - if self.dconv: - if self.freq: - B, C, Fr, T = y.shape - y = y.permute(0, 2, 1, 3).reshape(-1, C, T) - - y = self.dconv(y) - if self.freq: y = y.view(B, Fr, C, T).permute(0, 2, 1, 3) - - if self.rewrite: - z = self.norm2(self.rewrite(y)) - z = F.glu(z, dim=1) - else: z = y - - return z - -class MultiWrap(nn.Module): - def __init__(self, layer, split_ratios): - super().__init__() - self.split_ratios = split_ratios - self.layers = nn.ModuleList() - self.conv = isinstance(layer, HEncLayer) - assert not layer.norm - assert layer.freq - assert layer.pad - - if not self.conv: assert not layer.context_freq - - for _ in range(len(split_ratios) + 1): - lay = deepcopy(layer) - - if self.conv: lay.conv.padding = (0, 0) - else: lay.pad = False - - for m in lay.modules(): - if hasattr(m, "reset_parameters"): m.reset_parameters() - - self.layers.append(lay) - - def forward(self, x, skip=None, length=None): - B, C, Fr, T = x.shape - ratios = list(self.split_ratios) + [1] - start = 0 - outs = [] - - for ratio, layer in zip(ratios, self.layers): - if self.conv: - pad = layer.kernel_size // 4 - - if ratio == 1: - limit = Fr - frames = -1 - else: - limit = int(round(Fr * ratio)) - le = limit - start - - if start == 0: le += pad - - frames = round((le - layer.kernel_size) / layer.stride + 1) - limit = start + (frames - 1) * layer.stride + layer.kernel_size - - if start == 0: limit -= pad - - assert limit - start > 0, (limit, start) - assert limit <= Fr, (limit, Fr) - - y = x[:, :, start:limit, :] - - if start == 0: y = F.pad(y, (0, 0, pad, 0)) - if ratio == 1: y = F.pad(y, (0, 0, 0, pad)) - - outs.append(layer(y)) - start = limit - layer.kernel_size + layer.stride - else: - limit = Fr if ratio == 1 else int(round(Fr * ratio)) - - last = layer.last - layer.last = True - - y = x[:, :, start:limit] - s = skip[:, :, start:limit] - out, _ = layer(y, s, None) - - if outs: - outs[-1][:, :, -layer.stride :] += out[:, :, : layer.stride] - layer.conv_tr.bias.view(1, -1, 1, 1) - out = out[:, :, layer.stride :] - - if ratio == 1: out = out[:, :, : -layer.stride // 2, :] - if start == 0: out = out[:, :, layer.stride // 2 :, :] - - outs.append(out) - layer.last = last - start = limit - - out = torch.cat(outs, dim=2) - if not self.conv and not last: out = F.gelu(out) - - if self.conv: return out - else: return out, None - -class HDecLayer(nn.Module): - def __init__(self, chin, chout, last=False, kernel_size=8, stride=4, norm_groups=1, empty=False, freq=True, dconv=True, norm=True, context=1, dconv_kw={}, pad=True, context_freq=True, rewrite=True): - super().__init__() - norm_fn = lambda d: nn.Identity() - - if norm: norm_fn = lambda d: nn.GroupNorm(norm_groups, d) - pad = kernel_size // 4 if pad else 0 - - self.pad = pad - self.last = last - self.freq = freq - self.chin = chin - self.empty = empty - self.stride = stride - self.kernel_size = kernel_size - self.norm = norm - self.context_freq = context_freq - klass = nn.Conv1d - klass_tr = nn.ConvTranspose1d - - if freq: - kernel_size = [kernel_size, 1] - stride = [stride, 1] - klass = nn.Conv2d - klass_tr = nn.ConvTranspose2d - - self.conv_tr = klass_tr(chin, chout, kernel_size, stride) - self.norm2 = norm_fn(chout) - - if self.empty: return - self.rewrite = None - - if rewrite: - if context_freq: self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context) - else: self.rewrite = klass(chin, 2 * chin, [1, 1 + 2 * context], 1, [0, context]) - - self.norm1 = norm_fn(2 * chin) - - self.dconv = None - if dconv: self.dconv = DConv(chin, **dconv_kw) - - def forward(self, x, skip, length): - if self.freq and x.dim() == 3: - B, C, T = x.shape - x = x.view(B, self.chin, -1, T) - - if not self.empty: - x = x + skip - - y = F.glu(self.norm1(self.rewrite(x)), dim=1) if self.rewrite else x - - if self.dconv: - if self.freq: - B, C, Fr, T = y.shape - y = y.permute(0, 2, 1, 3).reshape(-1, C, T) - - y = self.dconv(y) - - if self.freq: y = y.view(B, Fr, C, T).permute(0, 2, 1, 3) - else: - y = x - assert skip is None - - z = self.norm2(self.conv_tr(y)) - - if self.freq: - if self.pad: z = z[..., self.pad : -self.pad, :] - else: - z = z[..., self.pad : self.pad + length] - assert z.shape[-1] == length, (z.shape[-1], length) - - if not self.last: z = F.gelu(z) - return z, y - -class HDemucs(nn.Module): - @capture_init - def __init__(self, sources, audio_channels=2, channels=48, channels_time=None, growth=2, nfft=4096, wiener_iters=0, end_iters=0, wiener_residual=False, cac=True, depth=6, rewrite=True, hybrid=True, hybrid_old=False, multi_freqs=None, multi_freqs_depth=2, freq_emb=0.2, emb_scale=10, emb_smooth=True, kernel_size=8, time_stride=2, stride=4, context=1, context_enc=0, norm_starts=4, norm_groups=4, dconv_mode=1, dconv_depth=2, dconv_comp=4, dconv_attn=4, dconv_lstm=4, dconv_init=1e-4, rescale=0.1, samplerate=44100, segment=4 * 10): - super().__init__() - self.cac = cac - self.wiener_residual = wiener_residual - self.audio_channels = audio_channels - self.sources = sources - self.kernel_size = kernel_size - self.context = context - self.stride = stride - self.depth = depth - self.channels = channels - self.samplerate = samplerate - self.segment = segment - self.nfft = nfft - self.hop_length = nfft // 4 - self.wiener_iters = wiener_iters - self.end_iters = end_iters - self.freq_emb = None - self.hybrid = hybrid - self.hybrid_old = hybrid_old - if hybrid_old: assert hybrid - if hybrid: assert wiener_iters == end_iters - self.encoder = nn.ModuleList() - self.decoder = nn.ModuleList() - - if hybrid: - self.tencoder = nn.ModuleList() - self.tdecoder = nn.ModuleList() - - chin = audio_channels - chin_z = chin - - if self.cac: chin_z *= 2 - - chout = channels_time or channels - chout_z = channels - freqs = nfft // 2 - - for index in range(depth): - lstm = index >= dconv_lstm - attn = index >= dconv_attn - norm = index >= norm_starts - freq = freqs > 1 - stri = stride - ker = kernel_size - - if not freq: - assert freqs == 1 - - ker = time_stride * 2 - stri = time_stride - - pad = True - last_freq = False - - if freq and freqs <= kernel_size: - ker = freqs - pad = False - last_freq = True - - kw = { - "kernel_size": ker, - "stride": stri, - "freq": freq, - "pad": pad, - "norm": norm, - "rewrite": rewrite, - "norm_groups": norm_groups, - "dconv_kw": {"lstm": lstm, "attn": attn, "depth": dconv_depth, "compress": dconv_comp, "init": dconv_init, "gelu": True}, - } - - kwt = dict(kw) - kwt["freq"] = 0 - kwt["kernel_size"] = kernel_size - kwt["stride"] = stride - kwt["pad"] = True - kw_dec = dict(kw) - - multi = False - - if multi_freqs and index < multi_freqs_depth: - multi = True - kw_dec["context_freq"] = False - - if last_freq: - chout_z = max(chout, chout_z) - chout = chout_z - - enc = HEncLayer(chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw) - if hybrid and freq: - tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc, empty=last_freq, **kwt) - self.tencoder.append(tenc) - - if multi: enc = MultiWrap(enc, multi_freqs) - - self.encoder.append(enc) - if index == 0: - chin = self.audio_channels * len(self.sources) - chin_z = chin - - if self.cac: chin_z *= 2 - - dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2, last=index == 0, context=context, **kw_dec) - if multi: dec = MultiWrap(dec, multi_freqs) - - if hybrid and freq: - tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq, last=index == 0, context=context, **kwt) - self.tdecoder.insert(0, tdec) - - self.decoder.insert(0, dec) - chin = chout - chin_z = chout_z - chout = int(growth * chout) - chout_z = int(growth * chout_z) - - if freq: - if freqs <= kernel_size: freqs = 1 - else: freqs //= stride - - if index == 0 and freq_emb: - self.freq_emb = ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale) - self.freq_emb_scale = freq_emb - - if rescale: rescale_module(self, reference=rescale) - - def _spec(self, x): - hl = self.hop_length - nfft = self.nfft - - if self.hybrid: - assert hl == nfft // 4 - le = int(math.ceil(x.shape[-1] / hl)) - pad = hl // 2 * 3 - x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect") if not self.hybrid_old else pad1d(x, (pad, pad + le * hl - x.shape[-1])) - - z = spectro(x, nfft, hl)[..., :-1, :] - if self.hybrid: - assert z.shape[-1] == le + 4, (z.shape, x.shape, le) - z = z[..., 2 : 2 + le] - - return z - - def _ispec(self, z, length=None, scale=0): - hl = self.hop_length // (4**scale) - z = F.pad(z, (0, 0, 0, 1)) - - if self.hybrid: - z = F.pad(z, (2, 2)) - pad = hl // 2 * 3 - le = hl * int(math.ceil(length / hl)) + 2 * pad if not self.hybrid_old else hl * int(math.ceil(length / hl)) - x = ispectro(z, hl, length=le) - x = x[..., pad : pad + length] if not self.hybrid_old else x[..., :length] - else: x = ispectro(z, hl, length) - - return x - - def _magnitude(self, z): - if self.cac: - B, C, Fr, T = z.shape - m = torch.view_as_real(z).permute(0, 1, 4, 2, 3) - m = m.reshape(B, C * 2, Fr, T) - else: m = z.abs() - - return m - - def _mask(self, z, m): - niters = self.wiener_iters - if self.cac: - B, S, C, Fr, T = m.shape - out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3) - out = torch.view_as_complex(out.contiguous()) - return out - - if self.training: niters = self.end_iters - - if niters < 0: - z = z[:, None] - return z / (1e-8 + z.abs()) * m - else: return self._wiener(m, z, niters) - - def _wiener(self, mag_out, mix_stft, niters): - init = mix_stft.dtype - wiener_win_len = 300 - residual = self.wiener_residual - B, S, C, Fq, T = mag_out.shape - mag_out = mag_out.permute(0, 4, 3, 2, 1) - mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1)) - outs = [] - - for sample in range(B): - pos = 0 - out = [] - - for pos in range(0, T, wiener_win_len): - frame = slice(pos, pos + wiener_win_len) - z_out = wiener(mag_out[sample, frame], mix_stft[sample, frame], niters, residual=residual) - out.append(z_out.transpose(-1, -2)) - - outs.append(torch.cat(out, dim=0)) - - out = torch.view_as_complex(torch.stack(outs, 0)) - out = out.permute(0, 4, 3, 2, 1).contiguous() - - if residual: out = out[:, :-1] - assert list(out.shape) == [B, S, C, Fq, T] - return out.to(init) - - def forward(self, mix): - x = mix - length = x.shape[-1] - z = self._spec(mix) - mag = self._magnitude(z).to(mix.device) - x = mag - B, C, Fq, T = x.shape - mean = x.mean(dim=(1, 2, 3), keepdim=True) - std = x.std(dim=(1, 2, 3), keepdim=True) - x = (x - mean) / (1e-5 + std) - - if self.hybrid: - xt = mix - meant = xt.mean(dim=(1, 2), keepdim=True) - stdt = xt.std(dim=(1, 2), keepdim=True) - xt = (xt - meant) / (1e-5 + stdt) - - saved, saved_t, lengths, lengths_t = [], [], [], [] - - for idx, encode in enumerate(self.encoder): - lengths.append(x.shape[-1]) - inject = None - - if self.hybrid and idx < len(self.tencoder): - lengths_t.append(xt.shape[-1]) - tenc = self.tencoder[idx] - xt = tenc(xt) - - if not tenc.empty: saved_t.append(xt) - else: inject = xt - - x = encode(x, inject) - - if idx == 0 and self.freq_emb is not None: - frs = torch.arange(x.shape[-2], device=x.device) - emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x) - x = x + self.freq_emb_scale * emb - - saved.append(x) - - x = torch.zeros_like(x) - if self.hybrid: xt = torch.zeros_like(x) - - for idx, decode in enumerate(self.decoder): - skip = saved.pop(-1) - x, pre = decode(x, skip, lengths.pop(-1)) - - if self.hybrid: offset = self.depth - len(self.tdecoder) - - if self.hybrid and idx >= offset: - tdec = self.tdecoder[idx - offset] - length_t = lengths_t.pop(-1) - - if tdec.empty: - assert pre.shape[2] == 1, pre.shape - - pre = pre[:, :, 0] - xt, _ = tdec(pre, None, length_t) - else: - skip = saved_t.pop(-1) - xt, _ = tdec(xt, skip, length_t) - - assert len(saved) == 0 - assert len(lengths_t) == 0 - assert len(saved_t) == 0 - - S = len(self.sources) - x = x.view(B, S, -1, Fq, T) - x = x * std[:, None] + mean[:, None] - device_type = x.device.type - device_load = f"{device_type}:{x.device.index}" if not device_type == "mps" else device_type - x_is_other_gpu = not device_type in ["cuda", "cpu"] - if x_is_other_gpu: x = x.cpu() - zout = self._mask(z, x) - x = self._ispec(zout, length) - if x_is_other_gpu: x = x.to(device_load) - - if self.hybrid: - xt = xt.view(B, S, -1, length) - xt = xt * stdt[:, None] + meant[:, None] - x = xt + x - - return x \ No newline at end of file diff --git a/main/library/uvr5_separator/demucs/htdemucs.py b/main/library/uvr5_separator/demucs/htdemucs.py deleted file mode 100644 index e63a0cdc291cce63e76b305d41fdbce4d114ca1a..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/demucs/htdemucs.py +++ /dev/null @@ -1,600 +0,0 @@ -import os -import sys -import math -import torch -import random - -import numpy as np - -from torch import nn -from einops import rearrange -from fractions import Fraction -from torch.nn import functional as F - -sys.path.append(os.getcwd()) - -from .states import capture_init -from .demucs import rescale_module -from main.configs.config import Config -from .hdemucs import pad1d, spectro, ispectro, wiener, ScaledEmbedding, HEncLayer, MultiWrap, HDecLayer - -translations = Config().translations - -def create_sin_embedding(length, dim, shift = 0, device="cpu", max_period=10000): - assert dim % 2 == 0 - pos = shift + torch.arange(length, device=device).view(-1, 1, 1) - half_dim = dim // 2 - adim = torch.arange(dim // 2, device=device).view(1, 1, -1) - phase = pos / (max_period ** (adim / (half_dim - 1))) - return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1) - -def create_2d_sin_embedding(d_model, height, width, device="cpu", max_period=10000): - if d_model % 4 != 0: raise ValueError(translations["dims"].format(dims=d_model)) - pe = torch.zeros(d_model, height, width) - d_model = int(d_model / 2) - div_term = torch.exp(torch.arange(0.0, d_model, 2) * -(math.log(max_period) / d_model)) - pos_w = torch.arange(0.0, width).unsqueeze(1) - pos_h = torch.arange(0.0, height).unsqueeze(1) - pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1) - pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1) - pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width) - pe[d_model + 1 :: 2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width) - - return pe[None, :].to(device) - -def create_sin_embedding_cape(length, dim, batch_size, mean_normalize, augment, max_global_shift = 0.0, max_local_shift = 0.0, max_scale = 1.0, device = "cpu", max_period = 10000.0): - assert dim % 2 == 0 - pos = 1.0 * torch.arange(length).view(-1, 1, 1) - pos = pos.repeat(1, batch_size, 1) - if mean_normalize: pos -= torch.nanmean(pos, dim=0, keepdim=True) - - if augment: - delta = np.random.uniform(-max_global_shift, +max_global_shift, size=[1, batch_size, 1]) - delta_local = np.random.uniform(-max_local_shift, +max_local_shift, size=[length, batch_size, 1]) - log_lambdas = np.random.uniform(-np.log(max_scale), +np.log(max_scale), size=[1, batch_size, 1]) - pos = (pos + delta + delta_local) * np.exp(log_lambdas) - - pos = pos.to(device) - half_dim = dim // 2 - adim = torch.arange(dim // 2, device=device).view(1, 1, -1) - phase = pos / (max_period ** (adim / (half_dim - 1))) - return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1).float() - -class MyGroupNorm(nn.GroupNorm): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def forward(self, x): - x = x.transpose(1, 2) - return super().forward(x).transpose(1, 2) - -class LayerScale(nn.Module): - def __init__(self, channels, init = 0, channel_last=False): - super().__init__() - self.channel_last = channel_last - self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True)) - self.scale.data[:] = init - - def forward(self, x): - if self.channel_last: return self.scale * x - else: return self.scale[:, None] * x - -class MyTransformerEncoderLayer(nn.TransformerEncoderLayer): - def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu, group_norm=0, norm_first=False, norm_out=False, layer_norm_eps=1e-5, layer_scale=False, init_values=1e-4, device=None, dtype=None, sparse=False, mask_type="diag", mask_random_seed=42, sparse_attn_window=500, global_window=50, auto_sparsity=False, sparsity=0.95, batch_first=False): - factory_kwargs = {"device": device, "dtype": dtype} - super().__init__(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation, layer_norm_eps=layer_norm_eps, batch_first=batch_first, norm_first=norm_first, device=device, dtype=dtype) - self.auto_sparsity = auto_sparsity - - if group_norm: - self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - - self.norm_out = None - if self.norm_first & norm_out: self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model) - - self.gamma_1 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - self.gamma_2 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - - def forward(self, src, src_mask=None, src_key_padding_mask=None): - x = src - T, B, C = x.shape - - if self.norm_first: - x = x + self.gamma_1(self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)) - x = x + self.gamma_2(self._ff_block(self.norm2(x))) - if self.norm_out: x = self.norm_out(x) - else: - x = self.norm1(x + self.gamma_1(self._sa_block(x, src_mask, src_key_padding_mask))) - x = self.norm2(x + self.gamma_2(self._ff_block(x))) - - return x - -class CrossTransformerEncoder(nn.Module): - def __init__(self, dim, emb = "sin", hidden_scale = 4.0, num_heads = 8, num_layers = 6, cross_first = False, dropout = 0.0, max_positions = 1000, norm_in = True, norm_in_group = False, group_norm = False, norm_first = False, norm_out = False, max_period = 10000.0, weight_decay = 0.0, lr = None, layer_scale = False, gelu = True, sin_random_shift = 0, weight_pos_embed = 1.0, cape_mean_normalize = True, cape_augment = True, cape_glob_loc_scale = [5000.0, 1.0, 1.4], sparse_self_attn = False, sparse_cross_attn = False, mask_type = "diag", mask_random_seed = 42, sparse_attn_window = 500, global_window = 50, auto_sparsity = False, sparsity = 0.95): - super().__init__() - assert dim % num_heads == 0 - hidden_dim = int(dim * hidden_scale) - self.num_layers = num_layers - self.classic_parity = 1 if cross_first else 0 - self.emb = emb - self.max_period = max_period - self.weight_decay = weight_decay - self.weight_pos_embed = weight_pos_embed - self.sin_random_shift = sin_random_shift - - if emb == "cape": - self.cape_mean_normalize = cape_mean_normalize - self.cape_augment = cape_augment - self.cape_glob_loc_scale = cape_glob_loc_scale - - if emb == "scaled": self.position_embeddings = ScaledEmbedding(max_positions, dim, scale=0.2) - - self.lr = lr - activation = F.gelu if gelu else F.relu - - if norm_in: - self.norm_in = nn.LayerNorm(dim) - self.norm_in_t = nn.LayerNorm(dim) - elif norm_in_group: - self.norm_in = MyGroupNorm(int(norm_in_group), dim) - self.norm_in_t = MyGroupNorm(int(norm_in_group), dim) - else: - self.norm_in = nn.Identity() - self.norm_in_t = nn.Identity() - - self.layers = nn.ModuleList() - self.layers_t = nn.ModuleList() - - kwargs_common = { - "d_model": dim, - "nhead": num_heads, - "dim_feedforward": hidden_dim, - "dropout": dropout, - "activation": activation, - "group_norm": group_norm, - "norm_first": norm_first, - "norm_out": norm_out, - "layer_scale": layer_scale, - "mask_type": mask_type, - "mask_random_seed": mask_random_seed, - "sparse_attn_window": sparse_attn_window, - "global_window": global_window, - "sparsity": sparsity, - "auto_sparsity": auto_sparsity, - "batch_first": True, - } - - kwargs_classic_encoder = dict(kwargs_common) - kwargs_classic_encoder.update({"sparse": sparse_self_attn}) - kwargs_cross_encoder = dict(kwargs_common) - kwargs_cross_encoder.update({"sparse": sparse_cross_attn}) - - for idx in range(num_layers): - if idx % 2 == self.classic_parity: - self.layers.append(MyTransformerEncoderLayer(**kwargs_classic_encoder)) - self.layers_t.append(MyTransformerEncoderLayer(**kwargs_classic_encoder)) - else: - self.layers.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder)) - self.layers_t.append(CrossTransformerEncoderLayer(**kwargs_cross_encoder)) - - def forward(self, x, xt): - B, C, Fr, T1 = x.shape - - pos_emb_2d = create_2d_sin_embedding(C, Fr, T1, x.device, self.max_period) - pos_emb_2d = rearrange(pos_emb_2d, "b c fr t1 -> b (t1 fr) c") - - x = rearrange(x, "b c fr t1 -> b (t1 fr) c") - x = self.norm_in(x) - x = x + self.weight_pos_embed * pos_emb_2d - - B, C, T2 = xt.shape - xt = rearrange(xt, "b c t2 -> b t2 c") - - pos_emb = self._get_pos_embedding(T2, B, C, x.device) - pos_emb = rearrange(pos_emb, "t2 b c -> b t2 c") - - xt = self.norm_in_t(xt) - xt = xt + self.weight_pos_embed * pos_emb - - for idx in range(self.num_layers): - if idx % 2 == self.classic_parity: - x = self.layers[idx](x) - xt = self.layers_t[idx](xt) - else: - old_x = x - x = self.layers[idx](x, xt) - xt = self.layers_t[idx](xt, old_x) - - x = rearrange(x, "b (t1 fr) c -> b c fr t1", t1=T1) - xt = rearrange(xt, "b t2 c -> b c t2") - return x, xt - - def _get_pos_embedding(self, T, B, C, device): - if self.emb == "sin": - shift = random.randrange(self.sin_random_shift + 1) - pos_emb = create_sin_embedding(T, C, shift=shift, device=device, max_period=self.max_period) - elif self.emb == "cape": - if self.training: pos_emb = create_sin_embedding_cape(T, C, B, device=device, max_period=self.max_period, mean_normalize=self.cape_mean_normalize, augment=self.cape_augment, max_global_shift=self.cape_glob_loc_scale[0], max_local_shift=self.cape_glob_loc_scale[1], max_scale=self.cape_glob_loc_scale[2]) - else: pos_emb = create_sin_embedding_cape(T, C, B, device=device, max_period=self.max_period, mean_normalize=self.cape_mean_normalize, augment=False) - elif self.emb == "scaled": - pos = torch.arange(T, device=device) - pos_emb = self.position_embeddings(pos)[:, None] - - return pos_emb - - def make_optim_group(self): - group = {"params": list(self.parameters()), "weight_decay": self.weight_decay} - if self.lr is not None: group["lr"] = self.lr - return group - -class CrossTransformerEncoderLayer(nn.Module): - def __init__(self, d_model, nhead, dim_feedforward = 2048, dropout = 0.1, activation=F.relu, layer_norm_eps = 1e-5, layer_scale = False, init_values = 1e-4, norm_first = False, group_norm = False, norm_out = False, sparse=False, mask_type="diag", mask_random_seed=42, sparse_attn_window=500, global_window=50, sparsity=0.95, auto_sparsity=None, device=None, dtype=None, batch_first=False): - factory_kwargs = {"device": device, "dtype": dtype} - super().__init__() - self.auto_sparsity = auto_sparsity - self.cross_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first) - self.linear1 = nn.Linear(d_model, dim_feedforward, **factory_kwargs) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model, **factory_kwargs) - self.norm_first = norm_first - - if group_norm: - self.norm1 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm2 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm3 = MyGroupNorm(int(group_norm), d_model, eps=layer_norm_eps, **factory_kwargs) - else: - self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - - self.norm_out = None - if self.norm_first & norm_out: - self.norm_out = MyGroupNorm(num_groups=int(norm_out), num_channels=d_model) - - self.gamma_1 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - self.gamma_2 = LayerScale(d_model, init_values, True) if layer_scale else nn.Identity() - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - - if isinstance(activation, str): self.activation = self._get_activation_fn(activation) - else: self.activation = activation - - def forward(self, q, k, mask=None): - if self.norm_first: - x = q + self.gamma_1(self._ca_block(self.norm1(q), self.norm2(k), mask)) - x = x + self.gamma_2(self._ff_block(self.norm3(x))) - - if self.norm_out: x = self.norm_out(x) - else: - x = self.norm1(q + self.gamma_1(self._ca_block(q, k, mask))) - x = self.norm2(x + self.gamma_2(self._ff_block(x))) - - return x - - def _ca_block(self, q, k, attn_mask=None): - x = self.cross_attn(q, k, k, attn_mask=attn_mask, need_weights=False)[0] - return self.dropout1(x) - - def _ff_block(self, x): - x = self.linear2(self.dropout(self.activation(self.linear1(x)))) - return self.dropout2(x) - - def _get_activation_fn(self, activation): - if activation == "relu": return F.relu - elif activation == "gelu": return F.gelu - raise RuntimeError(translations["activation"].format(activation=activation)) - -class HTDemucs(nn.Module): - @capture_init - def __init__(self, sources, audio_channels=2, channels=48, channels_time=None, growth=2, nfft=4096, wiener_iters=0, end_iters=0, wiener_residual=False, cac=True, depth=4, rewrite=True, multi_freqs=None, multi_freqs_depth=3, freq_emb=0.2, emb_scale=10, emb_smooth=True, kernel_size=8, time_stride=2, stride=4, context=1, context_enc=0, norm_starts=4, norm_groups=4, dconv_mode=1, dconv_depth=2, dconv_comp=8, dconv_init=1e-3, bottom_channels=0, t_layers=5, t_emb="sin", t_hidden_scale=4.0, t_heads=8, t_dropout=0.0, t_max_positions=10000, t_norm_in=True, t_norm_in_group=False, t_group_norm=False, t_norm_first=True, t_norm_out=True, t_max_period=10000.0, t_weight_decay=0.0, t_lr=None, t_layer_scale=True, t_gelu=True, t_weight_pos_embed=1.0, t_sin_random_shift=0, t_cape_mean_normalize=True, t_cape_augment=True, t_cape_glob_loc_scale=[5000.0, 1.0, 1.4], t_sparse_self_attn=False, t_sparse_cross_attn=False, t_mask_type="diag", t_mask_random_seed=42, t_sparse_attn_window=500, t_global_window=100, t_sparsity=0.95, t_auto_sparsity=False, t_cross_first=False, rescale=0.1, samplerate=44100, segment=4 * 10, use_train_segment=True): - super().__init__() - self.cac = cac - self.wiener_residual = wiener_residual - self.audio_channels = audio_channels - self.sources = sources - self.kernel_size = kernel_size - self.context = context - self.stride = stride - self.depth = depth - self.bottom_channels = bottom_channels - self.channels = channels - self.samplerate = samplerate - self.segment = segment - self.use_train_segment = use_train_segment - self.nfft = nfft - self.hop_length = nfft // 4 - self.wiener_iters = wiener_iters - self.end_iters = end_iters - self.freq_emb = None - assert wiener_iters == end_iters - self.encoder = nn.ModuleList() - self.decoder = nn.ModuleList() - self.tencoder = nn.ModuleList() - self.tdecoder = nn.ModuleList() - chin = audio_channels - chin_z = chin - if self.cac: chin_z *= 2 - chout = channels_time or channels - chout_z = channels - freqs = nfft // 2 - - for index in range(depth): - norm = index >= norm_starts - freq = freqs > 1 - stri = stride - ker = kernel_size - - if not freq: - assert freqs == 1 - ker = time_stride * 2 - stri = time_stride - - pad = True - last_freq = False - - if freq and freqs <= kernel_size: - ker = freqs - pad = False - last_freq = True - - kw = { - "kernel_size": ker, - "stride": stri, - "freq": freq, - "pad": pad, - "norm": norm, - "rewrite": rewrite, - "norm_groups": norm_groups, - "dconv_kw": {"depth": dconv_depth, "compress": dconv_comp, "init": dconv_init, "gelu": True}, - } - - kwt = dict(kw) - kwt["freq"] = 0 - kwt["kernel_size"] = kernel_size - kwt["stride"] = stride - kwt["pad"] = True - kw_dec = dict(kw) - multi = False - - if multi_freqs and index < multi_freqs_depth: - multi = True - kw_dec["context_freq"] = False - - if last_freq: - chout_z = max(chout, chout_z) - chout = chout_z - - enc = HEncLayer(chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw) - if freq: - tenc = HEncLayer(chin, chout, dconv=dconv_mode & 1, context=context_enc, empty=last_freq, **kwt) - self.tencoder.append(tenc) - - if multi: enc = MultiWrap(enc, multi_freqs) - - self.encoder.append(enc) - if index == 0: - chin = self.audio_channels * len(self.sources) - chin_z = chin - if self.cac: chin_z *= 2 - - dec = HDecLayer(chout_z, chin_z, dconv=dconv_mode & 2, last=index == 0, context=context, **kw_dec) - if multi: dec = MultiWrap(dec, multi_freqs) - - if freq: - tdec = HDecLayer(chout, chin, dconv=dconv_mode & 2, empty=last_freq, last=index == 0, context=context, **kwt) - self.tdecoder.insert(0, tdec) - - self.decoder.insert(0, dec) - chin = chout - chin_z = chout_z - chout = int(growth * chout) - chout_z = int(growth * chout_z) - - if freq: - if freqs <= kernel_size: freqs = 1 - else: freqs //= stride - - if index == 0 and freq_emb: - self.freq_emb = ScaledEmbedding(freqs, chin_z, smooth=emb_smooth, scale=emb_scale) - self.freq_emb_scale = freq_emb - - if rescale: rescale_module(self, reference=rescale) - transformer_channels = channels * growth ** (depth - 1) - - if bottom_channels: - self.channel_upsampler = nn.Conv1d(transformer_channels, bottom_channels, 1) - self.channel_downsampler = nn.Conv1d(bottom_channels, transformer_channels, 1) - self.channel_upsampler_t = nn.Conv1d(transformer_channels, bottom_channels, 1) - self.channel_downsampler_t = nn.Conv1d(bottom_channels, transformer_channels, 1) - transformer_channels = bottom_channels - - if t_layers > 0: self.crosstransformer = CrossTransformerEncoder(dim=transformer_channels, emb=t_emb, hidden_scale=t_hidden_scale, num_heads=t_heads, num_layers=t_layers, cross_first=t_cross_first, dropout=t_dropout, max_positions=t_max_positions, norm_in=t_norm_in, norm_in_group=t_norm_in_group, group_norm=t_group_norm, norm_first=t_norm_first, norm_out=t_norm_out, max_period=t_max_period, weight_decay=t_weight_decay, lr=t_lr, layer_scale=t_layer_scale, gelu=t_gelu, sin_random_shift=t_sin_random_shift, weight_pos_embed=t_weight_pos_embed, cape_mean_normalize=t_cape_mean_normalize, cape_augment=t_cape_augment, cape_glob_loc_scale=t_cape_glob_loc_scale, sparse_self_attn=t_sparse_self_attn, sparse_cross_attn=t_sparse_cross_attn, mask_type=t_mask_type, mask_random_seed=t_mask_random_seed, sparse_attn_window=t_sparse_attn_window, global_window=t_global_window, sparsity=t_sparsity, auto_sparsity=t_auto_sparsity) - else: self.crosstransformer = None - - def _spec(self, x): - hl = self.hop_length - nfft = self.nfft - assert hl == nfft // 4 - le = int(math.ceil(x.shape[-1] / hl)) - pad = hl // 2 * 3 - x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect") - z = spectro(x, nfft, hl)[..., :-1, :] - assert z.shape[-1] == le + 4, (z.shape, x.shape, le) - z = z[..., 2 : 2 + le] - return z - - def _ispec(self, z, length=None, scale=0): - hl = self.hop_length // (4**scale) - z = F.pad(z, (0, 0, 0, 1)) - z = F.pad(z, (2, 2)) - pad = hl // 2 * 3 - le = hl * int(math.ceil(length / hl)) + 2 * pad - x = ispectro(z, hl, length=le) - x = x[..., pad : pad + length] - return x - - def _magnitude(self, z): - if self.cac: - B, C, Fr, T = z.shape - m = torch.view_as_real(z).permute(0, 1, 4, 2, 3) - m = m.reshape(B, C * 2, Fr, T) - else: m = z.abs() - return m - - def _mask(self, z, m): - niters = self.wiener_iters - if self.cac: - B, S, C, Fr, T = m.shape - out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3) - out = torch.view_as_complex(out.contiguous()) - return out - - if self.training: niters = self.end_iters - - if niters < 0: - z = z[:, None] - return z / (1e-8 + z.abs()) * m - else: return self._wiener(m, z, niters) - - def _wiener(self, mag_out, mix_stft, niters): - init = mix_stft.dtype - wiener_win_len = 300 - residual = self.wiener_residual - B, S, C, Fq, T = mag_out.shape - mag_out = mag_out.permute(0, 4, 3, 2, 1) - mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1)) - - outs = [] - - for sample in range(B): - pos = 0 - out = [] - - for pos in range(0, T, wiener_win_len): - frame = slice(pos, pos + wiener_win_len) - z_out = wiener(mag_out[sample, frame], mix_stft[sample, frame], niters, residual=residual) - out.append(z_out.transpose(-1, -2)) - - outs.append(torch.cat(out, dim=0)) - - out = torch.view_as_complex(torch.stack(outs, 0)) - out = out.permute(0, 4, 3, 2, 1).contiguous() - - if residual: out = out[:, :-1] - assert list(out.shape) == [B, S, C, Fq, T] - return out.to(init) - - def valid_length(self, length): - if not self.use_train_segment: return length - - training_length = int(self.segment * self.samplerate) - if training_length < length: raise ValueError(translations["length_or_training_length"].format(length=length, training_length=training_length)) - - return training_length - - def forward(self, mix): - length = mix.shape[-1] - length_pre_pad = None - - if self.use_train_segment: - if self.training: self.segment = Fraction(mix.shape[-1], self.samplerate) - else: - training_length = int(self.segment * self.samplerate) - - if mix.shape[-1] < training_length: - length_pre_pad = mix.shape[-1] - mix = F.pad(mix, (0, training_length - length_pre_pad)) - - z = self._spec(mix) - mag = self._magnitude(z).to(mix.device) - x = mag - B, C, Fq, T = x.shape - mean = x.mean(dim=(1, 2, 3), keepdim=True) - std = x.std(dim=(1, 2, 3), keepdim=True) - x = (x - mean) / (1e-5 + std) - xt = mix - meant = xt.mean(dim=(1, 2), keepdim=True) - stdt = xt.std(dim=(1, 2), keepdim=True) - xt = (xt - meant) / (1e-5 + stdt) - - saved, saved_t, lengths, lengths_t = [], [], [], [] - - for idx, encode in enumerate(self.encoder): - lengths.append(x.shape[-1]) - inject = None - - if idx < len(self.tencoder): - lengths_t.append(xt.shape[-1]) - tenc = self.tencoder[idx] - xt = tenc(xt) - - if not tenc.empty: saved_t.append(xt) - else: inject = xt - - x = encode(x, inject) - if idx == 0 and self.freq_emb is not None: - frs = torch.arange(x.shape[-2], device=x.device) - emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x) - x = x + self.freq_emb_scale * emb - - saved.append(x) - - if self.crosstransformer: - if self.bottom_channels: - b, c, f, t = x.shape - x = rearrange(x, "b c f t-> b c (f t)") - x = self.channel_upsampler(x) - x = rearrange(x, "b c (f t)-> b c f t", f=f) - xt = self.channel_upsampler_t(xt) - - x, xt = self.crosstransformer(x, xt) - - if self.bottom_channels: - x = rearrange(x, "b c f t-> b c (f t)") - x = self.channel_downsampler(x) - x = rearrange(x, "b c (f t)-> b c f t", f=f) - xt = self.channel_downsampler_t(xt) - - for idx, decode in enumerate(self.decoder): - skip = saved.pop(-1) - x, pre = decode(x, skip, lengths.pop(-1)) - offset = self.depth - len(self.tdecoder) - - if idx >= offset: - tdec = self.tdecoder[idx - offset] - length_t = lengths_t.pop(-1) - - if tdec.empty: - assert pre.shape[2] == 1, pre.shape - pre = pre[:, :, 0] - xt, _ = tdec(pre, None, length_t) - else: - skip = saved_t.pop(-1) - xt, _ = tdec(xt, skip, length_t) - - assert len(saved) == 0 - assert len(lengths_t) == 0 - assert len(saved_t) == 0 - - S = len(self.sources) - x = x.view(B, S, -1, Fq, T) - x = x * std[:, None] + mean[:, None] - device_type = x.device.type - device_load = f"{device_type}:{x.device.index}" if not device_type == "mps" else device_type - x_is_other_gpu = not device_type in ["cuda", "cpu"] - if x_is_other_gpu: x = x.cpu() - zout = self._mask(z, x) - - if self.use_train_segment: x = self._ispec(zout, length) if self.training else self._ispec(zout, training_length) - else: x = self._ispec(zout, length) - - if x_is_other_gpu: x = x.to(device_load) - - if self.use_train_segment: xt = xt.view(B, S, -1, length) if self.training else xt.view(B, S, -1, training_length) - else: xt = xt.view(B, S, -1, length) - - xt = xt * stdt[:, None] + meant[:, None] - x = xt + x - - if length_pre_pad: x = x[..., :length_pre_pad] - return x \ No newline at end of file diff --git a/main/library/uvr5_separator/demucs/states.py b/main/library/uvr5_separator/demucs/states.py deleted file mode 100644 index da1fbc71ca83086c4d6472d229de82d1c43fa651..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/demucs/states.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import sys -import torch -import inspect -import warnings -import functools - -from pathlib import Path - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -translations = Config().translations - -def load_model(path_or_package, strict=False): - if isinstance(path_or_package, dict): package = path_or_package - elif isinstance(path_or_package, (str, Path)): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - package = torch.load(path_or_package, map_location="cpu") - else: raise ValueError(f"{translations['type_not_valid']} {path_or_package}.") - klass = package["klass"] - args = package["args"] - kwargs = package["kwargs"] - if strict: model = klass(*args, **kwargs) - else: - sig = inspect.signature(klass) - for key in list(kwargs): - if key not in sig.parameters: - warnings.warn(translations["del_parameter"] + key) - del kwargs[key] - model = klass(*args, **kwargs) - state = package["state"] - set_state(model, state) - return model - -def restore_quantized_state(model, state): - assert "meta" in state - quantizer = state["meta"]["klass"](model, **state["meta"]["init_kwargs"]) - quantizer.restore_quantized_state(state) - quantizer.detach() - -def set_state(model, state, quantizer=None): - if state.get("__quantized"): - if quantizer is not None: quantizer.restore_quantized_state(model, state["quantized"]) - else: restore_quantized_state(model, state) - else: model.load_state_dict(state) - return state - -def capture_init(init): - @functools.wraps(init) - def __init__(self, *args, **kwargs): - self._init_args_kwargs = (args, kwargs) - init(self, *args, **kwargs) - return __init__ \ No newline at end of file diff --git a/main/library/uvr5_separator/demucs/utils.py b/main/library/uvr5_separator/demucs/utils.py deleted file mode 100644 index f01fc79a1c4c0af99814bb5923015cb426aaf105..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/demucs/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -import torch - -def center_trim(tensor, reference): - ref_size = reference.size(-1) if isinstance(reference, torch.Tensor) else reference - delta = tensor.size(-1) - ref_size - if delta < 0: raise ValueError(f"tensor > parameter: {delta}.") - if delta: tensor = tensor[..., delta // 2 : -(delta - delta // 2)] - return tensor \ No newline at end of file diff --git a/main/library/uvr5_separator/spec_utils.py b/main/library/uvr5_separator/spec_utils.py deleted file mode 100644 index 803265b18fd8b18162d7b0bf6b3e5eeb026122b4..0000000000000000000000000000000000000000 --- a/main/library/uvr5_separator/spec_utils.py +++ /dev/null @@ -1,900 +0,0 @@ -import os -import six -import sys -import librosa -import tempfile -import platform -import subprocess - -import numpy as np -import soundfile as sf - -from scipy.signal import correlate, hilbert - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -translations = Config().translations - -OPERATING_SYSTEM = platform.system() -SYSTEM_ARCH = platform.platform() -SYSTEM_PROC = platform.processor() -ARM = "arm" -AUTO_PHASE = "Automatic" -POSITIVE_PHASE = "Positive Phase" -NEGATIVE_PHASE = "Negative Phase" -NONE_P = ("None",) -LOW_P = ("Shifts: Low",) -MED_P = ("Shifts: Medium",) -HIGH_P = ("Shifts: High",) -VHIGH_P = "Shifts: Very High" -MAXIMUM_P = "Shifts: Maximum" -BASE_PATH_RUB = sys._MEIPASS if getattr(sys, 'frozen', False) else os.path.dirname(os.path.abspath(__file__)) -DEVNULL = open(os.devnull, 'w') if six.PY2 else subprocess.DEVNULL -MAX_SPEC = "Max Spec" -MIN_SPEC = "Min Spec" -LIN_ENSE = "Linear Ensemble" -MAX_WAV = MAX_SPEC -MIN_WAV = MIN_SPEC -AVERAGE = "Average" - -progress_value, last_update_time = 0, 0 -wav_resolution = "sinc_fastest" -wav_resolution_float_resampling = wav_resolution - -def crop_center(h1, h2): - h1_shape = h1.size() - h2_shape = h2.size() - - if h1_shape[3] == h2_shape[3]: return h1 - elif h1_shape[3] < h2_shape[3]: raise ValueError("h1_shape[3] > h2_shape[3]") - - s_time = (h1_shape[3] - h2_shape[3]) // 2 - - h1 = h1[:, :, :, s_time:s_time + h2_shape[3]] - return h1 - -def preprocess(X_spec): - return np.abs(X_spec), np.angle(X_spec) - -def make_padding(width, cropsize, offset): - roi_size = cropsize - offset * 2 - - if roi_size == 0: roi_size = cropsize - return offset, roi_size - (width % roi_size) + offset, roi_size - -def normalize(wave, max_peak=1.0): - maxv = np.abs(wave).max() - - if maxv > max_peak: wave *= max_peak / maxv - return wave - -def auto_transpose(audio_array): - if audio_array.shape[1] == 2: return audio_array.T - return audio_array - -def write_array_to_mem(audio_data, subtype): - if isinstance(audio_data, np.ndarray): - import io - - audio_buffer = io.BytesIO() - sf.write(audio_buffer, audio_data, 44100, subtype=subtype, format="WAV") - - audio_buffer.seek(0) - return audio_buffer - else: return audio_data - -def spectrogram_to_image(spec, mode="magnitude"): - if mode == "magnitude": y = np.log10((np.abs(spec) if np.iscomplexobj(spec) else spec)**2 + 1e-8) - elif mode == "phase": y = np.angle(spec) if np.iscomplexobj(spec) else spec - - y -= y.min() - y *= 255 / y.max() - img = np.uint8(y) - - if y.ndim == 3: - img = img.transpose(1, 2, 0) - img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) - - return img - -def reduce_vocal_aggressively(X, y, softmask): - y_mag_tmp = np.abs(y) - v_mag_tmp = np.abs(X - y) - - return np.clip(y_mag_tmp - v_mag_tmp * (v_mag_tmp > y_mag_tmp) * softmask, 0, np.inf) * np.exp(1.0j * np.angle(y)) - -def merge_artifacts(y_mask, thres=0.01, min_range=64, fade_size=32): - mask = y_mask - - try: - if min_range < fade_size * 2: raise ValueError("min_range >= fade_size * 2") - - idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0] - start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) - end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) - artifact_idx = np.where(end_idx - start_idx > min_range)[0] - weight = np.zeros_like(y_mask) - - if len(artifact_idx) > 0: - start_idx = start_idx[artifact_idx] - end_idx = end_idx[artifact_idx] - old_e = None - - for s, e in zip(start_idx, end_idx): - if old_e is not None and s - old_e < fade_size: s = old_e - fade_size * 2 - - if s != 0: weight[:, :, s : s + fade_size] = np.linspace(0, 1, fade_size) - else: s -= fade_size - - if e != y_mask.shape[2]: weight[:, :, e - fade_size : e] = np.linspace(1, 0, fade_size) - else: e += fade_size - - weight[:, :, s + fade_size : e - fade_size] = 1 - old_e = e - - v_mask = 1 - y_mask - y_mask += weight * v_mask - mask = y_mask - except Exception as e: - import traceback - print(translations["not_success"], f'{type(e).__name__}: "{e}"\n{traceback.format_exc()}"') - - return mask - -def align_wave_head_and_tail(a, b): - l = min([a[0].size, b[0].size]) - return a[:l, :l], b[:l, :l] - -def convert_channels(spec, mp, band): - cc = mp.param["band"][band].get("convert_channels") - - if "mid_side_c" == cc: - spec_left = np.add(spec[0], spec[1] * 0.25) - spec_right = np.subtract(spec[1], spec[0] * 0.25) - elif "mid_side" == cc: - spec_left = np.add(spec[0], spec[1]) / 2 - spec_right = np.subtract(spec[0], spec[1]) - elif "stereo_n" == cc: - spec_left = np.add(spec[0], spec[1] * 0.25) / 0.9375 - spec_right = np.add(spec[1], spec[0] * 0.25) / 0.9375 - else: return spec - - return np.asfortranarray([spec_left, spec_right]) - -def combine_spectrograms(specs, mp, is_v51_model=False): - l = min([specs[i].shape[2] for i in specs]) - spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) - offset = 0 - bands_n = len(mp.param["band"]) - - for d in range(1, bands_n + 1): - h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] - spec_c[:, offset : offset + h, :l] = specs[d][:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l] - offset += h - - if offset > mp.param["bins"]: raise ValueError("offset > mp.param['bins']") - - if mp.param["pre_filter_start"] > 0: - if is_v51_model: spec_c *= get_lp_filter_mask(spec_c.shape[1], mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) - else: - if bands_n == 1: spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) - else: - import math - gp = 1 - - for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]): - g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0) - gp = g - spec_c[:, b, :] *= g - - return np.asfortranarray(spec_c) - -def wave_to_spectrogram(wave, hop_length, n_fft, mp, band, is_v51_model=False): - if wave.ndim == 1: wave = np.asfortranarray([wave, wave]) - - if not is_v51_model: - if mp.param["reverse"]: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mp.param["mid_side"]: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mp.param["mid_side_b2"]: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) - spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) - - spec = np.asfortranarray([spec_left, spec_right]) - - if is_v51_model: spec = convert_channels(spec, mp, band) - return spec - -def spectrogram_to_wave(spec, hop_length=1024, mp={}, band=0, is_v51_model=True): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hop_length) - wave_right = librosa.istft(spec_right, hop_length=hop_length) - - if is_v51_model: - cc = mp.param["band"][band].get("convert_channels") - - if "mid_side_c" == cc: return np.asfortranarray([np.subtract(wave_left / 1.0625, wave_right / 4.25), np.add(wave_right / 1.0625, wave_left / 4.25)]) - elif "mid_side" == cc: return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) - elif "stereo_n" == cc: return np.asfortranarray([np.subtract(wave_left, wave_right * 0.25), np.subtract(wave_right, wave_left * 0.25)]) - else: - if mp.param["reverse"]: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mp.param["mid_side"]: return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) - elif mp.param["mid_side_b2"]: return np.asfortranarray([np.add(wave_right / 1.25, 0.4 * wave_left), np.subtract(wave_left / 1.25, 0.4 * wave_right)]) - - return np.asfortranarray([wave_left, wave_right]) - -def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None, is_v51_model=False): - bands_n = len(mp.param["band"]) - offset = 0 - - for d in range(1, bands_n + 1): - bp = mp.param["band"][d] - spec_s = np.zeros(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex) - h = bp["crop_stop"] - bp["crop_start"] - spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :] - offset += h - - if d == bands_n: - if extra_bins_h: - max_bin = bp["n_fft"] // 2 - spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :] - - if bp["hpf_start"] > 0: - if is_v51_model: spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1) - else: spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - - wave = spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model) if bands_n == 1 else np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)) - else: - sr = mp.param["band"][d + 1]["sr"] - if d == 1: - if is_v51_model: spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"]) - else: spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - - try: - wave = librosa.resample(spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model), orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution) - except ValueError as e: - print(f"{translations['resample_error']}: {e}") - print(f"{translations['shapes']} Spec_s: {spec_s.shape}, SR: {sr}, {translations['wav_resolution']}: {wav_resolution}") - else: - if is_v51_model: - spec_s *= get_hp_filter_mask(spec_s.shape[1], bp["hpf_start"], bp["hpf_stop"] - 1) - spec_s *= get_lp_filter_mask(spec_s.shape[1], bp["lpf_start"], bp["lpf_stop"]) - else: - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - - try: - wave = librosa.resample(np.add(wave, spectrogram_to_wave(spec_s, bp["hl"], mp, d, is_v51_model)), orig_sr=bp["sr"], target_sr=sr, res_type=wav_resolution) - except ValueError as e: - print(f"{translations['resample_error']}: {e}") - print(f"{translations['shapes']} Spec_s: {spec_s.shape}, SR: {sr}, {translations['wav_resolution']}: {wav_resolution}") - - return wave - -def get_lp_filter_mask(n_bins, bin_start, bin_stop): - return np.concatenate([np.ones((bin_start - 1, 1)), np.linspace(1, 0, bin_stop - bin_start + 1)[:, None], np.zeros((n_bins - bin_stop, 1))], axis=0) - -def get_hp_filter_mask(n_bins, bin_start, bin_stop): - return np.concatenate([np.zeros((bin_stop + 1, 1)), np.linspace(0, 1, 1 + bin_start - bin_stop)[:, None], np.ones((n_bins - bin_start - 2, 1))], axis=0) - -def fft_lp_filter(spec, bin_start, bin_stop): - g = 1.0 - - for b in range(bin_start, bin_stop): - g -= 1 / (bin_stop - bin_start) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, bin_stop:, :] *= 0 - return spec - -def fft_hp_filter(spec, bin_start, bin_stop): - g = 1.0 - - for b in range(bin_start, bin_stop, -1): - g -= 1 / (bin_start - bin_stop) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, 0 : bin_stop + 1, :] *= 0 - return spec - -def spectrogram_to_wave_old(spec, hop_length=1024): - if spec.ndim == 2: wave = librosa.istft(spec, hop_length=hop_length) - elif spec.ndim == 3: wave = np.asfortranarray([librosa.istft(np.asfortranarray(spec[0]), hop_length=hop_length), librosa.istft(np.asfortranarray(spec[1]), hop_length=hop_length)]) - - return wave - -def wave_to_spectrogram_old(wave, hop_length, n_fft): - return np.asfortranarray([librosa.stft(np.asfortranarray(wave[0]), n_fft=n_fft, hop_length=hop_length), librosa.stft(np.asfortranarray(wave[1]), n_fft=n_fft, hop_length=hop_length)]) - -def mirroring(a, spec_m, input_high_end, mp): - if "mirroring" == a: - mirror = np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1) * np.exp(1.0j * np.angle(input_high_end)) - - return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) - - if "mirroring2" == a: - mi = np.multiply(np.flip(np.abs(spec_m[:, mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :]), 1), input_high_end * 1.7) - - return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) - -def adjust_aggr(mask, is_non_accom_stem, aggressiveness): - aggr = aggressiveness["value"] * 2 - - if aggr != 0: - if is_non_accom_stem: - aggr = 1 - aggr - - if np.any(aggr > 10) or np.any(aggr < -10): print(f"{translations['warnings']}: {aggr}") - - aggr = [aggr, aggr] - - if aggressiveness["aggr_correction"] is not None: - aggr[0] += aggressiveness["aggr_correction"]["left"] - aggr[1] += aggressiveness["aggr_correction"]["right"] - - for ch in range(2): - mask[ch, : aggressiveness["split_bin"]] = np.power(mask[ch, : aggressiveness["split_bin"]], 1 + aggr[ch] / 3) - mask[ch, aggressiveness["split_bin"] :] = np.power(mask[ch, aggressiveness["split_bin"] :], 1 + aggr[ch]) - - return mask - -def stft(wave, nfft, hl): - return np.asfortranarray([librosa.stft(np.asfortranarray(wave[0]), n_fft=nfft, hop_length=hl), librosa.stft(np.asfortranarray(wave[1]), n_fft=nfft, hop_length=hl)]) - -def istft(spec, hl): - return np.asfortranarray([librosa.istft(np.asfortranarray(spec[0]), hop_length=hl), librosa.istft(np.asfortranarray(spec[1]), hop_length=hl)]) - -def spec_effects(wave, algorithm="Default", value=None): - if np.isnan(wave).any() or np.isinf(wave).any(): print(f"{translations['warnings_2']}: {wave.shape}") - spec = [stft(wave[0], 2048, 1024), stft(wave[1], 2048, 1024)] - - if algorithm == "Min_Mag": wave = istft(np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0]), 1024) - elif algorithm == "Max_Mag": wave = istft(np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0]), 1024) - elif algorithm == "Default": wave = (wave[1] * value) + (wave[0] * (1 - value)) - elif algorithm == "Invert_p": - X_mag, y_mag = np.abs(spec[0]), np.abs(spec[1]) - wave = istft(spec[1] - np.where(X_mag >= y_mag, X_mag, y_mag) * np.exp(1.0j * np.angle(spec[0])), 1024) - - return wave - -def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024): - wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length) - if wave.ndim == 1: wave = np.asfortranarray([wave, wave]) - - return wave - -def wave_to_spectrogram_no_mp(wave): - spec = librosa.stft(wave, n_fft=2048, hop_length=1024) - - if spec.ndim == 1: spec = np.asfortranarray([spec, spec]) - return spec - -def invert_audio(specs, invert_p=True): - ln = min([specs[0].shape[2], specs[1].shape[2]]) - specs[0] = specs[0][:, :, :ln] - specs[1] = specs[1][:, :, :ln] - - if invert_p: - X_mag, y_mag = np.abs(specs[0]), np.abs(specs[1]) - v_spec = specs[1] - np.where(X_mag >= y_mag, X_mag, y_mag) * np.exp(1.0j * np.angle(specs[0])) - else: - specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) - v_spec = specs[0] - specs[1] - - return v_spec - -def invert_stem(mixture, stem): - return -spectrogram_to_wave_no_mp(invert_audio([wave_to_spectrogram_no_mp(mixture), wave_to_spectrogram_no_mp(stem)])).T - -def ensembling(a, inputs, is_wavs=False): - for i in range(1, len(inputs)): - if i == 1: input = inputs[0] - - if is_wavs: - ln = min([input.shape[1], inputs[i].shape[1]]) - input = input[:, :ln] - inputs[i] = inputs[i][:, :ln] - else: - ln = min([input.shape[2], inputs[i].shape[2]]) - input = input[:, :, :ln] - inputs[i] = inputs[i][:, :, :ln] - - if MIN_SPEC == a: input = np.where(np.abs(inputs[i]) <= np.abs(input), inputs[i], input) - if MAX_SPEC == a: input = np.where(np.abs(inputs[i]) >= np.abs(input), inputs[i], input) - - return input - -def ensemble_for_align(waves): - specs = [] - - for wav in waves: - spec = wave_to_spectrogram_no_mp(wav.T) - specs.append(spec) - - wav_aligned = spectrogram_to_wave_no_mp(ensembling(MIN_SPEC, specs)).T - wav_aligned = match_array_shapes(wav_aligned, waves[1], is_swap=True) - - return wav_aligned - -def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path, is_wave=False, is_array=False): - wavs_ = [] - - if algorithm == AVERAGE: - output = average_audio(audio_input) - samplerate = 44100 - else: - specs = [] - - for i in range(len(audio_input)): - wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100) - wavs_.append(wave) - specs.append( wave if is_wave else wave_to_spectrogram_no_mp(wave)) - - wave_shapes = [w.shape[1] for w in wavs_] - target_shape = wavs_[wave_shapes.index(max(wave_shapes))] - - output = ensembling(algorithm, specs, is_wavs=True) if is_wave else spectrogram_to_wave_no_mp(ensembling(algorithm, specs)) - output = to_shape(output, target_shape.shape) - - sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set) - -def to_shape(x, target_shape): - padding_list = [] - - for x_dim, target_dim in zip(x.shape, target_shape): - padding_list.append((0, target_dim - x_dim)) - - return np.pad(x, tuple(padding_list), mode="constant") - -def to_shape_minimize(x, target_shape): - padding_list = [] - - for x_dim, target_dim in zip(x.shape, target_shape): - padding_list.append((0, target_dim - x_dim)) - - return np.pad(x, tuple(padding_list), mode="constant") - -def detect_leading_silence(audio, sr, silence_threshold=0.007, frame_length=1024): - if len(audio.shape) == 2: - channel = np.argmax(np.sum(np.abs(audio), axis=1)) - audio = audio[channel] - - for i in range(0, len(audio), frame_length): - if np.max(np.abs(audio[i : i + frame_length])) > silence_threshold: return (i / sr) * 1000 - - return (len(audio) / sr) * 1000 - -def adjust_leading_silence(target_audio, reference_audio, silence_threshold=0.01, frame_length=1024): - def find_silence_end(audio): - if len(audio.shape) == 2: - channel = np.argmax(np.sum(np.abs(audio), axis=1)) - audio_mono = audio[channel] - else: audio_mono = audio - - for i in range(0, len(audio_mono), frame_length): - if np.max(np.abs(audio_mono[i : i + frame_length])) > silence_threshold: return i - - return len(audio_mono) - - ref_silence_end = find_silence_end(reference_audio) - target_silence_end = find_silence_end(target_audio) - silence_difference = ref_silence_end - target_silence_end - - try: - silence_difference_p = ((ref_silence_end / 44100) * 1000) - ((target_silence_end / 44100) * 1000) - except Exception as e: - pass - - if silence_difference > 0: return np.hstack((np.zeros((target_audio.shape[0], silence_difference))if len(target_audio.shape) == 2 else np.zeros(silence_difference), target_audio)) - elif silence_difference < 0: return target_audio[:, -silence_difference:]if len(target_audio.shape) == 2 else target_audio[-silence_difference:] - else: return target_audio - -def match_array_shapes(array_1, array_2, is_swap=False): - - if is_swap: array_1, array_2 = array_1.T, array_2.T - - if array_1.shape[1] > array_2.shape[1]: array_1 = array_1[:, : array_2.shape[1]] - elif array_1.shape[1] < array_2.shape[1]: - padding = array_2.shape[1] - array_1.shape[1] - array_1 = np.pad(array_1, ((0, 0), (0, padding)), "constant", constant_values=0) - - if is_swap: array_1, array_2 = array_1.T, array_2.T - - return array_1 - -def match_mono_array_shapes(array_1, array_2): - if len(array_1) > len(array_2): array_1 = array_1[: len(array_2)] - elif len(array_1) < len(array_2): - padding = len(array_2) - len(array_1) - array_1 = np.pad(array_1, (0, padding), "constant", constant_values=0) - - return array_1 - -def change_pitch_semitones(y, sr, semitone_shift): - factor = 2 ** (semitone_shift / 12) - y_pitch_tuned = [] - - for y_channel in y: - y_pitch_tuned.append(librosa.resample(y_channel, orig_sr=sr, target_sr=sr * factor, res_type=wav_resolution_float_resampling)) - - y_pitch_tuned = np.array(y_pitch_tuned) - new_sr = sr * factor - - return y_pitch_tuned, new_sr - -def augment_audio(export_path, audio_file, rate, is_normalization, wav_type_set, save_format=None, is_pitch=False, is_time_correction=True): - wav, sr = librosa.load(audio_file, sr=44100, mono=False) - if wav.ndim == 1: wav = np.asfortranarray([wav, wav]) - - if not is_time_correction: wav_mix = change_pitch_semitones(wav, 44100, semitone_shift=-rate)[0] - else: - if is_pitch: wav_1, wav_2 = pitch_shift(wav[0], sr, rate, rbargs=None), pitch_shift(wav[1], sr, rate, rbargs=None) - else: wav_1, wav_2 = time_stretch(wav[0], sr, rate, rbargs=None), time_stretch(wav[1], sr, rate, rbargs=None) - - if wav_1.shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1.shape) - if wav_1.shape < wav_2.shape: wav_1 = to_shape(wav_1, wav_2.shape) - - wav_mix = np.asfortranarray([wav_1, wav_2]) - - sf.write(export_path, normalize(wav_mix.T, is_normalization), sr, subtype=wav_type_set) - save_format(export_path) - - -def average_audio(audio): - waves, wave_shapes, final_waves = [], [], [] - - for i in range(len(audio)): - wave = librosa.load(audio[i], sr=44100, mono=False) - waves.append(wave[0]) - wave_shapes.append(wave[0].shape[1]) - - wave_shapes_index = wave_shapes.index(max(wave_shapes)) - target_shape = waves[wave_shapes_index] - - waves.pop(wave_shapes_index) - final_waves.append(target_shape) - - for n_array in waves: - wav_target = to_shape(n_array, target_shape.shape) - final_waves.append(wav_target) - - waves = sum(final_waves) - return waves / len(audio) - -def average_dual_sources(wav_1, wav_2, value): - if wav_1.shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1.shape) - if wav_1.shape < wav_2.shape: wav_1 = to_shape(wav_1, wav_2.shape) - - return (wav_1 * value) + (wav_2 * (1 - value)) - -def reshape_sources(wav_1, wav_2): - if wav_1.shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1.shape) - - if wav_1.shape < wav_2.shape: - ln = min([wav_1.shape[1], wav_2.shape[1]]) - wav_2 = wav_2[:, :ln] - - ln = min([wav_1.shape[1], wav_2.shape[1]]) - wav_1 = wav_1[:, :ln] - wav_2 = wav_2[:, :ln] - - return wav_2 - -def reshape_sources_ref(wav_1_shape, wav_2): - if wav_1_shape > wav_2.shape: wav_2 = to_shape(wav_2, wav_1_shape) - return wav_2 - -def combine_arrarys(audio_sources, is_swap=False): - source = np.zeros_like(max(audio_sources, key=np.size)) - - for v in audio_sources: - v = match_array_shapes(v, source, is_swap=is_swap) - source += v - - return source - -def combine_audio(paths, audio_file_base=None, wav_type_set="FLOAT", save_format=None): - source = combine_arrarys([load_audio(i) for i in paths]) - save_path = f"{audio_file_base}_combined.wav" - sf.write(save_path, source.T, 44100, subtype=wav_type_set) - save_format(save_path) - -def reduce_mix_bv(inst_source, voc_source, reduction_rate=0.9): - return combine_arrarys([inst_source * (1 - reduction_rate), voc_source], is_swap=True) - -def organize_inputs(inputs): - input_list = {"target": None, "reference": None, "reverb": None, "inst": None} - - for i in inputs: - if i.endswith("_(Vocals).wav"): input_list["reference"] = i - elif "_RVC_" in i: input_list["target"] = i - elif i.endswith("reverbed_stem.wav"): input_list["reverb"] = i - elif i.endswith("_(Instrumental).wav"): input_list["inst"] = i - - return input_list - -def check_if_phase_inverted(wav1, wav2, is_mono=False): - if not is_mono: - wav1 = np.mean(wav1, axis=0) - wav2 = np.mean(wav2, axis=0) - - return np.corrcoef(wav1[:1000], wav2[:1000])[0, 1] < 0 - -def align_audio(file1, file2, file2_aligned, file_subtracted, wav_type_set, is_save_aligned, command_Text, save_format, align_window, align_intro_val, db_analysis, set_progress_bar, phase_option, phase_shifts, is_match_silence, is_spec_match): - global progress_value - progress_value = 0 - is_mono = False - - def get_diff(a, b): - return np.correlate(a, b, "full").argmax() - (b.shape[0] - 1) - - def progress_bar(length): - global progress_value - progress_value += 1 - - if (0.90 / length * progress_value) >= 0.9: length = progress_value + 1 - set_progress_bar(0.1, (0.9 / length * progress_value)) - - wav1, sr1 = librosa.load(file1, sr=44100, mono=False) - wav2, sr2 = librosa.load(file2, sr=44100, mono=False) - - if wav1.ndim == 1 and wav2.ndim == 1: is_mono = True - elif wav1.ndim == 1: wav1 = np.asfortranarray([wav1, wav1]) - elif wav2.ndim == 1: wav2 = np.asfortranarray([wav2, wav2]) - - if phase_option == AUTO_PHASE: - if check_if_phase_inverted(wav1, wav2, is_mono=is_mono): wav2 = -wav2 - elif phase_option == POSITIVE_PHASE: wav2 = +wav2 - elif phase_option == NEGATIVE_PHASE: wav2 = -wav2 - - if is_match_silence: wav2 = adjust_leading_silence(wav2, wav1) - - wav1_length = int(librosa.get_duration(y=wav1, sr=44100)) - wav2_length = int(librosa.get_duration(y=wav2, sr=44100)) - - if not is_mono: - wav1 = wav1.transpose() - wav2 = wav2.transpose() - - wav2_org = wav2.copy() - - command_Text(translations["process_file"]) - seconds_length = min(wav1_length, wav2_length) - wav2_aligned_sources = [] - - for sec_len in align_intro_val: - sec_seg = 1 if sec_len == 1 else int(seconds_length // sec_len) - index = sr1 * sec_seg - - if is_mono: - samp1, samp2 = wav1[index : index + sr1], wav2[index : index + sr1] - diff = get_diff(samp1, samp2) - else: - index = sr1 * sec_seg - samp1, samp2 = wav1[index : index + sr1, 0], wav2[index : index + sr1, 0] - samp1_r, samp2_r = wav1[index : index + sr1, 1], wav2[index : index + sr1, 1] - diff, _ = get_diff(samp1, samp2), get_diff(samp1_r, samp2_r) - - if diff > 0: wav2_aligned = np.append(np.zeros(diff) if is_mono else np.zeros((diff, 2)), wav2_org, axis=0) - elif diff < 0: wav2_aligned = wav2_org[-diff:] - else: wav2_aligned = wav2_org - - if not any(np.array_equal(wav2_aligned, source) for source in wav2_aligned_sources): wav2_aligned_sources.append(wav2_aligned) - - unique_sources = len(wav2_aligned_sources) - sub_mapper_big_mapper = {} - - for s in wav2_aligned_sources: - wav2_aligned = match_mono_array_shapes(s, wav1) if is_mono else match_array_shapes(s, wav1, is_swap=True) - - if align_window: - wav_sub = time_correction(wav1, wav2_aligned, seconds_length, align_window=align_window, db_analysis=db_analysis, progress_bar=progress_bar, unique_sources=unique_sources, phase_shifts=phase_shifts) - sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{np.abs(wav_sub).mean(): wav_sub}} - else: - wav2_aligned = wav2_aligned * np.power(10, db_analysis[0] / 20) - - for db_adjustment in db_analysis[1]: - sub_mapper_big_mapper = {**sub_mapper_big_mapper, **{np.abs(wav_sub).mean(): wav1 - (wav2_aligned * (10 ** (db_adjustment / 20)))}} - - wav_sub = ensemble_for_align(list(sub_mapper_big_mapper.values())) if is_spec_match and len(list(sub_mapper_big_mapper.values())) >= 2 else ensemble_wav(list(sub_mapper_big_mapper.values())) - wav_sub = np.clip(wav_sub, -1, +1) - - command_Text(translations["save_instruments"]) - - if is_save_aligned or is_spec_match: - wav1 = match_mono_array_shapes(wav1, wav_sub) if is_mono else match_array_shapes(wav1, wav_sub, is_swap=True) - wav2_aligned = wav1 - wav_sub - - if is_spec_match: - if wav1.ndim == 1 and wav2.ndim == 1: - wav2_aligned = np.asfortranarray([wav2_aligned, wav2_aligned]).T - wav1 = np.asfortranarray([wav1, wav1]).T - - wav2_aligned = ensemble_for_align([wav2_aligned, wav1]) - wav_sub = wav1 - wav2_aligned - - if is_save_aligned: - sf.write(file2_aligned, wav2_aligned, sr1, subtype=wav_type_set) - save_format(file2_aligned) - - sf.write(file_subtracted, wav_sub, sr1, subtype=wav_type_set) - save_format(file_subtracted) - -def phase_shift_hilbert(signal, degree): - analytic_signal = hilbert(signal) - return np.cos(np.radians(degree)) * analytic_signal.real - np.sin(np.radians(degree)) * analytic_signal.imag - -def get_phase_shifted_tracks(track, phase_shift): - if phase_shift == 180: return [track, -track] - - step = phase_shift - end = 180 - (180 % step) if 180 % step == 0 else 181 - phase_range = range(step, end, step) - flipped_list = [track, -track] - - for i in phase_range: - flipped_list.extend([phase_shift_hilbert(track, i), phase_shift_hilbert(track, -i)]) - - return flipped_list - -def time_correction(mix, instrumental, seconds_length, align_window, db_analysis, sr=44100, progress_bar=None, unique_sources=None, phase_shifts=NONE_P): - def align_tracks(track1, track2): - shifted_tracks = {} - track2 = track2 * np.power(10, db_analysis[0] / 20) - track2_flipped = [track2] if phase_shifts == 190 else get_phase_shifted_tracks(track2, phase_shifts) - - for db_adjustment in db_analysis[1]: - for t in track2_flipped: - track2_adjusted = t * (10 ** (db_adjustment / 20)) - track2_shifted = np.roll(track2_adjusted, shift=np.argmax(np.abs(correlate(track1, track2_adjusted))) - (len(track1) - 1)) - shifted_tracks[np.abs(track1 - track2_shifted).mean()] = track2_shifted - - return shifted_tracks[min(shifted_tracks.keys())] - - assert mix.shape == instrumental.shape, translations["assert"].format(mixshape=mix.shape, instrumentalshape=instrumental.shape) - seconds_length = seconds_length // 2 - - sub_mapper = {} - progress_update_interval, total_iterations = 120, 0 - - if len(align_window) > 2: progress_update_interval = 320 - - for secs in align_window: - step = secs / 2 - window_size = int(sr * secs) - step_size = int(sr * step) - - if len(mix.shape) == 1: total_iterations += ((len(range(0, len(mix) - window_size, step_size)) // progress_update_interval) * unique_sources) - else: total_iterations += ((len(range(0, len(mix[:, 0]) - window_size, step_size)) * 2 // progress_update_interval) * unique_sources) - - for secs in align_window: - sub = np.zeros_like(mix) - divider = np.zeros_like(mix) - window_size = int(sr * secs) - step_size = int(sr * secs / 2) - window = np.hanning(window_size) - - if len(mix.shape) == 1: - counter = 0 - - for i in range(0, len(mix) - window_size, step_size): - counter += 1 - if counter % progress_update_interval == 0: progress_bar(total_iterations) - - window_mix = mix[i : i + window_size] * window - window_instrumental = instrumental[i : i + window_size] * window - window_instrumental_aligned = align_tracks(window_mix, window_instrumental) - sub[i : i + window_size] += window_mix - window_instrumental_aligned - divider[i : i + window_size] += window - else: - counter = 0 - - for ch in range(mix.shape[1]): - for i in range(0, len(mix[:, ch]) - window_size, step_size): - counter += 1 - - if counter % progress_update_interval == 0: progress_bar(total_iterations) - - window_mix = mix[i : i + window_size, ch] * window - window_instrumental = instrumental[i : i + window_size, ch] * window - window_instrumental_aligned = align_tracks(window_mix, window_instrumental) - sub[i : i + window_size, ch] += window_mix - window_instrumental_aligned - divider[i : i + window_size, ch] += window - - return ensemble_wav(list({**sub_mapper, **{np.abs(sub).mean(): np.where(divider > 1e-6, sub / divider, sub)}}.values()), split_size=12) - -def ensemble_wav(waveforms, split_size=240): - waveform_thirds = {i: np.array_split(waveform, split_size) for i, waveform in enumerate(waveforms)} - final_waveform = [] - for third_idx in range(split_size): - final_waveform.append(waveform_thirds[np.argmin([np.abs(waveform_thirds[i][third_idx]).mean() for i in range(len(waveforms))])][third_idx]) - - return np.concatenate(final_waveform) - -def ensemble_wav_min(waveforms): - for i in range(1, len(waveforms)): - if i == 1: wave = waveforms[0] - ln = min(len(wave), len(waveforms[i])) - wave = wave[:ln] - waveforms[i] = waveforms[i][:ln] - wave = np.where(np.abs(waveforms[i]) <= np.abs(wave), waveforms[i], wave) - - return wave - -def align_audio_test(wav1, wav2, sr1=44100): - def get_diff(a, b): - return np.correlate(a, b, "full").argmax() - (b.shape[0] - 1) - - wav1 = wav1.transpose() - wav2 = wav2.transpose() - wav2_org = wav2.copy() - index = sr1 - diff = get_diff(wav1[index : index + sr1, 0], wav2[index : index + sr1, 0]) - - if diff > 0: wav2_aligned = np.append(np.zeros((diff, 1)), wav2_org, axis=0) - elif diff < 0: wav2_aligned = wav2_org[-diff:] - else: wav2_aligned = wav2_org - return wav2_aligned - -def load_audio(audio_file): - wav, _ = librosa.load(audio_file, sr=44100, mono=False) - if wav.ndim == 1: wav = np.asfortranarray([wav, wav]) - return wav - -def __rubberband(y, sr, **kwargs): - assert sr > 0 - fd, infile = tempfile.mkstemp(suffix='.wav') - os.close(fd) - fd, outfile = tempfile.mkstemp(suffix='.wav') - os.close(fd) - - sf.write(infile, y, sr) - - try: - arguments = [os.path.join(BASE_PATH_RUB, 'rubberband'), '-q'] - for key, value in six.iteritems(kwargs): - arguments.append(str(key)) - arguments.append(str(value)) - - arguments.extend([infile, outfile]) - subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL) - - y_out, _ = sf.read(outfile, always_2d=True) - if y.ndim == 1: y_out = np.squeeze(y_out) - except OSError as exc: - six.raise_from(RuntimeError(translations["rubberband"]), exc) - finally: - os.unlink(infile) - os.unlink(outfile) - - return y_out - -def time_stretch(y, sr, rate, rbargs=None): - if rate <= 0: raise ValueError(translations["rate"]) - if rate == 1.0: return y - if rbargs is None: rbargs = dict() - - rbargs.setdefault('--tempo', rate) - return __rubberband(y, sr, **rbargs) - -def pitch_shift(y, sr, n_steps, rbargs=None): - if n_steps == 0: return y - if rbargs is None: rbargs = dict() - - rbargs.setdefault('--pitch', n_steps) - return __rubberband(y, sr, **rbargs) \ No newline at end of file diff --git a/main/tools/edge_tts.py b/main/tools/edge_tts.py deleted file mode 100644 index c38fe63bf8ad20306f8a921d49a92256d97bb737..0000000000000000000000000000000000000000 --- a/main/tools/edge_tts.py +++ /dev/null @@ -1,180 +0,0 @@ -import re -import ssl -import json -import time -import uuid -import codecs -import certifi -import aiohttp - -from io import TextIOWrapper -from dataclasses import dataclass -from contextlib import nullcontext -from xml.sax.saxutils import escape - - -@dataclass -class TTSConfig: - def __init__(self, voice, rate, volume, pitch): - self.voice = voice - self.rate = rate - self.volume = volume - self.pitch = pitch - - @staticmethod - def validate_string_param(param_name, param_value, pattern): - if re.match(pattern, param_value) is None: raise ValueError(f"{param_name} '{param_value}'.") - return param_value - - def __post_init__(self): - match = re.match(r"^([a-z]{2,})-([A-Z]{2,})-(.+Neural)$", self.voice) - if match is not None: - region = match.group(2) - name = match.group(3) - - if name.find("-") != -1: - region = region + "-" + name[: name.find("-")] - name = name[name.find("-") + 1 :] - - self.voice = ("Microsoft Server Speech Text to Speech Voice" + f" ({match.group(1)}-{region}, {name})") - - self.validate_string_param("voice", self.voice, r"^Microsoft Server Speech Text to Speech Voice \(.+,.+\)$") - self.validate_string_param("rate", self.rate, r"^[+-]\d+%$") - self.validate_string_param("volume", self.volume, r"^[+-]\d+%$") - self.validate_string_param("pitch", self.pitch, r"^[+-]\d+Hz$") - -def get_headers_and_data(data, header_length): - headers = {} - - for line in data[:header_length].split(b"\r\n"): - key, value = line.split(b":", 1) - headers[key] = value - - return headers, data[header_length + 2 :] - -def date_to_string(): - return time.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)", time.gmtime()) - -def mkssml(tc, escaped_text): - if isinstance(escaped_text, bytes): escaped_text = escaped_text.decode("utf-8") - return (f"" f"" f"" f"{escaped_text}" "" "" "") - -def connect_id(): - return str(uuid.uuid4()).replace("-", "") - -def ssml_headers_plus_data(request_id, timestamp, ssml): - return (f"X-RequestId:{request_id}\r\n" "Content-Type:application/ssml+xml\r\n" f"X-Timestamp:{timestamp}Z\r\n" "Path:ssml\r\n\r\n" f"{ssml}") - -def remove_incompatible_characters(string): - if isinstance(string, bytes): string = string.decode("utf-8") - chars = list(string) - - for idx, char in enumerate(chars): - code = ord(char) - if (0 <= code <= 8) or (11 <= code <= 12) or (14 <= code <= 31): chars[idx] = " " - - return "".join(chars) - -def split_text_by_byte_length(text, byte_length): - if isinstance(text, str): text = text.encode("utf-8") - if byte_length <= 0: raise ValueError("byte_length > 0") - - while len(text) > byte_length: - split_at = text.rfind(b" ", 0, byte_length) - split_at = split_at if split_at != -1 else byte_length - - while b"&" in text[:split_at]: - ampersand_index = text.rindex(b"&", 0, split_at) - if text.find(b";", ampersand_index, split_at) != -1: break - - split_at = ampersand_index - 1 - if split_at == 0: break - - new_text = text[:split_at].strip() - - if new_text: yield new_text - if split_at == 0: split_at = 1 - - text = text[split_at:] - - new_text = text.strip() - if new_text: yield new_text - -class Communicate: - def __init__(self, text, voice, *, rate="+0%", volume="+0%", pitch="+0Hz", proxy=None, connect_timeout=10, receive_timeout=60): - self.tts_config = TTSConfig(voice, rate, volume, pitch) - self.texts = split_text_by_byte_length(escape(remove_incompatible_characters(text)), 2**16 - (len(ssml_headers_plus_data(connect_id(), date_to_string(), mkssml(self.tts_config, ""))) + 50)) - self.proxy = proxy - self.session_timeout = aiohttp.ClientTimeout(total=None, connect=None, sock_connect=connect_timeout, sock_read=receive_timeout) - self.state = {"partial_text": None, "offset_compensation": 0, "last_duration_offset": 0, "stream_was_called": False} - - def __parse_metadata(self, data): - for meta_obj in json.loads(data)["Metadata"]: - meta_type = meta_obj["Type"] - if meta_type == "WordBoundary": return {"type": meta_type, "offset": (meta_obj["Data"]["Offset"] + self.state["offset_compensation"]), "duration": meta_obj["Data"]["Duration"], "text": meta_obj["Data"]["text"]["Text"]} - if meta_type in ("SessionEnd",): continue - - async def __stream(self): - async def send_command_request(): - await websocket.send_str(f"X-Timestamp:{date_to_string()}\r\n" "Content-Type:application/json; charset=utf-8\r\n" "Path:speech.config\r\n\r\n" '{"context":{"synthesis":{"audio":{"metadataoptions":{' '"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},' '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"' "}}}}\r\n") - - async def send_ssml_request(): - await websocket.send_str(ssml_headers_plus_data(connect_id(), date_to_string(), mkssml(self.tts_config, self.state["partial_text"]))) - - audio_was_received = False - ssl_ctx = ssl.create_default_context(cafile=certifi.where()) - - async with aiohttp.ClientSession(trust_env=True, timeout=self.session_timeout) as session, session.ws_connect(f"wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4&ConnectionId={connect_id()}", compress=15, proxy=self.proxy, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" " (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" " Edg/130.0.0.0", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9", "Pragma": "no-cache", "Cache-Control": "no-cache", "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold"}, ssl=ssl_ctx) as websocket: - await send_command_request() - await send_ssml_request() - - async for received in websocket: - if received.type == aiohttp.WSMsgType.TEXT: - encoded_data: bytes = received.data.encode("utf-8") - parameters, data = get_headers_and_data(encoded_data, encoded_data.find(b"\r\n\r\n")) - path = parameters.get(b"Path", None) - - if path == b"audio.metadata": - parsed_metadata = self.__parse_metadata(data) - yield parsed_metadata - self.state["last_duration_offset"] = (parsed_metadata["offset"] + parsed_metadata["duration"]) - elif path == b"turn.end": - self.state["offset_compensation"] = self.state["last_duration_offset"] - self.state["offset_compensation"] += 8_750_000 - break - elif received.type == aiohttp.WSMsgType.BINARY: - if len(received.data) < 2: raise Exception("received.data < 2") - - header_length = int.from_bytes(received.data[:2], "big") - if header_length > len(received.data): raise Exception("header_length > received.data") - - parameters, data = get_headers_and_data(received.data, header_length) - if parameters.get(b"Path") != b"audio": raise Exception("Path != audio") - - content_type = parameters.get(b"Content-Type", None) - if content_type not in [b"audio/mpeg", None]: raise Exception("content_type != audio/mpeg") - - if content_type is None and len(data) == 0: continue - - if len(data) == 0: raise Exception("data = 0") - audio_was_received = True - yield {"type": "audio", "data": data} - - if not audio_was_received: raise Exception("!audio_was_received") - - async def stream(self): - if self.state["stream_was_called"]: raise RuntimeError("stream_was_called") - self.state["stream_was_called"] = True - - for self.state["partial_text"] in self.texts: - async for message in self.__stream(): - yield message - - async def save(self, audio_fname, metadata_fname = None): - metadata = (open(metadata_fname, "w", encoding="utf-8") if metadata_fname is not None else nullcontext()) - with metadata, open(audio_fname, "wb") as audio: - async for message in self.stream(): - if message["type"] == "audio": audio.write(message["data"]) - elif (isinstance(metadata, TextIOWrapper) and message["type"] == "WordBoundary"): - json.dump(message, metadata) - metadata.write("\n") \ No newline at end of file diff --git a/main/tools/gdown.py b/main/tools/gdown.py deleted file mode 100644 index fc940dea47436144c2f343c4b8bc0e0926f90b06..0000000000000000000000000000000000000000 --- a/main/tools/gdown.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import re -import sys -import json -import tqdm -import codecs -import tempfile -import requests - -from urllib.parse import urlparse, parse_qs, unquote - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -translations = Config().translations - -def parse_url(url): - parsed = urlparse(url) - is_download_link = parsed.path.endswith("/uc") - if not parsed.hostname in ("drive.google.com", "docs.google.com"): return None, is_download_link - file_id = parse_qs(parsed.query).get("id", [None])[0] - - if file_id is None: - for pattern in (r"^/file/d/(.*?)/(edit|view)$", r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$", r"^/document/d/(.*?)/(edit|htmlview|view)$", r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", r"^/presentation/d/(.*?)/(edit|htmlview|view)$", r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$", r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$"): - match = re.match(pattern, parsed.path) - if match: - file_id = match.group(1) - break - return file_id, is_download_link - -def get_url_from_gdrive_confirmation(contents): - for pattern in (r'href="(\/uc\?export=download[^"]+)', r'href="/open\?id=([^"]+)"', r'"downloadUrl":"([^"]+)'): - match = re.search(pattern, contents) - if match: - url = match.group(1) - if pattern == r'href="/open\?id=([^"]+)"': url = (codecs.decode("uggcf://qevir.hfrepbagrag.tbbtyr.pbz/qbjaybnq?vq=", "rot13") + url + "&confirm=t&uuid=" + re.search(r'(.*)

', contents) - if match: raise Exception(match.group(1)) - raise Exception(translations["gdown_error"]) - -def _get_session(use_cookies, return_cookies_file=False): - sess = requests.session() - sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}) - cookies_file = os.path.join(os.path.expanduser("~"), ".cache/gdown/cookies.json") - - if os.path.exists(cookies_file) and use_cookies: - with open(cookies_file) as f: - for k, v in json.load(f): - sess.cookies[k] = v - return (sess, cookies_file) if return_cookies_file else sess - -def gdown_download(url=None, id=None, output=None): - if not (id is None) ^ (url is None): raise ValueError(translations["gdown_value_error"]) - if id is not None: url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/hp?vq=', 'rot13')}{id}" - - url_origin = url - sess, cookies_file = _get_session(use_cookies=True, return_cookies_file=True) - gdrive_file_id, is_gdrive_download_link = parse_url(url) - - if gdrive_file_id: - url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/hp?vq=', 'rot13')}{gdrive_file_id}" - url_origin = url - is_gdrive_download_link = True - - while 1: - res = sess.get(url, stream=True, verify=True) - if url == url_origin and res.status_code == 500: - url = f"{codecs.decode('uggcf://qevir.tbbtyr.pbz/bcra?vq=', 'rot13')}{gdrive_file_id}" - continue - - os.makedirs(os.path.dirname(cookies_file), exist_ok=True) - with open(cookies_file, "w") as f: - json.dump([(k, v) for k, v in sess.cookies.items() if not k.startswith("download_warning_")], f, indent=2) - - if "Content-Disposition" in res.headers: break - if not (gdrive_file_id and is_gdrive_download_link): break - - try: - url = get_url_from_gdrive_confirmation(res.text) - except Exception as e: - raise Exception(e) - - if gdrive_file_id and is_gdrive_download_link: - content_disposition = unquote(res.headers["Content-Disposition"]) - filename_from_url = (re.search(r"filename\*=UTF-8''(.*)", content_disposition) or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)).group(1).replace(os.path.sep, "_") - else: filename_from_url = os.path.basename(url) - - output = os.path.join(output or ".", filename_from_url) - tmp_file = tempfile.mktemp(suffix=tempfile.template, prefix=os.path.basename(output), dir=os.path.dirname(output)) - f = open(tmp_file, "ab") - - if tmp_file is not None and f.tell() != 0: res = sess.get(url, headers={"Range": f"bytes={f.tell()}-"}, stream=True, verify=True) - print(translations["to"], os.path.abspath(output), file=sys.stderr) - - try: - with tqdm.tqdm(total=int(res.headers.get("Content-Length", 0)), ncols=100, unit="byte") as pbar: - for chunk in res.iter_content(chunk_size=512 * 1024): - f.write(chunk) - pbar.update(len(chunk)) - - pbar.close() - if tmp_file: f.close() - finally: - os.rename(tmp_file, output) - sess.close() - return output \ No newline at end of file diff --git a/main/tools/google_tts.py b/main/tools/google_tts.py deleted file mode 100644 index 1e7375d8adf4158058a71d43b3e683aea34230a9..0000000000000000000000000000000000000000 --- a/main/tools/google_tts.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import codecs -import librosa -import requests - -import soundfile as sf - -def google_tts(text, lang="vi", speed=0, pitch=0, output_file="output.mp3"): - try: - response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": text, "tl": lang, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}) - - if response.status_code == 200: - with open(output_file, "wb") as f: - f.write(response.content) - - format = os.path.splitext(os.path.basename(output_file))[-1].lower().replace('.', '') - - if pitch != 0: pitch_shift(input_file=output_file, output_file=output_file, pitch=pitch, export_format=format) - if speed != 0: change_speed(input_file=output_file, output_file=output_file, speed=speed, export_format=format) - else: raise ValueError(f"{response.status_code}, {response.text}") - except Exception as e: - raise RuntimeError(e) - -def pitch_shift(input_file, output_file, pitch, export_format): - y, sr = librosa.load(input_file, sr=None) - sf.write(file=output_file, data=librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch), samplerate=sr, format=export_format) - -def change_speed(input_file, output_file, speed, export_format): - y, sr = librosa.load(input_file, sr=None) - sf.write(file=output_file, data=librosa.effects.time_stretch(y, rate=speed), samplerate=sr, format=export_format) \ No newline at end of file diff --git a/main/tools/huggingface.py b/main/tools/huggingface.py deleted file mode 100644 index 8e18dcaca67c0cc988c27a285c150c63a79bd4d5..0000000000000000000000000000000000000000 --- a/main/tools/huggingface.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -import requests -import tqdm - - -def HF_download_file(url, output_path=None): - url = url.replace("/blob/", "/resolve/").replace("?download=true", "").strip() - - if output_path is None: output_path = os.path.basename(url) - else: output_path = os.path.join(output_path, os.path.basename(url)) if os.path.isdir(output_path) else output_path - - response = requests.get(url, stream=True, timeout=300) - - if response.status_code == 200: - progress_bar = tqdm.tqdm(total=int(response.headers.get("content-length", 0)), ncols=100, unit="byte") - - with open(output_path, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - progress_bar.update(len(chunk)) - f.write(chunk) - - progress_bar.close() - return output_path - else: raise ValueError(response.status_code) \ No newline at end of file diff --git a/main/tools/mediafire.py b/main/tools/mediafire.py deleted file mode 100644 index 05ca070e905662b92d50cd78714d26e0660b5fa5..0000000000000000000000000000000000000000 --- a/main/tools/mediafire.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import sys -import requests -from bs4 import BeautifulSoup - - -def Mediafire_Download(url, output=None, filename=None): - if not filename: filename = url.split('/')[-2] - if not output: output = os.path.dirname(os.path.realpath(__file__)) - output_file = os.path.join(output, filename) - - sess = requests.session() - sess.headers.update({"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}) - - try: - with requests.get(BeautifulSoup(sess.get(url).content, "html.parser").find(id="downloadButton").get("href"), stream=True) as r: - r.raise_for_status() - with open(output_file, "wb") as f: - total_length = int(r.headers.get('content-length')) - download_progress = 0 - - for chunk in r.iter_content(chunk_size=1024): - download_progress += len(chunk) - f.write(chunk) - sys.stdout.write(f"\r[{filename}]: {int(100 * download_progress/total_length)}% ({round(download_progress/1024/1024, 2)}mb/{round(total_length/1024/1024, 2)}mb)") - sys.stdout.flush() - sys.stdout.write("\n") - return output_file - except Exception as e: - raise RuntimeError(e) \ No newline at end of file diff --git a/main/tools/meganz.py b/main/tools/meganz.py deleted file mode 100644 index 8345986dc84daa922bf4f697ca9d15e79f8468fa..0000000000000000000000000000000000000000 --- a/main/tools/meganz.py +++ /dev/null @@ -1,160 +0,0 @@ -import os -import re -import sys -import json -import tqdm -import time -import codecs -import random -import base64 -import struct -import shutil -import requests -import tempfile - -from Crypto.Cipher import AES -from Crypto.Util import Counter - -sys.path.append(os.getcwd()) - -from main.configs.config import Config -translations = Config().translations - -def makebyte(x): - return codecs.latin_1_encode(x)[0] - -def a32_to_str(a): - return struct.pack('>%dI' % len(a), *a) - -def get_chunks(size): - p, s = 0, 0x20000 - - while p + s < size: - yield (p, s) - p += s - - if s < 0x100000: s += 0x20000 - - yield (p, size - p) - -def decrypt_attr(attr, key): - attr = codecs.latin_1_decode(AES.new(a32_to_str(key), AES.MODE_CBC, makebyte('\0' * 16)).decrypt(attr))[0].rstrip('\0') - - return json.loads(attr[4:]) if attr[:6] == 'MEGA{"' else False - -def _api_request(data): - sequence_num = random.randint(0, 0xFFFFFFFF) - params = {'id': sequence_num} - sequence_num += 1 - - if not isinstance(data, list): data = [data] - - for attempt in range(60): - try: - json_resp = json.loads(requests.post(f'https://g.api.mega.co.nz/cs', params=params, data=json.dumps(data), timeout=160).text) - - try: - if isinstance(json_resp, list): int_resp = json_resp[0] if isinstance(json_resp[0], int) else None - elif isinstance(json_resp, int): int_resp = json_resp - except IndexError: - int_resp = None - - if int_resp is not None: - if int_resp == 0: return int_resp - if int_resp == -3: raise RuntimeError('int_resp==-3') - raise Exception(int_resp) - - return json_resp[0] - except (RuntimeError, requests.exceptions.RequestException): - if attempt == 60 - 1: raise - delay = 2 * (2 ** attempt) - time.sleep(delay) - -def base64_url_decode(data): - data += '=='[(2 - len(data) * 3) % 4:] - - for search, replace in (('-', '+'), ('_', '/'), (',', '')): - data = data.replace(search, replace) - - return base64.b64decode(data) - -def str_to_a32(b): - if isinstance(b, str): b = makebyte(b) - if len(b) % 4: b += b'\0' * (4 - len(b) % 4) - - return struct.unpack('>%dI' % (len(b) / 4), b) - -def mega_download_file(file_handle, file_key, dest_path=None, dest_filename=None, file=None): - if file is None: - file_key = str_to_a32(base64_url_decode(file_key)) - file_data = _api_request({'a': 'g', 'g': 1, 'p': file_handle}) - - k = (file_key[0] ^ file_key[4], file_key[1] ^ file_key[5], file_key[2] ^ file_key[6], file_key[3] ^ file_key[7]) - iv = file_key[4:6] + (0, 0) - meta_mac = file_key[6:8] - else: - file_data = _api_request({'a': 'g', 'g': 1, 'n': file['h']}) - k = file['k'] - iv = file['iv'] - meta_mac = file['meta_mac'] - - if 'g' not in file_data: raise Exception(translations["file_not_access"]) - file_size = file_data['s'] - - attribs = decrypt_attr(base64_url_decode(file_data['at']), k) - - file_name = dest_filename if dest_filename is not None else attribs['n'] - input_file = requests.get(file_data['g'], stream=True).raw - - if dest_path is None: dest_path = '' - else: dest_path += '/' - - temp_output_file = tempfile.NamedTemporaryFile(mode='w+b', prefix='megapy_', delete=False) - k_str = a32_to_str(k) - - aes = AES.new(k_str, AES.MODE_CTR, counter=Counter.new(128, initial_value=((iv[0] << 32) + iv[1]) << 64)) - mac_str = b'\0' * 16 - mac_encryptor = AES.new(k_str, AES.MODE_CBC, mac_str) - - iv_str = a32_to_str([iv[0], iv[1], iv[0], iv[1]]) - pbar = tqdm.tqdm(total=file_size, ncols=100, unit="byte") - - for _, chunk_size in get_chunks(file_size): - chunk = aes.decrypt(input_file.read(chunk_size)) - temp_output_file.write(chunk) - - pbar.update(len(chunk)) - encryptor = AES.new(k_str, AES.MODE_CBC, iv_str) - - for i in range(0, len(chunk)-16, 16): - block = chunk[i:i + 16] - encryptor.encrypt(block) - - if file_size > 16: i += 16 - else: i = 0 - - block = chunk[i:i + 16] - if len(block) % 16: block += b'\0' * (16 - (len(block) % 16)) - - mac_str = mac_encryptor.encrypt(encryptor.encrypt(block)) - - file_mac = str_to_a32(mac_str) - temp_output_file.close() - - if (file_mac[0] ^ file_mac[1], file_mac[2] ^ file_mac[3]) != meta_mac: raise ValueError(translations["mac_not_match"]) - - file_path = os.path.join(dest_path, file_name) - if os.path.exists(file_path): os.remove(file_path) - - shutil.move(temp_output_file.name, file_path) - -def mega_download_url(url, dest_path=None, dest_filename=None): - if '/file/' in url: - url = url.replace(' ', '') - file_id = re.findall(r'\W\w\w\w\w\w\w\w\w\W', url)[0][1:-1] - - path = f'{file_id}!{url[re.search(file_id, url).end() + 1:]}'.split('!') - elif '!' in url: path = re.findall(r'/#!(.*)', url)[0].split('!') - else: raise Exception(translations["missing_url"]) - - return mega_download_file(file_handle=path[0], file_key=path[1], dest_path=dest_path, dest_filename=dest_filename) \ No newline at end of file diff --git a/main/tools/noisereduce.py b/main/tools/noisereduce.py deleted file mode 100644 index e052141da5d144d24a76db7a00eb72fbc9f12740..0000000000000000000000000000000000000000 --- a/main/tools/noisereduce.py +++ /dev/null @@ -1,200 +0,0 @@ -import torch -import tempfile -import numpy as np - -from tqdm.auto import tqdm -from joblib import Parallel, delayed -from torch.nn.functional import conv1d, conv2d - -from main.configs.config import Config -translations = Config().translations - -@torch.no_grad() -def amp_to_db(x, eps = torch.finfo(torch.float32).eps, top_db = 40): - x_db = 20 * torch.log10(x.abs() + eps) - return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1)) - -@torch.no_grad() -def temperature_sigmoid(x, x0, temp_coeff): - return torch.sigmoid((x - x0) / temp_coeff) - -@torch.no_grad() -def linspace(start, stop, num = 50, endpoint = True, **kwargs): - return torch.linspace(start, stop, num, **kwargs) if endpoint else torch.linspace(start, stop, num + 1, **kwargs)[:-1] - -def _smoothing_filter(n_grad_freq, n_grad_time): - smoothing_filter = np.outer(np.concatenate([np.linspace(0, 1, n_grad_freq + 1, endpoint=False), np.linspace(1, 0, n_grad_freq + 2)])[1:-1], np.concatenate([np.linspace(0, 1, n_grad_time + 1, endpoint=False), np.linspace(1, 0, n_grad_time + 2)])[1:-1]) - return smoothing_filter / np.sum(smoothing_filter) - -class SpectralGate: - def __init__(self, y, sr, prop_decrease, chunk_size, padding, n_fft, win_length, hop_length, time_constant_s, freq_mask_smooth_hz, time_mask_smooth_ms, tmp_folder, use_tqdm, n_jobs): - self.sr = sr - self.flat = False - y = np.array(y) - - if len(y.shape) == 1: - self.y = np.expand_dims(y, 0) - self.flat = True - elif len(y.shape) > 2: raise ValueError(translations["waveform"]) - else: self.y = y - - self._dtype = y.dtype - self.n_channels, self.n_frames = self.y.shape - self._chunk_size = chunk_size - self.padding = padding - self.n_jobs = n_jobs - self.use_tqdm = use_tqdm - self._tmp_folder = tmp_folder - self._n_fft = n_fft - self._win_length = self._n_fft if win_length is None else win_length - self._hop_length = (self._win_length // 4) if hop_length is None else hop_length - self._time_constant_s = time_constant_s - self._prop_decrease = prop_decrease - - if (freq_mask_smooth_hz is None) & (time_mask_smooth_ms is None): self.smooth_mask = False - else: self._generate_mask_smoothing_filter(freq_mask_smooth_hz, time_mask_smooth_ms) - - def _generate_mask_smoothing_filter(self, freq_mask_smooth_hz, time_mask_smooth_ms): - if freq_mask_smooth_hz is None: n_grad_freq = 1 - else: - n_grad_freq = int(freq_mask_smooth_hz / (self.sr / (self._n_fft / 2))) - if n_grad_freq < 1: raise ValueError(translations["freq_mask_smooth_hz"].format(hz=int((self.sr / (self._n_fft / 2))))) - - if time_mask_smooth_ms is None: n_grad_time = 1 - else: - n_grad_time = int(time_mask_smooth_ms / ((self._hop_length / self.sr) * 1000)) - if n_grad_time < 1: raise ValueError(translations["time_mask_smooth_ms"].format(ms=int((self._hop_length / self.sr) * 1000))) - - if (n_grad_time == 1) & (n_grad_freq == 1): self.smooth_mask = False - else: - self.smooth_mask = True - self._smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time) - - def _read_chunk(self, i1, i2): - i1b = 0 if i1 < 0 else i1 - i2b = self.n_frames if i2 > self.n_frames else i2 - chunk = np.zeros((self.n_channels, i2 - i1)) - chunk[:, i1b - i1: i2b - i1] = self.y[:, i1b:i2b] - return chunk - - def filter_chunk(self, start_frame, end_frame): - i1 = start_frame - self.padding - return self._do_filter(self._read_chunk(i1, (end_frame + self.padding)))[:, start_frame - i1: end_frame - i1] - - def _get_filtered_chunk(self, ind): - start0 = ind * self._chunk_size - end0 = (ind + 1) * self._chunk_size - return self.filter_chunk(start_frame=start0, end_frame=end0) - - def _do_filter(self, chunk): - pass - - def _iterate_chunk(self, filtered_chunk, pos, end0, start0, ich): - filtered_chunk[:, pos: pos + end0 - start0] = self._get_filtered_chunk(ich)[:, start0:end0] - pos += end0 - start0 - - def get_traces(self, start_frame=None, end_frame=None): - if start_frame is None: start_frame = 0 - if end_frame is None: end_frame = self.n_frames - - if self._chunk_size is not None: - if end_frame - start_frame > self._chunk_size: - ich1 = int(start_frame / self._chunk_size) - ich2 = int((end_frame - 1) / self._chunk_size) - - with tempfile.NamedTemporaryFile(prefix=self._tmp_folder) as fp: - filtered_chunk = np.memmap(fp, dtype=self._dtype, shape=(self.n_channels, int(end_frame - start_frame)), mode="w+") - pos_list, start_list, end_list = [], [], [] - pos = 0 - - for ich in range(ich1, ich2 + 1): - start0 = (start_frame - ich * self._chunk_size) if ich == ich1 else 0 - end0 = end_frame - ich * self._chunk_size if ich == ich2 else self._chunk_size - pos_list.append(pos) - start_list.append(start0) - end_list.append(end0) - pos += end0 - start0 - - Parallel(n_jobs=self.n_jobs)(delayed(self._iterate_chunk)(filtered_chunk, pos, end0, start0, ich) for pos, start0, end0, ich in zip(tqdm(pos_list, disable=not (self.use_tqdm)), start_list, end_list, range(ich1, ich2 + 1))) - return filtered_chunk.astype(self._dtype).flatten() if self.flat else filtered_chunk.astype(self._dtype) - - filtered_chunk = self.filter_chunk(start_frame=0, end_frame=end_frame) - return filtered_chunk.astype(self._dtype).flatten() if self.flat else filtered_chunk.astype(self._dtype) - -class TG(torch.nn.Module): - @torch.no_grad() - def __init__(self, sr, nonstationary = False, n_std_thresh_stationary = 1.5, n_thresh_nonstationary = 1.3, temp_coeff_nonstationary = 0.1, n_movemean_nonstationary = 20, prop_decrease = 1.0, n_fft = 1024, win_length = None, hop_length = None, freq_mask_smooth_hz = 500, time_mask_smooth_ms = 50): - super().__init__() - self.sr = sr - self.nonstationary = nonstationary - assert 0.0 <= prop_decrease <= 1.0 - self.prop_decrease = prop_decrease - self.n_fft = n_fft - self.win_length = self.n_fft if win_length is None else win_length - self.hop_length = self.win_length // 4 if hop_length is None else hop_length - self.n_std_thresh_stationary = n_std_thresh_stationary - self.temp_coeff_nonstationary = temp_coeff_nonstationary - self.n_movemean_nonstationary = n_movemean_nonstationary - self.n_thresh_nonstationary = n_thresh_nonstationary - self.freq_mask_smooth_hz = freq_mask_smooth_hz - self.time_mask_smooth_ms = time_mask_smooth_ms - self.register_buffer("smoothing_filter", self._generate_mask_smoothing_filter()) - - @torch.no_grad() - def _generate_mask_smoothing_filter(self): - if self.freq_mask_smooth_hz is None and self.time_mask_smooth_ms is None: return None - n_grad_freq = (1 if self.freq_mask_smooth_hz is None else int(self.freq_mask_smooth_hz / (self.sr / (self.n_fft / 2)))) - if n_grad_freq < 1: raise ValueError(translations["freq_mask_smooth_hz"].format(hz=int((self.sr / (self._n_fft / 2))))) - - n_grad_time = (1 if self.time_mask_smooth_ms is None else int(self.time_mask_smooth_ms / ((self.hop_length / self.sr) * 1000))) - if n_grad_time < 1: raise ValueError(translations["time_mask_smooth_ms"].format(ms=int((self._hop_length / self.sr) * 1000))) - if n_grad_time == 1 and n_grad_freq == 1: return None - - smoothing_filter = torch.outer(torch.cat([linspace(0, 1, n_grad_freq + 1, endpoint=False), linspace(1, 0, n_grad_freq + 2)])[1:-1], torch.cat([linspace(0, 1, n_grad_time + 1, endpoint=False), linspace(1, 0, n_grad_time + 2)])[1:-1]).unsqueeze(0).unsqueeze(0) - return smoothing_filter / smoothing_filter.sum() - - @torch.no_grad() - def _stationary_mask(self, X_db, xn = None): - XN_db = amp_to_db(torch.stft(xn, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, return_complex=True, pad_mode="constant", center=True, window=torch.hann_window(self.win_length).to(xn.device))).to(dtype=X_db.dtype) if xn is not None else X_db - std_freq_noise, mean_freq_noise = torch.std_mean(XN_db, dim=-1) - return torch.gt(X_db, (mean_freq_noise + std_freq_noise * self.n_std_thresh_stationary).unsqueeze(2)) - - @torch.no_grad() - def _nonstationary_mask(self, X_abs): - X_smoothed = (conv1d(X_abs.reshape(-1, 1, X_abs.shape[-1]), torch.ones(self.n_movemean_nonstationary, dtype=X_abs.dtype, device=X_abs.device).view(1, 1, -1), padding="same").view(X_abs.shape) / self.n_movemean_nonstationary) - return temperature_sigmoid(((X_abs - X_smoothed) / X_smoothed), self.n_thresh_nonstationary, self.temp_coeff_nonstationary) - - def forward(self, x, xn = None): - assert x.ndim == 2 - if x.shape[-1] < self.win_length * 2: raise Exception(f"{translations['x']} {self.win_length * 2}") - assert xn is None or xn.ndim == 1 or xn.ndim == 2 - if xn is not None and xn.shape[-1] < self.win_length * 2: raise Exception(f"{translations['xn']} {self.win_length * 2}") - - X = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, return_complex=True, pad_mode="constant", center=True, window=torch.hann_window(self.win_length).to(x.device)) - sig_mask = self._nonstationary_mask(X.abs()) if self.nonstationary else self._stationary_mask(amp_to_db(X), xn) - - sig_mask = self.prop_decrease * (sig_mask * 1.0 - 1.0) + 1.0 - if self.smoothing_filter is not None: sig_mask = conv2d(sig_mask.unsqueeze(1), self.smoothing_filter.to(sig_mask.dtype), padding="same") - - Y = X * sig_mask.squeeze(1) - return torch.istft(Y, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, center=True, window=torch.hann_window(self.win_length).to(Y.device)).to(dtype=x.dtype) - -class StreamedTorchGate(SpectralGate): - def __init__(self, y, sr, stationary=False, y_noise=None, prop_decrease=1.0, time_constant_s=2.0, freq_mask_smooth_hz=500, time_mask_smooth_ms=50, thresh_n_mult_nonstationary=2, sigmoid_slope_nonstationary=10, n_std_thresh_stationary=1.5, tmp_folder=None, chunk_size=600000, padding=30000, n_fft=1024, win_length=None, hop_length=None, clip_noise_stationary=True, use_tqdm=False, n_jobs=1, device="cpu"): - super().__init__(y=y, sr=sr, chunk_size=chunk_size, padding=padding, n_fft=n_fft, win_length=win_length, hop_length=hop_length, time_constant_s=time_constant_s, freq_mask_smooth_hz=freq_mask_smooth_hz, time_mask_smooth_ms=time_mask_smooth_ms, tmp_folder=tmp_folder, prop_decrease=prop_decrease, use_tqdm=use_tqdm, n_jobs=n_jobs) - self.device = torch.device(device if torch.cuda.is_available() else 'cpu') - - if y_noise is not None: - if y_noise.shape[-1] > y.shape[-1] and clip_noise_stationary: y_noise = y_noise[: y.shape[-1]] - y_noise = torch.from_numpy(y_noise).to(device) - if len(y_noise.shape) == 1: y_noise = y_noise.unsqueeze(0) - - self.y_noise = y_noise - self.tg = TG(sr=sr, nonstationary=not stationary, n_std_thresh_stationary=n_std_thresh_stationary, n_thresh_nonstationary=thresh_n_mult_nonstationary, temp_coeff_nonstationary=1 / sigmoid_slope_nonstationary, n_movemean_nonstationary=int(time_constant_s / self._hop_length * sr), prop_decrease=prop_decrease, n_fft=self._n_fft, win_length=self._win_length, hop_length=self._hop_length, freq_mask_smooth_hz=freq_mask_smooth_hz, time_mask_smooth_ms=time_mask_smooth_ms).to(device) - - def _do_filter(self, chunk): - if type(chunk) is np.ndarray: chunk = torch.from_numpy(chunk).to(self.device) - return self.tg(x=chunk, xn=self.y_noise).cpu().detach().numpy() - -def reduce_noise(y, sr, stationary=False, y_noise=None, prop_decrease=1.0, time_constant_s=2.0, freq_mask_smooth_hz=500, time_mask_smooth_ms=50, thresh_n_mult_nonstationary=2, sigmoid_slope_nonstationary=10, tmp_folder=None, chunk_size=600000, padding=30000, n_fft=1024, win_length=None, hop_length=None, clip_noise_stationary=True, use_tqdm=False, device="cpu"): - return StreamedTorchGate(y=y, sr=sr, stationary=stationary, y_noise=y_noise, prop_decrease=prop_decrease, time_constant_s=time_constant_s, freq_mask_smooth_hz=freq_mask_smooth_hz, time_mask_smooth_ms=time_mask_smooth_ms, thresh_n_mult_nonstationary=thresh_n_mult_nonstationary, sigmoid_slope_nonstationary=sigmoid_slope_nonstationary, tmp_folder=tmp_folder, chunk_size=chunk_size, padding=padding, n_fft=n_fft, win_length=win_length, hop_length=hop_length, clip_noise_stationary=clip_noise_stationary, use_tqdm=use_tqdm, n_jobs=1, device=device).get_traces() \ No newline at end of file diff --git a/main/tools/pixeldrain.py b/main/tools/pixeldrain.py deleted file mode 100644 index 64741b6d9417055efb66ffddae84087f8d52552d..0000000000000000000000000000000000000000 --- a/main/tools/pixeldrain.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import requests - -def pixeldrain(url, output_dir): - try: - response = requests.get(f"https://pixeldrain.com/api/file/{url.split('pixeldrain.com/u/')[1]}") - - if response.status_code == 200: - file_path = os.path.join(output_dir, (response.headers.get("Content-Disposition").split("filename=")[-1].strip('";'))) - - with open(file_path, "wb") as newfile: - newfile.write(response.content) - return file_path - else: return None - except Exception as e: - raise RuntimeError(e) \ No newline at end of file