{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "8lw0EgLex-YZ" }, "source": [ "# SoniTranslate embedded app\n", "\n", "`This notebook embeds the Gradio app directly into a cell, allowing you to interact with it without needing to open a separate browser window or navigate to a public/local URL.`\n", "\n", "| Description | Link |\n", "| ----------- | ---- |\n", "| 🎉 Repository | [![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=flat-square&logo=github)](https://github.com/R3gm/SoniTranslate/) |\n", "| 🚀 Online Demo in HF | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "LUgwm0rfx0_J" }, "outputs": [], "source": [ "# @title Install requirements for SoniTranslate\n", "!git clone https://github.com/r3gm/SoniTranslate.git\n", "%cd SoniTranslate\n", "\n", "!pip uninstall chex pandas-stubs ibis-framework albumentations albucore -y -q\n", "!python -m pip install -q pip==23.1.2\n", "!apt install git-lfs\n", "!git lfs install\n", "\n", "!sed -i 's|git+https://github.com/R3gm/whisperX.git@cuda_11_8|git+https://github.com/R3gm/whisperX.git@cuda_12_x|' requirements_base.txt\n", "!pip install -q -r requirements_base.txt\n", "!pip install -q -r requirements_extra.txt\n", "!pip install -q ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/\n", "\n", "Install_PIPER_TTS = True # @param {type:\"boolean\"}\n", "\n", "if Install_PIPER_TTS:\n", " !pip install -q piper-tts==1.2.0\n", "\n", "Install_Coqui_XTTS = True # @param {type:\"boolean\"}\n", "\n", "if Install_Coqui_XTTS:\n", " !pip install -q -r requirements_xtts.txt\n", " !pip install -q TTS==0.21.1 --no-deps" ] }, { "cell_type": "markdown", "metadata": { "id": "LTaTstXPXNg2" }, "source": [ "One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation\n", "\n", "\n", "Get your KEY TOKEN here: https://hf.co/settings/tokens\n", "\n", "When you are creating the new Access Token in Hugging Face, make sure to tick \"Read access to contents of all public gated repos you can access\"." ] }, { "cell_type": "markdown", "metadata": { "id": "NRAsK95dJSgq" }, "source": [ "Directory output: /content/SoniTranslate/outputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "XkhXfaFw4R4J" }, "outputs": [], "source": [ "#@markdown # `RUN THE WEB APP`\n", "YOUR_HF_TOKEN = \"\" #@param {type:'string'}\n", "%env YOUR_HF_TOKEN={YOUR_HF_TOKEN}\n", "theme_var = \"Taithrah/Minimal\" # @param [\"Taithrah/Minimal\", \"aliabid94/new-theme\", \"gstaff/xkcd\", \"ParityError/LimeFace\", \"abidlabs/pakistan\", \"rottenlittlecreature/Moon_Goblin\", \"ysharma/llamas\", \"gradio/dracula_revamped\"]\n", "interface_language_var = \"english\" # @param ['afrikaans', 'arabic', 'azerbaijani', 'chinese_zh_cn', 'english', 'french', 'german', 'hindi', 'indonesian', 'italian', 'japanese', 'korean', 'marathi', 'persian', 'polish', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish', 'ukrainian', 'vietnamese']\n", "verbosity_level_var = \"error\" # @param [\"debug\", \"info\", \"warning\", \"error\", \"critical\"]\n", "#@markdown ### `The interface will appear down here 👇‍‍`\n", "\n", "%cd /content/SoniTranslate\n", "import gradio as gr\n", "from soni_translate.logging_setup import (\n", " logger,\n", " set_logging_level,\n", " configure_logging_libs,\n", "); configure_logging_libs() # noqa\n", "import whisperx\n", "import torch\n", "import os\n", "from soni_translate.audio_segments import create_translated_audio\n", "from soni_translate.text_to_speech import (\n", " audio_segmentation_to_voice,\n", " edge_tts_voices_list,\n", " coqui_xtts_voices_list,\n", " piper_tts_voices_list,\n", " create_wav_file_vc,\n", " accelerate_segments,\n", ")\n", "from soni_translate.translate_segments import (\n", " translate_text,\n", " TRANSLATION_PROCESS_OPTIONS,\n", " DOCS_TRANSLATION_PROCESS_OPTIONS\n", ")\n", "from soni_translate.preprocessor import (\n", " audio_video_preprocessor,\n", " audio_preprocessor,\n", ")\n", "from soni_translate.postprocessor import (\n", " OUTPUT_TYPE_OPTIONS,\n", " DOCS_OUTPUT_TYPE_OPTIONS,\n", " sound_separate,\n", " get_no_ext_filename,\n", " media_out,\n", " get_subtitle_speaker,\n", ")\n", "from soni_translate.language_configuration import (\n", " LANGUAGES,\n", " UNIDIRECTIONAL_L_LIST,\n", " LANGUAGES_LIST,\n", " BARK_VOICES_LIST,\n", " VITS_VOICES_LIST,\n", " OPENAI_TTS_MODELS,\n", ")\n", "from soni_translate.utils import (\n", " remove_files,\n", " download_list,\n", " upload_model_list,\n", " download_manager,\n", " run_command,\n", " is_audio_file,\n", " is_subtitle_file,\n", " copy_files,\n", " get_valid_files,\n", " get_link_list,\n", " remove_directory_contents,\n", ")\n", "from soni_translate.mdx_net import (\n", " UVR_MODELS,\n", " MDX_DOWNLOAD_LINK,\n", " mdxnet_models_dir,\n", ")\n", "from soni_translate.speech_segmentation import (\n", " ASR_MODEL_OPTIONS,\n", " COMPUTE_TYPE_GPU,\n", " COMPUTE_TYPE_CPU,\n", " find_whisper_models,\n", " transcribe_speech,\n", " align_speech,\n", " diarize_speech,\n", " diarization_models,\n", ")\n", "from soni_translate.text_multiformat_processor import (\n", " BORDER_COLORS,\n", " srt_file_to_segments,\n", " document_preprocessor,\n", " determine_chunk_size,\n", " plain_text_to_segments,\n", " segments_to_plain_text,\n", " process_subtitles,\n", " linguistic_level_segments,\n", " break_aling_segments,\n", " doc_to_txtximg_pages,\n", " page_data_to_segments,\n", " update_page_data,\n", " fix_timestamps_docs,\n", " create_video_from_images,\n", " merge_video_and_audio,\n", ")\n", "from soni_translate.languages_gui import language_data, news\n", "import copy\n", "import logging\n", "import json\n", "from pydub import AudioSegment\n", "from voice_main import ClassVoices\n", "import argparse\n", "import time\n", "import hashlib\n", "import sys\n", "\n", "directories = [\n", " \"downloads\",\n", " \"logs\",\n", " \"weights\",\n", " \"clean_song_output\",\n", " \"_XTTS_\",\n", " f\"audio2{os.sep}audio\",\n", " \"audio\",\n", " \"outputs\",\n", "]\n", "[\n", " os.makedirs(directory)\n", " for directory in directories\n", " if not os.path.exists(directory)\n", "]\n", "\n", "\n", "class TTS_Info:\n", " def __init__(self, piper_enabled, xtts_enabled):\n", " self.list_edge = edge_tts_voices_list()\n", " self.list_bark = list(BARK_VOICES_LIST.keys())\n", " self.list_vits = list(VITS_VOICES_LIST.keys())\n", " self.list_openai_tts = OPENAI_TTS_MODELS\n", " self.piper_enabled = piper_enabled\n", " self.list_vits_onnx = (\n", " piper_tts_voices_list() if self.piper_enabled else []\n", " )\n", " self.xtts_enabled = xtts_enabled\n", "\n", " def tts_list(self):\n", " self.list_coqui_xtts = (\n", " coqui_xtts_voices_list() if self.xtts_enabled else []\n", " )\n", " list_tts = self.list_coqui_xtts + sorted(\n", " self.list_edge\n", " + self.list_bark\n", " + self.list_vits\n", " + self.list_openai_tts\n", " + self.list_vits_onnx\n", " )\n", " return list_tts\n", "\n", "\n", "def prog_disp(msg, percent, is_gui, progress=None):\n", " logger.info(msg)\n", " if is_gui:\n", " progress(percent, desc=msg)\n", "\n", "\n", "def warn_disp(wrn_lang, is_gui):\n", " logger.warning(wrn_lang)\n", " if is_gui:\n", " gr.Warning(wrn_lang)\n", "\n", "\n", "class SoniTrCache:\n", " def __init__(self):\n", " self.cache = {\n", " 'media': [[]],\n", " 'refine_vocals': [],\n", " 'transcript_align': [],\n", " 'break_align': [],\n", " 'diarize': [],\n", " 'translate': [],\n", " 'subs_and_edit': [],\n", " 'tts': [],\n", " 'acc_and_vc': [],\n", " 'mix_aud': [],\n", " 'output': []\n", " }\n", "\n", " self.cache_data = {\n", " 'media': [],\n", " 'refine_vocals': [],\n", " 'transcript_align': [],\n", " 'break_align': [],\n", " 'diarize': [],\n", " 'translate': [],\n", " 'subs_and_edit': [],\n", " 'tts': [],\n", " 'acc_and_vc': [],\n", " 'mix_aud': [],\n", " 'output': []\n", " }\n", "\n", " self.cache_keys = list(self.cache.keys())\n", " self.first_task = self.cache_keys[0]\n", " self.last_task = self.cache_keys[-1]\n", "\n", " self.pre_step = None\n", " self.pre_params = []\n", "\n", " def set_variable(self, variable_name, value):\n", " setattr(self, variable_name, value)\n", "\n", " def task_in_cache(self, step: str, params: list, previous_step_data: dict):\n", "\n", " self.pre_step_cache = None\n", "\n", " if step == self.first_task:\n", " self.pre_step = None\n", "\n", " if self.pre_step:\n", " self.cache[self.pre_step] = self.pre_params\n", "\n", " # Fill data in cache\n", " self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data)\n", "\n", " self.pre_params = params\n", " # logger.debug(f\"Step: {str(step)}, Cache params: {str(self.cache)}\")\n", " if params == self.cache[step]:\n", " logger.debug(f\"In cache: {str(step)}\")\n", "\n", " # Set the var needed for next step\n", " # Recovery from cache_data the current step\n", " for key, value in self.cache_data[step].items():\n", " self.set_variable(key, copy.deepcopy(value))\n", " logger.debug(\n", " f\"Chache load: {str(key)}\"\n", " )\n", "\n", " self.pre_step = step\n", " return True\n", "\n", " else:\n", " logger.debug(f\"Flush next and caching {str(step)}\")\n", " selected_index = self.cache_keys.index(step)\n", "\n", " for idx, key in enumerate(self.cache.keys()):\n", " if idx >= selected_index:\n", " self.cache[key] = []\n", " self.cache_data[key] = {}\n", "\n", " # The last is now previous\n", " self.pre_step = step\n", " return False\n", "\n", " def clear_cache(self, media, force=False):\n", "\n", " self.cache[\"media\"] = (\n", " self.cache[\"media\"] if len(self.cache[\"media\"]) else [[]]\n", " )\n", "\n", " if media != self.cache[\"media\"][0] or force:\n", "\n", " # Clear cache\n", " self.cache = {key: [] for key in self.cache}\n", " self.cache[\"media\"] = [[]]\n", "\n", " logger.info(\"Cache flushed\")\n", "\n", "\n", "def get_hash(filepath):\n", " with open(filepath, 'rb') as f:\n", " file_hash = hashlib.blake2b()\n", " while chunk := f.read(8192):\n", " file_hash.update(chunk)\n", "\n", " return file_hash.hexdigest()[:18]\n", "\n", "\n", "def check_openai_api_key():\n", " if not os.environ.get(\"OPENAI_API_KEY\"):\n", " raise ValueError(\n", " \"To use GPT for translation, please set up your OpenAI API key \"\n", " \"as an environment variable in Linux as follows: \"\n", " \"export OPENAI_API_KEY='your-api-key-here'. Or change the \"\n", " \"translation process in Advanced settings.\"\n", " )\n", "\n", "\n", "class SoniTranslate(SoniTrCache):\n", " def __init__(self, cpu_mode=False):\n", " super().__init__()\n", " if cpu_mode:\n", " os.environ[\"SONITR_DEVICE\"] = \"cpu\"\n", " else:\n", " os.environ[\"SONITR_DEVICE\"] = (\n", " \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", " )\n", "\n", " self.device = os.environ.get(\"SONITR_DEVICE\")\n", " self.result_diarize = None\n", " self.align_language = None\n", " self.result_source_lang = None\n", " self.edit_subs_complete = False\n", " self.voiceless_id = None\n", " self.burn_subs_id = None\n", "\n", " self.vci = ClassVoices(only_cpu=cpu_mode)\n", "\n", " self.tts_voices = self.get_tts_voice_list()\n", "\n", " logger.info(f\"Working in: {self.device}\")\n", "\n", " def get_tts_voice_list(self):\n", " try:\n", " from piper import PiperVoice # noqa\n", "\n", " piper_enabled = True\n", " logger.info(\"PIPER TTS enabled\")\n", " except Exception as error:\n", " logger.debug(str(error))\n", " piper_enabled = False\n", " logger.info(\"PIPER TTS disabled\")\n", " try:\n", " from TTS.api import TTS # noqa\n", "\n", " xtts_enabled = True\n", " logger.info(\"Coqui XTTS enabled\")\n", " logger.info(\n", " \"In this app, by using Coqui TTS (text-to-speech), you \"\n", " \"acknowledge and agree to the license.\\n\"\n", " \"You confirm that you have read, understood, and agreed \"\n", " \"to the Terms and Conditions specified at the following \"\n", " \"link:\\nhttps://coqui.ai/cpml.txt.\"\n", " )\n", " os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", " except Exception as error:\n", " logger.debug(str(error))\n", " xtts_enabled = False\n", " logger.info(\"Coqui XTTS disabled\")\n", "\n", " self.tts_info = TTS_Info(piper_enabled, xtts_enabled)\n", "\n", " return self.tts_info.tts_list()\n", "\n", " def batch_multilingual_media_conversion(self, *kwargs):\n", " # logger.debug(str(kwargs))\n", "\n", " media_file_arg = kwargs[0] if kwargs[0] is not None else []\n", "\n", " link_media_arg = kwargs[1]\n", " link_media_arg = [x.strip() for x in link_media_arg.split(',')]\n", " link_media_arg = get_link_list(link_media_arg)\n", "\n", " path_arg = kwargs[2]\n", " path_arg = [x.strip() for x in path_arg.split(',')]\n", " path_arg = get_valid_files(path_arg)\n", "\n", " edit_text_arg = kwargs[31]\n", " get_text_arg = kwargs[32]\n", "\n", " is_gui_arg = kwargs[-1]\n", "\n", " kwargs = kwargs[3:]\n", "\n", " media_batch = media_file_arg + link_media_arg + path_arg\n", " media_batch = list(filter(lambda x: x != \"\", media_batch))\n", " media_batch = media_batch if media_batch else [None]\n", " logger.debug(str(media_batch))\n", "\n", " remove_directory_contents(\"outputs\")\n", "\n", " if edit_text_arg or get_text_arg:\n", " return self.multilingual_media_conversion(\n", " media_batch[0], \"\", \"\", *kwargs\n", " )\n", "\n", " if \"SET_LIMIT\" == os.getenv(\"DEMO\"):\n", " media_batch = [media_batch[0]]\n", "\n", " result = []\n", " for media in media_batch:\n", " # Call the nested function with the parameters\n", " output_file = self.multilingual_media_conversion(\n", " media, \"\", \"\", *kwargs\n", " )\n", "\n", " if isinstance(output_file, str):\n", " output_file = [output_file]\n", " result.extend(output_file)\n", "\n", " if is_gui_arg and len(media_batch) > 1:\n", " gr.Info(f\"Done: {os.path.basename(output_file[0])}\")\n", "\n", " return result\n", "\n", " def multilingual_media_conversion(\n", " self,\n", " media_file=None,\n", " link_media=\"\",\n", " directory_input=\"\",\n", " YOUR_HF_TOKEN=\"\",\n", " preview=False,\n", " transcriber_model=\"large-v3\",\n", " batch_size=4,\n", " compute_type=\"auto\",\n", " origin_language=\"Automatic detection\",\n", " target_language=\"English (en)\",\n", " min_speakers=1,\n", " max_speakers=1,\n", " tts_voice00=\"en-US-EmmaMultilingualNeural-Female\",\n", " tts_voice01=\"en-US-AndrewMultilingualNeural-Male\",\n", " tts_voice02=\"en-US-AvaMultilingualNeural-Female\",\n", " tts_voice03=\"en-US-BrianMultilingualNeural-Male\",\n", " tts_voice04=\"de-DE-SeraphinaMultilingualNeural-Female\",\n", " tts_voice05=\"de-DE-FlorianMultilingualNeural-Male\",\n", " tts_voice06=\"fr-FR-VivienneMultilingualNeural-Female\",\n", " tts_voice07=\"fr-FR-RemyMultilingualNeural-Male\",\n", " tts_voice08=\"en-US-EmmaMultilingualNeural-Female\",\n", " tts_voice09=\"en-US-AndrewMultilingualNeural-Male\",\n", " tts_voice10=\"en-US-EmmaMultilingualNeural-Female\",\n", " tts_voice11=\"en-US-AndrewMultilingualNeural-Male\",\n", " video_output_name=\"\",\n", " mix_method_audio=\"Adjusting volumes and mixing audio\",\n", " max_accelerate_audio=2.1,\n", " acceleration_rate_regulation=False,\n", " volume_original_audio=0.25,\n", " volume_translated_audio=1.80,\n", " output_format_subtitle=\"srt\",\n", " get_translated_text=False,\n", " get_video_from_text_json=False,\n", " text_json=\"{}\",\n", " avoid_overlap=False,\n", " vocal_refinement=False,\n", " literalize_numbers=True,\n", " segment_duration_limit=15,\n", " diarization_model=\"pyannote_2.1\",\n", " translate_process=\"google_translator_batch\",\n", " subtitle_file=None,\n", " output_type=\"video (mp4)\",\n", " voiceless_track=False,\n", " voice_imitation=False,\n", " voice_imitation_max_segments=3,\n", " voice_imitation_vocals_dereverb=False,\n", " voice_imitation_remove_previous=True,\n", " voice_imitation_method=\"freevc\",\n", " dereverb_automatic_xtts=True,\n", " text_segmentation_scale=\"sentence\",\n", " divide_text_segments_by=\"\",\n", " soft_subtitles_to_video=True,\n", " burn_subtitles_to_video=False,\n", " enable_cache=True,\n", " custom_voices=False,\n", " custom_voices_workers=1,\n", " is_gui=False,\n", " progress=gr.Progress(),\n", " ):\n", " if not YOUR_HF_TOKEN:\n", " YOUR_HF_TOKEN = os.getenv(\"YOUR_HF_TOKEN\")\n", " if diarization_model == \"disable\" or max_speakers == 1:\n", " if YOUR_HF_TOKEN is None:\n", " YOUR_HF_TOKEN = \"\"\n", " elif not YOUR_HF_TOKEN:\n", " raise ValueError(\"No valid Hugging Face token\")\n", " else:\n", " os.environ[\"YOUR_HF_TOKEN\"] = YOUR_HF_TOKEN\n", "\n", " if (\n", " \"gpt\" in translate_process\n", " or transcriber_model == \"OpenAI_API_Whisper\"\n", " or \"OpenAI-TTS\" in tts_voice00\n", " ):\n", " check_openai_api_key()\n", "\n", " if media_file is None:\n", " media_file = (\n", " directory_input\n", " if os.path.exists(directory_input)\n", " else link_media\n", " )\n", " media_file = (\n", " media_file if isinstance(media_file, str) else media_file.name\n", " )\n", "\n", " if is_subtitle_file(media_file):\n", " subtitle_file = media_file\n", " media_file = \"\"\n", "\n", " if media_file is None:\n", " media_file = \"\"\n", "\n", " if not origin_language:\n", " origin_language = \"Automatic detection\"\n", "\n", " if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file:\n", " raise ValueError(\n", " f\"The language '{origin_language}' \"\n", " \"is not supported for transcription (ASR).\"\n", " )\n", "\n", " if get_translated_text:\n", " self.edit_subs_complete = False\n", " if get_video_from_text_json:\n", " if not self.edit_subs_complete:\n", " raise ValueError(\"Generate the transcription first.\")\n", "\n", " if (\n", " (\"sound\" in output_type or output_type == \"raw media\")\n", " and (get_translated_text or get_video_from_text_json)\n", " ):\n", " raise ValueError(\n", " \"Please disable 'edit generate subtitles' \"\n", " f\"first to acquire the {output_type}.\"\n", " )\n", "\n", " TRANSLATE_AUDIO_TO = LANGUAGES[target_language]\n", " SOURCE_LANGUAGE = LANGUAGES[origin_language]\n", "\n", " if (\n", " transcriber_model == \"OpenAI_API_Whisper\"\n", " and SOURCE_LANGUAGE == \"zh-TW\"\n", " ):\n", " logger.warning(\n", " \"OpenAI API Whisper only supports Chinese (Simplified).\"\n", " )\n", " SOURCE_LANGUAGE = \"zh\"\n", "\n", " if (\n", " text_segmentation_scale in [\"word\", \"character\"]\n", " and \"subtitle\" not in output_type\n", " ):\n", " wrn_lang = (\n", " \"Text segmentation by words or characters is typically\"\n", " \" used for generating subtitles. If subtitles are not the\"\n", " \" intended output, consider selecting 'sentence' \"\n", " \"segmentation method to ensure optimal results.\"\n", "\n", " )\n", " warn_disp(wrn_lang, is_gui)\n", "\n", " if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():\n", " wrn_lang = (\n", " \"Make sure to select a 'TTS Speaker' suitable for\"\n", " \" the translation language to avoid errors with the TTS.\"\n", " )\n", " warn_disp(wrn_lang, is_gui)\n", "\n", " if \"_XTTS_\" in tts_voice00 and voice_imitation:\n", " wrn_lang = (\n", " \"When you select XTTS, it is advisable \"\n", " \"to disable Voice Imitation.\"\n", " )\n", " warn_disp(wrn_lang, is_gui)\n", "\n", " if custom_voices and voice_imitation:\n", " wrn_lang = (\n", " \"When you use R.V.C. models, it is advisable\"\n", " \" to disable Voice Imitation.\"\n", " )\n", " warn_disp(wrn_lang, is_gui)\n", "\n", " if not media_file and not subtitle_file:\n", " raise ValueError(\n", " \"Specifify a media or SRT file in advanced settings\"\n", " )\n", "\n", " if subtitle_file:\n", " subtitle_file = (\n", " subtitle_file\n", " if isinstance(subtitle_file, str)\n", " else subtitle_file.name\n", " )\n", "\n", " if subtitle_file and SOURCE_LANGUAGE == \"Automatic detection\":\n", " raise Exception(\n", " \"To use an SRT file, you need to specify its \"\n", " \"original language (Source language)\"\n", " )\n", "\n", " if not media_file and subtitle_file:\n", " diarization_model = \"disable\"\n", " media_file = \"audio_support.wav\"\n", " if not get_video_from_text_json:\n", " remove_files(media_file)\n", " srt_data = srt_file_to_segments(subtitle_file)\n", " total_duration = srt_data[\"segments\"][-1][\"end\"] + 30.\n", " support_audio = AudioSegment.silent(\n", " duration=int(total_duration * 1000)\n", " )\n", " support_audio.export(\n", " media_file, format=\"wav\"\n", " )\n", " logger.info(\"Supporting audio for the SRT file, created.\")\n", "\n", " if \"SET_LIMIT\" == os.getenv(\"DEMO\"):\n", " preview = True\n", " mix_method_audio = \"Adjusting volumes and mixing audio\"\n", " transcriber_model = \"medium\"\n", " logger.info(\n", " \"DEMO; set preview=True; Generation is limited to \"\n", " \"10 seconds to prevent CPU errors. No limitations with GPU.\\n\"\n", " \"DEMO; set Adjusting volumes and mixing audio\\n\"\n", " \"DEMO; set whisper model to medium\"\n", " )\n", "\n", " # Check GPU\n", " if self.device == \"cpu\" and compute_type not in COMPUTE_TYPE_CPU:\n", " logger.info(\"Compute type changed to float32\")\n", " compute_type = \"float32\"\n", "\n", " base_video_file = \"Video.mp4\"\n", " base_audio_wav = \"audio.wav\"\n", " dub_audio_file = \"audio_dub_solo.ogg\"\n", " vocals_audio_file = \"audio_Vocals_DeReverb.wav\"\n", " voiceless_audio_file = \"audio_Voiceless.wav\"\n", " mix_audio_file = \"audio_mix.mp3\"\n", " vid_subs = \"video_subs_file.mp4\"\n", " video_output_file = \"video_dub.mp4\"\n", "\n", " if os.path.exists(media_file):\n", " media_base_hash = get_hash(media_file)\n", " else:\n", " media_base_hash = media_file\n", " self.clear_cache(media_base_hash, force=(not enable_cache))\n", "\n", " if not get_video_from_text_json:\n", " self.result_diarize = (\n", " self.align_language\n", " ) = self.result_source_lang = None\n", " if not self.task_in_cache(\"media\", [media_base_hash, preview], {}):\n", " if is_audio_file(media_file):\n", " prog_disp(\n", " \"Processing audio...\", 0.15, is_gui, progress=progress\n", " )\n", " audio_preprocessor(preview, media_file, base_audio_wav)\n", " else:\n", " prog_disp(\n", " \"Processing video...\", 0.15, is_gui, progress=progress\n", " )\n", " audio_video_preprocessor(\n", " preview, media_file, base_video_file, base_audio_wav\n", " )\n", " logger.debug(\"Set file complete.\")\n", "\n", " if \"sound\" in output_type:\n", " prog_disp(\n", " \"Separating sounds in the file...\",\n", " 0.50,\n", " is_gui,\n", " progress=progress\n", " )\n", " separate_out = sound_separate(base_audio_wav, output_type)\n", " final_outputs = []\n", " for out in separate_out:\n", " final_name = media_out(\n", " media_file,\n", " f\"{get_no_ext_filename(out)}\",\n", " video_output_name,\n", " \"wav\",\n", " file_obj=out,\n", " )\n", " final_outputs.append(final_name)\n", " logger.info(f\"Done: {str(final_outputs)}\")\n", " return final_outputs\n", "\n", " if output_type == \"raw media\":\n", " output = media_out(\n", " media_file,\n", " \"raw_media\",\n", " video_output_name,\n", " \"wav\" if is_audio_file(media_file) else \"mp4\",\n", " file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,\n", " )\n", " logger.info(f\"Done: {output}\")\n", " return output\n", "\n", " if not self.task_in_cache(\"refine_vocals\", [vocal_refinement], {}):\n", " self.vocals = None\n", " if vocal_refinement:\n", " try:\n", " from soni_translate.mdx_net import process_uvr_task\n", " _, _, _, _, file_vocals = process_uvr_task(\n", " orig_song_path=base_audio_wav,\n", " main_vocals=False,\n", " dereverb=True,\n", " remove_files_output_dir=True,\n", " )\n", " remove_files(vocals_audio_file)\n", " copy_files(file_vocals, \".\")\n", " self.vocals = vocals_audio_file\n", " except Exception as error:\n", " logger.error(str(error))\n", "\n", " if not self.task_in_cache(\"transcript_align\", [\n", " subtitle_file,\n", " SOURCE_LANGUAGE,\n", " transcriber_model,\n", " compute_type,\n", " batch_size,\n", " literalize_numbers,\n", " segment_duration_limit,\n", " (\n", " \"l_unit\"\n", " if text_segmentation_scale in [\"word\", \"character\"]\n", " and subtitle_file\n", " else \"sentence\"\n", " )\n", " ], {\"vocals\": self.vocals}):\n", " if subtitle_file:\n", " prog_disp(\n", " \"From SRT file...\", 0.30, is_gui, progress=progress\n", " )\n", " audio = whisperx.load_audio(\n", " base_audio_wav if not self.vocals else self.vocals\n", " )\n", " self.result = srt_file_to_segments(subtitle_file)\n", " self.result[\"language\"] = SOURCE_LANGUAGE\n", " else:\n", " prog_disp(\n", " \"Transcribing...\", 0.30, is_gui, progress=progress\n", " )\n", " SOURCE_LANGUAGE = (\n", " None\n", " if SOURCE_LANGUAGE == \"Automatic detection\"\n", " else SOURCE_LANGUAGE\n", " )\n", " audio, self.result = transcribe_speech(\n", " base_audio_wav if not self.vocals else self.vocals,\n", " transcriber_model,\n", " compute_type,\n", " batch_size,\n", " SOURCE_LANGUAGE,\n", " literalize_numbers,\n", " segment_duration_limit,\n", " )\n", " logger.debug(\n", " \"Transcript complete, \"\n", " f\"segments count {len(self.result['segments'])}\"\n", " )\n", "\n", " self.align_language = self.result[\"language\"]\n", " if (\n", " not subtitle_file\n", " or text_segmentation_scale in [\"word\", \"character\"]\n", " ):\n", " prog_disp(\"Aligning...\", 0.45, is_gui, progress=progress)\n", " try:\n", " if self.align_language in [\"vi\"]:\n", " logger.info(\n", " \"Deficient alignment for the \"\n", " f\"{self.align_language} language, skipping the\"\n", " \" process. It is suggested to reduce the \"\n", " \"duration of the segments as an alternative.\"\n", " )\n", " else:\n", " self.result = align_speech(audio, self.result)\n", " logger.debug(\n", " \"Align complete, \"\n", " f\"segments count {len(self.result['segments'])}\"\n", " )\n", " except Exception as error:\n", " logger.error(str(error))\n", "\n", " if self.result[\"segments\"] == []:\n", " raise ValueError(\"No active speech found in audio\")\n", "\n", " if not self.task_in_cache(\"break_align\", [\n", " divide_text_segments_by,\n", " text_segmentation_scale,\n", " self.align_language\n", " ], {\n", " \"result\": self.result,\n", " \"align_language\": self.align_language\n", " }):\n", " if self.align_language in [\"ja\", \"zh\", \"zh-TW\"]:\n", " divide_text_segments_by += \"|!|?|...|。\"\n", " if text_segmentation_scale in [\"word\", \"character\"]:\n", " self.result = linguistic_level_segments(\n", " self.result,\n", " text_segmentation_scale,\n", " )\n", " elif divide_text_segments_by:\n", " try:\n", " self.result = break_aling_segments(\n", " self.result,\n", " break_characters=divide_text_segments_by,\n", " )\n", " except Exception as error:\n", " logger.error(str(error))\n", "\n", " if not self.task_in_cache(\"diarize\", [\n", " min_speakers,\n", " max_speakers,\n", " YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2],\n", " diarization_model\n", " ], {\n", " \"result\": self.result\n", " }):\n", " prog_disp(\"Diarizing...\", 0.60, is_gui, progress=progress)\n", " diarize_model_select = diarization_models[diarization_model]\n", " self.result_diarize = diarize_speech(\n", " base_audio_wav if not self.vocals else self.vocals,\n", " self.result,\n", " min_speakers,\n", " max_speakers,\n", " YOUR_HF_TOKEN,\n", " diarize_model_select,\n", " )\n", " logger.debug(\"Diarize complete\")\n", " self.result_source_lang = copy.deepcopy(self.result_diarize)\n", "\n", " if not self.task_in_cache(\"translate\", [\n", " TRANSLATE_AUDIO_TO,\n", " translate_process\n", " ], {\n", " \"result_diarize\": self.result_diarize\n", " }):\n", " prog_disp(\"Translating...\", 0.70, is_gui, progress=progress)\n", " lang_source = (\n", " self.align_language\n", " if self.align_language\n", " else SOURCE_LANGUAGE\n", " )\n", " self.result_diarize[\"segments\"] = translate_text(\n", " self.result_diarize[\"segments\"],\n", " TRANSLATE_AUDIO_TO,\n", " translate_process,\n", " chunk_size=1800,\n", " source=lang_source,\n", " )\n", " logger.debug(\"Translation complete\")\n", " logger.debug(self.result_diarize)\n", "\n", " if get_translated_text:\n", "\n", " json_data = []\n", " for segment in self.result_diarize[\"segments\"]:\n", " start = segment[\"start\"]\n", " text = segment[\"text\"]\n", " speaker = int(segment.get(\"speaker\", \"SPEAKER_00\")[-2:]) + 1\n", " json_data.append(\n", " {\"start\": start, \"text\": text, \"speaker\": speaker}\n", " )\n", "\n", " # Convert list of dictionaries to a JSON string with indentation\n", " json_string = json.dumps(json_data, indent=2)\n", " logger.info(\"Done\")\n", " self.edit_subs_complete = True\n", " return json_string.encode().decode(\"unicode_escape\")\n", "\n", " if get_video_from_text_json:\n", "\n", " if self.result_diarize is None:\n", " raise ValueError(\"Generate the transcription first.\")\n", " # with open('text_json.json', 'r') as file:\n", " text_json_loaded = json.loads(text_json)\n", " for i, segment in enumerate(self.result_diarize[\"segments\"]):\n", " segment[\"text\"] = text_json_loaded[i][\"text\"]\n", " segment[\"speaker\"] = \"SPEAKER_{:02d}\".format(\n", " int(text_json_loaded[i][\"speaker\"]) - 1\n", " )\n", "\n", " # Write subtitle\n", " if not self.task_in_cache(\"subs_and_edit\", [\n", " copy.deepcopy(self.result_diarize),\n", " output_format_subtitle,\n", " TRANSLATE_AUDIO_TO\n", " ], {\n", " \"result_diarize\": self.result_diarize\n", " }):\n", " if output_format_subtitle == \"disable\":\n", " self.sub_file = \"sub_tra.srt\"\n", " elif output_format_subtitle != \"ass\":\n", " self.sub_file = process_subtitles(\n", " self.result_source_lang,\n", " self.align_language,\n", " self.result_diarize,\n", " output_format_subtitle,\n", " TRANSLATE_AUDIO_TO,\n", " )\n", "\n", " # Need task\n", " if output_format_subtitle != \"srt\":\n", " _ = process_subtitles(\n", " self.result_source_lang,\n", " self.align_language,\n", " self.result_diarize,\n", " \"srt\",\n", " TRANSLATE_AUDIO_TO,\n", " )\n", "\n", " if output_format_subtitle == \"ass\":\n", " convert_ori = \"ffmpeg -i sub_ori.srt sub_ori.ass -y\"\n", " convert_tra = \"ffmpeg -i sub_tra.srt sub_tra.ass -y\"\n", " self.sub_file = \"sub_tra.ass\"\n", " run_command(convert_ori)\n", " run_command(convert_tra)\n", "\n", " format_sub = (\n", " output_format_subtitle\n", " if output_format_subtitle != \"disable\"\n", " else \"srt\"\n", " )\n", "\n", " if output_type == \"subtitle\":\n", "\n", " out_subs = []\n", " tra_subs = media_out(\n", " media_file,\n", " TRANSLATE_AUDIO_TO,\n", " video_output_name,\n", " format_sub,\n", " file_obj=self.sub_file,\n", " )\n", " out_subs.append(tra_subs)\n", "\n", " ori_subs = media_out(\n", " media_file,\n", " self.align_language,\n", " video_output_name,\n", " format_sub,\n", " file_obj=f\"sub_ori.{format_sub}\",\n", " )\n", " out_subs.append(ori_subs)\n", " logger.info(f\"Done: {out_subs}\")\n", " return out_subs\n", "\n", " if output_type == \"subtitle [by speaker]\":\n", " output = get_subtitle_speaker(\n", " media_file,\n", " result=self.result_diarize,\n", " language=TRANSLATE_AUDIO_TO,\n", " extension=format_sub,\n", " base_name=video_output_name,\n", " )\n", " logger.info(f\"Done: {str(output)}\")\n", " return output\n", "\n", " if \"video [subtitled]\" in output_type:\n", " output = media_out(\n", " media_file,\n", " TRANSLATE_AUDIO_TO + \"_subtitled\",\n", " video_output_name,\n", " \"wav\" if is_audio_file(media_file) else (\n", " \"mkv\" if \"mkv\" in output_type else \"mp4\"\n", " ),\n", " file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,\n", " soft_subtitles=False if is_audio_file(media_file) else True,\n", " subtitle_files=output_format_subtitle,\n", " )\n", " msg_out = output[0] if isinstance(output, list) else output\n", " logger.info(f\"Done: {msg_out}\")\n", " return output\n", "\n", " if not self.task_in_cache(\"tts\", [\n", " TRANSLATE_AUDIO_TO,\n", " tts_voice00,\n", " tts_voice01,\n", " tts_voice02,\n", " tts_voice03,\n", " tts_voice04,\n", " tts_voice05,\n", " tts_voice06,\n", " tts_voice07,\n", " tts_voice08,\n", " tts_voice09,\n", " tts_voice10,\n", " tts_voice11,\n", " dereverb_automatic_xtts\n", " ], {\n", " \"sub_file\": self.sub_file\n", " }):\n", " prog_disp(\"Text to speech...\", 0.80, is_gui, progress=progress)\n", " self.valid_speakers = audio_segmentation_to_voice(\n", " self.result_diarize,\n", " TRANSLATE_AUDIO_TO,\n", " is_gui,\n", " tts_voice00,\n", " tts_voice01,\n", " tts_voice02,\n", " tts_voice03,\n", " tts_voice04,\n", " tts_voice05,\n", " tts_voice06,\n", " tts_voice07,\n", " tts_voice08,\n", " tts_voice09,\n", " tts_voice10,\n", " tts_voice11,\n", " dereverb_automatic_xtts,\n", " )\n", "\n", " if not self.task_in_cache(\"acc_and_vc\", [\n", " max_accelerate_audio,\n", " acceleration_rate_regulation,\n", " voice_imitation,\n", " voice_imitation_max_segments,\n", " voice_imitation_remove_previous,\n", " voice_imitation_vocals_dereverb,\n", " voice_imitation_method,\n", " custom_voices,\n", " custom_voices_workers,\n", " copy.deepcopy(self.vci.model_config),\n", " avoid_overlap\n", " ], {\n", " \"valid_speakers\": self.valid_speakers\n", " }):\n", " audio_files, speakers_list = accelerate_segments(\n", " self.result_diarize,\n", " max_accelerate_audio,\n", " self.valid_speakers,\n", " acceleration_rate_regulation,\n", " )\n", "\n", " # Voice Imitation (Tone color converter)\n", " if voice_imitation:\n", " prog_disp(\n", " \"Voice Imitation...\", 0.85, is_gui, progress=progress\n", " )\n", " from soni_translate.text_to_speech import toneconverter\n", "\n", " try:\n", " toneconverter(\n", " copy.deepcopy(self.result_diarize),\n", " voice_imitation_max_segments,\n", " voice_imitation_remove_previous,\n", " voice_imitation_vocals_dereverb,\n", " voice_imitation_method,\n", " )\n", " except Exception as error:\n", " logger.error(str(error))\n", "\n", " # custom voice\n", " if custom_voices:\n", " prog_disp(\n", " \"Applying customized voices...\",\n", " 0.90,\n", " is_gui,\n", " progress=progress,\n", " )\n", "\n", " try:\n", " self.vci(\n", " audio_files,\n", " speakers_list,\n", " overwrite=True,\n", " parallel_workers=custom_voices_workers,\n", " )\n", " self.vci.unload_models()\n", " except Exception as error:\n", " logger.error(str(error))\n", "\n", " prog_disp(\n", " \"Creating final translated video...\",\n", " 0.95,\n", " is_gui,\n", " progress=progress,\n", " )\n", " remove_files(dub_audio_file)\n", " create_translated_audio(\n", " self.result_diarize,\n", " audio_files,\n", " dub_audio_file,\n", " False,\n", " avoid_overlap,\n", " )\n", "\n", " # Voiceless track, change with file\n", " hash_base_audio_wav = get_hash(base_audio_wav)\n", " if voiceless_track:\n", " if self.voiceless_id != hash_base_audio_wav:\n", " from soni_translate.mdx_net import process_uvr_task\n", "\n", " try:\n", " # voiceless_audio_file_dir = \"clean_song_output/voiceless\"\n", " remove_files(voiceless_audio_file)\n", " uvr_voiceless_audio_wav, _ = process_uvr_task(\n", " orig_song_path=base_audio_wav,\n", " song_id=\"voiceless\",\n", " only_voiceless=True,\n", " remove_files_output_dir=False,\n", " )\n", " copy_files(uvr_voiceless_audio_wav, \".\")\n", " base_audio_wav = voiceless_audio_file\n", " self.voiceless_id = hash_base_audio_wav\n", "\n", " except Exception as error:\n", " logger.error(str(error))\n", " else:\n", " base_audio_wav = voiceless_audio_file\n", "\n", " if not self.task_in_cache(\"mix_aud\", [\n", " mix_method_audio,\n", " volume_original_audio,\n", " volume_translated_audio,\n", " voiceless_track\n", " ], {}):\n", " # TYPE MIX AUDIO\n", " remove_files(mix_audio_file)\n", " command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex \"[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest\" -c:a libmp3lame {mix_audio_file}'\n", " command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex \"[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]\" -map [final] {mix_audio_file}'\n", " if mix_method_audio == \"Adjusting volumes and mixing audio\":\n", " # volume mix\n", " run_command(command_volume_mix)\n", " else:\n", " try:\n", " # background mix\n", " run_command(command_background_mix)\n", " except Exception as error_mix:\n", " # volume mix except\n", " logger.error(str(error_mix))\n", " run_command(command_volume_mix)\n", "\n", " if \"audio\" in output_type or is_audio_file(media_file):\n", " output = media_out(\n", " media_file,\n", " TRANSLATE_AUDIO_TO,\n", " video_output_name,\n", " \"wav\" if \"wav\" in output_type else (\n", " \"ogg\" if \"ogg\" in output_type else \"mp3\"\n", " ),\n", " file_obj=mix_audio_file,\n", " subtitle_files=output_format_subtitle,\n", " )\n", " msg_out = output[0] if isinstance(output, list) else output\n", " logger.info(f\"Done: {msg_out}\")\n", " return output\n", "\n", " hash_base_video_file = get_hash(base_video_file)\n", "\n", " if burn_subtitles_to_video:\n", " hashvideo_text = [\n", " hash_base_video_file,\n", " [seg[\"text\"] for seg in self.result_diarize[\"segments\"]]\n", " ]\n", " if self.burn_subs_id != hashvideo_text:\n", " try:\n", " logger.info(\"Burn subtitles\")\n", " remove_files(vid_subs)\n", " command = f\"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}\"\n", " run_command(command)\n", " base_video_file = vid_subs\n", " self.burn_subs_id = hashvideo_text\n", " except Exception as error:\n", " logger.error(str(error))\n", " else:\n", " base_video_file = vid_subs\n", "\n", " if not self.task_in_cache(\"output\", [\n", " hash_base_video_file,\n", " hash_base_audio_wav,\n", " burn_subtitles_to_video\n", " ], {}):\n", " # Merge new audio + video\n", " remove_files(video_output_file)\n", " run_command(\n", " f\"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}\"\n", " )\n", "\n", " output = media_out(\n", " media_file,\n", " TRANSLATE_AUDIO_TO,\n", " video_output_name,\n", " \"mkv\" if \"mkv\" in output_type else \"mp4\",\n", " file_obj=video_output_file,\n", " soft_subtitles=soft_subtitles_to_video,\n", " subtitle_files=output_format_subtitle,\n", " )\n", " msg_out = output[0] if isinstance(output, list) else output\n", " logger.info(f\"Done: {msg_out}\")\n", "\n", " return output\n", "\n", " def hook_beta_processor(\n", " self,\n", " document,\n", " tgt_lang,\n", " translate_process,\n", " ori_lang,\n", " tts,\n", " name_final_file,\n", " custom_voices,\n", " custom_voices_workers,\n", " output_type,\n", " chunk_size,\n", " width,\n", " height,\n", " start_page,\n", " end_page,\n", " bcolor,\n", " is_gui,\n", " progress\n", " ):\n", " prog_disp(\"Processing pages...\", 0.10, is_gui, progress=progress)\n", " doc_data = doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor)\n", " result_diarize = page_data_to_segments(doc_data, 1700)\n", "\n", " prog_disp(\"Translating...\", 0.20, is_gui, progress=progress)\n", " result_diarize[\"segments\"] = translate_text(\n", " result_diarize[\"segments\"],\n", " tgt_lang,\n", " translate_process,\n", " chunk_size=0,\n", " source=ori_lang,\n", " )\n", " chunk_size = (\n", " chunk_size if chunk_size else determine_chunk_size(tts)\n", " )\n", " doc_data = update_page_data(result_diarize, doc_data)\n", "\n", " prog_disp(\"Text to speech...\", 0.30, is_gui, progress=progress)\n", " result_diarize = page_data_to_segments(doc_data, chunk_size)\n", " valid_speakers = audio_segmentation_to_voice(\n", " result_diarize,\n", " tgt_lang,\n", " is_gui,\n", " tts,\n", " )\n", "\n", " # fix format and set folder output\n", " audio_files, speakers_list = accelerate_segments(\n", " result_diarize,\n", " 1.0,\n", " valid_speakers,\n", " )\n", "\n", " # custom voice\n", " if custom_voices:\n", " prog_disp(\n", " \"Applying customized voices...\",\n", " 0.60,\n", " is_gui,\n", " progress=progress,\n", " )\n", " self.vci(\n", " audio_files,\n", " speakers_list,\n", " overwrite=True,\n", " parallel_workers=custom_voices_workers,\n", " )\n", " self.vci.unload_models()\n", "\n", " # Update time segments and not concat\n", " result_diarize = fix_timestamps_docs(result_diarize, audio_files)\n", " final_wav_file = \"audio_book.wav\"\n", " remove_files(final_wav_file)\n", "\n", " prog_disp(\"Creating audio file...\", 0.70, is_gui, progress=progress)\n", " create_translated_audio(\n", " result_diarize, audio_files, final_wav_file, False\n", " )\n", "\n", " prog_disp(\"Creating video file...\", 0.80, is_gui, progress=progress)\n", " video_doc = create_video_from_images(\n", " doc_data,\n", " result_diarize\n", " )\n", "\n", " # Merge video and audio\n", " prog_disp(\"Merging...\", 0.90, is_gui, progress=progress)\n", " vid_out = merge_video_and_audio(video_doc, final_wav_file)\n", "\n", " # End\n", " output = media_out(\n", " document,\n", " tgt_lang,\n", " name_final_file,\n", " \"mkv\" if \"mkv\" in output_type else \"mp4\",\n", " file_obj=vid_out,\n", " )\n", " logger.info(f\"Done: {output}\")\n", " return output\n", "\n", " def multilingual_docs_conversion(\n", " self,\n", " string_text=\"\", # string\n", " document=None, # doc path gui\n", " directory_input=\"\", # doc path\n", " origin_language=\"English (en)\",\n", " target_language=\"English (en)\",\n", " tts_voice00=\"en-US-EmmaMultilingualNeural-Female\",\n", " name_final_file=\"\",\n", " translate_process=\"google_translator\",\n", " output_type=\"audio\",\n", " chunk_size=None,\n", " custom_voices=False,\n", " custom_voices_workers=1,\n", " start_page=1,\n", " end_page=99999,\n", " width=1280,\n", " height=720,\n", " bcolor=\"dynamic\",\n", " is_gui=False,\n", " progress=gr.Progress(),\n", " ):\n", " if \"gpt\" in translate_process:\n", " check_openai_api_key()\n", "\n", " SOURCE_LANGUAGE = LANGUAGES[origin_language]\n", " if translate_process != \"disable_translation\":\n", " TRANSLATE_AUDIO_TO = LANGUAGES[target_language]\n", " else:\n", " TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE\n", " logger.info(\"No translation\")\n", " if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():\n", " logger.debug(\n", " \"Make sure to select a 'TTS Speaker' suitable for the \"\n", " \"translation language to avoid errors with the TTS.\"\n", " )\n", "\n", " self.clear_cache(string_text, force=True)\n", "\n", " is_string = False\n", " if document is None:\n", " if os.path.exists(directory_input):\n", " document = directory_input\n", " else:\n", " document = string_text\n", " is_string = True\n", " document = document if isinstance(document, str) else document.name\n", " if not document:\n", " raise Exception(\"No data found\")\n", "\n", " if \"videobook\" in output_type:\n", " if not document.lower().endswith(\".pdf\"):\n", " raise ValueError(\n", " \"Videobooks are only compatible with PDF files.\"\n", " )\n", "\n", " return self.hook_beta_processor(\n", " document,\n", " TRANSLATE_AUDIO_TO,\n", " translate_process,\n", " SOURCE_LANGUAGE,\n", " tts_voice00,\n", " name_final_file,\n", " custom_voices,\n", " custom_voices_workers,\n", " output_type,\n", " chunk_size,\n", " width,\n", " height,\n", " start_page,\n", " end_page,\n", " bcolor,\n", " is_gui,\n", " progress\n", " )\n", "\n", " # audio_wav = \"audio.wav\"\n", " final_wav_file = \"audio_book.wav\"\n", "\n", " prog_disp(\"Processing text...\", 0.15, is_gui, progress=progress)\n", " result_file_path, result_text = document_preprocessor(\n", " document, is_string, start_page, end_page\n", " )\n", "\n", " if (\n", " output_type == \"book (txt)\"\n", " and translate_process == \"disable_translation\"\n", " ):\n", " return result_file_path\n", "\n", " if \"SET_LIMIT\" == os.getenv(\"DEMO\"):\n", " result_text = result_text[:50]\n", " logger.info(\n", " \"DEMO; Generation is limited to 50 characters to prevent \"\n", " \"CPU errors. No limitations with GPU.\\n\"\n", " )\n", "\n", " if translate_process != \"disable_translation\":\n", " # chunks text for translation\n", " result_diarize = plain_text_to_segments(result_text, 1700)\n", " prog_disp(\"Translating...\", 0.30, is_gui, progress=progress)\n", " # not or iterative with 1700 chars\n", " result_diarize[\"segments\"] = translate_text(\n", " result_diarize[\"segments\"],\n", " TRANSLATE_AUDIO_TO,\n", " translate_process,\n", " chunk_size=0,\n", " source=SOURCE_LANGUAGE,\n", " )\n", "\n", " txt_file_path, result_text = segments_to_plain_text(result_diarize)\n", "\n", " if output_type == \"book (txt)\":\n", " return media_out(\n", " result_file_path if is_string else document,\n", " TRANSLATE_AUDIO_TO,\n", " name_final_file,\n", " \"txt\",\n", " file_obj=txt_file_path,\n", " )\n", "\n", " # (TTS limits) plain text to result_diarize\n", " chunk_size = (\n", " chunk_size if chunk_size else determine_chunk_size(tts_voice00)\n", " )\n", " result_diarize = plain_text_to_segments(result_text, chunk_size)\n", " logger.debug(result_diarize)\n", "\n", " prog_disp(\"Text to speech...\", 0.45, is_gui, progress=progress)\n", " valid_speakers = audio_segmentation_to_voice(\n", " result_diarize,\n", " TRANSLATE_AUDIO_TO,\n", " is_gui,\n", " tts_voice00,\n", " )\n", "\n", " # fix format and set folder output\n", " audio_files, speakers_list = accelerate_segments(\n", " result_diarize,\n", " 1.0,\n", " valid_speakers,\n", " )\n", "\n", " # custom voice\n", " if custom_voices:\n", " prog_disp(\n", " \"Applying customized voices...\",\n", " 0.80,\n", " is_gui,\n", " progress=progress,\n", " )\n", " self.vci(\n", " audio_files,\n", " speakers_list,\n", " overwrite=True,\n", " parallel_workers=custom_voices_workers,\n", " )\n", " self.vci.unload_models()\n", "\n", " prog_disp(\n", " \"Creating final audio file...\", 0.90, is_gui, progress=progress\n", " )\n", " remove_files(final_wav_file)\n", " create_translated_audio(\n", " result_diarize, audio_files, final_wav_file, True\n", " )\n", "\n", " output = media_out(\n", " result_file_path if is_string else document,\n", " TRANSLATE_AUDIO_TO,\n", " name_final_file,\n", " \"mp3\" if \"mp3\" in output_type else (\n", " \"ogg\" if \"ogg\" in output_type else \"wav\"\n", " ),\n", " file_obj=final_wav_file,\n", " )\n", "\n", " logger.info(f\"Done: {output}\")\n", "\n", " return output\n", "\n", "\n", "title = \"
📽️ SoniTranslate 🈷️
\"\n", "\n", "\n", "def create_gui(theme, logs_in_gui=False):\n", " with gr.Blocks(theme=theme) as app:\n", " gr.Markdown(title)\n", " gr.Markdown(lg_conf[\"description\"])\n", "\n", " with gr.Tab(lg_conf[\"tab_translate\"]):\n", " with gr.Row():\n", " with gr.Column():\n", " input_data_type = gr.Dropdown(\n", " [\"SUBMIT VIDEO\", \"URL\", \"Find Video Path\"],\n", " value=\"SUBMIT VIDEO\",\n", " label=lg_conf[\"video_source\"],\n", " )\n", "\n", " def swap_visibility(data_type):\n", " if data_type == \"URL\":\n", " return (\n", " gr.update(visible=False, value=None),\n", " gr.update(visible=True, value=\"\"),\n", " gr.update(visible=False, value=\"\"),\n", " )\n", " elif data_type == \"SUBMIT VIDEO\":\n", " return (\n", " gr.update(visible=True, value=None),\n", " gr.update(visible=False, value=\"\"),\n", " gr.update(visible=False, value=\"\"),\n", " )\n", " elif data_type == \"Find Video Path\":\n", " return (\n", " gr.update(visible=False, value=None),\n", " gr.update(visible=False, value=\"\"),\n", " gr.update(visible=True, value=\"\"),\n", " )\n", "\n", " video_input = gr.File(\n", " label=\"VIDEO\",\n", " file_count=\"multiple\",\n", " type=\"filepath\",\n", " )\n", " blink_input = gr.Textbox(\n", " visible=False,\n", " label=lg_conf[\"link_label\"],\n", " info=lg_conf[\"link_info\"],\n", " placeholder=lg_conf[\"link_ph\"],\n", " )\n", " directory_input = gr.Textbox(\n", " visible=False,\n", " label=lg_conf[\"dir_label\"],\n", " info=lg_conf[\"dir_info\"],\n", " placeholder=lg_conf[\"dir_ph\"],\n", " )\n", " input_data_type.change(\n", " fn=swap_visibility,\n", " inputs=input_data_type,\n", " outputs=[video_input, blink_input, directory_input],\n", " )\n", "\n", " gr.HTML()\n", "\n", " SOURCE_LANGUAGE = gr.Dropdown(\n", " LANGUAGES_LIST,\n", " value=LANGUAGES_LIST[0],\n", " label=lg_conf[\"sl_label\"],\n", " info=lg_conf[\"sl_info\"],\n", " )\n", " TRANSLATE_AUDIO_TO = gr.Dropdown(\n", " LANGUAGES_LIST[1:],\n", " value=\"English (en)\",\n", " label=lg_conf[\"tat_label\"],\n", " info=lg_conf[\"tat_info\"],\n", " )\n", "\n", " gr.HTML(\"
\")\n", "\n", " gr.Markdown(lg_conf[\"num_speakers\"])\n", " MAX_TTS = 12\n", " min_speakers = gr.Slider(\n", " 1,\n", " MAX_TTS,\n", " value=1,\n", " label=lg_conf[\"min_sk\"],\n", " step=1,\n", " visible=False,\n", " )\n", " max_speakers = gr.Slider(\n", " 1,\n", " MAX_TTS,\n", " value=2,\n", " step=1,\n", " label=lg_conf[\"max_sk\"],\n", " )\n", " gr.Markdown(lg_conf[\"tts_select\"])\n", "\n", " def submit(value):\n", " visibility_dict = {\n", " f\"tts_voice{i:02d}\": gr.update(visible=i < value)\n", " for i in range(MAX_TTS)\n", " }\n", " return [value for value in visibility_dict.values()]\n", "\n", " tts_voice00 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-EmmaMultilingualNeural-Female\",\n", " label=lg_conf[\"sk1\"],\n", " visible=True,\n", " interactive=True,\n", " )\n", " tts_voice01 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-AndrewMultilingualNeural-Male\",\n", " label=lg_conf[\"sk2\"],\n", " visible=True,\n", " interactive=True,\n", " )\n", " tts_voice02 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-AvaMultilingualNeural-Female\",\n", " label=lg_conf[\"sk3\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice03 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-BrianMultilingualNeural-Male\",\n", " label=lg_conf[\"sk4\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice04 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"de-DE-SeraphinaMultilingualNeural-Female\",\n", " label=lg_conf[\"sk4\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice05 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"de-DE-FlorianMultilingualNeural-Male\",\n", " label=lg_conf[\"sk6\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice06 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"fr-FR-VivienneMultilingualNeural-Female\",\n", " label=lg_conf[\"sk7\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice07 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"fr-FR-RemyMultilingualNeural-Male\",\n", " label=lg_conf[\"sk8\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice08 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-EmmaMultilingualNeural-Female\",\n", " label=lg_conf[\"sk9\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice09 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-AndrewMultilingualNeural-Male\",\n", " label=lg_conf[\"sk10\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice10 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-EmmaMultilingualNeural-Female\",\n", " label=lg_conf[\"sk11\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " tts_voice11 = gr.Dropdown(\n", " SoniTr.tts_info.tts_list(),\n", " value=\"en-US-AndrewMultilingualNeural-Male\",\n", " label=lg_conf[\"sk12\"],\n", " visible=False,\n", " interactive=True,\n", " )\n", " max_speakers.change(\n", " submit,\n", " max_speakers,\n", " [\n", " tts_voice00,\n", " tts_voice01,\n", " tts_voice02,\n", " tts_voice03,\n", " tts_voice04,\n", " tts_voice05,\n", " tts_voice06,\n", " tts_voice07,\n", " tts_voice08,\n", " tts_voice09,\n", " tts_voice10,\n", " tts_voice11,\n", " ],\n", " )\n", "\n", " with gr.Column():\n", " with gr.Accordion(\n", " lg_conf[\"vc_title\"],\n", " open=False,\n", " ):\n", " gr.Markdown(lg_conf[\"vc_subtitle\"])\n", " voice_imitation_gui = gr.Checkbox(\n", " False,\n", " label=lg_conf[\"vc_active_label\"],\n", " info=lg_conf[\"vc_active_info\"],\n", " )\n", " openvoice_models = [\"openvoice\", \"openvoice_v2\"]\n", " voice_imitation_method_options = (\n", " [\"freevc\"] + openvoice_models\n", " if SoniTr.tts_info.xtts_enabled\n", " else openvoice_models\n", " )\n", " voice_imitation_method_gui = gr.Dropdown(\n", " voice_imitation_method_options,\n", " value=voice_imitation_method_options[0],\n", " label=lg_conf[\"vc_method_label\"],\n", " info=lg_conf[\"vc_method_info\"],\n", " )\n", " voice_imitation_max_segments_gui = gr.Slider(\n", " label=lg_conf[\"vc_segments_label\"],\n", " info=lg_conf[\"vc_segments_info\"],\n", " value=3,\n", " step=1,\n", " minimum=1,\n", " maximum=10,\n", " visible=True,\n", " interactive=True,\n", " )\n", " voice_imitation_vocals_dereverb_gui = gr.Checkbox(\n", " False,\n", " label=lg_conf[\"vc_dereverb_label\"],\n", " info=lg_conf[\"vc_dereverb_info\"],\n", " )\n", " voice_imitation_remove_previous_gui = gr.Checkbox(\n", " True,\n", " label=lg_conf[\"vc_remove_label\"],\n", " info=lg_conf[\"vc_remove_info\"],\n", " )\n", "\n", " if SoniTr.tts_info.xtts_enabled:\n", " with gr.Column():\n", " with gr.Accordion(\n", " lg_conf[\"xtts_title\"],\n", " open=False,\n", " ):\n", " gr.Markdown(lg_conf[\"xtts_subtitle\"])\n", " wav_speaker_file = gr.File(\n", " label=lg_conf[\"xtts_file_label\"]\n", " )\n", " wav_speaker_name = gr.Textbox(\n", " label=lg_conf[\"xtts_name_label\"],\n", " value=\"\",\n", " info=lg_conf[\"xtts_name_info\"],\n", " placeholder=\"default_name\",\n", " lines=1,\n", " )\n", " wav_speaker_start = gr.Number(\n", " label=\"Time audio start\",\n", " value=0,\n", " visible=False,\n", " )\n", " wav_speaker_end = gr.Number(\n", " label=\"Time audio end\",\n", " value=0,\n", " visible=False,\n", " )\n", " wav_speaker_dir = gr.Textbox(\n", " label=\"Directory save\",\n", " value=\"_XTTS_\",\n", " visible=False,\n", " )\n", " wav_speaker_dereverb = gr.Checkbox(\n", " True,\n", " label=lg_conf[\"xtts_dereverb_label\"],\n", " info=lg_conf[\"xtts_dereverb_info\"]\n", " )\n", " wav_speaker_output = gr.HTML()\n", " create_xtts_wav = gr.Button(\n", " lg_conf[\"xtts_button\"]\n", " )\n", " gr.Markdown(lg_conf[\"xtts_footer\"])\n", " else:\n", " wav_speaker_dereverb = gr.Checkbox(\n", " False,\n", " label=lg_conf[\"xtts_dereverb_label\"],\n", " info=lg_conf[\"xtts_dereverb_info\"],\n", " visible=False\n", " )\n", "\n", " with gr.Column():\n", " with gr.Accordion(\n", " lg_conf[\"extra_setting\"], open=False\n", " ):\n", " audio_accelerate = gr.Slider(\n", " label=lg_conf[\"acc_max_label\"],\n", " value=1.9,\n", " step=0.1,\n", " minimum=1.0,\n", " maximum=2.5,\n", " visible=True,\n", " interactive=True,\n", " info=lg_conf[\"acc_max_info\"],\n", " )\n", " acceleration_rate_regulation_gui = gr.Checkbox(\n", " False,\n", " label=lg_conf[\"acc_rate_label\"],\n", " info=lg_conf[\"acc_rate_info\"],\n", " )\n", " avoid_overlap_gui = gr.Checkbox(\n", " False,\n", " label=lg_conf[\"or_label\"],\n", " info=lg_conf[\"or_info\"],\n", " )\n", "\n", " gr.HTML(\"
\")\n", "\n", " audio_mix_options = [\n", " \"Mixing audio with sidechain compression\",\n", " \"Adjusting volumes and mixing audio\",\n", " ]\n", " AUDIO_MIX = gr.Dropdown(\n", " audio_mix_options,\n", " value=audio_mix_options[1],\n", " label=lg_conf[\"aud_mix_label\"],\n", " info=lg_conf[\"aud_mix_info\"],\n", " )\n", " volume_original_mix = gr.Slider(\n", " label=lg_conf[\"vol_ori\"],\n", " info=\"for Adjusting volumes and mixing audio\",\n", " value=0.25,\n", " step=0.05,\n", " minimum=0.0,\n", " maximum=2.50,\n", " visible=True,\n", " interactive=True,\n", " )\n", " volume_translated_mix = gr.Slider(\n", " label=lg_conf[\"vol_tra\"],\n", " info=\"for Adjusting volumes and mixing audio\",\n", " value=1.80,\n", " step=0.05,\n", " minimum=0.0,\n", " maximum=2.50,\n", " visible=True,\n", " interactive=True,\n", " )\n", " main_voiceless_track = gr.Checkbox(\n", " label=lg_conf[\"voiceless_tk_label\"],\n", " info=lg_conf[\"voiceless_tk_info\"],\n", " )\n", "\n", " gr.HTML(\"
\")\n", " sub_type_options = [\n", " \"disable\",\n", " \"srt\",\n", " \"vtt\",\n", " \"ass\",\n", " \"txt\",\n", " \"tsv\",\n", " \"json\",\n", " \"aud\",\n", " ]\n", "\n", " sub_type_output = gr.Dropdown(\n", " sub_type_options,\n", " value=sub_type_options[1],\n", " label=lg_conf[\"sub_type\"],\n", " )\n", " soft_subtitles_to_video_gui = gr.Checkbox(\n", " label=lg_conf[\"soft_subs_label\"],\n", " info=lg_conf[\"soft_subs_info\"],\n", " )\n", " burn_subtitles_to_video_gui = gr.Checkbox(\n", " label=lg_conf[\"burn_subs_label\"],\n", " info=lg_conf[\"burn_subs_info\"],\n", " )\n", "\n", " gr.HTML(\"
\")\n", " gr.Markdown(lg_conf[\"whisper_title\"])\n", " literalize_numbers_gui = gr.Checkbox(\n", " True,\n", " label=lg_conf[\"lnum_label\"],\n", " info=lg_conf[\"lnum_info\"],\n", " )\n", " vocal_refinement_gui = gr.Checkbox(\n", " False,\n", " label=lg_conf[\"scle_label\"],\n", " info=lg_conf[\"scle_info\"],\n", " )\n", " segment_duration_limit_gui = gr.Slider(\n", " label=lg_conf[\"sd_limit_label\"],\n", " info=lg_conf[\"sd_limit_info\"],\n", " value=15,\n", " step=1,\n", " minimum=1,\n", " maximum=30,\n", " )\n", " whisper_model_default = (\n", " \"large-v3\"\n", " if SoniTr.device == \"cuda\"\n", " else \"medium\"\n", " )\n", "\n", " WHISPER_MODEL_SIZE = gr.Dropdown(\n", " ASR_MODEL_OPTIONS + find_whisper_models(),\n", " value=whisper_model_default,\n", " label=\"Whisper ASR model\",\n", " info=lg_conf[\"asr_model_info\"],\n", " allow_custom_value=True,\n", " )\n", " com_t_opt, com_t_default = (\n", " [COMPUTE_TYPE_GPU, \"float16\"]\n", " if SoniTr.device == \"cuda\"\n", " else [COMPUTE_TYPE_CPU, \"float32\"]\n", " )\n", " compute_type = gr.Dropdown(\n", " com_t_opt,\n", " value=com_t_default,\n", " label=lg_conf[\"ctype_label\"],\n", " info=lg_conf[\"ctype_info\"],\n", " )\n", " batch_size = gr.Slider(\n", " minimum=1,\n", " maximum=32,\n", " value=8,\n", " label=lg_conf[\"batchz_label\"],\n", " info=lg_conf[\"batchz_info\"],\n", " step=1,\n", " )\n", " input_srt = gr.File(\n", " label=lg_conf[\"srt_file_label\"],\n", " file_types=[\".srt\", \".ass\", \".vtt\"],\n", " height=130,\n", " )\n", "\n", " gr.HTML(\"
\")\n", " text_segmentation_options = [\n", " \"sentence\",\n", " \"word\",\n", " \"character\"\n", " ]\n", " text_segmentation_scale_gui = gr.Dropdown(\n", " text_segmentation_options,\n", " value=text_segmentation_options[0],\n", " label=lg_conf[\"tsscale_label\"],\n", " info=lg_conf[\"tsscale_info\"],\n", " )\n", " divide_text_segments_by_gui = gr.Textbox(\n", " label=lg_conf[\"divide_text_label\"],\n", " value=\"\",\n", " info=lg_conf[\"divide_text_info\"],\n", " )\n", "\n", " gr.HTML(\"
\")\n", " pyannote_models_list = list(\n", " diarization_models.keys()\n", " )\n", " diarization_process_dropdown = gr.Dropdown(\n", " pyannote_models_list,\n", " value=pyannote_models_list[1],\n", " label=lg_conf[\"diarization_label\"],\n", " )\n", " translate_process_dropdown = gr.Dropdown(\n", " TRANSLATION_PROCESS_OPTIONS,\n", " value=TRANSLATION_PROCESS_OPTIONS[0],\n", " label=lg_conf[\"tr_process_label\"],\n", " )\n", "\n", " gr.HTML(\"
\")\n", " main_output_type = gr.Dropdown(\n", " OUTPUT_TYPE_OPTIONS,\n", " value=OUTPUT_TYPE_OPTIONS[0],\n", " label=lg_conf[\"out_type_label\"],\n", " )\n", " VIDEO_OUTPUT_NAME = gr.Textbox(\n", " label=lg_conf[\"out_name_label\"],\n", " value=\"\",\n", " info=lg_conf[\"out_name_info\"],\n", " )\n", " play_sound_gui = gr.Checkbox(\n", " True,\n", " label=lg_conf[\"task_sound_label\"],\n", " info=lg_conf[\"task_sound_info\"],\n", " )\n", " enable_cache_gui = gr.Checkbox(\n", " True,\n", " label=lg_conf[\"cache_label\"],\n", " info=lg_conf[\"cache_info\"],\n", " )\n", " PREVIEW = gr.Checkbox(\n", " label=\"Preview\", info=lg_conf[\"preview_info\"]\n", " )\n", " is_gui_dummy_check = gr.Checkbox(\n", " True, visible=False\n", " )\n", "\n", " with gr.Column(variant=\"compact\"):\n", " edit_sub_check = gr.Checkbox(\n", " label=lg_conf[\"edit_sub_label\"],\n", " info=lg_conf[\"edit_sub_info\"],\n", " )\n", " dummy_false_check = gr.Checkbox(\n", " False,\n", " visible=False,\n", " )\n", "\n", " def visible_component_subs(input_bool):\n", " if input_bool:\n", " return gr.update(visible=True), gr.update(\n", " visible=True\n", " )\n", " else:\n", " return gr.update(visible=False), gr.update(\n", " visible=False\n", " )\n", "\n", " subs_button = gr.Button(\n", " lg_conf[\"button_subs\"],\n", " variant=\"primary\",\n", " visible=False,\n", " )\n", " subs_edit_space = gr.Textbox(\n", " visible=False,\n", " lines=10,\n", " label=lg_conf[\"editor_sub_label\"],\n", " info=lg_conf[\"editor_sub_info\"],\n", " placeholder=lg_conf[\"editor_sub_ph\"],\n", " )\n", " edit_sub_check.change(\n", " visible_component_subs,\n", " [edit_sub_check],\n", " [subs_button, subs_edit_space],\n", " )\n", "\n", " with gr.Row():\n", " video_button = gr.Button(\n", " lg_conf[\"button_translate\"],\n", " variant=\"primary\",\n", " )\n", " with gr.Row():\n", " video_output = gr.File(\n", " label=lg_conf[\"output_result_label\"],\n", " file_count=\"multiple\",\n", " interactive=False,\n", "\n", " ) # gr.Video()\n", "\n", " gr.HTML(\"
\")\n", "\n", " if (\n", " os.getenv(\"YOUR_HF_TOKEN\") is None\n", " or os.getenv(\"YOUR_HF_TOKEN\") == \"\"\n", " ):\n", " HFKEY = gr.Textbox(\n", " visible=True,\n", " label=\"HF Token\",\n", " info=lg_conf[\"ht_token_info\"],\n", " placeholder=lg_conf[\"ht_token_ph\"],\n", " )\n", " else:\n", " HFKEY = gr.Textbox(\n", " visible=False,\n", " label=\"HF Token\",\n", " info=lg_conf[\"ht_token_info\"],\n", " placeholder=lg_conf[\"ht_token_ph\"],\n", " )\n", "\n", " gr.Examples(\n", " examples=[\n", " [\n", " [\"./assets/Video_main.mp4\"],\n", " \"\",\n", " \"\",\n", " \"\",\n", " False,\n", " whisper_model_default,\n", " 4,\n", " com_t_default,\n", " \"Spanish (es)\",\n", " \"English (en)\",\n", " 1,\n", " 2,\n", " \"en-CA-ClaraNeural-Female\",\n", " \"en-AU-WilliamNeural-Male\",\n", " ],\n", " ], # no update\n", " fn=SoniTr.batch_multilingual_media_conversion,\n", " inputs=[\n", " video_input,\n", " blink_input,\n", " directory_input,\n", " HFKEY,\n", " PREVIEW,\n", " WHISPER_MODEL_SIZE,\n", " batch_size,\n", " compute_type,\n", " SOURCE_LANGUAGE,\n", " TRANSLATE_AUDIO_TO,\n", " min_speakers,\n", " max_speakers,\n", " tts_voice00,\n", " tts_voice01,\n", " ],\n", " outputs=[video_output],\n", " cache_examples=False,\n", " )\n", "\n", " with gr.Tab(lg_conf[\"tab_docs\"]):\n", " with gr.Column():\n", " with gr.Accordion(\"Docs\", open=True):\n", " with gr.Column(variant=\"compact\"):\n", " with gr.Column():\n", " input_doc_type = gr.Dropdown(\n", " [\n", " \"WRITE TEXT\",\n", " \"SUBMIT DOCUMENT\",\n", " \"Find Document Path\",\n", " ],\n", " value=\"SUBMIT DOCUMENT\",\n", " label=lg_conf[\"docs_input_label\"],\n", " info=lg_conf[\"docs_input_info\"],\n", " )\n", "\n", " def swap_visibility(data_type):\n", " if data_type == \"WRITE TEXT\":\n", " return (\n", " gr.update(visible=True, value=\"\"),\n", " gr.update(visible=False, value=None),\n", " gr.update(visible=False, value=\"\"),\n", " )\n", " elif data_type == \"SUBMIT DOCUMENT\":\n", " return (\n", " gr.update(visible=False, value=\"\"),\n", " gr.update(visible=True, value=None),\n", " gr.update(visible=False, value=\"\"),\n", " )\n", " elif data_type == \"Find Document Path\":\n", " return (\n", " gr.update(visible=False, value=\"\"),\n", " gr.update(visible=False, value=None),\n", " gr.update(visible=True, value=\"\"),\n", " )\n", "\n", " text_docs = gr.Textbox(\n", " label=\"Text\",\n", " value=\"This is an example\",\n", " info=\"Write a text\",\n", " placeholder=\"...\",\n", " lines=5,\n", " visible=False,\n", " )\n", " input_docs = gr.File(\n", " label=\"Document\", visible=True\n", " )\n", " directory_input_docs = gr.Textbox(\n", " visible=False,\n", " label=\"Document Path\",\n", " info=\"Example: /home/my_doc.pdf\",\n", " placeholder=\"Path goes here...\",\n", " )\n", " input_doc_type.change(\n", " fn=swap_visibility,\n", " inputs=input_doc_type,\n", " outputs=[\n", " text_docs,\n", " input_docs,\n", " directory_input_docs,\n", " ],\n", " )\n", "\n", " gr.HTML()\n", "\n", " tts_documents = gr.Dropdown(\n", " list(\n", " filter(\n", " lambda x: x != \"_XTTS_/AUTOMATIC.wav\",\n", " SoniTr.tts_info.tts_list(),\n", " )\n", " ),\n", " value=\"en-US-EmmaMultilingualNeural-Female\",\n", " label=\"TTS\",\n", " visible=True,\n", " interactive=True,\n", " )\n", "\n", " gr.HTML()\n", "\n", " docs_SOURCE_LANGUAGE = gr.Dropdown(\n", " LANGUAGES_LIST[1:],\n", " value=\"English (en)\",\n", " label=lg_conf[\"sl_label\"],\n", " info=lg_conf[\"docs_source_info\"],\n", " )\n", " docs_TRANSLATE_TO = gr.Dropdown(\n", " LANGUAGES_LIST[1:],\n", " value=\"English (en)\",\n", " label=lg_conf[\"tat_label\"],\n", " info=lg_conf[\"tat_info\"],\n", " )\n", "\n", " with gr.Column():\n", " with gr.Accordion(\n", " lg_conf[\"extra_setting\"], open=False\n", " ):\n", " docs_translate_process_dropdown = gr.Dropdown(\n", " DOCS_TRANSLATION_PROCESS_OPTIONS,\n", " value=DOCS_TRANSLATION_PROCESS_OPTIONS[\n", " 0\n", " ],\n", " label=\"Translation process\",\n", " )\n", "\n", " gr.HTML(\"
\")\n", "\n", " docs_output_type = gr.Dropdown(\n", " DOCS_OUTPUT_TYPE_OPTIONS,\n", " value=DOCS_OUTPUT_TYPE_OPTIONS[2],\n", " label=\"Output type\",\n", " )\n", " docs_OUTPUT_NAME = gr.Textbox(\n", " label=\"Final file name\",\n", " value=\"\",\n", " info=lg_conf[\"out_name_info\"],\n", " )\n", " docs_chunk_size = gr.Number(\n", " label=lg_conf[\"chunk_size_label\"],\n", " value=0,\n", " visible=True,\n", " interactive=True,\n", " info=lg_conf[\"chunk_size_info\"],\n", " )\n", " gr.HTML(\"
\")\n", " start_page_gui = gr.Number(\n", " step=1,\n", " value=1,\n", " minimum=1,\n", " maximum=99999,\n", " label=\"Start page\",\n", " )\n", " end_page_gui = gr.Number(\n", " step=1,\n", " value=99999,\n", " minimum=1,\n", " maximum=99999,\n", " label=\"End page\",\n", " )\n", " gr.HTML(\"
Videobook config\")\n", " videobook_width_gui = gr.Number(\n", " step=1,\n", " value=1280,\n", " minimum=100,\n", " maximum=4096,\n", " label=\"Width\",\n", " )\n", " videobook_height_gui = gr.Number(\n", " step=1,\n", " value=720,\n", " minimum=100,\n", " maximum=4096,\n", " label=\"Height\",\n", " )\n", " videobook_bcolor_gui = gr.Dropdown(\n", " BORDER_COLORS,\n", " value=BORDER_COLORS[0],\n", " label=\"Border color\",\n", " )\n", " docs_dummy_check = gr.Checkbox(\n", " True, visible=False\n", " )\n", "\n", " with gr.Row():\n", " docs_button = gr.Button(\n", " lg_conf[\"docs_button\"],\n", " variant=\"primary\",\n", " )\n", " with gr.Row():\n", " docs_output = gr.File(\n", " label=\"Result\",\n", " interactive=False,\n", " )\n", "\n", " with gr.Tab(\"Custom voice R.V.C. (Optional)\"):\n", "\n", " with gr.Column():\n", " with gr.Accordion(\"Get the R.V.C. Models\", open=True):\n", " url_links = gr.Textbox(\n", " label=\"URLs\",\n", " value=\"\",\n", " info=lg_conf[\"cv_url_info\"],\n", " placeholder=\"urls here...\",\n", " lines=1,\n", " )\n", " download_finish = gr.HTML()\n", " download_button = gr.Button(\"DOWNLOAD MODELS\")\n", "\n", " def update_models():\n", " models_path, index_path = upload_model_list()\n", "\n", " dict_models = {\n", " f\"fmodel{i:02d}\": gr.update(\n", " choices=models_path\n", " )\n", " for i in range(MAX_TTS+1)\n", " }\n", " dict_index = {\n", " f\"findex{i:02d}\": gr.update(\n", " choices=index_path, value=None\n", " )\n", " for i in range(MAX_TTS+1)\n", " }\n", " dict_changes = {**dict_models, **dict_index}\n", " return [value for value in dict_changes.values()]\n", "\n", " with gr.Column():\n", " with gr.Accordion(lg_conf[\"replace_title\"], open=False):\n", " with gr.Column(variant=\"compact\"):\n", " with gr.Column():\n", " gr.Markdown(lg_conf[\"sec1_title\"])\n", " enable_custom_voice = gr.Checkbox(\n", " False,\n", " label=\"ENABLE\",\n", " info=lg_conf[\"enable_replace\"]\n", " )\n", " workers_custom_voice = gr.Number(\n", " step=1,\n", " value=1,\n", " minimum=1,\n", " maximum=50,\n", " label=\"workers\",\n", " visible=False,\n", " )\n", "\n", " gr.Markdown(lg_conf[\"sec2_title\"])\n", " gr.Markdown(lg_conf[\"sec2_subtitle\"])\n", "\n", " PITCH_ALGO_OPT = [\n", " \"pm\",\n", " \"harvest\",\n", " \"crepe\",\n", " \"rmvpe\",\n", " \"rmvpe+\",\n", " ]\n", "\n", " def model_conf():\n", " return gr.Dropdown(\n", " models_path,\n", " # value=\"\",\n", " label=\"Model\",\n", " visible=True,\n", " interactive=True,\n", " )\n", "\n", " def pitch_algo_conf():\n", " return gr.Dropdown(\n", " PITCH_ALGO_OPT,\n", " value=PITCH_ALGO_OPT[3],\n", " label=\"Pitch algorithm\",\n", " visible=True,\n", " interactive=True,\n", " )\n", "\n", " def pitch_lvl_conf():\n", " return gr.Slider(\n", " label=\"Pitch level\",\n", " minimum=-24,\n", " maximum=24,\n", " step=1,\n", " value=0,\n", " visible=True,\n", " interactive=True,\n", " )\n", "\n", " def index_conf():\n", " return gr.Dropdown(\n", " index_path,\n", " value=None,\n", " label=\"Index\",\n", " visible=True,\n", " interactive=True,\n", " )\n", "\n", " def index_inf_conf():\n", " return gr.Slider(\n", " minimum=0,\n", " maximum=1,\n", " label=\"Index influence\",\n", " value=0.75,\n", " )\n", "\n", " def respiration_filter_conf():\n", " return gr.Slider(\n", " minimum=0,\n", " maximum=7,\n", " label=\"Respiration median filtering\",\n", " value=3,\n", " step=1,\n", " interactive=True,\n", " )\n", "\n", " def envelope_ratio_conf():\n", " return gr.Slider(\n", " minimum=0,\n", " maximum=1,\n", " label=\"Envelope ratio\",\n", " value=0.25,\n", " interactive=True,\n", " )\n", "\n", " def consonant_protec_conf():\n", " return gr.Slider(\n", " minimum=0,\n", " maximum=0.5,\n", " label=\"Consonant breath protection\",\n", " value=0.5,\n", " interactive=True,\n", " )\n", "\n", " def button_conf(tts_name):\n", " return gr.Button(\n", " lg_conf[\"cv_button_apply\"]+\" \"+tts_name,\n", " variant=\"primary\",\n", " )\n", "\n", " TTS_TABS = [\n", " 'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1)\n", " ]\n", "\n", " CV_SUBTITLES = [\n", " lg_conf[\"cv_tts1\"],\n", " lg_conf[\"cv_tts2\"],\n", " lg_conf[\"cv_tts3\"],\n", " lg_conf[\"cv_tts4\"],\n", " lg_conf[\"cv_tts5\"],\n", " lg_conf[\"cv_tts6\"],\n", " lg_conf[\"cv_tts7\"],\n", " lg_conf[\"cv_tts8\"],\n", " lg_conf[\"cv_tts9\"],\n", " lg_conf[\"cv_tts10\"],\n", " lg_conf[\"cv_tts11\"],\n", " lg_conf[\"cv_tts12\"],\n", " ]\n", "\n", " configs_storage = []\n", "\n", " for i in range(MAX_TTS): # Loop from 00 to 11\n", " with gr.Accordion(CV_SUBTITLES[i], open=False):\n", " gr.Markdown(TTS_TABS[i])\n", " with gr.Column():\n", " tag_gui = gr.Textbox(\n", " value=TTS_TABS[i], visible=False\n", " )\n", " model_gui = model_conf()\n", " pitch_algo_gui = pitch_algo_conf()\n", " pitch_lvl_gui = pitch_lvl_conf()\n", " index_gui = index_conf()\n", " index_inf_gui = index_inf_conf()\n", " rmf_gui = respiration_filter_conf()\n", " er_gui = envelope_ratio_conf()\n", " cbp_gui = consonant_protec_conf()\n", "\n", " with gr.Row(variant=\"compact\"):\n", " button_config = button_conf(\n", " TTS_TABS[i]\n", " )\n", "\n", " confirm_conf = gr.HTML()\n", "\n", " button_config.click(\n", " SoniTr.vci.apply_conf,\n", " inputs=[\n", " tag_gui,\n", " model_gui,\n", " pitch_algo_gui,\n", " pitch_lvl_gui,\n", " index_gui,\n", " index_inf_gui,\n", " rmf_gui,\n", " er_gui,\n", " cbp_gui,\n", " ],\n", " outputs=[confirm_conf],\n", " )\n", "\n", " configs_storage.append({\n", " \"tag\": tag_gui,\n", " \"model\": model_gui,\n", " \"index\": index_gui,\n", " })\n", "\n", " with gr.Column():\n", " with gr.Accordion(\"Test R.V.C.\", open=False):\n", " with gr.Row(variant=\"compact\"):\n", " text_test = gr.Textbox(\n", " label=\"Text\",\n", " value=\"This is an example\",\n", " info=\"write a text\",\n", " placeholder=\"...\",\n", " lines=5,\n", " )\n", " with gr.Column():\n", " tts_test = gr.Dropdown(\n", " sorted(SoniTr.tts_info.list_edge),\n", " value=\"en-GB-ThomasNeural-Male\",\n", " label=\"TTS\",\n", " visible=True,\n", " interactive=True,\n", " )\n", " model_test = model_conf()\n", " index_test = index_conf()\n", " pitch_test = pitch_lvl_conf()\n", " pitch_alg_test = pitch_algo_conf()\n", " with gr.Row(variant=\"compact\"):\n", " button_test = gr.Button(\"Test audio\")\n", "\n", " with gr.Column():\n", " with gr.Row():\n", " original_ttsvoice = gr.Audio()\n", " ttsvoice = gr.Audio()\n", "\n", " button_test.click(\n", " SoniTr.vci.make_test,\n", " inputs=[\n", " text_test,\n", " tts_test,\n", " model_test,\n", " index_test,\n", " pitch_test,\n", " pitch_alg_test,\n", " ],\n", " outputs=[ttsvoice, original_ttsvoice],\n", " )\n", "\n", " download_button.click(\n", " download_list,\n", " [url_links],\n", " [download_finish],\n", " queue=False\n", " ).then(\n", " update_models,\n", " [],\n", " [\n", " elem[\"model\"] for elem in configs_storage\n", " ] + [model_test] + [\n", " elem[\"index\"] for elem in configs_storage\n", " ] + [index_test],\n", " )\n", "\n", " with gr.Tab(lg_conf[\"tab_help\"]):\n", " gr.Markdown(lg_conf[\"tutorial\"])\n", " gr.Markdown(news)\n", "\n", " def play_sound_alert(play_sound):\n", "\n", " if not play_sound:\n", " return None\n", "\n", " # silent_sound = \"assets/empty_audio.mp3\"\n", " sound_alert = \"assets/sound_alert.mp3\"\n", "\n", " time.sleep(0.25)\n", " # yield silent_sound\n", " yield None\n", "\n", " time.sleep(0.25)\n", " yield sound_alert\n", "\n", " sound_alert_notification = gr.Audio(\n", " value=None,\n", " type=\"filepath\",\n", " format=\"mp3\",\n", " autoplay=True,\n", " visible=False,\n", " )\n", "\n", " if logs_in_gui:\n", " logger.info(\"Logs in gui need public url\")\n", "\n", " class Logger:\n", " def __init__(self, filename):\n", " self.terminal = sys.stdout\n", " self.log = open(filename, \"w\")\n", "\n", " def write(self, message):\n", " self.terminal.write(message)\n", " self.log.write(message)\n", "\n", " def flush(self):\n", " self.terminal.flush()\n", " self.log.flush()\n", "\n", " def isatty(self):\n", " return False\n", "\n", " sys.stdout = Logger(\"output.log\")\n", "\n", " def read_logs():\n", " sys.stdout.flush()\n", " with open(\"output.log\", \"r\") as f:\n", " return f.read()\n", "\n", " with gr.Accordion(\"Logs\", open=False):\n", " logs = gr.Textbox(label=\">>>\")\n", " app.load(read_logs, None, logs, every=1)\n", "\n", " if SoniTr.tts_info.xtts_enabled:\n", " # Update tts list\n", " def update_tts_list():\n", " update_dict = {\n", " f\"tts_voice{i:02d}\": gr.update(choices=SoniTr.tts_info.tts_list())\n", " for i in range(MAX_TTS)\n", " }\n", " update_dict[\"tts_documents\"] = gr.update(\n", " choices=list(\n", " filter(\n", " lambda x: x != \"_XTTS_/AUTOMATIC.wav\",\n", " SoniTr.tts_info.tts_list(),\n", " )\n", " )\n", " )\n", " return [value for value in update_dict.values()]\n", "\n", " create_xtts_wav.click(\n", " create_wav_file_vc,\n", " inputs=[\n", " wav_speaker_name,\n", " wav_speaker_file,\n", " wav_speaker_start,\n", " wav_speaker_end,\n", " wav_speaker_dir,\n", " wav_speaker_dereverb,\n", " ],\n", " outputs=[wav_speaker_output],\n", " ).then(\n", " update_tts_list,\n", " None,\n", " [\n", " tts_voice00,\n", " tts_voice01,\n", " tts_voice02,\n", " tts_voice03,\n", " tts_voice04,\n", " tts_voice05,\n", " tts_voice06,\n", " tts_voice07,\n", " tts_voice08,\n", " tts_voice09,\n", " tts_voice10,\n", " tts_voice11,\n", " tts_documents,\n", " ],\n", " )\n", "\n", " # Run translate text\n", " subs_button.click(\n", " SoniTr.batch_multilingual_media_conversion,\n", " inputs=[\n", " video_input,\n", " blink_input,\n", " directory_input,\n", " HFKEY,\n", " PREVIEW,\n", " WHISPER_MODEL_SIZE,\n", " batch_size,\n", " compute_type,\n", " SOURCE_LANGUAGE,\n", " TRANSLATE_AUDIO_TO,\n", " min_speakers,\n", " max_speakers,\n", " tts_voice00,\n", " tts_voice01,\n", " tts_voice02,\n", " tts_voice03,\n", " tts_voice04,\n", " tts_voice05,\n", " tts_voice06,\n", " tts_voice07,\n", " tts_voice08,\n", " tts_voice09,\n", " tts_voice10,\n", " tts_voice11,\n", " VIDEO_OUTPUT_NAME,\n", " AUDIO_MIX,\n", " audio_accelerate,\n", " acceleration_rate_regulation_gui,\n", " volume_original_mix,\n", " volume_translated_mix,\n", " sub_type_output,\n", " edit_sub_check, # TRUE BY DEFAULT\n", " dummy_false_check, # dummy false\n", " subs_edit_space,\n", " avoid_overlap_gui,\n", " vocal_refinement_gui,\n", " literalize_numbers_gui,\n", " segment_duration_limit_gui,\n", " diarization_process_dropdown,\n", " translate_process_dropdown,\n", " input_srt,\n", " main_output_type,\n", " main_voiceless_track,\n", " voice_imitation_gui,\n", " voice_imitation_max_segments_gui,\n", " voice_imitation_vocals_dereverb_gui,\n", " voice_imitation_remove_previous_gui,\n", " voice_imitation_method_gui,\n", " wav_speaker_dereverb,\n", " text_segmentation_scale_gui,\n", " divide_text_segments_by_gui,\n", " soft_subtitles_to_video_gui,\n", " burn_subtitles_to_video_gui,\n", " enable_cache_gui,\n", " enable_custom_voice,\n", " workers_custom_voice,\n", " is_gui_dummy_check,\n", " ],\n", " outputs=subs_edit_space,\n", " ).then(\n", " play_sound_alert, [play_sound_gui], [sound_alert_notification]\n", " )\n", "\n", " # Run translate tts and complete\n", " video_button.click(\n", " SoniTr.batch_multilingual_media_conversion,\n", " inputs=[\n", " video_input,\n", " blink_input,\n", " directory_input,\n", " HFKEY,\n", " PREVIEW,\n", " WHISPER_MODEL_SIZE,\n", " batch_size,\n", " compute_type,\n", " SOURCE_LANGUAGE,\n", " TRANSLATE_AUDIO_TO,\n", " min_speakers,\n", " max_speakers,\n", " tts_voice00,\n", " tts_voice01,\n", " tts_voice02,\n", " tts_voice03,\n", " tts_voice04,\n", " tts_voice05,\n", " tts_voice06,\n", " tts_voice07,\n", " tts_voice08,\n", " tts_voice09,\n", " tts_voice10,\n", " tts_voice11,\n", " VIDEO_OUTPUT_NAME,\n", " AUDIO_MIX,\n", " audio_accelerate,\n", " acceleration_rate_regulation_gui,\n", " volume_original_mix,\n", " volume_translated_mix,\n", " sub_type_output,\n", " dummy_false_check,\n", " edit_sub_check,\n", " subs_edit_space,\n", " avoid_overlap_gui,\n", " vocal_refinement_gui,\n", " literalize_numbers_gui,\n", " segment_duration_limit_gui,\n", " diarization_process_dropdown,\n", " translate_process_dropdown,\n", " input_srt,\n", " main_output_type,\n", " main_voiceless_track,\n", " voice_imitation_gui,\n", " voice_imitation_max_segments_gui,\n", " voice_imitation_vocals_dereverb_gui,\n", " voice_imitation_remove_previous_gui,\n", " voice_imitation_method_gui,\n", " wav_speaker_dereverb,\n", " text_segmentation_scale_gui,\n", " divide_text_segments_by_gui,\n", " soft_subtitles_to_video_gui,\n", " burn_subtitles_to_video_gui,\n", " enable_cache_gui,\n", " enable_custom_voice,\n", " workers_custom_voice,\n", " is_gui_dummy_check,\n", " ],\n", " outputs=video_output,\n", " trigger_mode=\"multiple\",\n", " ).then(\n", " play_sound_alert, [play_sound_gui], [sound_alert_notification]\n", " )\n", "\n", " # Run docs process\n", " docs_button.click(\n", " SoniTr.multilingual_docs_conversion,\n", " inputs=[\n", " text_docs,\n", " input_docs,\n", " directory_input_docs,\n", " docs_SOURCE_LANGUAGE,\n", " docs_TRANSLATE_TO,\n", " tts_documents,\n", " docs_OUTPUT_NAME,\n", " docs_translate_process_dropdown,\n", " docs_output_type,\n", " docs_chunk_size,\n", " enable_custom_voice,\n", " workers_custom_voice,\n", " start_page_gui,\n", " end_page_gui,\n", " videobook_width_gui,\n", " videobook_height_gui,\n", " videobook_bcolor_gui,\n", " docs_dummy_check,\n", " ],\n", " outputs=docs_output,\n", " trigger_mode=\"multiple\",\n", " ).then(\n", " play_sound_alert, [play_sound_gui], [sound_alert_notification]\n", " )\n", "\n", " return app\n", "\n", "\n", "def get_language_config(language_data, language=None, base_key=\"english\"):\n", " base_lang = language_data.get(base_key)\n", "\n", " if language not in language_data:\n", " logger.error(\n", " f\"Language {language} not found, defaulting to {base_key}\"\n", " )\n", " return base_lang\n", "\n", " lg_conf = language_data.get(language, {})\n", " lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf)\n", "\n", " return lg_conf\n", "\n", "\n", "def create_parser():\n", " parser = argparse.ArgumentParser(\n", " formatter_class=argparse.ArgumentDefaultsHelpFormatter\n", " )\n", " parser.add_argument(\n", " \"--theme\",\n", " type=str,\n", " default=\"Taithrah/Minimal\",\n", " help=(\n", " \"Specify the theme; find themes in \"\n", " \"https://huggingface.co/spaces/gradio/theme-gallery;\"\n", " \" Example: --theme aliabid94/new-theme\"\n", " ),\n", " )\n", " parser.add_argument(\n", " \"--public_url\",\n", " action=\"store_true\",\n", " default=False,\n", " help=\"Enable public link\",\n", " )\n", " parser.add_argument(\n", " \"--logs_in_gui\",\n", " action=\"store_true\",\n", " default=False,\n", " help=\"Displays the operations performed in Logs\",\n", " )\n", " parser.add_argument(\n", " \"--verbosity_level\",\n", " type=str,\n", " default=\"info\",\n", " help=(\n", " \"Set logger verbosity level: \"\n", " \"debug, info, warning, error, or critical\"\n", " ),\n", " )\n", " parser.add_argument(\n", " \"--language\",\n", " type=str,\n", " default=\"english\",\n", " help=\" Select the language of the interface: english, spanish\",\n", " )\n", " parser.add_argument(\n", " \"--cpu_mode\",\n", " action=\"store_true\",\n", " default=False,\n", " help=\"Enable CPU mode to run the program without utilizing GPU acceleration.\",\n", " )\n", " return parser\n", "\n", "\n", "if __name__ == \"__main__\":\n", "\n", " parser = create_parser()\n", "\n", " # args = parser.parse_args()\n", " # Simulating command-line arguments\n", " args_list = f\"--theme {theme_var} --verbosity_level {verbosity_level_var} --language {interface_language_var}\".split()\n", " args = parser.parse_args(args_list)\n", "\n", " set_logging_level(args.verbosity_level)\n", "\n", " for id_model in UVR_MODELS:\n", " download_manager(\n", " os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir\n", " )\n", "\n", " models_path, index_path = upload_model_list()\n", "\n", " SoniTr = SoniTranslate(cpu_mode=args.cpu_mode)\n", "\n", " lg_conf = get_language_config(language_data, language=args.language)\n", "\n", " app = create_gui(args.theme, logs_in_gui=args.logs_in_gui)\n", "\n", " app.queue()\n", "\n", " from IPython.display import clear_output\n", " clear_output()\n", "\n", " app.launch(\n", " max_threads=6,\n", " # share=args.public_url,\n", " show_error=True,\n", " quiet=False,\n", " debug=True,\n", " )" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "include_colab_link": true, "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }