|
import os |
|
import torch |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
import langid |
|
from openvoice.api import BaseSpeakerTTS, ToneColorConverter |
|
import openvoice.se_extractor as se_extractor |
|
|
|
|
|
CKPT_BASE_PATH = os.getenv('CHECKPOINT_PATH', './checkpoints') |
|
EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN" |
|
CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter" |
|
OUTPUT_DIR = "./outputs" |
|
|
|
|
|
os.makedirs(CKPT_BASE_PATH, exist_ok=True) |
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
def download_files(): |
|
"""Centralized file download method with error handling""" |
|
files_to_download = [ |
|
(f"{CONVERTER_SUFFIX}/checkpoint.pth", "converter/checkpoint.pth"), |
|
(f"{CONVERTER_SUFFIX}/config.json", "converter/config.json"), |
|
(f"{EN_SUFFIX}/checkpoint.pth", "base_speakers/EN/checkpoint.pth"), |
|
(f"{EN_SUFFIX}/config.json", "base_speakers/EN/config.json"), |
|
(f"{EN_SUFFIX}/en_default_se.pth", "base_speakers/EN/en_default_se.pth"), |
|
(f"{EN_SUFFIX}/en_style_se.pth", "base_speakers/EN/en_style_se.pth") |
|
] |
|
|
|
for local_path, remote_path in files_to_download: |
|
try: |
|
os.makedirs(os.path.dirname(local_path), exist_ok=True) |
|
hf_hub_download( |
|
repo_id="myshell-ai/OpenVoice", |
|
filename=remote_path, |
|
local_dir=CKPT_BASE_PATH |
|
) |
|
except Exception as e: |
|
print(f"Error downloading {remote_path}: {e}") |
|
raise |
|
|
|
|
|
download_files() |
|
|
|
|
|
try: |
|
pt_device = "cpu" |
|
en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device) |
|
en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth") |
|
|
|
tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device) |
|
tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth") |
|
|
|
en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth") |
|
en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth") |
|
except Exception as model_init_error: |
|
print(f"Model initialization error: {model_init_error}") |
|
raise |
|
|
|
def predict(prompt, style, audio_file_pth, tau): |
|
if len(prompt) < 2 or len(prompt) > 200: |
|
return "Text should be between 2 and 200 characters.", None |
|
|
|
try: |
|
target_se, _ = se_extractor.get_se( |
|
audio_file_pth, |
|
tone_color_converter, |
|
target_dir=OUTPUT_DIR, |
|
vad=True |
|
) |
|
except Exception as e: |
|
return f"Error extracting tone: {str(e)}", None |
|
|
|
try: |
|
src_path = f"{OUTPUT_DIR}/tmp.wav" |
|
en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English") |
|
|
|
save_path = f"{OUTPUT_DIR}/output.wav" |
|
tone_color_converter.convert( |
|
audio_src_path=src_path, |
|
src_se=en_source_style_se if style != "default" else en_source_default_se, |
|
tgt_se=target_se, |
|
output_path=save_path, |
|
tau=tau |
|
) |
|
|
|
return "Voice cloning completed successfully.", save_path |
|
except Exception as conversion_error: |
|
return f"Voice conversion error: {conversion_error}", None |
|
|
|
def create_demo(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("# OpenVoice: Instant Voice Cloning") |
|
|
|
with gr.Row(): |
|
input_text = gr.Textbox(label="Text to speak", placeholder="Enter text (2-200 chars)") |
|
style = gr.Dropdown( |
|
label="Style", |
|
choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"], |
|
value="default" |
|
) |
|
|
|
with gr.Row(): |
|
reference_audio = gr.Audio(label="Reference Audio", type="filepath") |
|
tau_slider = gr.Slider( |
|
minimum=0.1, |
|
maximum=1.0, |
|
value=0.7, |
|
label="Voice Similarity", |
|
info="Higher values = more similar to reference" |
|
) |
|
|
|
submit_button = gr.Button("Generate Voice") |
|
|
|
output_text = gr.Textbox(label="Status") |
|
output_audio = gr.Audio(label="Generated Audio") |
|
|
|
submit_button.click( |
|
predict, |
|
inputs=[input_text, style, reference_audio, tau_slider], |
|
outputs=[output_text, output_audio] |
|
) |
|
|
|
return demo |
|
|
|
|
|
demo = create_demo() |