File size: 4,560 Bytes
7c9c787 3c51bbc 7c9c787 3c51bbc 918c36e edd5fdd 3c51bbc edd5fdd 3c51bbc f712093 edd5fdd f712093 edd5fdd f712093 edd5fdd 3c51bbc edd5fdd 3c51bbc edd5fdd 3c51bbc edd5fdd 3c51bbc edd5fdd 3c51bbc edd5fdd f712093 edd5fdd f712093 3c51bbc edd5fdd 7c9c787 3c51bbc edd5fdd 3c51bbc edd5fdd f712093 3c51bbc 7c9c787 3c51bbc 7c9c787 3c51bbc 7c9c787 edd5fdd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import langid
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
import openvoice.se_extractor as se_extractor
# Use environment variables or predefined paths
CKPT_BASE_PATH = os.getenv('CHECKPOINT_PATH', './checkpoints')
EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN"
CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter"
OUTPUT_DIR = "./outputs"
# Ensure directories exist
os.makedirs(CKPT_BASE_PATH, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
def download_files():
"""Centralized file download method with error handling"""
files_to_download = [
(f"{CONVERTER_SUFFIX}/checkpoint.pth", "converter/checkpoint.pth"),
(f"{CONVERTER_SUFFIX}/config.json", "converter/config.json"),
(f"{EN_SUFFIX}/checkpoint.pth", "base_speakers/EN/checkpoint.pth"),
(f"{EN_SUFFIX}/config.json", "base_speakers/EN/config.json"),
(f"{EN_SUFFIX}/en_default_se.pth", "base_speakers/EN/en_default_se.pth"),
(f"{EN_SUFFIX}/en_style_se.pth", "base_speakers/EN/en_style_se.pth")
]
for local_path, remote_path in files_to_download:
try:
os.makedirs(os.path.dirname(local_path), exist_ok=True)
hf_hub_download(
repo_id="myshell-ai/OpenVoice",
filename=remote_path,
local_dir=CKPT_BASE_PATH
)
except Exception as e:
print(f"Error downloading {remote_path}: {e}")
raise
# Download files early
download_files()
# Model Initialization with Error Handling
try:
pt_device = "cpu" # Explicitly use CPU for Hugging Face deployment
en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device)
en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth")
tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device)
tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth")
en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth")
en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth")
except Exception as model_init_error:
print(f"Model initialization error: {model_init_error}")
raise
def predict(prompt, style, audio_file_pth, tau):
if len(prompt) < 2 or len(prompt) > 200:
return "Text should be between 2 and 200 characters.", None
try:
target_se, _ = se_extractor.get_se(
audio_file_pth,
tone_color_converter,
target_dir=OUTPUT_DIR,
vad=True
)
except Exception as e:
return f"Error extracting tone: {str(e)}", None
try:
src_path = f"{OUTPUT_DIR}/tmp.wav"
en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English")
save_path = f"{OUTPUT_DIR}/output.wav"
tone_color_converter.convert(
audio_src_path=src_path,
src_se=en_source_style_se if style != "default" else en_source_default_se,
tgt_se=target_se,
output_path=save_path,
tau=tau
)
return "Voice cloning completed successfully.", save_path
except Exception as conversion_error:
return f"Voice conversion error: {conversion_error}", None
def create_demo():
with gr.Blocks() as demo:
gr.Markdown("# OpenVoice: Instant Voice Cloning")
with gr.Row():
input_text = gr.Textbox(label="Text to speak", placeholder="Enter text (2-200 chars)")
style = gr.Dropdown(
label="Style",
choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"],
value="default"
)
with gr.Row():
reference_audio = gr.Audio(label="Reference Audio", type="filepath")
tau_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
label="Voice Similarity",
info="Higher values = more similar to reference"
)
submit_button = gr.Button("Generate Voice")
output_text = gr.Textbox(label="Status")
output_audio = gr.Audio(label="Generated Audio")
submit_button.click(
predict,
inputs=[input_text, style, reference_audio, tau_slider],
outputs=[output_text, output_audio]
)
return demo
# Hugging Face Space compatibility
demo = create_demo() |