import os import torch import gradio as gr from huggingface_hub import hf_hub_download import langid from openvoice.api import BaseSpeakerTTS, ToneColorConverter import openvoice.se_extractor as se_extractor # Use environment variables or predefined paths CKPT_BASE_PATH = os.getenv('CHECKPOINT_PATH', './checkpoints') EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN" CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter" OUTPUT_DIR = "./outputs" # Ensure directories exist os.makedirs(CKPT_BASE_PATH, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True) def download_files(): """Centralized file download method with error handling""" files_to_download = [ (f"{CONVERTER_SUFFIX}/checkpoint.pth", "converter/checkpoint.pth"), (f"{CONVERTER_SUFFIX}/config.json", "converter/config.json"), (f"{EN_SUFFIX}/checkpoint.pth", "base_speakers/EN/checkpoint.pth"), (f"{EN_SUFFIX}/config.json", "base_speakers/EN/config.json"), (f"{EN_SUFFIX}/en_default_se.pth", "base_speakers/EN/en_default_se.pth"), (f"{EN_SUFFIX}/en_style_se.pth", "base_speakers/EN/en_style_se.pth") ] for local_path, remote_path in files_to_download: try: os.makedirs(os.path.dirname(local_path), exist_ok=True) hf_hub_download( repo_id="myshell-ai/OpenVoice", filename=remote_path, local_dir=CKPT_BASE_PATH ) except Exception as e: print(f"Error downloading {remote_path}: {e}") raise # Download files early download_files() # Model Initialization with Error Handling try: pt_device = "cpu" # Explicitly use CPU for Hugging Face deployment en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device) en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth") tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device) tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth") en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth") en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth") except Exception as model_init_error: print(f"Model initialization error: {model_init_error}") raise def predict(prompt, style, audio_file_pth, tau): if len(prompt) < 2 or len(prompt) > 200: return "Text should be between 2 and 200 characters.", None try: target_se, _ = se_extractor.get_se( audio_file_pth, tone_color_converter, target_dir=OUTPUT_DIR, vad=True ) except Exception as e: return f"Error extracting tone: {str(e)}", None try: src_path = f"{OUTPUT_DIR}/tmp.wav" en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English") save_path = f"{OUTPUT_DIR}/output.wav" tone_color_converter.convert( audio_src_path=src_path, src_se=en_source_style_se if style != "default" else en_source_default_se, tgt_se=target_se, output_path=save_path, tau=tau ) return "Voice cloning completed successfully.", save_path except Exception as conversion_error: return f"Voice conversion error: {conversion_error}", None def create_demo(): with gr.Blocks() as demo: gr.Markdown("# OpenVoice: Instant Voice Cloning") with gr.Row(): input_text = gr.Textbox(label="Text to speak", placeholder="Enter text (2-200 chars)") style = gr.Dropdown( label="Style", choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"], value="default" ) with gr.Row(): reference_audio = gr.Audio(label="Reference Audio", type="filepath") tau_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, label="Voice Similarity", info="Higher values = more similar to reference" ) submit_button = gr.Button("Generate Voice") output_text = gr.Textbox(label="Status") output_audio = gr.Audio(label="Generated Audio") submit_button.click( predict, inputs=[input_text, style, reference_audio, tau_slider], outputs=[output_text, output_audio] ) return demo # Hugging Face Space compatibility demo = create_demo()