Spaces:

naveenk-ai
/

openvoice_voicecloning_win

Running

File size: 4,560 Bytes

7c9c787
 
 
3c51bbc
7c9c787
3c51bbc
918c36e
 
edd5fdd
 
3c51bbc
 
edd5fdd
 
 
 
3c51bbc
f712093
edd5fdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f712093
edd5fdd
 
f712093
edd5fdd
 
 
 
 
3c51bbc
edd5fdd
 
3c51bbc
edd5fdd
 
 
 
 
3c51bbc
 
 
 
 
 
edd5fdd
 
 
 
 
 
3c51bbc
edd5fdd
3c51bbc
edd5fdd
 
 
 
 
 
 
 
 
 
 
 
f712093
edd5fdd
 
 
f712093
3c51bbc
 
edd5fdd
7c9c787
3c51bbc
edd5fdd
3c51bbc
 
 
 
 
 
 
 
edd5fdd
 
 
 
 
 
 
f712093
3c51bbc
 
 
 
7c9c787
3c51bbc
 
 
 
 
7c9c787
3c51bbc
7c9c787
edd5fdd

import os
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import langid
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
import openvoice.se_extractor as se_extractor

# Use environment variables or predefined paths
CKPT_BASE_PATH = os.getenv('CHECKPOINT_PATH', './checkpoints')
EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN"
CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter"
OUTPUT_DIR = "./outputs"

# Ensure directories exist
os.makedirs(CKPT_BASE_PATH, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

def download_files():
    """Centralized file download method with error handling"""
    files_to_download = [
        (f"{CONVERTER_SUFFIX}/checkpoint.pth", "converter/checkpoint.pth"),
        (f"{CONVERTER_SUFFIX}/config.json", "converter/config.json"),
        (f"{EN_SUFFIX}/checkpoint.pth", "base_speakers/EN/checkpoint.pth"),
        (f"{EN_SUFFIX}/config.json", "base_speakers/EN/config.json"),
        (f"{EN_SUFFIX}/en_default_se.pth", "base_speakers/EN/en_default_se.pth"),
        (f"{EN_SUFFIX}/en_style_se.pth", "base_speakers/EN/en_style_se.pth")
    ]
    
    for local_path, remote_path in files_to_download:
        try:
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            hf_hub_download(
                repo_id="myshell-ai/OpenVoice", 
                filename=remote_path, 
                local_dir=CKPT_BASE_PATH
            )
        except Exception as e:
            print(f"Error downloading {remote_path}: {e}")
            raise

# Download files early
download_files()

# Model Initialization with Error Handling
try:
    pt_device = "cpu"  # Explicitly use CPU for Hugging Face deployment
    en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device)
    en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth")

    tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device)
    tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth")

    en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth")
    en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth")
except Exception as model_init_error:
    print(f"Model initialization error: {model_init_error}")
    raise

def predict(prompt, style, audio_file_pth, tau):
    if len(prompt) < 2 or len(prompt) > 200:
        return "Text should be between 2 and 200 characters.", None

    try:
        target_se, _ = se_extractor.get_se(
            audio_file_pth, 
            tone_color_converter, 
            target_dir=OUTPUT_DIR, 
            vad=True
        )
    except Exception as e:
        return f"Error extracting tone: {str(e)}", None

    try:
        src_path = f"{OUTPUT_DIR}/tmp.wav"
        en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English")

        save_path = f"{OUTPUT_DIR}/output.wav"
        tone_color_converter.convert(
            audio_src_path=src_path,
            src_se=en_source_style_se if style != "default" else en_source_default_se,
            tgt_se=target_se,
            output_path=save_path,
            tau=tau
        )

        return "Voice cloning completed successfully.", save_path
    except Exception as conversion_error:
        return f"Voice conversion error: {conversion_error}", None

def create_demo():
    with gr.Blocks() as demo:
        gr.Markdown("# OpenVoice: Instant Voice Cloning")
        
        with gr.Row():
            input_text = gr.Textbox(label="Text to speak", placeholder="Enter text (2-200 chars)")
            style = gr.Dropdown(
                label="Style",
                choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"],
                value="default"
            )
        
        with gr.Row():
            reference_audio = gr.Audio(label="Reference Audio", type="filepath")
            tau_slider = gr.Slider(
                minimum=0.1, 
                maximum=1.0, 
                value=0.7, 
                label="Voice Similarity", 
                info="Higher values = more similar to reference"
            )

        submit_button = gr.Button("Generate Voice")
        
        output_text = gr.Textbox(label="Status")
        output_audio = gr.Audio(label="Generated Audio")

        submit_button.click(
            predict,
            inputs=[input_text, style, reference_audio, tau_slider],
            outputs=[output_text, output_audio]
        )

    return demo

# Hugging Face Space compatibility
demo = create_demo()