File size: 4,560 Bytes
7c9c787
 
 
3c51bbc
7c9c787
3c51bbc
918c36e
 
edd5fdd
 
3c51bbc
 
edd5fdd
 
 
 
3c51bbc
f712093
edd5fdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f712093
edd5fdd
 
f712093
edd5fdd
 
 
 
 
3c51bbc
edd5fdd
 
3c51bbc
edd5fdd
 
 
 
 
3c51bbc
 
 
 
 
 
edd5fdd
 
 
 
 
 
3c51bbc
edd5fdd
3c51bbc
edd5fdd
 
 
 
 
 
 
 
 
 
 
 
f712093
edd5fdd
 
 
f712093
3c51bbc
 
edd5fdd
7c9c787
3c51bbc
edd5fdd
3c51bbc
 
 
 
 
 
 
 
edd5fdd
 
 
 
 
 
 
f712093
3c51bbc
 
 
 
7c9c787
3c51bbc
 
 
 
 
7c9c787
3c51bbc
7c9c787
edd5fdd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import langid
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
import openvoice.se_extractor as se_extractor

# Use environment variables or predefined paths
CKPT_BASE_PATH = os.getenv('CHECKPOINT_PATH', './checkpoints')
EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN"
CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter"
OUTPUT_DIR = "./outputs"

# Ensure directories exist
os.makedirs(CKPT_BASE_PATH, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

def download_files():
    """Centralized file download method with error handling"""
    files_to_download = [
        (f"{CONVERTER_SUFFIX}/checkpoint.pth", "converter/checkpoint.pth"),
        (f"{CONVERTER_SUFFIX}/config.json", "converter/config.json"),
        (f"{EN_SUFFIX}/checkpoint.pth", "base_speakers/EN/checkpoint.pth"),
        (f"{EN_SUFFIX}/config.json", "base_speakers/EN/config.json"),
        (f"{EN_SUFFIX}/en_default_se.pth", "base_speakers/EN/en_default_se.pth"),
        (f"{EN_SUFFIX}/en_style_se.pth", "base_speakers/EN/en_style_se.pth")
    ]
    
    for local_path, remote_path in files_to_download:
        try:
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            hf_hub_download(
                repo_id="myshell-ai/OpenVoice", 
                filename=remote_path, 
                local_dir=CKPT_BASE_PATH
            )
        except Exception as e:
            print(f"Error downloading {remote_path}: {e}")
            raise

# Download files early
download_files()

# Model Initialization with Error Handling
try:
    pt_device = "cpu"  # Explicitly use CPU for Hugging Face deployment
    en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device)
    en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth")

    tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device)
    tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth")

    en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth")
    en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth")
except Exception as model_init_error:
    print(f"Model initialization error: {model_init_error}")
    raise

def predict(prompt, style, audio_file_pth, tau):
    if len(prompt) < 2 or len(prompt) > 200:
        return "Text should be between 2 and 200 characters.", None

    try:
        target_se, _ = se_extractor.get_se(
            audio_file_pth, 
            tone_color_converter, 
            target_dir=OUTPUT_DIR, 
            vad=True
        )
    except Exception as e:
        return f"Error extracting tone: {str(e)}", None

    try:
        src_path = f"{OUTPUT_DIR}/tmp.wav"
        en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English")

        save_path = f"{OUTPUT_DIR}/output.wav"
        tone_color_converter.convert(
            audio_src_path=src_path,
            src_se=en_source_style_se if style != "default" else en_source_default_se,
            tgt_se=target_se,
            output_path=save_path,
            tau=tau
        )

        return "Voice cloning completed successfully.", save_path
    except Exception as conversion_error:
        return f"Voice conversion error: {conversion_error}", None

def create_demo():
    with gr.Blocks() as demo:
        gr.Markdown("# OpenVoice: Instant Voice Cloning")
        
        with gr.Row():
            input_text = gr.Textbox(label="Text to speak", placeholder="Enter text (2-200 chars)")
            style = gr.Dropdown(
                label="Style",
                choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"],
                value="default"
            )
        
        with gr.Row():
            reference_audio = gr.Audio(label="Reference Audio", type="filepath")
            tau_slider = gr.Slider(
                minimum=0.1, 
                maximum=1.0, 
                value=0.7, 
                label="Voice Similarity", 
                info="Higher values = more similar to reference"
            )

        submit_button = gr.Button("Generate Voice")
        
        output_text = gr.Textbox(label="Status")
        output_audio = gr.Audio(label="Generated Audio")

        submit_button.click(
            predict,
            inputs=[input_text, style, reference_audio, tau_slider],
            outputs=[output_text, output_audio]
        )

    return demo

# Hugging Face Space compatibility
demo = create_demo()