import os
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import langid
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
import openvoice.se_extractor as se_extractor

# Use environment variables or predefined paths
CKPT_BASE_PATH = os.getenv('CHECKPOINT_PATH', './checkpoints')
EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN"
CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter"
OUTPUT_DIR = "./outputs"

# Ensure directories exist
os.makedirs(CKPT_BASE_PATH, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

def download_files():
    """Centralized file download method with error handling"""
    files_to_download = [
        (f"{CONVERTER_SUFFIX}/checkpoint.pth", "converter/checkpoint.pth"),
        (f"{CONVERTER_SUFFIX}/config.json", "converter/config.json"),
        (f"{EN_SUFFIX}/checkpoint.pth", "base_speakers/EN/checkpoint.pth"),
        (f"{EN_SUFFIX}/config.json", "base_speakers/EN/config.json"),
        (f"{EN_SUFFIX}/en_default_se.pth", "base_speakers/EN/en_default_se.pth"),
        (f"{EN_SUFFIX}/en_style_se.pth", "base_speakers/EN/en_style_se.pth")
    ]
    
    for local_path, remote_path in files_to_download:
        try:
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            hf_hub_download(
                repo_id="myshell-ai/OpenVoice", 
                filename=remote_path, 
                local_dir=CKPT_BASE_PATH
            )
        except Exception as e:
            print(f"Error downloading {remote_path}: {e}")
            raise

# Download files early
download_files()

# Model Initialization with Error Handling
try:
    pt_device = "cpu"  # Explicitly use CPU for Hugging Face deployment
    en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device)
    en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth")

    tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device)
    tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth")

    en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth")
    en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth")
except Exception as model_init_error:
    print(f"Model initialization error: {model_init_error}")
    raise

def predict(prompt, style, audio_file_pth, tau):
    if len(prompt) < 2 or len(prompt) > 200:
        return "Text should be between 2 and 200 characters.", None

    try:
        target_se, _ = se_extractor.get_se(
            audio_file_pth, 
            tone_color_converter, 
            target_dir=OUTPUT_DIR, 
            vad=True
        )
    except Exception as e:
        return f"Error extracting tone: {str(e)}", None

    try:
        src_path = f"{OUTPUT_DIR}/tmp.wav"
        en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English")

        save_path = f"{OUTPUT_DIR}/output.wav"
        tone_color_converter.convert(
            audio_src_path=src_path,
            src_se=en_source_style_se if style != "default" else en_source_default_se,
            tgt_se=target_se,
            output_path=save_path,
            tau=tau
        )

        return "Voice cloning completed successfully.", save_path
    except Exception as conversion_error:
        return f"Voice conversion error: {conversion_error}", None

def create_demo():
    with gr.Blocks() as demo:
        gr.Markdown("# OpenVoice: Instant Voice Cloning")
        
        with gr.Row():
            input_text = gr.Textbox(label="Text to speak", placeholder="Enter text (2-200 chars)")
            style = gr.Dropdown(
                label="Style",
                choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"],
                value="default"
            )
        
        with gr.Row():
            reference_audio = gr.Audio(label="Reference Audio", type="filepath")
            tau_slider = gr.Slider(
                minimum=0.1, 
                maximum=1.0, 
                value=0.7, 
                label="Voice Similarity", 
                info="Higher values = more similar to reference"
            )

        submit_button = gr.Button("Generate Voice")
        
        output_text = gr.Textbox(label="Status")
        output_audio = gr.Audio(label="Generated Audio")

        submit_button.click(
            predict,
            inputs=[input_text, style, reference_audio, tau_slider],
            outputs=[output_text, output_audio]
        )

    return demo

# Hugging Face Space compatibility
demo = create_demo()