Spaces:

naveenk-ai
/

openvoice_voicecloning_win

Running

App Files Files Community

openvoice_voicecloning_win / app.py

naveenk-ai

Update app.py

edd5fdd verified 10 months ago

raw

history blame

4.56 kB

	import os
	import torch
	import gradio as gr
	from huggingface_hub import hf_hub_download
	import langid
	from openvoice.api import BaseSpeakerTTS, ToneColorConverter
	import openvoice.se_extractor as se_extractor

	# Use environment variables or predefined paths
	CKPT_BASE_PATH = os.getenv('CHECKPOINT_PATH', './checkpoints')
	EN_SUFFIX = f"{CKPT_BASE_PATH}/base_speakers/EN"
	CONVERTER_SUFFIX = f"{CKPT_BASE_PATH}/converter"
	OUTPUT_DIR = "./outputs"

	# Ensure directories exist
	os.makedirs(CKPT_BASE_PATH, exist_ok=True)
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	def download_files():
	"""Centralized file download method with error handling"""
	files_to_download = [
	(f"{CONVERTER_SUFFIX}/checkpoint.pth", "converter/checkpoint.pth"),
	(f"{CONVERTER_SUFFIX}/config.json", "converter/config.json"),
	(f"{EN_SUFFIX}/checkpoint.pth", "base_speakers/EN/checkpoint.pth"),
	(f"{EN_SUFFIX}/config.json", "base_speakers/EN/config.json"),
	(f"{EN_SUFFIX}/en_default_se.pth", "base_speakers/EN/en_default_se.pth"),
	(f"{EN_SUFFIX}/en_style_se.pth", "base_speakers/EN/en_style_se.pth")
	]

	for local_path, remote_path in files_to_download:
	try:
	os.makedirs(os.path.dirname(local_path), exist_ok=True)
	hf_hub_download(
	repo_id="myshell-ai/OpenVoice",
	filename=remote_path,
	local_dir=CKPT_BASE_PATH
	)
	except Exception as e:
	print(f"Error downloading {remote_path}: {e}")
	raise

	# Download files early
	download_files()

	# Model Initialization with Error Handling
	try:
	pt_device = "cpu" # Explicitly use CPU for Hugging Face deployment
	en_base_speaker_tts = BaseSpeakerTTS(f"{EN_SUFFIX}/config.json", device=pt_device)
	en_base_speaker_tts.load_ckpt(f"{EN_SUFFIX}/checkpoint.pth")

	tone_color_converter = ToneColorConverter(f"{CONVERTER_SUFFIX}/config.json", device=pt_device)
	tone_color_converter.load_ckpt(f"{CONVERTER_SUFFIX}/checkpoint.pth")

	en_source_default_se = torch.load(f"{EN_SUFFIX}/en_default_se.pth")
	en_source_style_se = torch.load(f"{EN_SUFFIX}/en_style_se.pth")
	except Exception as model_init_error:
	print(f"Model initialization error: {model_init_error}")
	raise

	def predict(prompt, style, audio_file_pth, tau):
	if len(prompt) < 2 or len(prompt) > 200:
	return "Text should be between 2 and 200 characters.", None

	try:
	target_se, _ = se_extractor.get_se(
	audio_file_pth,
	tone_color_converter,
	target_dir=OUTPUT_DIR,
	vad=True
	)
	except Exception as e:
	return f"Error extracting tone: {str(e)}", None

	try:
	src_path = f"{OUTPUT_DIR}/tmp.wav"
	en_base_speaker_tts.tts(prompt, src_path, speaker=style, language="English")

	save_path = f"{OUTPUT_DIR}/output.wav"
	tone_color_converter.convert(
	audio_src_path=src_path,
	src_se=en_source_style_se if style != "default" else en_source_default_se,
	tgt_se=target_se,
	output_path=save_path,
	tau=tau
	)

	return "Voice cloning completed successfully.", save_path
	except Exception as conversion_error:
	return f"Voice conversion error: {conversion_error}", None

	def create_demo():
	with gr.Blocks() as demo:
	gr.Markdown("# OpenVoice: Instant Voice Cloning")

	with gr.Row():
	input_text = gr.Textbox(label="Text to speak", placeholder="Enter text (2-200 chars)")
	style = gr.Dropdown(
	label="Style",
	choices=["default", "whispering", "cheerful", "terrified", "angry", "sad", "friendly"],
	value="default"
	)

	with gr.Row():
	reference_audio = gr.Audio(label="Reference Audio", type="filepath")
	tau_slider = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	label="Voice Similarity",
	info="Higher values = more similar to reference"
	)

	submit_button = gr.Button("Generate Voice")

	output_text = gr.Textbox(label="Status")
	output_audio = gr.Audio(label="Generated Audio")

	submit_button.click(
	predict,
	inputs=[input_text, style, reference_audio, tau_slider],
	outputs=[output_text, output_audio]
	)

	return demo

	# Hugging Face Space compatibility
	demo = create_demo()