Spaces:

SamratBarai
/

Kokoro-82M_Interface

Sleeping

App Files Files Community

Kokoro-82M_Interface / app.py

SamratBarai

Update app.py

081ce5b verified about 2 months ago

raw

history blame contribute delete

2.98 kB

	import gradio as gr
	from kokoro import generate
	from models import build_model
	from scipy.io.wavfile import write
	from pydub import AudioSegment
	import torch
	import numpy as np
	import os
	import shortuuid

	# Load model and voicepack only once
	MODEL_PATH = 'kokoro-v0_19.pth'
	MODEL = None
	if not os.path.exists(MODEL_PATH):
	raise FileNotFoundError(f"Error: Model file '{MODEL_PATH}' does not exist.")

	if MODEL is None:
	MODEL = build_model(MODEL_PATH, 'cpu')
	print("\n-------------\nModel loaded.")

	VOICE_NAMES = [
	'af', # Default voice is a 50-50 mix of Bella & Sarah
	'af_bella', 'af_sarah', 'am_adam', 'am_michael',
	'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
	'af_nicole', 'af_sky',
	]

	def text_to_speech(text, voice_name, output_folder):
	if voice_name not in VOICE_NAMES:
	return None, "Invalid voice name."

	# Load the selected voicepack
	voicepack_path = f'voices/{voice_name}.pt'
	if not os.path.exists(voicepack_path):
	return None, f"Voicepack '{voice_name}' not found."

	VOICEPACK = torch.load(voicepack_path, weights_only=True).to('cpu')
	print(f'Loaded voice: {voice_name}')

	# Generate audio
	audio_data, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0])

	# Normalize and scale audio data
	audio_data = np.array(audio_data)
	normalized_audio = audio_data / np.max(np.abs(audio_data))
	scaled_audio = np.int16(normalized_audio * 32767)

	# Save files
	if not os.path.exists(output_folder):
	os.makedirs(output_folder)

	wav_path = output_folder + f'/{text.split(" ")[0]}-{shortuuid.uuid()}' + ".wav"
	write(wav_path, 24000, scaled_audio)

	return wav_path, f"Audio saved at: {wav_path}"

	# Gradio Blocks implementation
	with gr.Blocks(theme='gradio/soft') as app:
	gr.Markdown(
	"""
	<h1 align="center">Kokoro-82M TTS Engine</h1>
	<h4 align="left">A TTS engine with only 82M parameters. Enter the Text, voice and output folder and click generate to generate audio</h4>
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(label="Text to Convert")
	voice_selector = gr.Dropdown(choices=VOICE_NAMES, label="Select Voice")
	output_folder_input = gr.Textbox(label="Output Folder", value="./outputs")
	submit_button = gr.Button("Generate")
	with gr.Column():
	audio_output = gr.Audio(label="Generated Audio", type="filepath")
	status_output = gr.Textbox(label="Status", interactive=False)

	def process_text_to_speech(text, voice_name, output_folder):
	wav_path, status_message = text_to_speech(text, voice_name, output_folder)
	return wav_path, status_message

	submit_button.click(
	fn=process_text_to_speech,
	inputs=[text_input, voice_selector, output_folder_input],
	outputs=[audio_output, status_output]
	)


	if __name__ == "__main__":
	app.launch()