Spaces:
Sleeping
Sleeping
import gradio as gr | |
from kokoro import generate | |
from models import build_model | |
from scipy.io.wavfile import write | |
from pydub import AudioSegment | |
import torch | |
import numpy as np | |
import os | |
import shortuuid | |
# Load model and voicepack only once | |
MODEL_PATH = 'kokoro-v0_19.pth' | |
MODEL = None | |
if not os.path.exists(MODEL_PATH): | |
raise FileNotFoundError(f"Error: Model file '{MODEL_PATH}' does not exist.") | |
if MODEL is None: | |
MODEL = build_model(MODEL_PATH, 'cpu') | |
print("\n-------------\nModel loaded.") | |
VOICE_NAMES = [ | |
'af', # Default voice is a 50-50 mix of Bella & Sarah | |
'af_bella', 'af_sarah', 'am_adam', 'am_michael', | |
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis', | |
'af_nicole', 'af_sky', | |
] | |
def text_to_speech(text, voice_name, output_folder): | |
if voice_name not in VOICE_NAMES: | |
return None, "Invalid voice name." | |
# Load the selected voicepack | |
voicepack_path = f'voices/{voice_name}.pt' | |
if not os.path.exists(voicepack_path): | |
return None, f"Voicepack '{voice_name}' not found." | |
VOICEPACK = torch.load(voicepack_path, weights_only=True).to('cpu') | |
print(f'Loaded voice: {voice_name}') | |
# Generate audio | |
audio_data, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0]) | |
# Normalize and scale audio data | |
audio_data = np.array(audio_data) | |
normalized_audio = audio_data / np.max(np.abs(audio_data)) | |
scaled_audio = np.int16(normalized_audio * 32767) | |
# Save files | |
if not os.path.exists(output_folder): | |
os.makedirs(output_folder) | |
wav_path = output_folder + f'/{text.split(" ")[0]}-{shortuuid.uuid()}' + ".wav" | |
write(wav_path, 24000, scaled_audio) | |
return wav_path, f"Audio saved at: {wav_path}" | |
# Gradio Blocks implementation | |
with gr.Blocks(theme='gradio/soft') as app: | |
gr.Markdown( | |
""" | |
<h1 align="center">Kokoro-82M TTS Engine</h1> | |
<h4 align="left">A TTS engine with only 82M parameters. Enter the Text, voice and output folder and click generate to generate audio</h4> | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox(label="Text to Convert") | |
voice_selector = gr.Dropdown(choices=VOICE_NAMES, label="Select Voice") | |
output_folder_input = gr.Textbox(label="Output Folder", value="./outputs") | |
submit_button = gr.Button("Generate") | |
with gr.Column(): | |
audio_output = gr.Audio(label="Generated Audio", type="filepath") | |
status_output = gr.Textbox(label="Status", interactive=False) | |
def process_text_to_speech(text, voice_name, output_folder): | |
wav_path, status_message = text_to_speech(text, voice_name, output_folder) | |
return wav_path, status_message | |
submit_button.click( | |
fn=process_text_to_speech, | |
inputs=[text_input, voice_selector, output_folder_input], | |
outputs=[audio_output, status_output] | |
) | |
if __name__ == "__main__": | |
app.launch() |