Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from kokoro import generate | |
| from models import build_model | |
| from scipy.io.wavfile import write | |
| from pydub import AudioSegment | |
| import torch | |
| import numpy as np | |
| import os | |
| import shortuuid | |
| # Load model and voicepack only once | |
| MODEL_PATH = 'kokoro-v0_19.pth' | |
| MODEL = None | |
| if not os.path.exists(MODEL_PATH): | |
| raise FileNotFoundError(f"Error: Model file '{MODEL_PATH}' does not exist.") | |
| if MODEL is None: | |
| MODEL = build_model(MODEL_PATH, 'cpu') | |
| print("\n-------------\nModel loaded.") | |
| VOICE_NAMES = [ | |
| 'af', # Default voice is a 50-50 mix of Bella & Sarah | |
| 'af_bella', 'af_sarah', 'am_adam', 'am_michael', | |
| 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis', | |
| 'af_nicole', 'af_sky', | |
| ] | |
| def text_to_speech(text, voice_name, output_folder): | |
| if voice_name not in VOICE_NAMES: | |
| return None, "Invalid voice name." | |
| # Load the selected voicepack | |
| voicepack_path = f'voices/{voice_name}.pt' | |
| if not os.path.exists(voicepack_path): | |
| return None, f"Voicepack '{voice_name}' not found." | |
| VOICEPACK = torch.load(voicepack_path, weights_only=True).to('cpu') | |
| print(f'Loaded voice: {voice_name}') | |
| # Generate audio | |
| audio_data, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0]) | |
| # Normalize and scale audio data | |
| audio_data = np.array(audio_data) | |
| normalized_audio = audio_data / np.max(np.abs(audio_data)) | |
| scaled_audio = np.int16(normalized_audio * 32767) | |
| # Save files | |
| if not os.path.exists(output_folder): | |
| os.makedirs(output_folder) | |
| wav_path = output_folder + f'/{text.split(" ")[0]}-{shortuuid.uuid()}' + ".wav" | |
| write(wav_path, 24000, scaled_audio) | |
| return wav_path, f"Audio saved at: {wav_path}" | |
| # Gradio Blocks implementation | |
| with gr.Blocks(theme='gradio/soft') as app: | |
| gr.Markdown( | |
| """ | |
| <h1 align="center">Kokoro-82M TTS Engine</h1> | |
| <h4 align="left">A TTS engine with only 82M parameters. Enter the Text, voice and output folder and click generate to generate audio</h4> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox(label="Text to Convert") | |
| voice_selector = gr.Dropdown(choices=VOICE_NAMES, label="Select Voice") | |
| output_folder_input = gr.Textbox(label="Output Folder", value="./outputs") | |
| submit_button = gr.Button("Generate") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Audio", type="filepath") | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| def process_text_to_speech(text, voice_name, output_folder): | |
| wav_path, status_message = text_to_speech(text, voice_name, output_folder) | |
| return wav_path, status_message | |
| submit_button.click( | |
| fn=process_text_to_speech, | |
| inputs=[text_input, voice_selector, output_folder_input], | |
| outputs=[audio_output, status_output] | |
| ) | |
| if __name__ == "__main__": | |
| app.launch() |