import gradio as gr import torch import soundfile as sf import tempfile from transformers import AutoModelForTextToSpeech, AutoTokenizer # Load Kokoro-82M Model MODEL_NAME = "hexgrad/Kokoro-82M" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForTextToSpeech.from_pretrained(MODEL_NAME).to("cpu") # Change to "cuda" if running on GPU # Define available voices (Check if Kokoro-82M has predefined voices) voices = ['default'] # Modify if multiple voices exist def generate_speech(text, voice, speed, show_transcript): """Convert input text to speech using Kokoro-82M""" inputs = tokenizer(text, return_tensors="pt").to("cpu") with torch.no_grad(): speech = model.generate(**inputs) # Save the generated speech as a file temp_file = tempfile.mktemp(suffix=".wav") sf.write(temp_file, speech.cpu().numpy(), 22050) # Adjust sample rate if necessary # Return audio and optional transcript return temp_file, text if show_transcript else None # Gradio UI interface = gr.Interface( fn=generate_speech, inputs=[ gr.Textbox(label="Input Text", lines=5, placeholder="Type here..."), gr.Dropdown(choices=voices, label="Select Voice", value='default'), gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Speed"), gr.Checkbox(label="Show Transcript", value=True) ], outputs=[ gr.Audio(label="Generated Speech"), gr.Textbox(label="Transcript", visible=True) ], title="Educational Text-to-Speech", description="Enter text, choose a voice, and generate speech. Use the transcript option to follow along while listening.", allow_flagging="never" ) # Launch the app if __name__ == "__main__": interface.launch()