import gradio as gr import torch import tempfile import soundfile as sf from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio # 1) Initialize the Tortoise TTS engine at startup tts = TextToSpeech() # downloads and caches models automatically # 2) Define a helper to generate speech from a reference clip + text def generate_speech(reference_audio_path, text): """ reference_audio_path: filepath to a WAV sampled at 22 050 Hz text: the string to synthesize returns: path to a 24 kHz WAV file with your cloned voice """ # Load and resample the reference clip to 22 050 Hz as a torch tensor # (load_audio handles mono conversion) ref_waveform = load_audio(reference_audio_path, sr=22050) # Synthesize: one clip, use the 'fast' preset for decent speed/quality tradeoff # returns a Tensor of shape (1, S) at 24 kHz :contentReference[oaicite:1]{index=1} output_tensor = tts.tts_with_preset( text, voice_samples=[ref_waveform], preset="fast" ) # Convert to NumPy and save to a temporary WAV (float32, 24 kHz) wav_np = output_tensor.squeeze().cpu().numpy() tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sf.write(tmp.name, wav_np, samplerate=24000) # sample rate is 24 kHz :contentReference[oaicite:2]{index=2} return tmp.name # 3) Build the Gradio interface with gr.Blocks(title="Tortoise Voice Cloning TTS") as app: gr.Markdown("## Voice Cloning with Tortoise TTS") gr.Markdown( "Upload a ~10 sec WAV clip (22 050 Hz), enter English text, " "and hear it spoken back in **your** voice!" ) with gr.Row(): voice_sample = gr.Audio(type="filepath", label="Upload Reference Voice (22 050 Hz WAV)") text_input = gr.Textbox(label="Text to Synthesize", placeholder="e.g., Hello, world!") generate_btn = gr.Button("Generate Speech") output_audio = gr.Audio(label="Cloned Speech Output (24 kHz)", interactive=False) generate_btn.click( fn=generate_speech, inputs=[voice_sample, text_input], outputs=output_audio ) if __name__ == "__main__": app.launch()