import os import torch import gradio as gr import torchaudio import time from datetime import datetime from tortoise.api import TextToSpeech from tortoise.utils.audio import load_voice, load_voices VOICE_OPTIONS = [ "angie", "deniro", "freeman", "random", # special option for random voice ] def inference( text, voice, voice_b, ): # Set split_by_newline to "No" regardless of the user input texts = [text] voices = [voice] if voice_b != "disabled": voices.append(voice_b) if len(voices) == 1: voice_samples, conditioning_latents = load_voice(voice) else: voice_samples, conditioning_latents = load_voices(voices) start_time = time.time() for j, text in enumerate(texts): for audio_frame in tts.tts_with_preset( text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset="ultra_fast", k=1 ): yield (24000, audio_frame.cpu().detach().numpy()) def main(): title = "Tortoise TTS " text = gr.Textbox( lines=4, label="Text:", ) voice = gr.Dropdown( VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value" ) voice_b = gr.Dropdown( VOICE_OPTIONS, value="disabled", label="(Optional) Select second voice:", type="value", ) output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True) interface = gr.Interface( fn=inference, inputs=[ text, voice, voice_b, ], title=title, outputs=[output_audio], ) interface.queue().launch()