Text_to_Speech / app.py
Pranjal12345's picture
Update app.py
bd104fa
raw
history blame
2.05 kB
import os
import torch
import gradio as gr
import torchaudio
import time
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices
VOICE_OPTIONS = [
"angie",
"deniro",
"freeman",
"random", # special option for random voice
]
def inference(
text,
voice,
voice_b,
):
# Set split_by_newline to "No" regardless of the user input
texts = [text]
voices = [voice]
if voice_b != "disabled":
voices.append(voice_b)
if len(voices) == 1:
voice_samples, conditioning_latents = load_voice(voice)
else:
voice_samples, conditioning_latents = load_voices(voices)
start_time = time.time()
for j, text in enumerate(texts):
for audio_frame in tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset="ultra_fast",
k=1
):
yield (24000, audio_frame.cpu().detach().numpy())
def main():
title = "Tortoise TTS "
text = gr.Textbox(
lines=4,
label="Text:",
)
voice = gr.Dropdown(
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
)
voice_b = gr.Dropdown(
VOICE_OPTIONS,
value="disabled",
label="(Optional) Select second voice:",
type="value",
)
output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
interface = gr.Interface(
fn=inference,
inputs=[
text,
voice,
voice_b,
],
title=title,
outputs=[output_audio],
)
interface.queue().launch()
if __name__ == "__main__":
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
f.write(
f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
)
main()