File size: 1,985 Bytes
97e4faf
 
 
 
 
 
 
bd104fa
97e4faf
 
 
 
 
b38366a
97e4faf
 
 
 
 
1d9f047
97e4faf
 
bd104fa
97e4faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d9f047
97e4faf
 
 
 
 
deb14ad
e9c4729
bd104fa
 
 
97e4faf
 
 
 
 
 
1d9f047
 
 
 
 
 
 
97e4faf
 
 
 
 
 
1d9f047
97e4faf
 
 
 
bd104fa
944b0be
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import torch
import gradio as gr
import torchaudio
import time
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices

VOICE_OPTIONS = [
    "angie",
    "deniro",
    "freeman",
    "random", 
]

def inference(
    text,
    voice,
    preset_option,
):

    texts = [text]

    voices = [voice]

    if len(voices) == 1:
        voice_samples, conditioning_latents = load_voice(voice)
    else:
        voice_samples, conditioning_latents = load_voices(voices)

    start_time = time.time()

    for j, text in enumerate(texts):
        for audio_frame in tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset= preset_option,
            k=1
        ):
            yield (24000, audio_frame.cpu().detach().numpy())

def main():
    title = "Tortoise TTS "
    
    text = gr.Textbox(
        lines=4,
        label="Text:",
    )

    voice = gr.Dropdown(
        VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
    )

    preset_option = gr.Radio(
        ["ultra_fast", "fast", "standard", "high_quality"],
        label="ultra_fast for quick inference and high_quality for better inference",
        type="value",
        value="ultra_fast",
    )

    output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
    interface = gr.Interface(
        fn=inference,
        inputs=[
            text,
            voice,
            preset_option,
        ],
        title=title,
        outputs=[output_audio],
    )
    interface.queue().launch()

if __name__ == "__main__":
    tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)

    with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
        f.write(
            f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
        )

    main()