File size: 1,740 Bytes
97e4faf
 
 
 
 
 
 
bd104fa
97e4faf
 
 
 
 
 
 
 
 
 
 
 
 
bd104fa
97e4faf
bd104fa
97e4faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
deb14ad
e9c4729
bd104fa
 
 
97e4faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd104fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import torch
import gradio as gr
import torchaudio
import time
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices

VOICE_OPTIONS = [
    "angie",
    "deniro",
    "freeman",
    "random",  # special option for random voice
]

def inference(
    text,
    voice,
    voice_b,
):
    # Set split_by_newline to "No" regardless of the user input

    texts = [text]

    voices = [voice]
    if voice_b != "disabled":
        voices.append(voice_b)

    if len(voices) == 1:
        voice_samples, conditioning_latents = load_voice(voice)
    else:
        voice_samples, conditioning_latents = load_voices(voices)

    start_time = time.time()

    for j, text in enumerate(texts):
        for audio_frame in tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset="ultra_fast",
            k=1
        ):
            yield (24000, audio_frame.cpu().detach().numpy())

def main():
    title = "Tortoise TTS "
    
    text = gr.Textbox(
        lines=4,
        label="Text:",
    )

    voice = gr.Dropdown(
        VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
    )
    voice_b = gr.Dropdown(
        VOICE_OPTIONS,
        value="disabled",
        label="(Optional) Select second voice:",
        type="value",
    )

    output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
    interface = gr.Interface(
        fn=inference,
        inputs=[
            text,
            voice,
            voice_b,
        ],
        title=title,
        outputs=[output_audio],
    )
    interface.queue().launch()