File size: 4,412 Bytes
ae81afb
49fb505
bb53d91
 
ae81afb
49fb505
ae81afb
49fb505
fd4bbc5
ae81afb
49fb505
ae81afb
49fb505
ae81afb
49fb505
 
 
 
 
 
 
 
 
 
ae81afb
49fb505
ae81afb
49fb505
e1be1d0
ae81afb
e1be1d0
 
 
 
 
 
49fb505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a09114e
 
 
49fb505
 
 
 
 
 
 
a09114e
49fb505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec4ac80
 
 
af59381
 
 
49fb505
 
66e46f9
fd4bbc5
 
 
 
 
70c7cd0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
print("import gradio")
import gradio as gr
from scipy.io.wavfile import write
import tempfile
print("import ppn")
from pypinyin import lazy_pinyin, Style
print("import torch")
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("import ttts")
from ttts.utils.infer_utils import load_model
print("import mel")
from ttts.vocoder.feature_extractors import MelSpectrogramFeatures
print("import torchaudio")
import torchaudio
MODELS = {
    'vqvae.pth':'./TTTS/vae-30.pt',
    'gpt.pth': './TTTS/gpt-70.pt',
    'clvp2.pth': '',
    'diffusion.pth': './TTTS/diffusion-855.pt',
    'vocoder.pth': './ttts/pretrained_models/pytorch_model.bin',
    'rlg_auto.pth': '',
    'rlg_diffuser.pth': '',
}
print("import tokenizer")
from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer
print("import f")
import torch.nn.functional as F
cond_audio = 'ttts/3.wav'
print("load audio")
audio,sr = torchaudio.load(cond_audio)
if audio.shape[0]>1:
    audio = audio[0].unsqueeze(0)
audio = torchaudio.transforms.Resample(sr, 24000)(audio)
cond_mel = MelSpectrogramFeatures()(audio).to(device)
print(cond_mel.shape)
auto_conditioning = cond_mel
settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
                    'top_p': .8,
                    'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
top_p = .8
temperature = .8
autoregressive_batch_size = 1
length_penalty = 1.0
repetition_penalty = 2.0
max_mel_tokens = 600
from vocos import Vocos
from ttts.diffusion.train import do_spectrogram_diffusion
from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel
# print(device)

vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
gpt = load_model('gpt',MODELS['gpt.pth'], './ttts/gpt/config.json',device)
gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)
tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')
def speak(text):
    pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
    text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)
    text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
    text_tokens = text_tokens.to(device)
    print(pinyin)
    print(text_tokens)
    
    codes = gpt.inference_speech(auto_conditioning, text_tokens,
                                do_sample=True,
                                top_p=top_p,
                                temperature=temperature,
                                num_return_sequences=autoregressive_batch_size,
                                length_penalty=length_penalty,
                                repetition_penalty=repetition_penalty,
                                max_generate_length=max_mel_tokens)
    latent = gpt(auto_conditioning, text_tokens,
    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
    torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),
    return_latent=True, clip_inputs=False).transpose(1,2)
    diffusion = load_model('diffusion',MODELS['diffusion.pth'],'./ttts/diffusion/config.yaml',device)
    diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',
                           model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),
                           conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')
    diffusion_conditioning = normalize_tacotron_mel(cond_mel)
    mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()
    wav = vocos.decode(mel).detach().cpu()
    print(wav)
    return (24000, wav.numpy())
    # with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
    #     write(f.name, data=wav, rate=24000)
    #     return f.name

with gr.Blocks() as demo:
    gr.Markdown('# TTTS\n\nAn **unofficial** demo of [TTTS](https://github.com/adelacvg/ttts) based on XTTS. TTTS only supports Chinese.')
    txt = gr.Textbox(label="Text to say", interactive=True, value="大家好,今天来点大家想看的东西。")
    btn = gr.Button("Say")
    aud = gr.Audio(interactive=False)
    btn.click(speak, inputs=txt, outputs=aud)
    
demo.queue(max_size=20, api_open=False).launch()