Spaces:
Sleeping
Sleeping
File size: 4,412 Bytes
ae81afb 49fb505 bb53d91 ae81afb 49fb505 ae81afb 49fb505 fd4bbc5 ae81afb 49fb505 ae81afb 49fb505 ae81afb 49fb505 ae81afb 49fb505 ae81afb 49fb505 e1be1d0 ae81afb e1be1d0 49fb505 a09114e 49fb505 a09114e 49fb505 ec4ac80 af59381 49fb505 66e46f9 fd4bbc5 70c7cd0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
print("import gradio")
import gradio as gr
from scipy.io.wavfile import write
import tempfile
print("import ppn")
from pypinyin import lazy_pinyin, Style
print("import torch")
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("import ttts")
from ttts.utils.infer_utils import load_model
print("import mel")
from ttts.vocoder.feature_extractors import MelSpectrogramFeatures
print("import torchaudio")
import torchaudio
MODELS = {
'vqvae.pth':'./TTTS/vae-30.pt',
'gpt.pth': './TTTS/gpt-70.pt',
'clvp2.pth': '',
'diffusion.pth': './TTTS/diffusion-855.pt',
'vocoder.pth': './ttts/pretrained_models/pytorch_model.bin',
'rlg_auto.pth': '',
'rlg_diffuser.pth': '',
}
print("import tokenizer")
from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer
print("import f")
import torch.nn.functional as F
cond_audio = 'ttts/3.wav'
print("load audio")
audio,sr = torchaudio.load(cond_audio)
if audio.shape[0]>1:
audio = audio[0].unsqueeze(0)
audio = torchaudio.transforms.Resample(sr, 24000)(audio)
cond_mel = MelSpectrogramFeatures()(audio).to(device)
print(cond_mel.shape)
auto_conditioning = cond_mel
settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
'top_p': .8,
'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
top_p = .8
temperature = .8
autoregressive_batch_size = 1
length_penalty = 1.0
repetition_penalty = 2.0
max_mel_tokens = 600
from vocos import Vocos
from ttts.diffusion.train import do_spectrogram_diffusion
from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel
# print(device)
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
gpt = load_model('gpt',MODELS['gpt.pth'], './ttts/gpt/config.json',device)
gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)
tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')
def speak(text):
pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
text_tokens = text_tokens.to(device)
print(pinyin)
print(text_tokens)
codes = gpt.inference_speech(auto_conditioning, text_tokens,
do_sample=True,
top_p=top_p,
temperature=temperature,
num_return_sequences=autoregressive_batch_size,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
max_generate_length=max_mel_tokens)
latent = gpt(auto_conditioning, text_tokens,
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),
return_latent=True, clip_inputs=False).transpose(1,2)
diffusion = load_model('diffusion',MODELS['diffusion.pth'],'./ttts/diffusion/config.yaml',device)
diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',
model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),
conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')
diffusion_conditioning = normalize_tacotron_mel(cond_mel)
mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()
wav = vocos.decode(mel).detach().cpu()
print(wav)
return (24000, wav.numpy())
# with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
# write(f.name, data=wav, rate=24000)
# return f.name
with gr.Blocks() as demo:
gr.Markdown('# TTTS\n\nAn **unofficial** demo of [TTTS](https://github.com/adelacvg/ttts) based on XTTS. TTTS only supports Chinese.')
txt = gr.Textbox(label="Text to say", interactive=True, value="大家好,今天来点大家想看的东西。")
btn = gr.Button("Say")
aud = gr.Audio(interactive=False)
btn.click(speak, inputs=txt, outputs=aud)
demo.queue(max_size=20, api_open=False).launch() |