mrfakename commited on
Commit
a09114e
·
verified ·
1 Parent(s): dcecee1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -7
app.py CHANGED
@@ -51,18 +51,17 @@ from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron
51
  # print(device)
52
 
53
  vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
54
-
55
-
 
56
  def speak(text):
57
  pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
58
- tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')
59
  text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)
60
  text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
61
  text_tokens = text_tokens.to(device)
62
  print(pinyin)
63
  print(text_tokens)
64
- gpt = load_model('gpt',MODELS['gpt.pth'], './ttts/gpt/config.json',device)
65
- gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)
66
  codes = gpt.inference_speech(auto_conditioning, text_tokens,
67
  do_sample=True,
68
  top_p=top_p,
@@ -81,8 +80,7 @@ def speak(text):
81
  conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')
82
  diffusion_conditioning = normalize_tacotron_mel(cond_mel)
83
  mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()
84
- wav = vocos.decode(mel).detach().cpu()
85
- print(wav)
86
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
87
  write(f.name, data=wav, rate=24000)
88
  return f.name
 
51
  # print(device)
52
 
53
  vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
54
+ gpt = load_model('gpt',MODELS['gpt.pth'], './ttts/gpt/config.json',device)
55
+ gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)
56
+ tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')
57
  def speak(text):
58
  pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
 
59
  text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)
60
  text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
61
  text_tokens = text_tokens.to(device)
62
  print(pinyin)
63
  print(text_tokens)
64
+
 
65
  codes = gpt.inference_speech(auto_conditioning, text_tokens,
66
  do_sample=True,
67
  top_p=top_p,
 
80
  conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')
81
  diffusion_conditioning = normalize_tacotron_mel(cond_mel)
82
  mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()
83
+ wav = vocos.decode(mel).detach().cpu().numpy()
 
84
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
85
  write(f.name, data=wav, rate=24000)
86
  return f.name