Spaces:
Sleeping
Sleeping
File size: 1,832 Bytes
bfcd0c0 080259f bfcd0c0 080259f cb82d78 080259f cb82d78 4d5330a 848f2f7 261114e f95a0dc 848f2f7 261114e 848f2f7 f95a0dc 848f2f7 cb82d78 f95a0dc cb82d78 c472fbf f95a0dc c472fbf cb82d78 848f2f7 cb82d78 c472fbf 848f2f7 c472fbf 848f2f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
from hyper_parameters import tacotron_params as hparams
from training import load_model
from text import text_to_sequence
from melgan.model.generator import Generator
from melgan.utils.hparams import load_hparam
import torch
import numpy as np
torch.manual_seed(1234)
MAX_WAV_VALUE = 32768.0
def init_models(hparams):
# load trained tacotron2 + GST model:
model = load_model(hparams)
checkpoint_path = "trained_models/checkpoint_78000.model"
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
# model.to('cuda')
_ = model.eval()
# load pre trained MelGAN model for mel2audio:
vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
hp_melgan = load_hparam("melgan/config/default.yaml")
vocoder_model = Generator(80)
vocoder_model.load_state_dict(checkpoint['model_g'])
# vocoder_model = vocoder_model.to('cuda')
vocoder_model.eval(inference=False)
def synthesize(text):
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
gst_scores = torch.from_numpy(gst_head_scores).float()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
# mel2wav inference:
with torch.no_grad():
audio = vocoder_model.inference(mel_outputs_postnet)
audio_numpy = audio.data.cpu().detach().numpy()
return (22050, audio_numpy)
init_models(hparams)
iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),])
iface.launch()
|