Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

AlexK-PL commited on Sep 4, 2023

Commit

a25696f

1 Parent(s): 187a298

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,22 +14,22 @@ import numpy as np
 torch.manual_seed(1234)
 MAX_WAV_VALUE = 32768.0
-def init_models(hparams):
-    # load trained tacotron2 + GST model:
-    model = load_model(hparams)
-    checkpoint_path = "trained_models/checkpoint_78000.model"
-    model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
-    # model.to('cuda')
-    _ = model.eval()
-    # load pre trained MelGAN model for mel2audio:
-    vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
-    checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
-    hp_melgan = load_hparam("melgan/config/default.yaml")
-    vocoder_model = Generator(80)
-    vocoder_model.load_state_dict(checkpoint['model_g'])
-    # vocoder_model = vocoder_model.to('cuda')
-    vocoder_model.eval(inference=False)
 def synthesize(text):
     sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
@@ -43,13 +43,11 @@ def synthesize(text):
     # mel2wav inference:
     with torch.no_grad():
       audio = vocoder_model.inference(mel_outputs_postnet)
     audio_numpy = audio.data.cpu().detach().numpy()
     return (22050, audio_numpy)
-init_models(hparams)
 iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),])
 iface.launch()

 torch.manual_seed(1234)
 MAX_WAV_VALUE = 32768.0
+# load trained tacotron2 + GST model:
+model = load_model(hparams)
+checkpoint_path = "trained_models/checkpoint_78000.model"
+model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
+# model.to('cuda')
+_ = model.eval()
+# load pre trained MelGAN model for mel2audio:
+vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
+checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
+hp_melgan = load_hparam("melgan/config/default.yaml")
+vocoder_model = Generator(80)
+vocoder_model.load_state_dict(checkpoint['model_g'])
+# vocoder_model = vocoder_model.to('cuda')
+vocoder_model.eval(inference=False)
 def synthesize(text):
     sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
     # mel2wav inference:
     with torch.no_grad():
       audio = vocoder_model.inference(mel_outputs_postnet)
     audio_numpy = audio.data.cpu().detach().numpy()
     return (22050, audio_numpy)
 iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),])
 iface.launch()