Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

AlexK-PL commited on Sep 25, 2023

Commit

378de71

1 Parent(s): 30b323b

Delete melgan

Browse files

Files changed (26) hide show

melgan/.gitignore +0 -119
melgan/LICENSE +0 -29
melgan/README.md +0 -83
melgan/assets/gd.png +0 -0
melgan/assets/lj-tensorboard-v0.3-alpha.png +0 -0
melgan/assets/lj-tensorboard.png +0 -0
melgan/config/default.yaml +0 -34
melgan/datasets/dataloader.py +0 -67
melgan/hubconf.py +0 -41
melgan/inference.py +0 -49
melgan/model/discriminator.py +0 -64
melgan/model/generator.py +0 -99
melgan/model/identity.py +0 -12
melgan/model/multiscale.py +0 -29
melgan/model/res_stack.py +0 -36
melgan/preprocess.py +0 -50
melgan/requirements.txt +0 -9
melgan/trainer.py +0 -52
melgan/utils/audio_processing.py +0 -93
melgan/utils/hparams.py +0 -67
melgan/utils/plotting.py +0 -29
melgan/utils/stft.py +0 -184
melgan/utils/train.py +0 -131
melgan/utils/utils.py +0 -26
melgan/utils/validation.py +0 -41
melgan/utils/writer.py +0 -33

melgan/.gitignore DELETED Viewed

@@ -1,119 +0,0 @@
-# IDE configuration
-.idea/
-# configuration
-config/*
-!config/default.yaml
-temp-restore.yaml
-# logs, checkpoints
-chkpt/
-logs/
-# just a temporary folder
-temp/
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# pyenv
-.python-version
-# celery beat schedule file
-celerybeat-schedule
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/

melgan/LICENSE DELETED Viewed

@@ -1,29 +0,0 @@
-BSD 3-Clause License
-Copyright (c) 2019, Seungwon Park 박승원
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-3. Neither the name of the copyright holder nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

melgan/README.md DELETED Viewed

@@ -1,83 +0,0 @@
-# MelGAN
-Unofficial PyTorch implementation of [MelGAN vocoder](https://arxiv.org/abs/1910.06711)
-## Key Features
-- MelGAN is lighter, faster, and better at generalizing to unseen speakers than [WaveGlow](https://github.com/NVIDIA/waveglow).
-- This repository use identical mel-spectrogram function from [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2), so this can be directly used to convert output from NVIDIA's tacotron2 into raw-audio.
-- Pretrained model on LJSpeech-1.1 via [PyTorch Hub](https://pytorch.org/hub).
-![](./assets/gd.png)
-## Prerequisites
-Tested on Python 3.6
-```bash
-pip install -r requirements.txt
-```
-## Prepare Dataset
-- Download dataset for training. This can be any wav files with sample rate 22050Hz. (e.g. LJSpeech was used in paper)
-- preprocess: `python preprocess.py -c config/default.yaml -d [data's root path]`
-- Edit configuration `yaml` file
-## Train & Tensorboard
-- `python trainer.py -c [config yaml file] -n [name of the run]`
-  - `cp config/default.yaml config/config.yaml` and then edit `config.yaml`
-  - Write down the root path of train/validation files to 2nd/3rd line.
-  - Each path should contain pairs of `*.wav` with corresponding (preprocessed) `*.mel` file.
-  - The data loader parses list of files within the path recursively.
-- `tensorboard --logdir logs/`
-## Pretrained model
-Try with Google Colab: TODO
-```python
-import torch
-vocoder = torch.hub.load('seungwonpark/melgan', 'melgan')
-vocoder.eval()
-mel = torch.randn(1, 80, 234) # use your own mel-spectrogram here
-if torch.cuda.is_available():
-    vocoder = vocoder.cuda()
-    mel = mel.cuda()
-with torch.no_grad():
-    audio = vocoder.inference(mel)
-```
-## Inference
-- `python inference.py -p [checkpoint path] -i [input mel path]`
-## Results
-See audio samples at: http://swpark.me/melgan/.
-Model was trained at V100 GPU for 14 days using LJSpeech-1.1.
-![](./assets/lj-tensorboard-v0.3-alpha.png)
-## Implementation Authors
-- [Seungwon Park](http://swpark.me) @ MINDsLab Inc. ([email protected], [email protected])
-- Myunchul Joe @ MINDsLab Inc.
-- [Rishikesh](https://github.com/rishikksh20) @ DeepSync Technologies Pvt Ltd.
-## License
-BSD 3-Clause License.
-- [utils/stft.py](./utils/stft.py) by Prem Seetharaman (BSD 3-Clause License)
-- [datasets/mel2samp.py](./datasets/mel2samp.py) from https://github.com/NVIDIA/waveglow (BSD 3-Clause License)
-- [utils/hparams.py](./utils/hparams.py) from https://github.com/HarryVolek/PyTorch_Speaker_Verification (No License specified)
-## Useful resources
-- [How to Train a GAN? Tips and tricks to make GANs work](https://github.com/soumith/ganhacks) by Soumith Chintala
-- [Official MelGAN implementation by original authors](https://github.com/descriptinc/melgan-neurips)
-- [Reproduction of MelGAN - NeurIPS 2019 Reproducibility Challenge (Ablation Track)](https://openreview.net/pdf?id=9jTbNbBNw0) by Yifei Zhao, Yichao Yang, and Yang Gao
-  - "replacing the average pooling layer with max pooling layer and replacing reflection padding with replication padding improves the performance significantly, while combining them produces worse results"

melgan/assets/gd.png DELETED Viewed

Binary file (114 kB)

melgan/assets/lj-tensorboard-v0.3-alpha.png DELETED Viewed

Binary file (61.1 kB)

melgan/assets/lj-tensorboard.png DELETED Viewed

Binary file (45 kB)

melgan/config/default.yaml DELETED Viewed

@@ -1,34 +0,0 @@
-data: # root path of train/validation data (either relative/absoulte path is ok)
-  train: ''
-  validation: ''
----
-train:
-  rep_discriminator: 1
-  num_workers: 32
-  batch_size: 16
-  optimizer: 'adam'
-  adam:
-    lr: 0.0001
-    beta1: 0.5
-    beta2: 0.9
----
-audio:
-  n_mel_channels: 80
-  segment_length: 16000
-  pad_short: 2000
-  filter_length: 1024
-  hop_length: 256 # WARNING: this can't be changed.
-  win_length: 1024
-  sampling_rate: 22050
-  mel_fmin: 0.0
-  mel_fmax: 8000.0
----
-model:
-  feat_match: 10.0
----
-log:
-  summary_interval: 1
-  validation_interval: 5
-  save_interval: 25
-  chkpt_dir: 'chkpt'
-  log_dir: 'logs'

melgan/datasets/dataloader.py DELETED Viewed

@@ -1,67 +0,0 @@
-import os
-import glob
-import torch
-import random
-import numpy as np
-from torch.utils.data import Dataset, DataLoader
-from utils.utils import read_wav_np
-def create_dataloader(hp, args, train):
-    dataset = MelFromDisk(hp, args, train)
-    if train:
-        return DataLoader(dataset=dataset, batch_size=hp.train.batch_size, shuffle=True,
-            num_workers=hp.train.num_workers, pin_memory=True, drop_last=True)
-    else:
-        return DataLoader(dataset=dataset, batch_size=1, shuffle=False,
-            num_workers=hp.train.num_workers, pin_memory=True, drop_last=False)
-class MelFromDisk(Dataset):
-    def __init__(self, hp, args, train):
-        self.hp = hp
-        self.args = args
-        self.train = train
-        self.path = hp.data.train if train else hp.data.validation
-        self.wav_list = glob.glob(os.path.join(self.path, '**', '*.wav'), recursive=True)
-        self.mel_segment_length = hp.audio.segment_length // hp.audio.hop_length + 2
-        self.mapping = [i for i in range(len(self.wav_list))]
-    def __len__(self):
-        return len(self.wav_list)
-    def __getitem__(self, idx):
-        if self.train:
-            idx1 = idx
-            idx2 = self.mapping[idx1]
-            return self.my_getitem(idx1), self.my_getitem(idx2)
-        else:
-            return self.my_getitem(idx)
-    def shuffle_mapping(self):
-        random.shuffle(self.mapping)
-    def my_getitem(self, idx):
-        wavpath = self.wav_list[idx]
-        melpath = wavpath.replace('.wav', '.mel')
-        sr, audio = read_wav_np(wavpath)
-        if len(audio) < self.hp.audio.segment_length + self.hp.audio.pad_short:
-            audio = np.pad(audio, (0, self.hp.audio.segment_length + self.hp.audio.pad_short - len(audio)), \
-                    mode='constant', constant_values=0.0)
-        audio = torch.from_numpy(audio).unsqueeze(0)
-        mel = torch.load(melpath).squeeze(0)
-        if self.train:
-            max_mel_start = mel.size(1) - self.mel_segment_length
-            mel_start = random.randint(0, max_mel_start)
-            mel_end = mel_start + self.mel_segment_length
-            mel = mel[:, mel_start:mel_end]
-            audio_start = mel_start * self.hp.audio.hop_length
-            audio = audio[:, audio_start:audio_start+self.hp.audio.segment_length]
-        audio = audio + (1/32768) * torch.randn_like(audio)
-        return mel, audio

melgan/hubconf.py DELETED Viewed

@@ -1,41 +0,0 @@
-dependencies = ['torch']
-import torch
-from model.generator import Generator
-model_params = {
-    'nvidia_tacotron2_LJ11_epoch6400': {
-        'mel_channel': 80,
-        'model_url': 'https://github.com/seungwonpark/melgan/releases/download/v0.3-alpha/nvidia_tacotron2_LJ11_epoch6400.pt',
-    },
-}
-def melgan(model_name='nvidia_tacotron2_LJ11_epoch6400', pretrained=True, progress=True):
-    params = model_params[model_name]
-    model = Generator(params['mel_channel'])
-    if pretrained:
-        state_dict = torch.hub.load_state_dict_from_url(params['model_url'],
-                                                        progress=progress)
-        model.load_state_dict(state_dict['model_g'])
-    model.eval(inference=True)
-    return model
-if __name__ == '__main__':
-    vocoder = torch.hub.load('seungwonpark/melgan', 'melgan')
-    mel = torch.randn(1, 80, 234) # use your own mel-spectrogram here
-    print('Input mel-spectrogram shape: {}'.format(mel.shape))
-    if torch.cuda.is_available():
-        print('Moving data & model to GPU')
-        vocoder = vocoder.cuda()
-        mel = mel.cuda()
-    with torch.no_grad():
-        audio = vocoder.inference(mel)
-    print('Output audio shape: {}'.format(audio.shape))

melgan/inference.py DELETED Viewed

@@ -1,49 +0,0 @@
-import os
-import glob
-import tqdm
-import torch
-import argparse
-from scipy.io.wavfile import write
-from model.generator import Generator
-from utils.hparams import HParam, load_hparam_str
-MAX_WAV_VALUE = 32768.0
-def main(args):
-    checkpoint = torch.load(args.checkpoint_path)
-    if args.config is not None:
-        hp = HParam(args.config)
-    else:
-        hp = load_hparam_str(checkpoint['hp_str'])
-    model = Generator(hp.audio.n_mel_channels).cuda()
-    model.load_state_dict(checkpoint['model_g'])
-    model.eval(inference=False)
-    with torch.no_grad():
-        for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))):
-            mel = torch.load(melpath)
-            if len(mel.shape) == 2:
-                mel = mel.unsqueeze(0)
-            mel = mel.cuda()
-            audio = model.inference(mel)
-            audio = audio.cpu().detach().numpy()
-            out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
-            write(out_path, hp.audio.sampling_rate, audio)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--config', type=str, default=None,
-                        help="yaml file for config. will use hp_str from checkpoint if not given.")
-    parser.add_argument('-p', '--checkpoint_path', type=str, required=True,
-                        help="path of checkpoint pt file for evaluation")
-    parser.add_argument('-i', '--input_folder', type=str, required=True,
-                        help="directory of mel-spectrograms to invert into raw audio. ")
-    args = parser.parse_args()
-    main(args)

melgan/model/discriminator.py DELETED Viewed

@@ -1,64 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class Discriminator(nn.Module):
-    def __init__(self):
-        super(Discriminator, self).__init__()
-        self.discriminator = nn.ModuleList([
-            nn.Sequential(
-                nn.ReflectionPad1d(7),
-                nn.utils.weight_norm(nn.Conv1d(1, 16, kernel_size=15, stride=1)),
-                nn.LeakyReLU(0.2, inplace=True),
-            ),
-            nn.Sequential(
-                nn.utils.weight_norm(nn.Conv1d(16, 64, kernel_size=41, stride=4, padding=20, groups=4)),
-                nn.LeakyReLU(0.2, inplace=True),
-            ),
-            nn.Sequential(
-                nn.utils.weight_norm(nn.Conv1d(64, 256, kernel_size=41, stride=4, padding=20, groups=16)),
-                nn.LeakyReLU(0.2, inplace=True),
-            ),
-            nn.Sequential(
-                nn.utils.weight_norm(nn.Conv1d(256, 1024, kernel_size=41, stride=4, padding=20, groups=64)),
-                nn.LeakyReLU(0.2, inplace=True),
-            ),
-            nn.Sequential(
-                nn.utils.weight_norm(nn.Conv1d(1024, 1024, kernel_size=41, stride=4, padding=20, groups=256)),
-                nn.LeakyReLU(0.2, inplace=True),
-            ),
-            nn.Sequential(
-                nn.utils.weight_norm(nn.Conv1d(1024, 1024, kernel_size=5, stride=1, padding=2)),
-                nn.LeakyReLU(0.2, inplace=True),
-            ),
-            nn.utils.weight_norm(nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=1)),
-        ])
-    def forward(self, x):
-        '''
-            returns: (list of 6 features, discriminator score)
-            we directly predict score without last sigmoid function
-            since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076)
-        '''
-        features = list()
-        for module in self.discriminator:
-            x = module(x)
-            features.append(x)
-        return features[:-1], features[-1]
-if __name__ == '__main__':
-    model = Discriminator()
-    x = torch.randn(3, 1, 22050)
-    print(x.shape)
-    features, score = model(x)
-    for feat in features:
-        print(feat.shape)
-    print(score.shape)
-    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(pytorch_total_params)

melgan/model/generator.py DELETED Viewed

@@ -1,99 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .res_stack import ResStack
-# from res_stack import ResStack
-MAX_WAV_VALUE = 32768.0
-class Generator(nn.Module):
-    def __init__(self, mel_channel):
-        super(Generator, self).__init__()
-        self.mel_channel = mel_channel
-        self.generator = nn.Sequential(
-            nn.ReflectionPad1d(3),
-            nn.utils.weight_norm(nn.Conv1d(mel_channel, 512, kernel_size=7, stride=1)),
-            nn.LeakyReLU(0.2),
-            nn.utils.weight_norm(nn.ConvTranspose1d(512, 256, kernel_size=16, stride=8, padding=4)),
-            ResStack(256),
-            nn.LeakyReLU(0.2),
-            nn.utils.weight_norm(nn.ConvTranspose1d(256, 128, kernel_size=16, stride=8, padding=4)),
-            ResStack(128),
-            nn.LeakyReLU(0.2),
-            nn.utils.weight_norm(nn.ConvTranspose1d(128, 64, kernel_size=4, stride=2, padding=1)),
-            ResStack(64),
-            nn.LeakyReLU(0.2),
-            nn.utils.weight_norm(nn.ConvTranspose1d(64, 32, kernel_size=4, stride=2, padding=1)),
-            ResStack(32),
-            nn.LeakyReLU(0.2),
-            nn.ReflectionPad1d(3),
-            nn.utils.weight_norm(nn.Conv1d(32, 1, kernel_size=7, stride=1)),
-            nn.Tanh(),
-        )
-    def forward(self, mel):
-        mel = (mel + 5.0) / 5.0 # roughly normalize spectrogram
-        return self.generator(mel)
-    def eval(self, inference=False):
-        super(Generator, self).eval()
-        # don't remove weight norm while validation in training loop
-        if inference:
-            self.remove_weight_norm()
-    def remove_weight_norm(self):
-        for idx, layer in enumerate(self.generator):
-            if len(layer.state_dict()) != 0:
-                try:
-                    nn.utils.remove_weight_norm(layer)
-                except:
-                    layer.remove_weight_norm()
-    def inference(self, mel):
-        hop_length = 256
-        # pad input mel with zeros to cut artifact
-        # see https://github.com/seungwonpark/melgan/issues/8
-        zero = torch.full((1, self.mel_channel, 10), -11.5129).to(mel.device)
-        mel = torch.cat((mel, zero), dim=2)
-        audio = self.forward(mel)
-        audio = audio.squeeze() # collapse all dimension except time axis
-        audio = audio[:-(hop_length*10)]
-        audio = MAX_WAV_VALUE * audio
-        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
-        audio = audio.short()
-        return audio
-'''
-    to run this, fix
-    from . import ResStack
-    into
-    from res_stack import ResStack
-'''
-if __name__ == '__main__':
-    model = Generator(80)
-    x = torch.randn(3, 80, 10)
-    print(x.shape)
-    y = model(x)
-    print(y.shape)
-    assert y.shape == torch.Size([3, 1, 2560])
-    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(pytorch_total_params)

melgan/model/identity.py DELETED Viewed

@@ -1,12 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class Identity(nn.Module):
-    def __init__(self):
-        super(Identity, self).__init__()
-    def forward(self, x):
-        return x

melgan/model/multiscale.py DELETED Viewed

@@ -1,29 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .discriminator import Discriminator
-from .identity import Identity
-class MultiScaleDiscriminator(nn.Module):
-    def __init__(self):
-        super(MultiScaleDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList(
-            [Discriminator() for _ in range(3)]
-        )
-        self.pooling = nn.ModuleList(
-            [Identity()] +
-            [nn.AvgPool1d(kernel_size=4, stride=2, padding=1, count_include_pad=False) for _ in range(1, 3)]
-        )
-    def forward(self, x):
-        ret = list()
-        for pool, disc in zip(self.pooling, self.discriminators):
-            x = pool(x)
-            ret.append(disc(x))
-        return ret # [(feat, score), (feat, score), (feat, score)]

melgan/model/res_stack.py DELETED Viewed

@@ -1,36 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-class ResStack(nn.Module):
-    def __init__(self, channel):
-        super(ResStack, self).__init__()
-        self.blocks = nn.ModuleList([
-            nn.Sequential(
-                nn.LeakyReLU(0.2),
-                nn.ReflectionPad1d(3**i),
-                nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=3, dilation=3**i)),
-                nn.LeakyReLU(0.2),
-                nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)),
-            )
-            for i in range(3)
-        ])
-        self.shortcuts = nn.ModuleList([
-            nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1))
-            for i in range(3)
-        ])
-    def forward(self, x):
-        for block, shortcut in zip(self.blocks, self.shortcuts):
-            x = shortcut(x) + block(x)
-        return x
-    def remove_weight_norm(self):
-        for block, shortcut in zip(self.blocks, self.shortcuts):
-            nn.utils.remove_weight_norm(block[2])
-            nn.utils.remove_weight_norm(block[4])
-            nn.utils.remove_weight_norm(shortcut)

melgan/preprocess.py DELETED Viewed

@@ -1,50 +0,0 @@
-import os
-import glob
-import tqdm
-import torch
-import argparse
-import numpy as np
-from utils.stft import TacotronSTFT
-from utils.hparams import HParam
-from utils.utils import read_wav_np
-def main(hp, args):
-    stft = TacotronSTFT(filter_length=hp.audio.filter_length,
-                        hop_length=hp.audio.hop_length,
-                        win_length=hp.audio.win_length,
-                        n_mel_channels=hp.audio.n_mel_channels,
-                        sampling_rate=hp.audio.sampling_rate,
-                        mel_fmin=hp.audio.mel_fmin,
-                        mel_fmax=hp.audio.mel_fmax)
-    wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True)
-    for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'):
-        sr, wav = read_wav_np(wavpath)
-        assert sr == hp.audio.sampling_rate, \
-            "sample rate mismatch. expected %d, got %d at %s" % \
-            (hp.audio.sampling_rate, sr, wavpath)
-        if len(wav) < hp.audio.segment_length + hp.audio.pad_short:
-            wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \
-                    mode='constant', constant_values=0.0)
-        wav = torch.from_numpy(wav).unsqueeze(0)
-        mel = stft.mel_spectrogram(wav)
-        melpath = wavpath.replace('.wav', '.mel')
-        torch.save(mel, melpath)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--config', type=str, required=True,
-                        help="yaml file for config.")
-    parser.add_argument('-d', '--data_path', type=str, required=True,
-                        help="root directory of wav files")
-    args = parser.parse_args()
-    hp = HParam(args.config)
-    main(hp, args)

melgan/requirements.txt DELETED Viewed

@@ -1,9 +0,0 @@
-librosa
-matplotlib
-numpy
-scipy
-tensorboardX
-torch
-tqdm
-pillow
-pyyaml

melgan/trainer.py DELETED Viewed

@@ -1,52 +0,0 @@
-import os
-import time
-import logging
-import argparse
-from utils.train import train
-from utils.hparams import HParam
-from utils.writer import MyWriter
-from datasets.dataloader import create_dataloader
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--config', type=str, required=True,
-                        help="yaml file for configuration")
-    parser.add_argument('-p', '--checkpoint_path', type=str, default=None,
-                        help="path of checkpoint pt file to resume training")
-    parser.add_argument('-n', '--name', type=str, required=True,
-                        help="name of the model for logging, saving checkpoint")
-    args = parser.parse_args()
-    hp = HParam(args.config)
-    with open(args.config, 'r') as f:
-        hp_str = ''.join(f.readlines())
-    pt_dir = os.path.join(hp.log.chkpt_dir, args.name)
-    log_dir = os.path.join(hp.log.log_dir, args.name)
-    os.makedirs(pt_dir, exist_ok=True)
-    os.makedirs(log_dir, exist_ok=True)
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.FileHandler(os.path.join(log_dir,
-                '%s-%d.log' % (args.name, time.time()))),
-            logging.StreamHandler()
-        ]
-    )
-    logger = logging.getLogger()
-    writer = MyWriter(hp, log_dir)
-    assert hp.audio.hop_length == 256, \
-        'hp.audio.hop_length must be equal to 256, got %d' % hp.audio.hop_length
-    assert hp.data.train != '' and hp.data.validation != '', \
-        'hp.data.train and hp.data.validation can\'t be empty: please fix %s' % args.config
-    trainloader = create_dataloader(hp, args, True)
-    valloader = create_dataloader(hp, args, False)
-    train(args, pt_dir, args.checkpoint_path, trainloader, valloader, writer, logger, hp, hp_str)

melgan/utils/audio_processing.py DELETED Viewed

@@ -1,93 +0,0 @@
-import torch
-import numpy as np
-from scipy.signal import get_window
-import librosa.util as librosa_util
-def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
-                     n_fft=800, dtype=np.float32, norm=None):
-    """
-    # from librosa 0.6
-    Compute the sum-square envelope of a window function at a given hop length.
-    This is used to estimate modulation effects induced by windowing
-    observations in short-time fourier transforms.
-    Parameters
-    ----------
-    window : string, tuple, number, callable, or list-like
-        Window specification, as in `get_window`
-    n_frames : int > 0
-        The number of analysis frames
-    hop_length : int > 0
-        The number of samples to advance between frames
-    win_length : [optional]
-        The length of the window function.  By default, this matches `n_fft`.
-    n_fft : int > 0
-        The length of each analysis frame.
-    dtype : np.dtype
-        The data type of the output
-    Returns
-    -------
-    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
-        The sum-squared envelope of the window function
-    """
-    if win_length is None:
-        win_length = n_fft
-    n = n_fft + hop_length * (n_frames - 1)
-    x = np.zeros(n, dtype=dtype)
-    # Compute the squared window at the desired length
-    win_sq = get_window(window, win_length, fftbins=True)
-    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
-    win_sq = librosa_util.pad_center(win_sq, n_fft)
-    # Fill the envelope
-    for i in range(n_frames):
-        sample = i * hop_length
-        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
-    return x
-def griffin_lim(magnitudes, stft_fn, n_iters=30):
-    """
-    PARAMS
-    ------
-    magnitudes: spectrogram magnitudes
-    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
-    """
-    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
-    angles = angles.astype(np.float32)
-    angles = torch.autograd.Variable(torch.from_numpy(angles))
-    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
-    for i in range(n_iters):
-        _, angles = stft_fn.transform(signal)
-        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
-    return signal
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
-    """
-    PARAMS
-    ------
-    C: compression factor
-    """
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-def dynamic_range_decompression(x, C=1):
-    """
-    PARAMS
-    ------
-    C: compression factor used to compress
-    """
-    return torch.exp(x) / C

melgan/utils/hparams.py DELETED Viewed

@@ -1,67 +0,0 @@
-# modified from https://github.com/HarryVolek/PyTorch_Speaker_Verification
-import os
-import yaml
-def load_hparam_str(hp_str):
-    path = 'temp-restore.yaml'
-    with open(path, 'w') as f:
-        f.write(hp_str)
-    ret = HParam(path)
-    os.remove(path)
-    return ret
-def load_hparam(filename):
-    stream = open(filename, 'r')
-    docs = yaml.load_all(stream, Loader=yaml.Loader)
-    hparam_dict = dict()
-    for doc in docs:
-        for k, v in doc.items():
-            hparam_dict[k] = v
-    return hparam_dict
-def merge_dict(user, default):
-    if isinstance(user, dict) and isinstance(default, dict):
-        for k, v in default.items():
-            if k not in user:
-                user[k] = v
-            else:
-                user[k] = merge_dict(user[k], v)
-    return user
-class Dotdict(dict):
-    """
-    a dictionary that supports dot notation
-    as well as dictionary access notation
-    usage: d = DotDict() or d = DotDict({'val1':'first'})
-    set attributes: d.val2 = 'second' or d['val2'] = 'second'
-    get attributes: d.val2 or d['val2']
-    """
-    __getattr__ = dict.__getitem__
-    __setattr__ = dict.__setitem__
-    __delattr__ = dict.__delitem__
-    def __init__(self, dct=None):
-        dct = dict() if not dct else dct
-        for key, value in dct.items():
-            if hasattr(value, 'keys'):
-                value = Dotdict(value)
-            self[key] = value
-class HParam(Dotdict):
-    def __init__(self, file):
-        super(Dotdict, self).__init__()
-        hp_dict = load_hparam(file)
-        hp_dotdict = Dotdict(hp_dict)
-        for k, v in hp_dotdict.items():
-            setattr(self, k, v)
-    __getattr__ = Dotdict.__getitem__
-    __setattr__ = Dotdict.__setitem__
-    __delattr__ = Dotdict.__delitem__

melgan/utils/plotting.py DELETED Viewed

@@ -1,29 +0,0 @@
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pylab as plt
-import numpy as np
-def save_figure_to_numpy(fig):
-    # save it to a numpy array.
-    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
-    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-    data = np.transpose(data, (2, 0, 1))
-    return data
-def plot_waveform_to_numpy(waveform):
-    fig, ax = plt.subplots(figsize=(12, 3))
-    ax.plot()
-    ax.plot(range(len(waveform)), waveform,
-            linewidth=0.1, alpha=0.7, color='blue')
-    plt.xlabel("Samples")
-    plt.ylabel("Amplitude")
-    plt.ylim(-1, 1)
-    plt.tight_layout()
-    fig.canvas.draw()
-    data = save_figure_to_numpy(fig)
-    plt.close()
-    return data

melgan/utils/stft.py DELETED Viewed

@@ -1,184 +0,0 @@
-"""
-BSD 3-Clause License
-Copyright (c) 2017, Prem Seetharaman
-All rights reserved.
-* Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are met:
-* Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following disclaimer in the
-  documentation and/or other materials provided with the distribution.
-* Neither the name of the copyright holder nor the names of its
-  contributors may be used to endorse or promote products derived from this
-  software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-import torch
-import numpy as np
-import torch.nn.functional as F
-from torch.autograd import Variable
-from scipy.signal import get_window
-from librosa.util import pad_center, tiny
-from .audio_processing import window_sumsquare, dynamic_range_compression, dynamic_range_decompression
-from librosa.filters import mel as librosa_mel_fn
-class STFT(torch.nn.Module):
-    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
-    def __init__(self, filter_length=800, hop_length=200, win_length=800,
-                 window='hann'):
-        super(STFT, self).__init__()
-        self.filter_length = filter_length
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.window = window
-        self.forward_transform = None
-        scale = self.filter_length / self.hop_length
-        fourier_basis = np.fft.fft(np.eye(self.filter_length))
-        cutoff = int((self.filter_length / 2 + 1))
-        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
-                                   np.imag(fourier_basis[:cutoff, :])])
-        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
-        inverse_basis = torch.FloatTensor(
-            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
-        if window is not None:
-            assert(filter_length >= win_length)
-            # get window and zero center pad it to filter_length
-            fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, filter_length)
-            fft_window = torch.from_numpy(fft_window).float()
-            # window the bases
-            forward_basis *= fft_window
-            inverse_basis *= fft_window
-        self.register_buffer('forward_basis', forward_basis.float())
-        self.register_buffer('inverse_basis', inverse_basis.float())
-    def transform(self, input_data):
-        num_batches = input_data.size(0)
-        num_samples = input_data.size(1)
-        self.num_samples = num_samples
-        # similar to librosa, reflect-pad the input
-        input_data = input_data.view(num_batches, 1, num_samples)
-        input_data = F.pad(
-            input_data.unsqueeze(1),
-            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
-            mode='reflect')
-        input_data = input_data.squeeze(1)
-        # https://github.com/NVIDIA/tacotron2/issues/125
-        forward_transform = F.conv1d(
-            input_data,  # cuda()
-            Variable(self.forward_basis, requires_grad=False),  # cuda()
-            stride=self.hop_length,
-            padding=0).cpu()
-        cutoff = int((self.filter_length / 2) + 1)
-        real_part = forward_transform[:, :cutoff, :]
-        imag_part = forward_transform[:, cutoff:, :]
-        magnitude = torch.sqrt(real_part**2 + imag_part**2)
-        phase = torch.autograd.Variable(
-            torch.atan2(imag_part.data, real_part.data))
-        return magnitude, phase
-    def inverse(self, magnitude, phase):
-        recombine_magnitude_phase = torch.cat(
-            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
-        inverse_transform = F.conv_transpose1d(
-            recombine_magnitude_phase,
-            Variable(self.inverse_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0)
-        if self.window is not None:
-            window_sum = window_sumsquare(
-                self.window, magnitude.size(-1), hop_length=self.hop_length,
-                win_length=self.win_length, n_fft=self.filter_length,
-                dtype=np.float32)
-            # remove modulation effects
-            approx_nonzero_indices = torch.from_numpy(
-                np.where(window_sum > tiny(window_sum))[0])
-            window_sum = torch.autograd.Variable(
-                torch.from_numpy(window_sum), requires_grad=False)
-            # window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
-            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
-            # scale by hop ratio
-            inverse_transform *= float(self.filter_length) / self.hop_length
-        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
-        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
-        return inverse_transform
-    def forward(self, input_data):
-        self.magnitude, self.phase = self.transform(input_data)
-        reconstruction = self.inverse(self.magnitude, self.phase)
-        return reconstruction
-class TacotronSTFT(torch.nn.Module):
-    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
-                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
-                 mel_fmax=None):
-        super(TacotronSTFT, self).__init__()
-        self.n_mel_channels = n_mel_channels
-        self.sampling_rate = sampling_rate
-        self.stft_fn = STFT(filter_length, hop_length, win_length)
-        mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
-        mel_basis = torch.from_numpy(mel_basis).float()
-        self.register_buffer('mel_basis', mel_basis)
-    def spectral_normalize(self, magnitudes):
-        output = dynamic_range_compression(magnitudes)
-        return output
-    def spectral_de_normalize(self, magnitudes):
-        output = dynamic_range_decompression(magnitudes)
-        return output
-    def mel_spectrogram(self, y):
-        """Computes mel-spectrograms from a batch of waves
-        PARAMS
-        ------
-        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
-        RETURNS
-        -------
-        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
-        """
-        assert(torch.min(y.data) >= -1)
-        assert(torch.max(y.data) <= 1)
-        magnitudes, phases = self.stft_fn.transform(y)
-        magnitudes = magnitudes.data
-        mel_output = torch.matmul(self.mel_basis, magnitudes)
-        mel_output = self.spectral_normalize(mel_output)
-        return mel_output

melgan/utils/train.py DELETED Viewed

@@ -1,131 +0,0 @@
-import os
-import math
-import tqdm
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import itertools
-import traceback
-from model.generator import Generator
-from model.multiscale import MultiScaleDiscriminator
-from .utils import get_commit_hash
-from .validation import validate
-def train(args, pt_dir, chkpt_path, trainloader, valloader, writer, logger, hp, hp_str):
-    model_g = Generator(hp.audio.n_mel_channels)  # cuda()
-    model_d = MultiScaleDiscriminator()  # cuda()
-    optim_g = torch.optim.Adam(model_g.parameters(),
-        lr=hp.train.adam.lr, betas=(hp.train.adam.beta1, hp.train.adam.beta2))
-    optim_d = torch.optim.Adam(model_d.parameters(),
-        lr=hp.train.adam.lr, betas=(hp.train.adam.beta1, hp.train.adam.beta2))
-    githash = get_commit_hash()
-    init_epoch = -1
-    step = 0
-    if chkpt_path is not None:
-        logger.info("Resuming from checkpoint: %s" % chkpt_path)
-        checkpoint = torch.load(chkpt_path)
-        model_g.load_state_dict(checkpoint['model_g'])
-        model_d.load_state_dict(checkpoint['model_d'])
-        optim_g.load_state_dict(checkpoint['optim_g'])
-        optim_d.load_state_dict(checkpoint['optim_d'])
-        step = checkpoint['step']
-        init_epoch = checkpoint['epoch']
-        if hp_str != checkpoint['hp_str']:
-            logger.warning("New hparams is different from checkpoint. Will use new.")
-        if githash != checkpoint['githash']:
-            logger.warning("Code might be different: git hash is different.")
-            logger.warning("%s -> %s" % (checkpoint['githash'], githash))
-    else:
-        logger.info("Starting new training run.")
-    # this accelerates training when the size of minibatch is always consistent.
-    # if not consistent, it'll horribly slow down.
-    torch.backends.cudnn.benchmark = True
-    try:
-        model_g.train()
-        model_d.train()
-        for epoch in itertools.count(init_epoch+1):
-            if epoch % hp.log.validation_interval == 0:
-                with torch.no_grad():
-                    validate(hp, args, model_g, model_d, valloader, writer, step)
-            trainloader.dataset.shuffle_mapping()
-            loader = tqdm.tqdm(trainloader, desc='Loading train data')
-            for (melG, audioG), (melD, audioD) in loader:
-                # melG = melG.cuda()
-                # audioG = audioG.cuda()
-                # melD = melD.cuda()
-                # audioD = audioD.cuda()
-                # generator
-                optim_g.zero_grad()
-                fake_audio = model_g(melG)[:, :, :hp.audio.segment_length]
-                disc_fake = model_d(fake_audio)
-                disc_real = model_d(audioG)
-                loss_g = 0.0
-                for (feats_fake, score_fake), (feats_real, _) in zip(disc_fake, disc_real):
-                    loss_g += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))
-                    for feat_f, feat_r in zip(feats_fake, feats_real):
-                        loss_g += hp.model.feat_match * torch.mean(torch.abs(feat_f - feat_r))
-                loss_g.backward()
-                optim_g.step()
-                # discriminator
-                fake_audio = model_g(melD)[:, :, :hp.audio.segment_length]
-                fake_audio = fake_audio.detach()
-                loss_d_sum = 0.0
-                for _ in range(hp.train.rep_discriminator):
-                    optim_d.zero_grad()
-                    disc_fake = model_d(fake_audio)
-                    disc_real = model_d(audioD)
-                    loss_d = 0.0
-                    for (_, score_fake), (_, score_real) in zip(disc_fake, disc_real):
-                        loss_d += torch.mean(torch.sum(torch.pow(score_real - 1.0, 2), dim=[1, 2]))
-                        loss_d += torch.mean(torch.sum(torch.pow(score_fake, 2), dim=[1, 2]))
-                    loss_d.backward()
-                    optim_d.step()
-                    loss_d_sum += loss_d
-                step += 1
-                # logging
-                loss_g = loss_g.item()
-                loss_d_avg = loss_d_sum / hp.train.rep_discriminator
-                loss_d_avg = loss_d_avg.item()
-                if any([loss_g > 1e8, math.isnan(loss_g), loss_d_avg > 1e8, math.isnan(loss_d_avg)]):
-                    logger.error("loss_g %.01f loss_d_avg %.01f at step %d!" % (loss_g, loss_d_avg, step))
-                    raise Exception("Loss exploded")
-                if step % hp.log.summary_interval == 0:
-                    writer.log_training(loss_g, loss_d_avg, step)
-                    loader.set_description("g %.04f d %.04f | step %d" % (loss_g, loss_d_avg, step))
-            if epoch % hp.log.save_interval == 0:
-                save_path = os.path.join(pt_dir, '%s_%s_%04d.pt'
-                    % (args.name, githash, epoch))
-                torch.save({
-                    'model_g': model_g.state_dict(),
-                    'model_d': model_d.state_dict(),
-                    'optim_g': optim_g.state_dict(),
-                    'optim_d': optim_d.state_dict(),
-                    'step': step,
-                    'epoch': epoch,
-                    'hp_str': hp_str,
-                    'githash': githash,
-                }, save_path)
-                logger.info("Saved checkpoint to: %s" % save_path)
-    except Exception as e:
-        logger.info("Exiting due to exception: %s" % e)
-        traceback.print_exc()

melgan/utils/utils.py DELETED Viewed

@@ -1,26 +0,0 @@
-import random
-import subprocess
-import numpy as np
-from scipy.io.wavfile import read
-def get_commit_hash():
-    message = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
-    return message.strip().decode('utf-8')
-def read_wav_np(path):
-    sr, wav = read(path)
-    if len(wav.shape) == 2:
-        wav = wav[:, 0]
-    if wav.dtype == np.int16:
-        wav = wav / 32768.0
-    elif wav.dtype == np.int32:
-        wav = wav / 2147483648.0
-    elif wav.dtype == np.uint8:
-        wav = (wav - 128) / 128.0
-    wav = wav.astype(np.float32)
-    return sr, wav

melgan/utils/validation.py DELETED Viewed

@@ -1,41 +0,0 @@
-import tqdm
-import torch
-def validate(hp, args, generator, discriminator, valloader, writer, step):
-    generator.eval()
-    discriminator.eval()
-    torch.backends.cudnn.benchmark = False
-    loader = tqdm.tqdm(valloader, desc='Validation loop')
-    loss_g_sum = 0.0
-    loss_d_sum = 0.0
-    for mel, audio in loader:
-        # mel = mel.cuda()
-        # audio = audio.cuda()
-        # generator
-        fake_audio = generator(mel)
-        disc_fake = discriminator(fake_audio[:, :, :audio.size(2)])
-        disc_real = discriminator(audio)
-        loss_g = 0.0
-        loss_d = 0.0
-        for (feats_fake, score_fake), (feats_real, score_real) in zip(disc_fake, disc_real):
-            loss_g += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2]))
-            for feat_f, feat_r in zip(feats_fake, feats_real):
-                loss_g += hp.model.feat_match * torch.mean(torch.abs(feat_f - feat_r))
-            loss_d += torch.mean(torch.sum(torch.pow(score_real - 1.0, 2), dim=[1, 2]))
-            loss_d += torch.mean(torch.sum(torch.pow(score_fake, 2), dim=[1, 2]))
-        loss_g_sum += loss_g.item()
-        loss_d_sum += loss_d.item()
-    loss_g_avg = loss_g_sum / len(valloader.dataset)
-    loss_d_avg = loss_d_sum / len(valloader.dataset)
-    audio = audio[0][0].cpu().detach().numpy()
-    fake_audio = fake_audio[0][0].cpu().detach().numpy()
-    writer.log_validation(loss_g_avg, loss_d_avg, generator, discriminator, audio, fake_audio, step)
-    torch.backends.cudnn.benchmark = True

melgan/utils/writer.py DELETED Viewed

@@ -1,33 +0,0 @@
-from tensorboardX import SummaryWriter
-from .plotting import plot_waveform_to_numpy
-class MyWriter(SummaryWriter):
-    def __init__(self, hp, logdir):
-        super(MyWriter, self).__init__(logdir)
-        self.sample_rate = hp.audio.sampling_rate
-        self.is_first = True
-    def log_training(self, g_loss, d_loss, step):
-        self.add_scalar('train.g_loss', g_loss, step)
-        self.add_scalar('train.d_loss', d_loss, step)
-    def log_validation(self, g_loss, d_loss, generator, discriminator, target, prediction, step):
-        self.add_scalar('validation.g_loss', g_loss, step)
-        self.add_scalar('validation.d_loss', d_loss, step)
-        self.add_audio('raw_audio_predicted', prediction, step, self.sample_rate)
-        self.add_image('waveform_predicted', plot_waveform_to_numpy(prediction), step)
-        self.log_histogram(generator, step)
-        self.log_histogram(discriminator, step)
-        if self.is_first:
-            self.add_audio('raw_audio_target', target, step, self.sample_rate)
-            self.add_image('waveform_target', plot_waveform_to_numpy(target), step)
-            self.is_first = False
-    def log_histogram(self, model, step):
-        for tag, value in model.named_parameters():
-            self.add_histogram(tag.replace('.', '/'), value.cpu().detach().numpy(), step)