Spaces:

maxmax20160403
/

grad-svc

Running

App Files Files Community

maxmax20160403 commited on Oct 5, 2023

Commit

3aa4060

1 Parent(s): c79c2bf

Upload 39 files

Browse files

Files changed (39) hide show

LICENSE +21 -0
bigvgan/LICENSE +21 -0
bigvgan/README.md +138 -0
bigvgan/configs/nsf_bigvgan.yaml +60 -0
bigvgan/inference.py +71 -0
bigvgan/model/__init__.py +1 -0
bigvgan/model/alias/__init__.py +6 -0
bigvgan/model/alias/act.py +129 -0
bigvgan/model/alias/filter.py +95 -0
bigvgan/model/alias/resample.py +49 -0
bigvgan/model/bigv.py +64 -0
bigvgan/model/generator.py +143 -0
bigvgan/model/nsf.py +394 -0
bigvgan_pretrain/README.md +5 -0
bigvgan_pretrain/nsf_bigvgan_pretrain_32K.pth +3 -0
configs/base.yaml +41 -0
grad/LICENSE +19 -0
grad/__init__.py +0 -0
grad/base.py +29 -0
grad/diffusion.py +253 -0
grad/encoder.py +327 -0
grad/model.py +148 -0
grad/reversal.py +62 -0
grad/solver.py +190 -0
grad/ssim.py +59 -0
grad/utils.py +99 -0
grad_extend/data.py +135 -0
grad_extend/train.py +188 -0
grad_extend/utils.py +77 -0
grad_pretrain/README.md +3 -0
hubert/__init__.py +0 -0
hubert/hubert_model.py +229 -0
hubert/inference.py +67 -0
hubert_pretrain/README.md +3 -0
hubert_pretrain/hubert-soft-0d54a1f4.pt +3 -0
pitch/__init__.py +1 -0
pitch/inference.py +86 -0
requirements.txt +11 -0
spec/inference.py +113 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 PlayVoice
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

bigvgan/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 PlayVoice
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

bigvgan/README.md ADDED Viewed

	@@ -0,0 +1,138 @@

+<div align="center">
+<h1> Neural Source-Filter BigVGAN </h1>
+    Just For Fun
+</div>
+![nsf_bigvgan_mel](https://github.com/PlayVoice/NSF-BigVGAN/assets/16432329/eebb8dca-a8d3-4e69-b02c-632a3a1cdd6a)
+## Dataset preparation
+Put the dataset into the data_raw directory according to the following file structure
+```shell
+data_raw
+├───speaker0
+│   ├───000001.wav
+│   ├───...
+│   └───000xxx.wav
+└───speaker1
+    ├───000001.wav
+    ├───...
+    └───000xxx.wav
+```
+## Install dependencies
+- 1 software dependency
+  > pip install -r requirements.txt
+- 2 download [release](https://github.com/PlayVoice/NSF-BigVGAN/releases/tag/debug) model, and test
+  > python nsf_bigvgan_inference.py --config configs/nsf_bigvgan.yaml --model nsf_bigvgan_g.pth --wave test.wav
+## Data preprocessing
+- 1， re-sampling: 32kHz
+    > python prepare/preprocess_a.py -w ./data_raw -o ./data_bigvgan/waves-32k
+- 3， extract pitch
+    > python prepare/preprocess_f0.py -w data_bigvgan/waves-32k/ -p data_bigvgan/pitch
+- 4， extract mel: [100, length]
+    > python prepare/preprocess_spec.py -w data_bigvgan/waves-32k/ -s data_bigvgan/mel
+- 5， generate training index
+    > python prepare/preprocess_train.py
+```shell
+data_bigvgan/
+│
+└── waves-32k
+│    └── speaker0
+│    │      ├── 000001.wav
+│    │      └── 000xxx.wav
+│    └── speaker1
+│           ├── 000001.wav
+│           └── 000xxx.wav
+└── pitch
+│    └── speaker0
+│    │      ├── 000001.pit.npy
+│    │      └── 000xxx.pit.npy
+│    └── speaker1
+│           ├── 000001.pit.npy
+│           └── 000xxx.pit.npy
+└── mel
+     └── speaker0
+     │      ├── 000001.mel.pt
+     │      └── 000xxx.mel.pt
+     └── speaker1
+            ├── 000001.mel.pt
+            └── 000xxx.mel.pt
+```
+## Train
+- 1， start training
+    > python nsf_bigvgan_trainer.py -c configs/nsf_bigvgan.yaml -n nsf_bigvgan
+- 2， resume training
+    > python nsf_bigvgan_trainer.py -c configs/nsf_bigvgan.yaml -n nsf_bigvgan -p chkpt/nsf_bigvgan/***.pth
+- 3， view log
+    > tensorboard --logdir logs/
+## Inference
+- 1， export inference model
+    > python nsf_bigvgan_export.py --config configs/maxgan.yaml --checkpoint_path chkpt/nsf_bigvgan/***.pt
+- 2， extract mel
+    > python spec/inference.py -w test.wav -m test.mel.pt
+- 3， extract F0
+    > python pitch/inference.py -w test.wav -p test.csv
+- 4， infer
+    > python nsf_bigvgan_inference.py --config configs/nsf_bigvgan.yaml --model nsf_bigvgan_g.pth --wave test.wav
+    or
+    > python nsf_bigvgan_inference.py --config configs/nsf_bigvgan.yaml --model nsf_bigvgan_g.pth --mel test.mel.pt --pit test.csv
+## Augmentation of mel
+For the over smooth output of acoustic model, we use gaussian blur for mel when train vocoder
+```
+# gaussian blur
+model_b = get_gaussian_kernel(kernel_size=5, sigma=2, channels=1).to(device)
+# mel blur
+mel_b = mel[:, None, :, :]
+mel_b = model_b(mel_b)
+mel_b = torch.squeeze(mel_b, 1)
+mel_r = torch.rand(1).to(device) * 0.5
+mel_b = (1 - mel_r) * mel_b + mel_r * mel
+# generator
+optim_g.zero_grad()
+fake_audio = model_g(mel_b, pit)
+```
+![mel_gaussian_blur](https://github.com/PlayVoice/NSF-BigVGAN/assets/16432329/7fa96ef7-5e3b-4ae6-bc61-9b6da3b9d0b9)
+## Source of code and References
+https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts/tree/master/project/01-nsf
+https://github.com/mindslab-ai/univnet [[paper]](https://arxiv.org/abs/2106.07889)
+https://github.com/NVIDIA/BigVGAN [[paper]](https://arxiv.org/abs/2206.04658)

bigvgan/configs/nsf_bigvgan.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+data:
+  train_file: 'files/train.txt'
+  val_file: 'files/valid.txt'
+#############################
+train:
+  num_workers: 4
+  batch_size: 8
+  optimizer: 'adam'
+  seed: 1234
+  adam:
+    lr: 0.0002
+    beta1: 0.8
+    beta2: 0.99
+  mel_lamb: 5
+  stft_lamb: 2.5
+  pretrain: ''
+  lora: False
+#############################
+audio:
+  n_mel_channels: 100
+  segment_length: 12800 # Should be multiple of 320
+  filter_length: 1024
+  hop_length: 320 # WARNING: this can't be changed.
+  win_length: 1024
+  sampling_rate: 32000
+  mel_fmin: 40.0
+  mel_fmax: 16000.0
+#############################
+gen:
+  mel_channels: 100
+  upsample_rates: [5,4,2,2,2,2]
+  upsample_kernel_sizes: [15,8,4,4,4,4]
+  upsample_initial_channel: 320
+  resblock_kernel_sizes: [3,7,11]
+  resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+#############################
+mpd:
+  periods: [2,3,5,7,11]
+  kernel_size: 5
+  stride: 3
+  use_spectral_norm: False
+  lReLU_slope: 0.2
+#############################
+mrd:
+  resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
+  use_spectral_norm: False
+  lReLU_slope: 0.2
+#############################
+dist_config:
+  dist_backend: "nccl"
+  dist_url: "tcp://localhost:54321"
+  world_size: 1
+#############################
+log:
+  info_interval: 100
+  eval_interval: 1000
+  save_interval: 10000
+  num_audio: 6
+  pth_dir: 'chkpt'
+  log_dir: 'logs'

bigvgan/inference.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+import argparse
+from omegaconf import OmegaConf
+from scipy.io.wavfile import write
+from bigvgan.model.generator import Generator
+from pitch import load_csv_pitch
+def load_bigv_model(checkpoint_path, model):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
+    saved_state_dict = checkpoint_dict["model_g"]
+    state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+        except:
+            print("%s is not in the checkpoint" % k)
+            new_state_dict[k] = v
+    model.load_state_dict(new_state_dict)
+    return model
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    hp = OmegaConf.load(args.config)
+    model = Generator(hp)
+    load_bigv_model(args.model, model)
+    model.eval()
+    model.to(device)
+    mel = torch.load(args.mel)
+    pit = load_csv_pitch(args.pit)
+    pit = torch.FloatTensor(pit)
+    len_pit = pit.size()[0]
+    len_mel = mel.size()[1]
+    len_min = min(len_pit, len_mel)
+    pit = pit[:len_min]
+    mel = mel[:, :len_min]
+    with torch.no_grad():
+        mel = mel.unsqueeze(0).to(device)
+        pit = pit.unsqueeze(0).to(device)
+        audio = model.inference(mel, pit)
+        audio = audio.cpu().detach().numpy()
+        pitwav = model.pitch2wav(pit)
+        pitwav = pitwav.cpu().detach().numpy()
+    write("gvc_out.wav", hp.audio.sampling_rate, audio)
+    write("gvc_pitch.wav", hp.audio.sampling_rate, pitwav)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mel', type=str,
+                        help="Path of content vector.")
+    parser.add_argument('--pit', type=str,
+                        help="Path of pitch csv file.")
+    args = parser.parse_args()
+    args.config = "./bigvgan/configs/nsf_bigvgan.yaml"
+    args.model = "./bigvgan_pretrain/nsf_bigvgan_pretrain_32K.pth"
+    main(args)

bigvgan/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .alias.act import SnakeAlias

bigvgan/model/alias/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+from .filter import *
+from .resample import *
+from .act import *

bigvgan/model/alias/act.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import sin, pow
+from torch.nn import Parameter
+from .resample import UpSample1d, DownSample1d
+class Activation1d(nn.Module):
+    def __init__(self,
+                 activation,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta = x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(
+            0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class Mish(nn.Module):
+    """
+    Mish activation function is proposed in "Mish: A Self
+    Regularized Non-Monotonic Neural Activation Function"
+    paper, https://arxiv.org/abs/1908.08681.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+class SnakeAlias(nn.Module):
+    def __init__(self,
+                 channels,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = SnakeBeta(channels, alpha_logscale=True)
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x

bigvgan/model/alias/filter.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+if 'sinc' in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(x == 0,
+                           torch.tensor(1., device=x.device, dtype=x.dtype),
+                           torch.sin(math.pi * x) / math.pi / x)
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+    even = (kernel_size % 2 == 0)
+    half_size = kernel_size // 2
+    #For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.:
+        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+    else:
+        beta = 0.
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = (torch.arange(-half_size, half_size) + 0.5)
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+    return filter
+class LowPassFilter1d(nn.Module):
+    def __init__(self,
+                 cutoff=0.5,
+                 half_width=0.6,
+                 stride: int = 1,
+                 padding: bool = True,
+                 padding_mode: str = 'replicate',
+                 kernel_size: int = 12):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = (kernel_size % 2 == 0)
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+    #input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right),
+                      mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1),
+                       stride=self.stride, groups=C)
+        return out

bigvgan/model/alias/resample.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+                                      half_width=0.6 / ratio,
+                                      kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode='replicate')
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left:-self.pad_right]
+        return x
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+                                       half_width=0.6 / ratio,
+                                       stride=ratio,
+                                       kernel_size=self.kernel_size)
+    def forward(self, x):
+        xx = self.lowpass(x)
+        return xx

bigvgan/model/bigv.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import torch.nn as nn
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+from .alias.act import SnakeAlias
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+class AMPBlock(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(AMPBlock, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+        # total number of conv layers
+        self.num_layers = len(self.convs1) + len(self.convs2)
+        # periodic nonlinearity with snakebeta function and anti-aliasing
+        self.activations = nn.ModuleList([
+            SnakeAlias(channels) for _ in range(self.num_layers)
+        ])
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)

bigvgan/model/generator.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import weight_norm
+from torch.nn.utils import remove_weight_norm
+from .nsf import SourceModuleHnNSF
+from .bigv import init_weights, AMPBlock, SnakeAlias
+class Generator(torch.nn.Module):
+    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+    def __init__(self, hp):
+        super(Generator, self).__init__()
+        self.hp = hp
+        self.num_kernels = len(hp.gen.resblock_kernel_sizes)
+        self.num_upsamples = len(hp.gen.upsample_rates)
+        # pre conv
+        self.conv_pre = nn.utils.weight_norm(
+            Conv1d(hp.gen.mel_channels, hp.gen.upsample_initial_channel, 7, 1, padding=3))
+        # nsf
+        self.f0_upsamp = torch.nn.Upsample(
+            scale_factor=np.prod(hp.gen.upsample_rates))
+        self.m_source = SourceModuleHnNSF(sampling_rate=hp.audio.sampling_rate)
+        self.noise_convs = nn.ModuleList()
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
+            # print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
+            # base
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        hp.gen.upsample_initial_channel // (2 ** i),
+                        hp.gen.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2)
+                )
+            )
+            # nsf
+            if i + 1 < len(hp.gen.upsample_rates):
+                stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
+                stride_f0 = int(stride_f0)
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        hp.gen.upsample_initial_channel // (2 ** (i + 1)),
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(
+                    Conv1d(1, hp.gen.upsample_initial_channel //
+                           (2 ** (i + 1)), kernel_size=1)
+                )
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
+            for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
+                self.resblocks.append(AMPBlock(ch, k, d))
+        # post conv
+        self.activation_post = SnakeAlias(ch)
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        # weight initialization
+        self.ups.apply(init_weights)
+    def forward(self, x, f0, train=True):
+        # nsf
+        f0 = f0[:, None]
+        f0 = self.f0_upsamp(f0).transpose(1, 2)
+        har_source = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)
+        # pre conv
+        if train:
+            x = x + torch.randn_like(x) * 0.1     # Perturbation
+        x = self.conv_pre(x)
+        x = x * torch.tanh(F.softplus(x))
+        for i in range(self.num_upsamples):
+            # upsampling
+            x = self.ups[i](x)
+            # nsf
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+    def eval(self, inference=False):
+        super(Generator, self).eval()
+        # don't remove weight norm while validation in training loop
+        if inference:
+            self.remove_weight_norm()
+    def inference(self, mel, f0):
+        MAX_WAV_VALUE = 32768.0
+        audio = self.forward(mel, f0, False)
+        audio = audio.squeeze()  # collapse all dimension except time axis
+        audio = MAX_WAV_VALUE * audio
+        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
+        audio = audio.short()
+        return audio
+    def pitch2wav(self, f0):
+        MAX_WAV_VALUE = 32768.0
+        # nsf
+        f0 = f0[:, None]
+        f0 = self.f0_upsamp(f0).transpose(1, 2)
+        har_source = self.m_source(f0)
+        audio = har_source.transpose(1, 2)
+        audio = audio.squeeze()  # collapse all dimension except time axis
+        audio = MAX_WAV_VALUE * audio
+        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
+        audio = audio.short()
+        return audio

bigvgan/model/nsf.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import torch
+import numpy as np
+import sys
+import torch.nn.functional as torch_nn_func
+class PulseGen(torch.nn.Module):
+    """Definition of Pulse train generator
+    There are many ways to implement pulse generator.
+    Here, PulseGen is based on SinGen. For a perfect
+    """
+    def __init__(self, samp_rate, pulse_amp=0.1, noise_std=0.003, voiced_threshold=0):
+        super(PulseGen, self).__init__()
+        self.pulse_amp = pulse_amp
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.noise_std = noise_std
+        self.l_sinegen = SineGen(
+            self.sampling_rate,
+            harmonic_num=0,
+            sine_amp=self.pulse_amp,
+            noise_std=0,
+            voiced_threshold=self.voiced_threshold,
+            flag_for_pulse=True,
+        )
+    def forward(self, f0):
+        """Pulse train generator
+        pulse_train, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output pulse_train: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        Note: self.l_sine doesn't make sure that the initial phase of
+        a voiced segment is np.pi, the first pulse in a voiced segment
+        may not be at the first time step within a voiced segment
+        """
+        with torch.no_grad():
+            sine_wav, uv, noise = self.l_sinegen(f0)
+            # sine without additive noise
+            pure_sine = sine_wav - noise
+            # step t corresponds to a pulse if
+            # sine[t] > sine[t+1] & sine[t] > sine[t-1]
+            # & sine[t-1], sine[t+1], and sine[t] are voiced
+            # or
+            # sine[t] is voiced, sine[t-1] is unvoiced
+            # we use torch.roll to simulate sine[t+1] and sine[t-1]
+            sine_1 = torch.roll(pure_sine, shifts=1, dims=1)
+            uv_1 = torch.roll(uv, shifts=1, dims=1)
+            uv_1[:, 0, :] = 0
+            sine_2 = torch.roll(pure_sine, shifts=-1, dims=1)
+            uv_2 = torch.roll(uv, shifts=-1, dims=1)
+            uv_2[:, -1, :] = 0
+            loc = (pure_sine > sine_1) * (pure_sine > sine_2) \
+                  * (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \
+                  + (uv_1 < 1) * (uv > 0)
+            # pulse train without noise
+            pulse_train = pure_sine * loc
+            # additive noise to pulse train
+            # note that noise from sinegen is zero in voiced regions
+            pulse_noise = torch.randn_like(pure_sine) * self.noise_std
+            # with additive noise on pulse, and unvoiced regions
+            pulse_train += pulse_noise * loc + pulse_noise * (1 - uv)
+        return pulse_train, sine_wav, uv, pulse_noise
+class SignalsConv1d(torch.nn.Module):
+    """Filtering input signal with time invariant filter
+    Note: FIRFilter conducted filtering given fixed FIR weight
+          SignalsConv1d convolves two signals
+    Note: this is based on torch.nn.functional.conv1d
+    """
+    def __init__(self):
+        super(SignalsConv1d, self).__init__()
+    def forward(self, signal, system_ir):
+        """output = forward(signal, system_ir)
+        signal:    (batchsize, length1, dim)
+        system_ir: (length2, dim)
+        output:    (batchsize, length1, dim)
+        """
+        if signal.shape[-1] != system_ir.shape[-1]:
+            print("Error: SignalsConv1d expects shape:")
+            print("signal    (batchsize, length1, dim)")
+            print("system_id (batchsize, length2, dim)")
+            print("But received signal: {:s}".format(str(signal.shape)))
+            print(" system_ir: {:s}".format(str(system_ir.shape)))
+            sys.exit(1)
+        padding_length = system_ir.shape[0] - 1
+        groups = signal.shape[-1]
+        # pad signal on the left
+        signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), (padding_length, 0))
+        # prepare system impulse response as (dim, 1, length2)
+        # also flip the impulse response
+        ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), dims=[2])
+        # convolute
+        output = torch_nn_func.conv1d(signal_pad, ir, groups=groups)
+        return output.permute(0, 2, 1)
+class CyclicNoiseGen_v1(torch.nn.Module):
+    """CyclicnoiseGen_v1
+    Cyclic noise with a single parameter of beta.
+    Pytorch v1 implementation assumes f_t is also fixed
+    """
+    def __init__(self, samp_rate, noise_std=0.003, voiced_threshold=0):
+        super(CyclicNoiseGen_v1, self).__init__()
+        self.samp_rate = samp_rate
+        self.noise_std = noise_std
+        self.voiced_threshold = voiced_threshold
+        self.l_pulse = PulseGen(
+            samp_rate,
+            pulse_amp=1.0,
+            noise_std=noise_std,
+            voiced_threshold=voiced_threshold,
+        )
+        self.l_conv = SignalsConv1d()
+    def noise_decay(self, beta, f0mean):
+        """decayed_noise = noise_decay(beta, f0mean)
+        decayed_noise =  n[t]exp(-t * f_mean / beta / samp_rate)
+        beta: (dim=1) or (batchsize=1, 1, dim=1)
+        f0mean (batchsize=1, 1, dim=1)
+        decayed_noise (batchsize=1, length, dim=1)
+        """
+        with torch.no_grad():
+            # exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T
+            # truncate the noise when decayed by -40 dB
+            length = 4.6 * self.samp_rate / f0mean
+            length = length.int()
+            time_idx = torch.arange(0, length, device=beta.device)
+            time_idx = time_idx.unsqueeze(0).unsqueeze(2)
+            time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2])
+        noise = torch.randn(time_idx.shape, device=beta.device)
+        # due to Pytorch implementation, use f0_mean as the f0 factor
+        decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate)
+        return noise * self.noise_std * decay
+    def forward(self, f0s, beta):
+        """Producde cyclic-noise"""
+        # pulse train
+        pulse_train, sine_wav, uv, noise = self.l_pulse(f0s)
+        pure_pulse = pulse_train - noise
+        # decayed_noise (length, dim=1)
+        if (uv < 1).all():
+            # all unvoiced
+            cyc_noise = torch.zeros_like(sine_wav)
+        else:
+            f0mean = f0s[uv > 0].mean()
+            decayed_noise = self.noise_decay(beta, f0mean)[0, :, :]
+            # convolute
+            cyc_noise = self.l_conv(pure_pulse, decayed_noise)
+        # add noise in invoiced segments
+        cyc_noise = cyc_noise + noise * (1.0 - uv)
+        return cyc_noise, pulse_train, sine_wav, uv, noise
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def _f02sine(self, f0_values):
+        """f0_values: (batchsize, length, dim)
+        where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(
+            f0_values.shape[0], f0_values.shape[2], device=f0_values.device
+        )
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            # for normal case
+            # To prevent torch.cumsum numerical overflow,
+            # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+            # Buffer tmp_over_one_idx indicates the time step to add -1.
+            # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+            tmp_over_one = torch.cumsum(rad_values, 1) % 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sines = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+            # generate sine waveforms
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+            # generate uv signal
+            # uv = torch.ones(f0.shape)
+            # uv = uv * (f0 > self.voiced_threshold)
+            uv = self._f02uv(f0)
+            # noise: for unvoiced should be similar to sine_amp
+            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+            # .       for voiced regions is self.noise_std
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            # first: set the unvoiced part to 0 by uv
+            # then: additive noise
+            sine_waves = sine_waves * uv + noise
+        return sine_waves
+class SourceModuleCycNoise_v1(torch.nn.Module):
+    """SourceModuleCycNoise_v1
+    SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    noise_std: std of Gaussian noise (default: 0.003)
+    voiced_threshold: threshold to set U/V given F0 (default: 0)
+    cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta)
+    F0_upsampled (batchsize, length, 1)
+    beta (1)
+    cyc (batchsize, length, 1)
+    noise (batchsize, length, 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleCycNoise_v1, self).__init__()
+        self.sampling_rate = sampling_rate
+        self.noise_std = noise_std
+        self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std, voiced_threshod)
+    def forward(self, f0_upsamped, beta):
+        """
+        cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta)
+        F0_upsampled (batchsize, length, 1)
+        beta (1)
+        cyc (batchsize, length, 1)
+        noise (batchsize, length, 1)
+        uv (batchsize, length, 1)
+        """
+        # source for harmonic branch
+        cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta)
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.noise_std / 3
+        return cyc, noise, uv
+class SourceModuleHnNSF(torch.nn.Module):
+    def __init__(
+        self,
+        sampling_rate=32000,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        harmonic_num = 10
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+        # to merge source harmonics into a single excitation
+        self.l_tanh = torch.nn.Tanh()
+        self.register_buffer('merge_w', torch.FloatTensor([[
+            0.2942, -0.2243, 0.0033, -0.0056, -0.0020, -0.0046,
+            0.0221, -0.0083, -0.0241, -0.0036, -0.0581]]))
+        self.register_buffer('merge_b', torch.FloatTensor([0.0008]))
+    def forward(self, x):
+        """
+        Sine_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        """
+        # source for harmonic branch
+        sine_wavs = self.l_sin_gen(x)
+        sine_wavs = torch_nn_func.linear(
+            sine_wavs, self.merge_w) + self.merge_b
+        sine_merge = self.l_tanh(sine_wavs)
+        return sine_merge

bigvgan_pretrain/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+Path for:
+    nsf_bigvgan_pretrain_32K.pth
+    DownLoad link:https://github.com/PlayVoice/NSF-BigVGAN/releases/tag/augment

bigvgan_pretrain/nsf_bigvgan_pretrain_32K.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e32aaea5fd26bcba47c18d3b0a44f5371dfce25a099aa468420d9d605eda225
+size 116020827

configs/base.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+train:
+  seed: 37
+  train_files: "files/train.txt"
+  valid_files: "files/valid.txt"
+  log_dir: 'logs/grad_svc'
+  full_epochs: 500
+  fast_epochs: 100
+  learning_rate: 2e-4
+  batch_size: 8
+  test_size: 4
+  test_step: 5
+  save_step: 10
+  pretrain: "grad_pretrain/gvc.pretrain.pth"
+#############################
+data:
+  segment_size: 16000  # WARNING: base on hop_length
+  max_wav_value: 32768.0
+  sampling_rate: 32000
+  filter_length: 1024
+  hop_length: 320
+  win_length: 1024
+  mel_channels: 100
+  mel_fmin: 40.0
+  mel_fmax: 16000.0
+#############################
+grad:
+  n_mels: 100
+  n_vecs: 256
+  n_pits: 256
+  n_spks: 256
+  n_embs: 64
+  # encoder parameters
+  n_enc_channels: 192
+  filter_channels: 512
+  # decoder parameters
+  dec_dim: 96
+  beta_min: 0.05
+  beta_max: 20.0
+  pe_scale: 1000

grad/LICENSE ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2021 Huawei Technologies Co., Ltd.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

grad/__init__.py ADDED Viewed

File without changes

grad/base.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+import torch
+class BaseModule(torch.nn.Module):
+    def __init__(self):
+        super(BaseModule, self).__init__()
+    @property
+    def nparams(self):
+        """
+        Returns number of trainable parameters of the module.
+        """
+        num_params = 0
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                num_params += np.prod(param.detach().cpu().numpy().shape)
+        return num_params
+    def relocate_input(self, x: list):
+        """
+        Relocates provided tensors to the same device set for the module.
+        """
+        device = next(self.parameters()).device
+        for i in range(len(x)):
+            if isinstance(x[i], torch.Tensor) and x[i].device != device:
+                x[i] = x[i].to(device)
+        return x

grad/diffusion.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import math
+import torch
+from einops import rearrange
+from grad.base import BaseModule
+from grad.solver import NoiseScheduleVP, MaxLikelihood, GradRaw
+class Mish(BaseModule):
+    def forward(self, x):
+        return x * torch.tanh(torch.nn.functional.softplus(x))
+class Upsample(BaseModule):
+    def __init__(self, dim):
+        super(Upsample, self).__init__()
+        self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class Downsample(BaseModule):
+    def __init__(self, dim):
+        super(Downsample, self).__init__()
+        self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class Rezero(BaseModule):
+    def __init__(self, fn):
+        super(Rezero, self).__init__()
+        self.fn = fn
+        self.g = torch.nn.Parameter(torch.zeros(1))
+    def forward(self, x):
+        return self.fn(x) * self.g
+class Block(BaseModule):
+    def __init__(self, dim, dim_out, groups=8):
+        super(Block, self).__init__()
+        self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3,
+                                         padding=1), torch.nn.GroupNorm(
+                                         groups, dim_out), Mish())
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock(BaseModule):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super(ResnetBlock, self).__init__()
+        self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim,
+                                                               dim_out))
+        self.block1 = Block(dim, dim_out, groups=groups)
+        self.block2 = Block(dim_out, dim_out, groups=groups)
+        if dim != dim_out:
+            self.res_conv = torch.nn.Conv2d(dim, dim_out, 1)
+        else:
+            self.res_conv = torch.nn.Identity()
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class LinearAttention(BaseModule):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(LinearAttention, self).__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)',
+                            heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w',
+                        heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+class Residual(BaseModule):
+    def __init__(self, fn):
+        super(Residual, self).__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        output = self.fn(x, *args, **kwargs) + x
+        return output
+class SinusoidalPosEmb(BaseModule):
+    def __init__(self, dim):
+        super(SinusoidalPosEmb, self).__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class GradLogPEstimator2d(BaseModule):
+    def __init__(self, dim, dim_mults=(1, 2, 4), emb_dim=64, n_mels=100,
+                 groups=8, pe_scale=1000):
+        super(GradLogPEstimator2d, self).__init__()
+        self.dim = dim
+        self.dim_mults = dim_mults
+        self.emb_dim = emb_dim
+        self.groups = groups
+        self.pe_scale = pe_scale
+        self.spk_mlp = torch.nn.Sequential(torch.nn.Linear(emb_dim, emb_dim * 4), Mish(),
+                                           torch.nn.Linear(emb_dim * 4, n_mels))
+        self.time_pos_emb = SinusoidalPosEmb(dim)
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(dim, dim * 4), Mish(),
+                                       torch.nn.Linear(dim * 4, dim))
+        dims = [2 + 1, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        self.downs = torch.nn.ModuleList([])
+        self.ups = torch.nn.ModuleList([])
+        num_resolutions = len(in_out)
+        for ind, (dim_in, dim_out) in enumerate(in_out):  # 2 downs
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(torch.nn.ModuleList([
+                       ResnetBlock(dim_in, dim_out, time_emb_dim=dim),
+                       ResnetBlock(dim_out, dim_out, time_emb_dim=dim),
+                       Residual(Rezero(LinearAttention(dim_out))),
+                       Downsample(dim_out) if not is_last else torch.nn.Identity()]))
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
+        self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):  # 2 ups
+            self.ups.append(torch.nn.ModuleList([
+                     ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim),
+                     ResnetBlock(dim_in, dim_in, time_emb_dim=dim),
+                     Residual(Rezero(LinearAttention(dim_in))),
+                     Upsample(dim_in)]))
+        self.final_block = Block(dim, dim)
+        self.final_conv = torch.nn.Conv2d(dim, 1, 1)
+    def forward(self, spk, x, mask, mu, t):
+        s = self.spk_mlp(spk)
+        t = self.time_pos_emb(t, scale=self.pe_scale)
+        t = self.mlp(t)
+        s = s.unsqueeze(-1).repeat(1, 1, x.shape[-1])
+        x = torch.stack([mu, x, s], 1)
+        mask = mask.unsqueeze(1)
+        hiddens = []
+        masks = [mask]
+        for resnet1, resnet2, attn, downsample in self.downs:
+            mask_down = masks[-1]
+            x = resnet1(x, mask_down, t)
+            x = resnet2(x, mask_down, t)
+            x = attn(x)
+            hiddens.append(x)
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        x = self.mid_block1(x, mask_mid, t)
+        x = self.mid_attn(x)
+        x = self.mid_block2(x, mask_mid, t)
+        for resnet1, resnet2, attn, upsample in self.ups:
+            mask_up = masks.pop()
+            x = torch.cat((x, hiddens.pop()), dim=1)
+            x = resnet1(x, mask_up, t)
+            x = resnet2(x, mask_up, t)
+            x = attn(x)
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask)
+        output = self.final_conv(x * mask)
+        return (output * mask).squeeze(1)
+def get_noise(t, beta_init, beta_term, cumulative=False):
+    if cumulative:
+        noise = beta_init*t + 0.5*(beta_term - beta_init)*(t**2)
+    else:
+        noise = beta_init + (beta_term - beta_init)*t
+    return noise
+class Diffusion(BaseModule):
+    def __init__(self, n_mels, dim, emb_dim=64,
+                 beta_min=0.05, beta_max=20, pe_scale=1000):
+        super(Diffusion, self).__init__()
+        self.n_mels = n_mels
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        # self.solver = NoiseScheduleVP()
+        self.solver = MaxLikelihood()
+        # self.solver = GradRaw()
+        self.estimator = GradLogPEstimator2d(dim,
+                                             n_mels=n_mels,
+                                             emb_dim=emb_dim,
+                                             pe_scale=pe_scale)
+    def forward_diffusion(self, mel, mask, mu, t):
+        time = t.unsqueeze(-1).unsqueeze(-1)
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+        mean = mel*torch.exp(-0.5*cum_noise) + mu*(1.0 - torch.exp(-0.5*cum_noise))
+        variance = 1.0 - torch.exp(-cum_noise)
+        z = torch.randn(mel.shape, dtype=mel.dtype, device=mel.device,
+                        requires_grad=False)
+        xt = mean + z * torch.sqrt(variance)
+        return xt * mask, z * mask
+    def forward(self, spk, z, mask, mu, n_timesteps, stoc=False):
+        return self.solver.reverse_diffusion(self.estimator, spk, z, mask, mu, n_timesteps, stoc)
+    def loss_t(self, spk, mel, mask, mu, t):
+        xt, z = self.forward_diffusion(mel, mask, mu, t)
+        time = t.unsqueeze(-1).unsqueeze(-1)
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+        noise_estimation = self.estimator(spk, xt, mask, mu, t)
+        noise_estimation *= torch.sqrt(1.0 - torch.exp(-cum_noise))
+        loss = torch.sum((noise_estimation + z)**2) / (torch.sum(mask)*self.n_mels)
+        return loss, xt
+    def compute_loss(self, spk, mel, mask, mu, offset=1e-5):
+        t = torch.rand(mel.shape[0], dtype=mel.dtype, device=mel.device, requires_grad=False)
+        t = torch.clamp(t, offset, 1.0 - offset)
+        return self.loss_t(spk, mel, mask, mu, t)

grad/encoder.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import math
+import torch
+from grad.base import BaseModule
+from grad.reversal import SpeakerClassifier
+from grad.utils import sequence_mask, convert_pad_shape
+class LayerNorm(BaseModule):
+    def __init__(self, channels, eps=1e-4):
+        super(LayerNorm, self).__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = torch.nn.Parameter(torch.ones(channels))
+        self.beta = torch.nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean)**2, 1, keepdim=True)
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+class ConvReluNorm(BaseModule):
+    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size,
+                 n_layers, p_dropout, eps=1e-5):
+        super(ConvReluNorm, self).__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.eps = eps
+        self.conv_layers = torch.nn.ModuleList()
+        self.conv_layers.append(torch.nn.Conv1d(in_channels, hidden_channels,
+                                                kernel_size, padding=kernel_size//2))
+        self.relu_drop = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(torch.nn.Conv1d(hidden_channels, hidden_channels,
+                                                    kernel_size, padding=kernel_size//2))
+        self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.instance_norm(x, x_mask)
+            x = self.relu_drop(x)
+        x = self.proj(x)
+        return x * x_mask
+    def instance_norm(self, x, mask, return_mean_std=False):
+        mean, std = self.calc_mean_std(x, mask)
+        x = (x - mean) / std
+        if return_mean_std:
+            return x, mean, std
+        else:
+            return x
+    def calc_mean_std(self, x, mask=None):
+        x = x * mask
+        B, C = x.shape[:2]
+        mn = x.view(B, C, -1).mean(-1)
+        sd = (x.view(B, C, -1).var(-1) + self.eps).sqrt()
+        mn = mn.view(B, C, *((len(x.shape) - 2) * [1]))
+        sd = sd.view(B, C, *((len(x.shape) - 2) * [1]))
+        return mn, sd
+class MultiHeadAttention(BaseModule):
+    def __init__(self, channels, out_channels, n_heads, window_size=None,
+                 heads_share=True, p_dropout=0.0, proximal_bias=False,
+                 proximal_init=False):
+        super(MultiHeadAttention, self).__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_k = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_v = torch.nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = torch.nn.Parameter(torch.randn(n_heads_rel,
+                             window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = torch.nn.Parameter(torch.randn(n_heads_rel,
+                             window_size * 2 + 1, self.k_channels) * rel_stddev)
+        self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
+        self.drop = torch.nn.Dropout(p_dropout)
+        torch.nn.init.xavier_uniform_(self.conv_q.weight)
+        torch.nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        torch.nn.init.xavier_uniform_(self.conv_v.weight)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device,
+                                                                    dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+        p_attn = torch.nn.functional.softmax(scores, dim=-1)
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights,
+                                                                value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = torch.nn.functional.pad(
+                            relative_embeddings, convert_pad_shape([[0, 0],
+                            [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:,
+                                   slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(x, convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0,0],[0,0],[0,length-1]]))
+        x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
+        x_flat = x.view([batch, heads, length**2 + length*(length - 1)])
+        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(BaseModule):
+    def __init__(self, in_channels, out_channels, filter_channels, kernel_size,
+                 p_dropout=0.0):
+        super(FFN, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size,
+                                      padding=kernel_size//2)
+        self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size,
+                                      padding=kernel_size//2)
+        self.drop = torch.nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        return x * x_mask
+class Encoder(BaseModule):
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers,
+                 kernel_size=1, p_dropout=0.0, window_size=None, **kwargs):
+        super(Encoder, self).__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = torch.nn.Dropout(p_dropout)
+        self.attn_layers = torch.nn.ModuleList()
+        self.norm_layers_1 = torch.nn.ModuleList()
+        self.ffn_layers = torch.nn.ModuleList()
+        self.norm_layers_2 = torch.nn.ModuleList()
+        for _ in range(self.n_layers):
+            self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels,
+                                    n_heads, window_size=window_size, p_dropout=p_dropout))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(hidden_channels, hidden_channels,
+                                       filter_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        for i in range(self.n_layers):
+            x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class TextEncoder(BaseModule):
+    def __init__(self, n_vecs, n_mels, n_embs,
+                 n_channels,
+                 filter_channels,
+                 n_heads=2,
+                 n_layers=6,
+                 kernel_size=3,
+                 p_dropout=0.1,
+                 window_size=4):
+        super(TextEncoder, self).__init__()
+        self.n_vecs = n_vecs
+        self.n_mels = n_mels
+        self.n_embs = n_embs
+        self.n_channels = n_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.prenet = ConvReluNorm(n_vecs,
+                                   n_channels,
+                                   n_channels,
+                                   kernel_size=5,
+                                   n_layers=5,
+                                   p_dropout=0.5)
+        self.speaker = SpeakerClassifier(
+            n_channels,
+            256,  # n_spks: 256
+        )
+        self.encoder = Encoder(n_channels + n_embs + n_embs,
+                               filter_channels,
+                               n_heads,
+                               n_layers,
+                               kernel_size,
+                               p_dropout,
+                               window_size=window_size)
+        self.proj_m = torch.nn.Conv1d(n_channels + n_embs + n_embs, n_mels, 1)
+    def forward(self, x_lengths, x, pit, spk, training=False):
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        # IN
+        x = self.prenet(x, x_mask)
+        if training:
+            r = self.speaker(x)
+        else:
+            r = None
+        # pitch + speaker
+        spk = spk.unsqueeze(-1).repeat(1, 1, x.shape[-1])
+        x = torch.cat([x, pit], dim=1)
+        x = torch.cat([x, spk], dim=1)
+        x = self.encoder(x, x_mask)
+        mu = self.proj_m(x) * x_mask
+        return mu, x_mask, r
+    def fine_tune(self):
+        for p in self.prenet.parameters():
+            p.requires_grad = False
+        for p in self.speaker.parameters():
+            p.requires_grad = False

grad/model.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import math
+import torch
+from grad.ssim import SSIM
+from grad.base import BaseModule
+from grad.encoder import TextEncoder
+from grad.diffusion import Diffusion
+from grad.utils import f0_to_coarse, rand_ids_segments, slice_segments
+SpeakerLoss = torch.nn.CosineEmbeddingLoss()
+SsimLoss = SSIM()
+class GradTTS(BaseModule):
+    def __init__(self, n_mels, n_vecs, n_pits, n_spks, n_embs,
+                 n_enc_channels, filter_channels,
+                 dec_dim, beta_min, beta_max, pe_scale):
+        super(GradTTS, self).__init__()
+        # common
+        self.n_mels = n_mels
+        self.n_vecs = n_vecs
+        self.n_spks = n_spks
+        self.n_embs = n_embs
+        # encoder
+        self.n_enc_channels = n_enc_channels
+        self.filter_channels = filter_channels
+        # decoder
+        self.dec_dim = dec_dim
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.pe_scale = pe_scale
+        self.pit_emb = torch.nn.Embedding(n_pits, n_embs)
+        self.spk_emb = torch.nn.Linear(n_spks, n_embs)
+        self.encoder = TextEncoder(n_vecs,
+                                   n_mels,
+                                   n_embs,
+                                   n_enc_channels,
+                                   filter_channels)
+        self.decoder = Diffusion(n_mels, dec_dim, n_embs, beta_min, beta_max, pe_scale)
+    def fine_tune(self):
+        for p in self.pit_emb.parameters():
+            p.requires_grad = False
+        for p in self.spk_emb.parameters():
+            p.requires_grad = False
+        self.encoder.fine_tune()
+    @torch.no_grad()
+    def forward(self, lengths, vec, pit, spk, n_timesteps, temperature=1.0, stoc=False):
+        """
+        Generates mel-spectrogram from vec. Returns:
+            1. encoder outputs
+            2. decoder outputs
+        Args:
+            lengths (torch.Tensor): lengths of texts in batch.
+            vec (torch.Tensor): batch of speech vec
+            pit (torch.Tensor): batch of speech pit
+            spk (torch.Tensor): batch of speaker
+            n_timesteps (int): number of steps to use for reverse diffusion in decoder.
+            temperature (float, optional): controls variance of terminal distribution.
+            stoc (bool, optional): flag that adds stochastic term to the decoder sampler.
+                Usually, does not provide synthesis improvements.
+        """
+        lengths, vec, pit, spk = self.relocate_input([lengths, vec, pit, spk])
+        # Get pitch embedding
+        pit = self.pit_emb(f0_to_coarse(pit))
+        # Get speaker embedding
+        spk = self.spk_emb(spk)
+        # Transpose
+        vec = torch.transpose(vec, 1, -1)
+        pit = torch.transpose(pit, 1, -1)
+        # Get encoder_outputs `mu_x`
+        mu_x, mask_x, _ = self.encoder(lengths, vec, pit, spk)
+        encoder_outputs = mu_x
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_x + torch.randn_like(mu_x, device=mu_x.device) / temperature
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder(spk, z, mask_x, mu_x, n_timesteps, stoc)
+        encoder_outputs = encoder_outputs + torch.randn_like(encoder_outputs)
+        return encoder_outputs, decoder_outputs
+    def compute_loss(self, lengths, vec, pit, spk, mel, out_size, skip_diff=False):
+        """
+        Computes 2 losses:
+            1. prior loss: loss between mel-spectrogram and encoder outputs.
+            2. diffusion loss: loss between gaussian noise and its reconstruction by diffusion-based decoder.
+        Args:
+            lengths (torch.Tensor): lengths of texts in batch.
+            vec (torch.Tensor): batch of speech vec
+            pit (torch.Tensor): batch of speech pit
+            spk (torch.Tensor): batch of speaker
+            mel (torch.Tensor): batch of corresponding mel-spectrogram
+            out_size (int, optional): length (in mel's sampling rate) of segment to cut, on which decoder will be trained.
+                Should be divisible by 2^{num of UNet downsamplings}. Needed to increase batch size.
+        """
+        lengths, vec, pit, spk, mel = self.relocate_input([lengths, vec, pit, spk, mel])
+        # Get pitch embedding
+        pit = self.pit_emb(f0_to_coarse(pit))
+        # Get speaker embedding
+        spk_64 = self.spk_emb(spk)
+        # Transpose
+        vec = torch.transpose(vec, 1, -1)
+        pit = torch.transpose(pit, 1, -1)
+        # Get encoder_outputs `mu_x`
+        mu_x, mask_x, spk_preds = self.encoder(lengths, vec, pit, spk_64, training=True)
+        # Compute loss between aligned encoder outputs and mel-spectrogram
+        prior_loss = torch.sum(0.5 * ((mel - mu_x) ** 2 + math.log(2 * math.pi)) * mask_x)
+        prior_loss = prior_loss / (torch.sum(mask_x) * self.n_mels)
+        # Mel ssim
+        mel_loss = SsimLoss(mu_x, mel, mask_x)
+        # Compute loss of speaker for GRL
+        spk_loss = SpeakerLoss(spk, spk_preds, torch.Tensor(spk_preds.size(0))
+                               .to(spk.device).fill_(1.0))
+        # Compute loss of score-based decoder
+        if skip_diff:
+            diff_loss = prior_loss.clone()
+            diff_loss.fill_(0)
+        else:
+            # Cut a small segment of mel-spectrogram in order to increase batch size
+            if not isinstance(out_size, type(None)):
+                ids = rand_ids_segments(lengths, out_size)
+                mel = slice_segments(mel, ids, out_size)
+                mask_y = slice_segments(mask_x, ids, out_size)
+                mu_y = slice_segments(mu_x, ids, out_size)
+                mu_y = mu_y + torch.randn_like(mu_y)
+            diff_loss, xt = self.decoder.compute_loss(
+                spk_64, mel, mask_y, mu_y)
+        return prior_loss, diff_loss, mel_loss, spk_loss

grad/reversal.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0
+# Unsupervised Domain Adaptation by Backpropagation
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.nn.utils import weight_norm
+class GradientReversalFunction(Function):
+    @staticmethod
+    def forward(ctx, x, lambda_):
+        ctx.lambda_ = lambda_
+        return x.clone()
+    @staticmethod
+    def backward(ctx, grads):
+        lambda_ = ctx.lambda_
+        lambda_ = grads.new_tensor(lambda_)
+        dx = -lambda_ * grads
+        return dx, None
+class GradientReversal(torch.nn.Module):
+    ''' Gradient Reversal Layer
+            Y. Ganin, V. Lempitsky,
+            "Unsupervised Domain Adaptation by Backpropagation",
+            in ICML, 2015.
+        Forward pass is the identity function
+        In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed)
+    '''
+    def __init__(self, lambda_reversal=1):
+        super(GradientReversal, self).__init__()
+        self.lambda_ = lambda_reversal
+    def forward(self, x):
+        return GradientReversalFunction.apply(x, self.lambda_)
+class SpeakerClassifier(nn.Module):
+    def __init__(self, idim, odim):
+        super(SpeakerClassifier, self).__init__()
+        self.classifier = nn.Sequential(
+            GradientReversal(lambda_reversal=1),
+            weight_norm(nn.Conv1d(idim, 1024, kernel_size=5, padding=2)),
+            nn.ReLU(),
+            weight_norm(nn.Conv1d(1024, 1024, kernel_size=5, padding=2)),
+            nn.ReLU(),
+            weight_norm(nn.Conv1d(1024, odim, kernel_size=5, padding=2))
+        )
+    def forward(self, x):
+        ''' Forward function of Speaker Classifier:
+            x = (B, idim, len)
+        '''
+        # pass through classifier
+        outputs = self.classifier(x)  # (B, nb_speakers)
+        outputs = torch.mean(outputs, dim=-1)
+        return outputs

grad/solver.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+class NoiseScheduleVP:
+    def __init__(self, beta_min=0.05, beta_max=20):
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.T = 1.
+    def get_noise(self, t, beta_init, beta_term, cumulative=False):
+        if cumulative:
+            noise = beta_init*t + 0.5*(beta_term - beta_init)*(t**2)
+        else:
+            noise = beta_init + (beta_term - beta_init)*t
+        return noise
+    def marginal_log_mean_coeff(self, t):
+        return -0.25 * t**2 * (self.beta_max -
+                               self.beta_min) - 0.5 * t * self.beta_min
+    def marginal_std(self, t):
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+    def marginal_lambda(self, t):
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+    def inverse_lambda(self, lamb):
+        tmp = 2. * (self.beta_max - self.beta_min) * torch.logaddexp(
+            -2. * lamb,
+            torch.zeros((1, )).to(lamb))
+        Delta = self.beta_min**2 + tmp
+        return tmp / (torch.sqrt(Delta) + self.beta_min) / (self.beta_max -
+                                                            self.beta_min)
+    def get_time_steps(self, t_T, t_0, N):
+        lambda_T = self.marginal_lambda(torch.tensor(t_T))
+        lambda_0 = self.marginal_lambda(torch.tensor(t_0))
+        logSNR_steps = torch.linspace(lambda_T, lambda_0, N + 1)
+        return self.inverse_lambda(logSNR_steps)
+    @torch.no_grad()
+    def reverse_diffusion(self, estimator, spk, z, mask, mu, n_timesteps, stoc):
+        print("use dpm-solver reverse")
+        xt = z * mask
+        yt = xt - mu
+        T = 1
+        eps = 1e-3
+        time = self.get_time_steps(T, eps, n_timesteps)
+        for i in range(n_timesteps):
+            s = torch.ones((xt.shape[0], )).to(xt.device) * time[i]
+            t = torch.ones((xt.shape[0], )).to(xt.device) * time[i + 1]
+            lambda_s = self.marginal_lambda(s)
+            lambda_t = self.marginal_lambda(t)
+            h = lambda_t - lambda_s
+            log_alpha_s = self.marginal_log_mean_coeff(s)
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            phi_1 = torch.expm1(h)
+            noise_s = estimator(spk, yt + mu, mask, mu, s)
+            lt = 1 - torch.exp(-self.get_noise(s, self.beta_min, self.beta_max, cumulative=True))
+            a = torch.exp(log_alpha_t - log_alpha_s)
+            b = sigma_t * phi_1 * torch.sqrt(lt)
+            yt = a * yt + (b * noise_s)
+        xt = yt + mu
+        return xt
+class MaxLikelihood:
+    def __init__(self, beta_min=0.05, beta_max=20):
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+    def get_noise(self, t, beta_init, beta_term, cumulative=False):
+        if cumulative:
+            noise = beta_init*t + 0.5*(beta_term - beta_init)*(t**2)
+        else:
+            noise = beta_init + (beta_term - beta_init)*t
+        return noise
+    def get_gamma(self, s, t, beta_init, beta_term):
+        gamma = beta_init*(t-s) + 0.5*(beta_term-beta_init)*(t**2-s**2)
+        gamma = torch.exp(-0.5*gamma)
+        return gamma
+    def get_mu(self, s, t):
+        gamma_0_s = self.get_gamma(0, s, self.beta_min, self.beta_max)
+        gamma_0_t = self.get_gamma(0, t, self.beta_min, self.beta_max)
+        gamma_s_t = self.get_gamma(s, t, self.beta_min, self.beta_max)
+        mu = gamma_s_t * ((1-gamma_0_s**2) / (1-gamma_0_t**2))
+        return mu
+    def get_nu(self, s, t):
+        gamma_0_s = self.get_gamma(0, s, self.beta_min, self.beta_max)
+        gamma_0_t = self.get_gamma(0, t, self.beta_min, self.beta_max)
+        gamma_s_t = self.get_gamma(s, t, self.beta_min, self.beta_max)
+        nu = gamma_0_s * ((1-gamma_s_t**2) / (1-gamma_0_t**2))
+        return nu
+    def get_sigma(self, s, t):
+        gamma_0_s = self.get_gamma(0, s, self.beta_min, self.beta_max)
+        gamma_0_t = self.get_gamma(0, t, self.beta_min, self.beta_max)
+        gamma_s_t = self.get_gamma(s, t, self.beta_min, self.beta_max)
+        sigma = torch.sqrt(((1 - gamma_0_s**2) * (1 - gamma_s_t**2)) / (1 - gamma_0_t**2))
+        return sigma
+    def get_kappa(self, t, h, noise):
+        nu = self.get_nu(t-h, t)
+        gamma_0_t = self.get_gamma(0, t, self.beta_min, self.beta_max)
+        kappa = (nu*(1-gamma_0_t**2)/(gamma_0_t*noise*h) - 1)
+        return kappa
+    def get_omega(self, t, h, noise):
+        mu = self.get_mu(t-h, t)
+        kappa = self.get_kappa(t, h, noise)
+        gamma_0_t = self.get_gamma(0, t, self.beta_min, self.beta_max)
+        omega = (mu-1)/(noise*h) + (1+kappa)/(1-gamma_0_t**2) - 0.5
+        return omega
+    @torch.no_grad()
+    def reverse_diffusion(self, estimator, spk, z, mask, mu, n_timesteps, stoc=False):
+        print("use MaxLikelihood reverse")
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - i*h) * torch.ones(z.shape[0], dtype=z.dtype,
+                                                 device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = self.get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            kappa_t_h = self.get_kappa(t, h, noise_t)
+            omega_t_h = self.get_omega(t, h, noise_t)
+            sigma_t_h = self.get_sigma(t-h, t)
+            es = estimator(spk, xt, mask, mu, t)
+            dxt = ((0.5+omega_t_h)*(xt - mu) + (1+kappa_t_h) * es)
+            dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                   requires_grad=False)
+            dxt_stoc = dxt_stoc * sigma_t_h
+            dxt = dxt * noise_t * h + dxt_stoc
+            xt = (xt + dxt) * mask
+        return xt
+class GradRaw:
+    def __init__(self, beta_min=0.05, beta_max=20):
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+    def get_noise(self, t, beta_init, beta_term, cumulative=False):
+        if cumulative:
+            noise = beta_init*t + 0.5*(beta_term - beta_init)*(t**2)
+        else:
+            noise = beta_init + (beta_term - beta_init)*t
+        return noise
+    @torch.no_grad()
+    def reverse_diffusion(self, estimator, spk, z, mask, mu, n_timesteps, stoc=False):
+        print("use grad-raw reverse")
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5)*h) * \
+                torch.ones(z.shape[0], dtype=z.dtype, device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = self.get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            if stoc:  # adds stochastic term
+                dxt_det = 0.5 * (mu - xt) - estimator(spk, xt, mask, mu, t)
+                dxt_det = dxt_det * noise_t * h
+                dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                       requires_grad=False)
+                dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                dxt = dxt_det + dxt_stoc
+            else:
+                dxt = 0.5 * (mu - xt - estimator(spk, xt, mask, mu, t))
+                dxt = dxt * noise_t * h
+            xt = (xt - dxt) * mask
+        return xt

grad/ssim.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim
+"""
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from math import exp
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
+    return window
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+    C1 = 0.01 ** 2
+    C2 = 0.03 ** 2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1)
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = create_window(window_size, self.channel)
+    def forward(self, fake, real, mask, bias=6.0):
+        fake = fake[:, None, :, :] + bias  # [B, 1, T, 80]
+        real = real[:, None, :, :] + bias  # [B, 1, T, 80]
+        self.window = self.window.to(dtype=fake.dtype, device=fake.device)
+        loss = 1 - _ssim(fake, real, self.window, self.window_size, self.channel, self.size_average)
+        loss = (loss * mask).sum() / mask.sum()
+        return loss

grad/utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import numpy as np
+import inspect
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def fix_len_compatibility(length, num_downsamplings_in_unet=2):
+    while True:
+        if length % (2**num_downsamplings_in_unet) == 0:
+            return length
+        length += 1
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def generate_path(duration, mask):
+    device = duration.device
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0],
+                                          [1, 0], [0, 0]]))[:, :-1]
+    path = path * mask
+    return path
+def duration_loss(logw, logw_, lengths):
+    loss = torch.sum((logw - logw_)**2) / torch.sum(lengths)
+    return loss
+f0_bin = 256
+f0_max = 1100.0
+f0_min = 50.0
+f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+def f0_to_coarse(f0):
+    is_torch = isinstance(f0, torch.Tensor)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * \
+        np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * \
+        (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+    f0_coarse = (
+        f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min(
+    ) >= 1, (f0_coarse.max(), f0_coarse.min())
+    return f0_coarse
+def rand_ids_segments(lengths, segment_size=200):
+    b = lengths.shape[0]
+    ids_str_max = lengths - segment_size
+    ids_str = (torch.rand([b]).to(device=lengths.device) * ids_str_max).to(dtype=torch.long)
+    return ids_str
+def slice_segments(x, ids_str, segment_size=200):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def retrieve_name(var):
+    for fi in reversed(inspect.stack()):
+        names = [var_name for var_name,
+                 var_val in fi.frame.f_locals.items() if var_val is var]
+        if len(names) > 0:
+            return names[0]
+Debug_Enable = True
+def debug_shapes(var):
+    if Debug_Enable:
+        print(retrieve_name(var), var.shape)

grad_extend/data.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+import random
+import numpy as np
+import torch
+from grad.utils import fix_len_compatibility
+from grad_extend.utils import parse_filelist
+class TextMelSpeakerDataset(torch.utils.data.Dataset):
+    def __init__(self, filelist_path):
+        super().__init__()
+        self.filelist = parse_filelist(filelist_path, split_char='|')
+        self._filter()
+        print(f'----------{len(self.filelist)}----------')
+    def _filter(self):
+        items_new = []
+        # segment = 200
+        items_min = 250  # 10ms * 250 = 2.5 S
+        items_max = 500  # 10ms * 400 = 5.0 S
+        for mel, vec, pit, spk in self.filelist:
+            if not os.path.isfile(mel):
+                continue
+            if not os.path.isfile(vec):
+                continue
+            if not os.path.isfile(pit):
+                continue
+            if not os.path.isfile(spk):
+                continue
+            temp = np.load(pit)
+            usel = int(temp.shape[0] - 1)  # useful length
+            if (usel < items_min):
+                continue
+            if (usel >= items_max):
+                usel = items_max
+            items_new.append([mel, vec, pit, spk, usel])
+        self.filelist = items_new
+    def get_triplet(self, item):
+        # print(item)
+        mel = item[0]
+        vec = item[1]
+        pit = item[2]
+        spk = item[3]
+        use = item[4]
+        mel = torch.load(mel)
+        vec = np.load(vec)
+        vec = np.repeat(vec, 2, 0)  # 320 VEC -> 160 * 2
+        pit = np.load(pit)
+        spk = np.load(spk)
+        vec = torch.FloatTensor(vec)
+        pit = torch.FloatTensor(pit)
+        spk = torch.FloatTensor(spk)
+        vec = vec + torch.randn_like(vec)  # Perturbation
+        len_vec = vec.size()[0] - 2 # for safe
+        len_pit = pit.size()[0]
+        len_min = min(len_pit, len_vec)
+        mel = mel[:, :len_min]
+        vec = vec[:len_min, :]
+        pit = pit[:len_min]
+        if len_min > use:
+            max_frame_start = vec.size(0) - use - 1
+            frame_start = random.randint(0, max_frame_start)
+            frame_end = frame_start + use
+            mel = mel[:, frame_start:frame_end]
+            vec = vec[frame_start:frame_end, :]
+            pit = pit[frame_start:frame_end]
+        # print(mel.shape)
+        # print(vec.shape)
+        # print(pit.shape)
+        # print(spk.shape)
+        return (mel, vec, pit, spk)
+    def __getitem__(self, index):
+        mel, vec, pit, spk = self.get_triplet(self.filelist[index])
+        item = {'mel': mel, 'vec': vec, 'pit': pit, 'spk': spk}
+        return item
+    def __len__(self):
+        return len(self.filelist)
+    def sample_test_batch(self, size):
+        idx = np.random.choice(range(len(self)), size=size, replace=False)
+        test_batch = []
+        for index in idx:
+            test_batch.append(self.__getitem__(index))
+        return test_batch
+class TextMelSpeakerBatchCollate(object):
+    # mel: [freq, length]
+    # vec: [len, 256]
+    # pit: [len]
+    # spk: [256]
+    def __call__(self, batch):
+        B = len(batch)
+        mel_max_length = max([item['mel'].shape[-1] for item in batch])
+        max_length = fix_len_compatibility(mel_max_length)
+        d_mel = batch[0]['mel'].shape[0]
+        d_vec = batch[0]['vec'].shape[1]
+        d_spk = batch[0]['spk'].shape[0]
+        # print("d_mel", d_mel)
+        # print("d_vec", d_vec)
+        # print("d_spk", d_spk)
+        mel = torch.zeros((B, d_mel, max_length), dtype=torch.float32)
+        vec = torch.zeros((B, max_length, d_vec), dtype=torch.float32)
+        pit = torch.zeros((B, max_length), dtype=torch.float32)
+        spk = torch.zeros((B, d_spk), dtype=torch.float32)
+        lengths = torch.LongTensor(B)
+        for i, item in enumerate(batch):
+            y_, x_, p_, s_ = item['mel'], item['vec'], item['pit'], item['spk']
+            mel[i, :, :y_.shape[1]] = y_
+            vec[i, :x_.shape[0], :] = x_
+            pit[i, :p_.shape[0]] = p_
+            spk[i] = s_
+            lengths[i] = y_.shape[1]
+        # print("lengths", lengths.shape)
+        # print("vec", vec.shape)
+        # print("pit", pit.shape)
+        # print("spk", spk.shape)
+        # print("mel", mel.shape)
+        return {'lengths': lengths, 'vec': vec, 'pit': pit, 'spk': spk, 'mel': mel}

grad_extend/train.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import torch
+import numpy as np
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from grad_extend.data import TextMelSpeakerDataset, TextMelSpeakerBatchCollate
+from grad_extend.utils import plot_tensor, save_plot, load_model, print_error
+from grad.utils import fix_len_compatibility
+from grad.model import GradTTS
+# 200 frames
+out_size = fix_len_compatibility(200)
+def train(hps, chkpt_path=None):
+    print('Initializing logger...')
+    logger = SummaryWriter(log_dir=hps.train.log_dir)
+    print('Initializing data loaders...')
+    train_dataset = TextMelSpeakerDataset(hps.train.train_files)
+    batch_collate = TextMelSpeakerBatchCollate()
+    loader = DataLoader(dataset=train_dataset,
+                        batch_size=hps.train.batch_size,
+                        collate_fn=batch_collate,
+                        drop_last=True,
+                        num_workers=8,
+                        shuffle=True)
+    test_dataset = TextMelSpeakerDataset(hps.train.valid_files)
+    print('Initializing model...')
+    model = GradTTS(hps.grad.n_mels, hps.grad.n_vecs, hps.grad.n_pits, hps.grad.n_spks, hps.grad.n_embs,
+                    hps.grad.n_enc_channels, hps.grad.filter_channels,
+                    hps.grad.dec_dim, hps.grad.beta_min, hps.grad.beta_max, hps.grad.pe_scale).cuda()
+    print('Number of encoder parameters = %.2fm' % (model.encoder.nparams/1e6))
+    print('Number of decoder parameters = %.2fm' % (model.decoder.nparams/1e6))
+    # Load Pretrain
+    if os.path.isfile(hps.train.pretrain):
+        print("Start from Grad_SVC pretrain model: %s" % hps.train.pretrain)
+        checkpoint = torch.load(hps.train.pretrain, map_location='cpu')
+        load_model(model, checkpoint['model'])
+        hps.train.learning_rate = 2e-5
+        # fine_tune
+        model.fine_tune()
+    else:
+        print_error(10 * '~' + "No Pretrain Model" + 10 * '~')
+    print('Initializing optimizer...')
+    optim = torch.optim.Adam(params=model.parameters(), lr=hps.train.learning_rate)
+    initepoch = 1
+    iteration = 0
+    # Load Continue
+    if chkpt_path is not None:
+        print("Resuming from checkpoint: %s" % chkpt_path)
+        checkpoint = torch.load(chkpt_path, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+        optim.load_state_dict(checkpoint['optim'])
+        initepoch = checkpoint['epoch']
+        iteration = checkpoint['steps']
+    print('Logging test batch...')
+    test_batch = test_dataset.sample_test_batch(size=hps.train.test_size)
+    for i, item in enumerate(test_batch):
+        mel = item['mel']
+        logger.add_image(f'image_{i}/ground_truth', plot_tensor(mel.squeeze()),
+                         global_step=0, dataformats='HWC')
+        save_plot(mel.squeeze(), f'{hps.train.log_dir}/original_{i}.png')
+    print('Start training...')
+    skip_diff_train = True
+    if initepoch >= hps.train.fast_epochs:
+        skip_diff_train = False
+    for epoch in range(initepoch, hps.train.full_epochs + 1):
+        if epoch % hps.train.test_step == 0:
+            model.eval()
+            print('Synthesis...')
+            with torch.no_grad():
+                for i, item in enumerate(test_batch):
+                    l_vec = item['vec'].shape[0]
+                    d_vec = item['vec'].shape[1]
+                    lengths_fix = fix_len_compatibility(l_vec)
+                    lengths = torch.LongTensor([l_vec]).cuda()
+                    vec = torch.zeros((1, lengths_fix, d_vec), dtype=torch.float32).cuda()
+                    pit = torch.zeros((1, lengths_fix), dtype=torch.float32).cuda()
+                    spk = item['spk'].to(torch.float32).unsqueeze(0).cuda()
+                    vec[0, :l_vec, :] = item['vec']
+                    pit[0, :l_vec] = item['pit']
+                    y_enc, y_dec = model(lengths, vec, pit, spk, n_timesteps=50)
+                    logger.add_image(f'image_{i}/generated_enc',
+                                     plot_tensor(y_enc.squeeze().cpu()),
+                                     global_step=iteration, dataformats='HWC')
+                    logger.add_image(f'image_{i}/generated_dec',
+                                     plot_tensor(y_dec.squeeze().cpu()),
+                                     global_step=iteration, dataformats='HWC')
+                    save_plot(y_enc.squeeze().cpu(),
+                              f'{hps.train.log_dir}/generated_enc_{i}.png')
+                    save_plot(y_dec.squeeze().cpu(),
+                              f'{hps.train.log_dir}/generated_dec_{i}.png')
+        model.train()
+        prior_losses = []
+        diff_losses = []
+        mel_losses = []
+        spk_losses = []
+        with tqdm(loader, total=len(train_dataset)//hps.train.batch_size) as progress_bar:
+            for batch in progress_bar:
+                model.zero_grad()
+                lengths = batch['lengths'].cuda()
+                vec = batch['vec'].cuda()
+                pit = batch['pit'].cuda()
+                spk = batch['spk'].cuda()
+                mel = batch['mel'].cuda()
+                prior_loss, diff_loss, mel_loss, spk_loss = model.compute_loss(
+                    lengths, vec, pit, spk,
+                    mel, out_size=out_size,
+                    skip_diff=skip_diff_train)
+                loss = sum([prior_loss, diff_loss, mel_loss, spk_loss])
+                loss.backward()
+                enc_grad_norm = torch.nn.utils.clip_grad_norm_(model.encoder.parameters(),
+                                                            max_norm=1)
+                dec_grad_norm = torch.nn.utils.clip_grad_norm_(model.decoder.parameters(),
+                                                            max_norm=1)
+                optim.step()
+                logger.add_scalar('training/mel_loss', mel_loss,
+                                global_step=iteration)
+                logger.add_scalar('training/prior_loss', prior_loss,
+                                global_step=iteration)
+                logger.add_scalar('training/diffusion_loss', diff_loss,
+                                global_step=iteration)
+                logger.add_scalar('training/encoder_grad_norm', enc_grad_norm,
+                                global_step=iteration)
+                logger.add_scalar('training/decoder_grad_norm', dec_grad_norm,
+                                global_step=iteration)
+                msg = f'Epoch: {epoch}, iteration: {iteration} | '
+                msg = msg + f'prior_loss: {prior_loss.item():.3f}, '
+                msg = msg + f'diff_loss: {diff_loss.item():.3f}, '
+                msg = msg + f'mel_loss: {mel_loss.item():.3f}, '
+                msg = msg + f'spk_loss: {spk_loss.item():.3f}, '
+                progress_bar.set_description(msg)
+                prior_losses.append(prior_loss.item())
+                diff_losses.append(diff_loss.item())
+                mel_losses.append(mel_loss.item())
+                spk_losses.append(spk_loss.item())
+                iteration += 1
+        msg = 'Epoch %d: ' % (epoch)
+        msg += '| spk loss = %.3f ' % np.mean(spk_losses)
+        msg += '| mel loss = %.3f ' % np.mean(mel_losses)
+        msg += '| prior loss = %.3f ' % np.mean(prior_losses)
+        msg += '| diffusion loss = %.3f\n' % np.mean(diff_losses)
+        with open(f'{hps.train.log_dir}/train.log', 'a') as f:
+            f.write(msg)
+        # if (np.mean(prior_losses) < 1.05):
+        #     skip_diff_train = False
+        if epoch > hps.train.fast_epochs:
+            skip_diff_train = False
+        if epoch % hps.train.save_step > 0:
+            continue
+        save_path = f"{hps.train.log_dir}/grad_svc_{epoch}.pt"
+        torch.save({
+            'model': model.state_dict(),
+            'optim': optim.state_dict(),
+            'epoch': epoch,
+            'steps': iteration,
+        }, save_path)
+        print("Saved checkpoint to: %s" % save_path)

grad_extend/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import glob
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+def parse_filelist(filelist_path, split_char="|"):
+    with open(filelist_path, encoding='utf-8') as f:
+        filepaths_and_text = [line.strip().split(split_char) for line in f]
+    return filepaths_and_text
+def load_model(model, saved_state_dict):
+    state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+        except:
+            print("%s is not in the checkpoint" % k)
+            new_state_dict[k] = v
+    model.load_state_dict(new_state_dict)
+    return model
+def latest_checkpoint_path(dir_path, regex="grad_svc_*.pt"):
+    f_list = glob.glob(os.path.join(dir_path, regex))
+    f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+    x = f_list[-1]
+    return x
+def load_checkpoint(logdir, model, num=None):
+    if num is None:
+        model_path = latest_checkpoint_path(logdir, regex="grad_svc_*.pt")
+    else:
+        model_path = os.path.join(logdir, f"grad_svc_{num}.pt")
+    print(f'Loading checkpoint {model_path}...')
+    model_dict = torch.load(model_path, map_location=lambda loc, storage: loc)
+    model.load_state_dict(model_dict, strict=False)
+    return model
+def save_figure_to_numpy(fig):
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    return data
+def plot_tensor(tensor):
+    plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
+def save_plot(tensor, savepath):
+    plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.savefig(savepath)
+    plt.close()
+    return
+def print_error(info):
+    print(f"\033[31m {info} \033[0m")

grad_pretrain/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Path for:
2	+
3	+ gvc.pretrain.pth

hubert/__init__.py ADDED Viewed

File without changes

hubert/hubert_model.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import copy
+import random
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def consume_prefix(state_dict, prefix: str) -> None:
+    keys = sorted(state_dict.keys())
+    for key in keys:
+        if key.startswith(prefix):
+            newkey = key[len(prefix):]
+            state_dict[newkey] = state_dict.pop(key)
+def hubert_soft(
+        path: str,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval()
+    return hubert

hubert/inference.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import argparse
+import torch
+import librosa
+from hubert import hubert_model
+def load_audio(file: str, sr: int = 16000):
+    x, sr = librosa.load(file, sr=sr)
+    return x
+def load_model(path, device):
+    model = hubert_model.hubert_soft(path)
+    model.eval()
+    if not (device == "cpu"):
+        model.half()
+    model.to(device)
+    return model
+def pred_vec(model, wavPath, vecPath, device):
+    audio = load_audio(wavPath)
+    audln = audio.shape[0]
+    vec_a = []
+    idx_s = 0
+    while (idx_s + 20 * 16000 < audln):
+        feats = audio[idx_s:idx_s + 20 * 16000]
+        feats = torch.from_numpy(feats).to(device)
+        feats = feats[None, None, :]
+        if not (device == "cpu"):
+            feats = feats.half()
+        with torch.no_grad():
+            vec = model.units(feats).squeeze().data.cpu().float().numpy()
+            vec_a.extend(vec)
+        idx_s = idx_s + 20 * 16000
+    if (idx_s < audln):
+        feats = audio[idx_s:audln]
+        feats = torch.from_numpy(feats).to(device)
+        feats = feats[None, None, :]
+        if not (device == "cpu"):
+            feats = feats.half()
+        with torch.no_grad():
+            vec = model.units(feats).squeeze().data.cpu().float().numpy()
+            # print(vec.shape)   # [length, dim=256] hop=320
+            vec_a.extend(vec)
+    np.save(vecPath, vec_a, allow_pickle=False)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav")
+    parser.add_argument("-v", "--vec", help="vec", dest="vec")
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.vec)
+    wavPath = args.wav
+    vecPath = args.vec
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    hubert = load_model(os.path.join(
+        "hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device)
+    pred_vec(hubert, wavPath, vecPath, device)

hubert_pretrain/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Path for:
2	+
3	+ hubert-soft-0d54a1f4.pt

hubert_pretrain/hubert-soft-0d54a1f4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
+size 378435957

pitch/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .inference import load_csv_pitch

pitch/inference.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import librosa
+import argparse
+import numpy as np
+import parselmouth
+# pip install praat-parselmouth
+def compute_f0_mouth(path):
+    x, sr = librosa.load(path, sr=16000)
+    assert sr == 16000
+    lpad = 1024 // 160
+    rpad = lpad
+    f0 = parselmouth.Sound(x, sr).to_pitch_ac(
+        time_step=160 / sr,
+        voicing_threshold=0.5,
+        pitch_floor=30,
+        pitch_ceiling=1000).selected_array['frequency']
+    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
+    return f0
+def compute_f0_crepe(filename):
+    import torch
+    import torchcrepe
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    audio, sr = librosa.load(filename, sr=16000)
+    assert sr == 16000
+    audio = torch.tensor(np.copy(audio))[None]
+    audio = audio + torch.randn_like(audio) * 0.001
+    # Here we'll use a 20 millisecond hop length
+    hop_length = 320
+    fmin = 50
+    fmax = 1000
+    model = "full"
+    batch_size = 512
+    pitch = torchcrepe.predict(
+        audio,
+        sr,
+        hop_length,
+        fmin,
+        fmax,
+        model,
+        batch_size=batch_size,
+        device=device,
+        return_periodicity=False,
+    )
+    pitch = np.repeat(pitch, 2, -1)  # 320 -> 160 * 2
+    pitch = torchcrepe.filter.mean(pitch, 5)
+    pitch = pitch.squeeze(0)
+    return pitch
+def save_csv_pitch(pitch, path):
+    with open(path, "w", encoding='utf-8') as pitch_file:
+        for i in range(len(pitch)):
+            t = i * 10
+            minute = t // 60000
+            seconds = (t - minute * 60000) // 1000
+            millisecond = t % 1000
+            print(
+                f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)
+def load_csv_pitch(path):
+    pitch = []
+    with open(path, "r", encoding='utf-8') as pitch_file:
+        for line in pitch_file.readlines():
+            pit = line.strip().split(",")[-1]
+            pitch.append(int(pit))
+    return pitch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav")
+    parser.add_argument("-p", "--pit", help="pit", dest="pit")  # csv for excel
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.pit)
+    pitch = compute_f0_mouth(args.wav)
+    save_csv_pitch(pitch, args.pit)
+    #tmp = load_csv_pitch(args.pit)
+    #save_csv_pitch(tmp, "tmp.csv")

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+librosa
+soundfile
+matplotlib
+tensorboard
+transformers
+tqdm
+einops
+fsspec
+omegaconf
+pyworld
+praat-parselmouth

spec/inference.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import argparse
+import torch
+import torch.utils.data
+import numpy as np
+import librosa
+from omegaconf import OmegaConf
+from librosa.filters import mel as librosa_mel_fn
+MAX_WAV_VALUE = 32768.0
+def load_wav_to_torch(full_path, sample_rate):
+    wav, _ = librosa.load(full_path, sr=sample_rate)
+    wav = wav / np.abs(wav).max() * 0.6
+    return torch.FloatTensor(wav)
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def mel_spectrogram_file(path, hps):
+    audio = load_wav_to_torch(path, hps.data.sampling_rate)
+    audio = audio.unsqueeze(0)
+    # match audio length to self.hop_length * n for evaluation
+    if (audio.size(1) % hps.data.hop_length) != 0:
+        audio = audio[:, :-(audio.size(1) % hps.data.hop_length)]
+    mel = mel_spectrogram(audio, hps.data.filter_length, hps.data.mel_channels, hps.data.sampling_rate,
+                          hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax, center=False)
+    return mel
+def print_mel(mel, path="mel.png"):
+    import matplotlib.pyplot as plt
+    fig = plt.figure(figsize=(12, 4))
+    if isinstance(mel, torch.Tensor):
+        mel = mel.cpu().numpy()
+    plt.pcolor(mel)
+    plt.savefig(path, format="png")
+    plt.close(fig)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-w", "--wav", help="wav", dest="wav")
+    parser.add_argument("-m", "--mel", help="mel", dest="mel")  # csv for excel
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.mel)
+    hps = OmegaConf.load(f"./configs/base.yaml")
+    mel = mel_spectrogram_file(args.wav, hps)
+    # TODO
+    mel = torch.squeeze(mel, 0)
+    # [100, length]
+    torch.save(mel, args.mel)
+    print_mel(mel, "debug.mel.png")