File size: 10,768 Bytes

c7362aa
4e4c64c
 
 
 
 
c7362aa
 
d353343
c7362aa
d353343
c7362aa
4e4c64c
c7362aa
4e4c64c
c7362aa
 
 
 
 
 
 
 
4e4c64c
c7362aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac6157a
4e4c64c
c7362aa
 
 
 
 
 
 
 
 
 
 
 
4e4c64c
ac6157a
c7362aa
 
4e4c64c
c7362aa
 
 
4e4c64c
c7362aa
4e4c64c
 
 
c7362aa
 
4e4c64c
c7362aa
 
4e4c64c
 
 
 
c7362aa
 
4e4c64c
 
 
 
c7362aa
 
 
4e4c64c
 
 
 
 
 
 
 
 
 
 
 
 
ac6157a
4e4c64c
bb2cd38
62ef231
bb2cd38
c7362aa
 
 
 
4e4c64c
c7362aa
 
 
 
 
4e4c64c
c7362aa
 
 
 
4e4c64c
 
 
 
c7362aa
 
 
 
4e4c64c
 
ac6157a
4e4c64c
c7362aa
 
4e4c64c
 
 
 
 
ac6157a
4e4c64c
 
 
 
 
 
 
 
 
 
 
 
 
ac6157a
c7362aa
4e4c64c
c7362aa
4e4c64c
 
 
 
 
 
 
 
c7362aa
 
 
 
4e4c64c
c7362aa
4e4c64c
c7362aa
4e4c64c
c7362aa
 
4e4c64c
 
c7362aa
 
4e4c64c
c7362aa
62ef231
4e4c64c
 
62ef231
4e4c64c
 
c7362aa
62ef231
 
c7362aa
4e4c64c
62ef231
4e4c64c
c7362aa
4e4c64c
 
c7362aa
 
4e4c64c
c7362aa
 
 
 
 
4e4c64c
c7362aa
 
 
 
 
 
 
 
 
 
 
 
4e4c64c
 
c7362aa
4e4c64c
c2687b7
c7362aa
d353343
 
c7362aa
d353343
c7362aa
4e4c64c
c2687b7
62ef231
c7362aa
 
4e4c64c
c7362aa
d353343
c7362aa
4e4c64c
 
 
c7362aa
d353343
c7362aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d353343
4e4c64c
 
c7362aa
4e4c64c
 
 
c7362aa
 
 
 
 
 
4e4c64c
 
d353343
c7362aa
64ccdd0
c7362aa
 
 
 
d353343
4e4c64c
 
c7362aa
 
64ccdd0
4e4c64c
c7362aa
 
4e4c64c
c7362aa

# coding:utf-8

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import spectral_norm
from torch.nn.utils.parametrizations import weight_norm
# from Utils.ASR.models import ASRCNN
# from Utils.JDC.model import JDCNet
from Modules.hifigan import _tile, AdainResBlk1d
import math

class MelSpec(torch.nn.Module):

    def __init__(self,
                 sample_rate=17402, # https://github.com/fakerybakery/styletts2-cli/blob/main/msinference.py = Default 16000. However 17400 vocalises better also "en_US/vctk_p274"
                 n_fft=2048,
                 win_length=1200,
                 hop_length=300,
                 n_mels=80
                 ):
        '''avoids dependency on torchaudio'''
        super().__init__()
        self.n_fft = n_fft
        self.win_length = win_length if win_length is not None else n_fft
        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
        # --
        f_min = 0.0
        f_max = float(sample_rate // 2)
        all_freqs = torch.linspace(0, sample_rate // 2, n_fft//2+1)
        m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
        m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
        m_pts = torch.linspace(m_min, m_max, n_mels + 2)
        f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
        f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
        slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)
        zero = torch.zeros(1)
        down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
        up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
        fb = torch.max(zero, torch.min(down_slopes, up_slopes))
        # --
        self.register_buffer('fb', fb)
        window = torch.hann_window(self.win_length)
        self.register_buffer('window', window)

    def forward(self, x):
        spec_f = torch.stft(x,
                            self.n_fft,
                            self.hop_length,
                            self.win_length,
                            self.window,
                            center=True,
                            pad_mode="reflect",
                            normalized=False,
                            onesided=True,
                            return_complex=True)  # [bs, 1025, 56]
        mel_specgram = torch.matmul(spec_f.abs().pow(2).transpose(1, 2), self.fb).transpose(1, 2)
        return mel_specgram[:, None, :, :]  # [bs, 1, 80, time]


class LearnedDownSample(nn.Module):
    def __init__(self, dim_in):
        super().__init__()
        self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(
                3, 3), stride=(2, 2), groups=dim_in, padding=1))
        
    def forward(self, x):
        return self.conv(x)


class ResBlk(nn.Module):
    def __init__(self, 
                 dim_in, dim_out):
        super().__init__()
        self.actv = nn.LeakyReLU(0.2)   # .07 also nice
        self.downsample_res = LearnedDownSample(dim_in)
        self.learned_sc = dim_in != dim_out
        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
        if self.learned_sc:
            self.conv1x1 = spectral_norm(
                nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))

    def _shortcut(self, x):
        if self.learned_sc:
            x = self.conv1x1(x)
        if x.shape[3] % 2 != 0:  # [bs, 128, Freq, Time]
            x = torch.cat([x, x[:, :, :, -1:]], dim=3)
        return F.interpolate(x, scale_factor=.5, mode='nearest-exact')  # F.avg_pool2d(x, 2)

    def _residual(self, x):
        x = self.actv(x)
        x = self.conv1(x)
        x = self.downsample_res(x)
        x = self.actv(x)
        x = self.conv2(x)
        return x

    def forward(self, x):
        x = self._shortcut(x) + self._residual(x)
        return x / math.sqrt(2)  # unit variance


class StyleEncoder(nn.Module):

    #  for both acoustic & prosodic ref_s/p

    def __init__(self,
                 dim_in=64,
                 style_dim=128,
                 max_conv_dim=512):
        super().__init__()
        blocks = [spectral_norm(nn.Conv2d(1, dim_in, 3, stride=1, padding=1))]
        for _ in range(4):
            dim_out = min(dim_in * 2, 
                          max_conv_dim)
            blocks += [ResBlk(dim_in, dim_out)]
            dim_in = dim_out
        blocks += [nn.LeakyReLU(0.24),  # w/o this activation - produces no speech
                   spectral_norm(nn.Conv2d(dim_out, dim_out, 5, stride=1, padding=0)),
                   nn.LeakyReLU(0.2)  # 0.3 sounds nice
                   ]
        self.shared = nn.Sequential(*blocks)
        self.unshared = nn.Linear(dim_out, style_dim)

    def forward(self, x):
        x = self.shared(x)
        x = x.mean(3, keepdims=True)  # comment this line for time varying style vector
        x = x.transpose(1, 3)
        s = self.unshared(x)
        return s


class LinearNorm(torch.nn.Module):
    def __init__(self, in_dim, out_dim, bias=True):
        super().__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

    def forward(self, x):
        return self.linear_layer(x)


class LayerNorm(nn.Module):
    def __init__(self, channels, eps=1e-5):
        super().__init__()
        self.channels = channels
        self.eps = eps

        self.gamma = nn.Parameter(torch.ones(channels))
        self.beta = nn.Parameter(torch.zeros(channels))

    def forward(self, x):
        x = x.transpose(1, -1)
        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
        return x.transpose(1, -1)


class TextEncoder(nn.Module):
    def __init__(self, channels, kernel_size, depth, n_symbols):
        super().__init__()
        self.embedding = nn.Embedding(n_symbols, channels)
        padding = (kernel_size - 1) // 2
        self.cnn = nn.ModuleList()
        for _ in range(depth):
            self.cnn.append(nn.Sequential(
                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
                LayerNorm(channels),
                nn.LeakyReLU(0.24))
                            )
        self.lstm = nn.LSTM(channels, channels//2, 1,
                            batch_first=True, bidirectional=True)

    def forward(self, x):
        x = self.embedding(x)  # [B, T, emb]
        x = x.transpose(1, 2)
        for c in self.cnn:
            x = c(x)
        x = x.transpose(1, 2)
        x, _ = self.lstm(x)
        return x


class AdaLayerNorm(nn.Module):

    def __init__(self, style_dim, channels=None, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.fc = nn.Linear(style_dim, 1024)

    def forward(self, x, s):
        h = self.fc(s)
        gamma = h[:, :, :512]
        beta = h[:, :, 512:1024]
        x = F.layer_norm(x, (512, ), eps=self.eps)
        x = (1 + gamma) * x + beta
        return x  # [1, 75, 512]


class ProsodyPredictor(nn.Module):

    def __init__(self, style_dim, d_hid, nlayers, max_dur=50):
        super().__init__()

        self.text_encoder = DurationEncoder(sty_dim=style_dim,
                                            d_model=d_hid,
                                            nlayers=nlayers)  # called outside forward
        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2,
                            1, batch_first=True, bidirectional=True)
        self.duration_proj = LinearNorm(d_hid, max_dur)
        self.shared = nn.LSTM(d_hid + style_dim, d_hid //
                              2, 1, batch_first=True, bidirectional=True)
        self.F0 = nn.ModuleList([
            AdainResBlk1d(d_hid, d_hid, style_dim),
            AdainResBlk1d(d_hid, d_hid // 2,  style_dim, upsample=True),
            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim),
            ])
        self.N = nn.ModuleList([
            AdainResBlk1d(d_hid, d_hid, style_dim),
            AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True),
            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim)
            ])
        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)

    def F0Ntrain(self, x, s):

        x, _ = self.shared(x)  # [bs, time, ch] LSTM

        x = x.transpose(1, 2)  # [bs, ch, time]

        F0 = x

        for block in self.F0:
            # print(f'LOOP {F0.shape=} {s.shape=}\n')
            # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
            # This is an AdainResBlk1d expects conv1d dimensions
            F0 = block(F0, s)
        F0 = self.F0_proj(F0)

        N = x

        for block in self.N:
            N = block(N, s)
        N = self.N_proj(N)

        return F0, N
    
    def forward(self, d_en=None, s=None):
        blend = self.text_encoder(d_en, s)
        x, _ = self.lstm(blend)
        dur = self.duration_proj(x)  # [bs, 150, 50]
        
        _, input_length, classifier_50 = dur.shape

        dur = dur[0, :, :]
        dur = torch.sigmoid(dur).sum(1)
        dur = dur.round().clamp(min=1).to(torch.int64)
        aln_trg = torch.zeros(1,
                              dur.sum(),
                              input_length, 
                              device=s.device)
        c_frame = 0
        for i in range(input_length):
            aln_trg[:, c_frame:c_frame + dur[i], i] = 1
            c_frame += dur[i]
        en = torch.bmm(aln_trg, blend)
        F0_pred, N_pred = self.F0Ntrain(en, s)
        return aln_trg, F0_pred, N_pred


class DurationEncoder(nn.Module):

    def __init__(self, sty_dim=128, d_model=512, nlayers=3):
        super().__init__()
        self.lstms = nn.ModuleList()
        for _ in range(nlayers):
            self.lstms.append(nn.LSTM(d_model + sty_dim,
                                      d_model // 2,
                                      num_layers=1,
                                      batch_first=True,
                                      bidirectional=True
                                      ))
            self.lstms.append(AdaLayerNorm(sty_dim, d_model))


    def forward(self, x, style):

        _, _, input_lengths = x.shape  # [bs, 512, time]

        style = _tile(style, length=x.shape[2]).transpose(1, 2)
        x = x.transpose(1, 2)

        for block in self.lstms:
            if isinstance(block, AdaLayerNorm):
                
                x = block(x, style)  # LSTM has transposed x

            else:
                x = torch.cat([x, style], axis=2)
                # LSTM

                x,_ = block(x)  # expects [bs, time, chan]  OUTPUTS [bs, time, 2*chan]  2x FROM BIDIRECTIONAL

        return torch.cat([x, style], axis=2)  # predictor.lstm()