time varying speaker style

Browse files

Files changed (4) hide show

Modules/hifigan.py +13 -5
models.py +53 -80
msinference.py +178 -75
requirements.txt +1 -1

Modules/hifigan.py CHANGED Viewed

@@ -12,16 +12,24 @@ import numpy as np
 LRELU_SLOPE = 0.1
 class AdaIN1d(nn.Module):
     def __init__(self, style_dim, num_features):
         super().__init__()
         self.norm = nn.InstanceNorm1d(num_features, affine=False)
         self.fc = nn.Linear(style_dim, num_features*2)
     def forward(self, x, s):
-        h = self.fc(s)
-        h = h.view(h.size(0), h.size(1), 1)
-        gamma, beta = torch.chunk(h, chunks=2, dim=1)
-        return (1 + gamma) * self.norm(x) + beta
 class AdaINResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
@@ -443,7 +451,7 @@ class Decoder(nn.Module):
         self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
-    def forward(self, asr, F0_curve, N, s):
         if self.training:
             downlist = [0, 3, 7]
             F0_down = downlist[random.randint(0, 2)]

 LRELU_SLOPE = 0.1
 class AdaIN1d(nn.Module):
+    # used by HiFiGan & ProsodyPredictor
     def __init__(self, style_dim, num_features):
         super().__init__()
         self.norm = nn.InstanceNorm1d(num_features, affine=False)
         self.fc = nn.Linear(style_dim, num_features*2)
     def forward(self, x, s):
+        s = self.fc(s)  # [bs, 1024, 130]
+        s = F.interpolate(s[:, :, 0, :].transpose(1,2), x.shape[2], mode='linear')  # different time-resolution than Dur
+        gamma, beta = torch.chunk(s, chunks=2, dim=1)  # channels vary in for loop
+        # affine (1 + lin(x)) * inst(x) + lin(x)    is this a skip connection where the weight is a lin of itself
+        return (1 + gamma) * self.norm(x) + beta    # norm(x) = PLBERT has norm / beta&gamma = style has no norm()
 class AdaINResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
         self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
+    def forward(self, asr=None, F0_curve=None, N=None, s=None):
         if self.training:
             downlist = [0, 3, 7]
             F0_down = downlist[random.randint(0, 2)]

models.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch.nn.functional as F
 from torch.nn.utils import weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
-from munch import Munch
 import yaml
@@ -110,7 +110,7 @@ class ResBlk(nn.Module):
 class StyleEncoder(nn.Module):
-    # used for both acoustic & prosodic ref_s/p
     def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
         super().__init__()
@@ -125,15 +125,20 @@ class StyleEncoder(nn.Module):
         blocks += [nn.LeakyReLU(0.2)]
         blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
-        blocks += [nn.AdaptiveAvgPool2d(1)]
         blocks += [nn.LeakyReLU(0.2)]
         self.shared = nn.Sequential(*blocks)
         self.unshared = nn.Linear(dim_out, style_dim)
     def forward(self, x):
-        h = self.shared(x)
-        h = h.view(h.size(0), -1)
         s = self.unshared(h)
         return s
@@ -289,21 +294,6 @@ class TextEncoder(nn.Module):
         mask = torch.gt(mask+1, lengths.unsqueeze(1))
         return mask
-class AdaIN1d(nn.Module):
-    def __init__(self, style_dim, num_features):
-        super().__init__()
-        self.norm = nn.InstanceNorm1d(num_features, affine=False)
-        self.fc = nn.Linear(style_dim, num_features*2)
-    def forward(self, x, s):
-        h = self.fc(s)
-        h = h.view(h.size(0), h.size(1), 1)
-        gamma, beta = torch.chunk(h, chunks=2, dim=1)
-        # affine (1 + lin(x)) * inst(x) + lin(x)    is this a skip connection where the weight is a lin of itself
-        return (1 + gamma) * self.norm(x) + beta    # norm(x) = PLBERT has norm / beta&gamma = style has no norm()
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
@@ -316,8 +306,15 @@ class UpSample1d(nn.Module):
             return F.interpolate(x, scale_factor=2, mode='nearest')
 class AdainResBlk1d(nn.Module):
-    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
-                 upsample='none', dropout_p=0.0):
         super().__init__()
         self.actv = actv
         self.upsample_type = upsample
@@ -362,26 +359,22 @@ class AdainResBlk1d(nn.Module):
         return out
 class AdaLayerNorm(nn.Module):
-    def __init__(self, style_dim, channels, eps=1e-5):
         super().__init__()
-        self.channels = channels
         self.eps = eps
-        self.fc = nn.Linear(style_dim, channels*2)
     def forward(self, x, s):
-        x = x.transpose(-1, -2)
-        x = x.transpose(1, -1)
-        h = self.fc(s)
-        h = h.view(h.size(0), h.size(1), 1)
-        gamma, beta = torch.chunk(h, chunks=2, dim=1)
-        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), eps=self.eps)
         x = (1 + gamma) * x + beta
-        return x.transpose(1, -1).transpose(-1, -2)
 class ProsodyPredictor(nn.Module):
@@ -414,7 +407,12 @@ class ProsodyPredictor(nn.Module):
         x, _ = self.shared(x.transpose(-1, -2))
         F0 = x.transpose(-1, -2)
         for block in self.F0:
             F0 = block(F0, s)
         F0 = self.F0_proj(F0)
@@ -452,21 +450,30 @@ class DurationEncoder(nn.Module):
     def forward(self, x, style, text_lengths, m):
         masks = m.to(text_lengths.device)
-        x = x.permute(2, 0, 1)
-        s = style.expand(x.shape[0], x.shape[1], -1)
-        x = torch.cat([x, s], axis=-1)
-        x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
-        x = x.transpose(0, 1)
         input_lengths = text_lengths.cpu().numpy()
-        x = x.transpose(-1, -2)
         for block in self.lstms:
             if isinstance(block, AdaLayerNorm):
-                x = block(x.transpose(-1, -2), style).transpose(-1, -2)
-                x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
-                x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
             else:
                 x = x.transpose(-1, -2)
                 x = nn.utils.rnn.pack_padded_sequence(
                     x, input_lengths, batch_first=True, enforce_sorted=False)
@@ -481,6 +488,7 @@ class DurationEncoder(nn.Module):
                 x_pad[:, :, :x.shape[-1]] = x
                 x = x_pad.to(x.device)
 #         print('Calling Duration Encoder\n\n\n\n',x.shape, x.min(), x.max())
 #         Calling Duration Encoder
 #  torch.Size([1, 640, 107]) tensor(-3.0903, device='cuda:0') tensor(2.3089, device='cuda:0')
@@ -493,7 +501,6 @@ def load_F0_models(path):
     # load F0 model
     F0_model = JDCNet(num_class=1, seq_len=192)
-    print(path, 'WHAT ARE YOU TRYING TO LOAD F0 L520')
     path = path.replace('.t7', '.pth')
     params = torch.load(path, map_location='cpu')['net']
     F0_model.load_state_dict(params)
@@ -524,37 +531,3 @@ def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
     _ = asr_model.train()
     return asr_model
-def build_model(args, text_aligner, pitch_extractor, bert):
-    print(f'\n==============\n {args.decoder.type=}\n==============L584 models.py @ build_model()\n')
-    from Modules.hifigan import Decoder
-    decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
-            resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
-            upsample_rates = args.decoder.upsample_rates,
-            upsample_initial_channel=args.decoder.upsample_initial_channel,
-            resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
-            upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
-    text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
-    predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
-    style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
-    predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
-    nets = Munch(
-            bert=bert,
-            bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
-            predictor=predictor,
-            decoder=decoder,
-            text_encoder=text_encoder,
-            predictor_encoder=predictor_encoder,
-            style_encoder=style_encoder,
-            text_aligner = text_aligner,
-            pitch_extractor=pitch_extractor
-       )
-    return nets

 from torch.nn.utils import weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
+from Modules.hifigan import AdaIN1d
 import yaml
 class StyleEncoder(nn.Module):
+    #  for both acoustic & prosodic ref_s/p
     def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
         super().__init__()
         blocks += [nn.LeakyReLU(0.2)]
         blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
+        # blocks += [nn.AdaptiveAvgPool2d(1)]   # THIS AVERAGES THE TIME-FRAMES OF SPEAKER STYLE
         blocks += [nn.LeakyReLU(0.2)]
         self.shared = nn.Sequential(*blocks)
         self.unshared = nn.Linear(dim_out, style_dim)
     def forward(self, x):
+        h = self.shared(x)  # [bs, 512, 1, 11]
+        h = h.mean(3, keepdims=True)  # UN COMMENT FOR TIME INVARIANT GLOBAL SPEAKER STYLE
+        h = h.transpose(1, 3)
         s = self.unshared(h)
         return s
         mask = torch.gt(mask+1, lengths.unsqueeze(1))
         return mask
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
             return F.interpolate(x, scale_factor=2, mode='nearest')
 class AdainResBlk1d(nn.Module):
+    # only instantiated in ProsodyPredictor
+    def __init__(self, dim_in,
+                 dim_out,
+                 style_dim=64,
+                 actv=nn.LeakyReLU(0.2),
+                 upsample='none',
+                 dropout_p=0.0):
         super().__init__()
         self.actv = actv
         self.upsample_type = upsample
         return out
 class AdaLayerNorm(nn.Module):
+    # only instantianted in DurationPredictor()
+    def __init__(self, style_dim, channels=None, eps=1e-5):
         super().__init__()
         self.eps = eps
+        self.fc = nn.Linear(style_dim, 1024)
     def forward(self, x, s):
+        h = self.fc(s.transpose(1, 2))  # has to be transposed due to interpolate needing the last dim to be frames
+        gamma = h[:, :, :512]
+        beta = h[:, :, 512:1024]
+        x = F.layer_norm(x.transpose(1, 2), (512, ), eps=self.eps)
         x = (1 + gamma) * x + beta
+        return x  # [1, 75, 512]
 class ProsodyPredictor(nn.Module):
         x, _ = self.shared(x.transpose(-1, -2))
         F0 = x.transpose(-1, -2)
         for block in self.F0:
+            print(f'F)N {F0.shape=} {s.shape=}\n')
+            # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
             F0 = block(F0, s)
         F0 = self.F0_proj(F0)
     def forward(self, x, style, text_lengths, m):
         masks = m.to(text_lengths.device)
+        # x : [bs, 512, 987]
+        # print('DURATION ENCODER', x.shape, style.shape, masks.shape)
+        # s = style.expand(x.shape[0], x.shape[1], -1)
+        style = style[:, :, 0, :].transpose(2, 1)  # [bs, 128, 11]
+        # print("S IN DURATION ENC", style.shape, x.shape)
+        style = F.interpolate(style, x.shape[2])
+        print(f'L468 IN DURATION ENC {x.shape=}, {style.shape=} {masks.shape=}')  # mask = [1,75]
+        x = torch.cat([x, style], axis=1)  # [bs, 640, 75]
+        x.masked_fill_(masks[:, None, :], 0.0)
         input_lengths = text_lengths.cpu().numpy()
         for block in self.lstms:
             if isinstance(block, AdaLayerNorm):
+                print(f'\n=========ENTER ADALAYNORM L479 models.py {x.shape=}, {style.shape=}')
+                x = block(x, style)   # [bs, 75, 512]
+                x = torch.cat([x.transpose(1, 2), style], axis=1) # [bs, 512, 75]
+                x.masked_fill_(masks[:, None, :], 0.0)
             else:
+                # print(f'{x.shape=} ENTER LSTM')  # [bs, 640, 75]  LSTM reduce ch 640 -> 512
                 x = x.transpose(-1, -2)
                 x = nn.utils.rnn.pack_padded_sequence(
                     x, input_lengths, batch_first=True, enforce_sorted=False)
                 x_pad[:, :, :x.shape[-1]] = x
                 x = x_pad.to(x.device)
+                # print(f'{x.shape=} EXIR LSTM')  # [bs, 512, 75]
 #         print('Calling Duration Encoder\n\n\n\n',x.shape, x.min(), x.max())
 #         Calling Duration Encoder
 #  torch.Size([1, 640, 107]) tensor(-3.0903, device='cuda:0') tensor(2.3089, device='cuda:0')
     # load F0 model
     F0_model = JDCNet(num_class=1, seq_len=192)
     path = path.replace('.t7', '.pth')
     params = torch.load(path, map_location='cpu')['net']
     F0_model.load_state_dict(params)
     _ = asr_model.train()
     return asr_model

msinference.py CHANGED Viewed

@@ -7,8 +7,7 @@ import numpy as np
 import yaml
 import torchaudio
 import librosa
-from models import *
-from munch import Munch
 from nltk.tokenize import word_tokenize
 torch.manual_seed(0)
@@ -62,17 +61,6 @@ def alpha_num(f):
     return f
-def recursive_munch(d):
-    if isinstance(d, dict):
-        return Munch((k, recursive_munch(v)) for k, v in d.items())
-    elif isinstance(d, list):
-        return [recursive_munch(v) for v in d]
-    else:
-        return d
 # ======== UTILS ABOVE
 def length_to_mask(lengths):
@@ -94,10 +82,10 @@ def compute_style(path):
     mel_tensor = preprocess(audio).to(device)
     with torch.no_grad():
-        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
-        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))
-    return torch.cat([ref_s, ref_p], dim=1)
 device = 'cpu'
 if torch.cuda.is_available():
@@ -112,50 +100,151 @@ global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_
 # phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
-config = yaml.safe_load(open(str('Utils/config.yml')))
-# load pretrained ASR model
-ASR_config = config.get('ASR_config', False)
-ASR_path = config.get('ASR_path', False)
-text_aligner = load_ASR_models(ASR_path, ASR_config)
-# load pretrained F0 model
-F0_path = config.get('F0_path', False)
-pitch_extractor = load_F0_models(F0_path)
-# load BERT model
 from Utils.PLBERT.util import load_plbert
-BERT_path = config.get('PLBERT_dir', False)
-plbert = load_plbert(BERT_path)
-model_params = recursive_munch(config['model_params'])
-model = build_model(model_params, text_aligner, pitch_extractor, plbert)
-_ = [model[key].eval() for key in model]
-_ = [model[key].to(device) for key in model]
 # params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
-for key in model:
-    if key in params:
-        print('%s loaded' % key)
-        try:
-            model[key].load_state_dict(params[key])
-        except:
-            from collections import OrderedDict
-            state_dict = params[key]
-            new_state_dict = OrderedDict()
-            for k, v in state_dict.items():
-                name = k[7:] # remove `module.`
-                new_state_dict[name] = v
-            # load params
-            model[key].load_state_dict(new_state_dict, strict=False)
-#             except:
-#                 _load(params[key], model[key])
-_ = [model[key].eval() for key in model]
 def inference(text,
@@ -193,24 +282,31 @@ def inference(text,
                         #   54, 156,  63, 158, 147,  83,  56,  16,   4]], device='cuda:0')
-        t_en = model.text_encoder(tokens, input_lengths, text_mask)
-        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
-        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
         # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
         # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
-        ref = ref_s[:, :128]
-        s = ref_s[:, 128:]
-        # s = .74 * s  # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
-        d = model.predictor.text_encoder(d_en,
-                                         s, input_lengths, text_mask)
-        x, _ = model.predictor.lstm(d)
-        duration = model.predictor.duration_proj(x)
         duration = torch.sigmoid(duration).sum(axis=-1)
         pred_dur = torch.round(duration.squeeze()).clamp(min=1)
@@ -224,23 +320,25 @@ def inference(text,
         # encode prosody
         en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(en)
-            asr_new[:, :, 0] = en[:, :, 0]
-            asr_new[:, :, 1:] = en[:, :, 0:-1]
-            en = asr_new
-        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
         asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(asr)
-            asr_new[:, :, 0] = asr[:, :, 0]
-            asr_new[:, :, 1:] = asr[:, :, 0:-1]
-            asr = asr_new
-        x = model.decoder(asr,
-                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))
     x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
@@ -299,6 +397,11 @@ import re
 from num2words import num2words
 PHONEME_MAP = {
         'q': 'ku',
         'w': 'aou',
         'z': 's',

 import yaml
 import torchaudio
 import librosa
+from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_ASR_models, load_F0_models
 from nltk.tokenize import word_tokenize
 torch.manual_seed(0)
     return f
 # ======== UTILS ABOVE
 def length_to_mask(lengths):
     mel_tensor = preprocess(audio).to(device)
     with torch.no_grad():
+        ref_s = style_encoder(mel_tensor.unsqueeze(1))
+        ref_p = predictor_encoder(mel_tensor.unsqueeze(1))  # [bs, 11, 1, 128]
+    print(f'\n\n\n\nCOMPUTE STYLe {ref_s.shape=} {ref_p.shape=}')
+    return torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
 device = 'cpu'
 if torch.cuda.is_available():
 # phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
+args = yaml.safe_load(open(str('Utils/config.yml')))
+ASR_config = args['ASR_config']
+ASR_path = args['ASR_path']
+text_aligner = load_ASR_models(ASR_path, ASR_config).eval().to(device)
+F0_path = args['F0_path']
+pitch_extractor = load_F0_models(F0_path).eval().to(device)
 from Utils.PLBERT.util import load_plbert
+bert = load_plbert(args['PLBERT_dir']).eval().to(device)
+# model_params = recursive_munch(config['model_params'])
+# --
+# def build_model(args, text_aligner, pitch_extractor, bert):
+#     print(f'\n==============\n {args.decoder.type=}\n==============L584 models.py @ build_model()\n')
+# # ======================================
+# In [4]: args['model_params']
+# Out[4]:
+# {'decoder': {'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+#                 'resblock_kernel_sizes': [3, 7, 11],
+#                 'type': 'hifigan',
+#                 'upsample_initial_channel': 512,
+#                 'upsample_kernel_sizes': [20, 10, 6, 4],
+#                 'upsample_rates': [10, 5, 3, 2]},
+#                 'diffusion': {'dist': {'estimate_sigma_data': True,
+#                 'mean': -3.0,
+#                 'sigma_data': 0.19926648961191362,
+#                 'std': 1.0},
+#                 'embedding_mask_proba': 0.1,
+#                 'transformer': {'head_features': 64,
+#                 'multiplier': 2,
+#                 'num_heads': 8,
+#                 'num_layers': 3}},
+#                 'dim_in': 64,
+#                 'dropout': 0.2,
+#                 'hidden_dim': 512,
+#                 'max_conv_dim': 512,
+#                 'max_dur': 50,
+#                 'multispeaker': True,
+#                 'n_layer': 3,
+#                 'n_mels': 80,
+#                 'n_token': 178,
+#                 'slm': {'hidden': 768,
+#                 'initial_channel': 64,
+#                 'model': 'microsoft/wavlm-base-plus',
+#                 'nlayers': 13,
+#                 'sr': 16000},
+#                 'style_dim': 128}
+# # ===============================================
+from Modules.hifigan import Decoder
+decoder = Decoder(dim_in=512,
+                  style_dim=128,
+                  dim_out=80,  # n_mels
+                  resblock_kernel_sizes = [3, 7, 11],
+                  upsample_rates = [10, 5, 3, 2],
+                  upsample_initial_channel=512,
+                  resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                  upsample_kernel_sizes=[20, 10, 6, 4]).eval().to(device)
+text_encoder = TextEncoder(channels=512,
+                           kernel_size=5,
+                           depth=3, #args['model_params']['n_layer'],
+                           n_symbols=178, #args['model_params']['n_token']
+                           ).eval().to(device)
+predictor = ProsodyPredictor(style_dim=128,
+                             d_hid=512,
+                             nlayers=3,  # OFFICIAL config.nlayers=5;
+                             max_dur=50,
+                             dropout=.2).eval().to(device)
+style_encoder = StyleEncoder(dim_in=64,
+                             style_dim=128,
+                             max_conv_dim=512).eval().to(device) # acoustic style encoder
+predictor_encoder = StyleEncoder(dim_in=64,
+                                 style_dim=128,
+                                 max_conv_dim=512).eval().to(device) # prosodic style encoder
+bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
+# --
+# model = build_model(model_params, text_aligner, pitch_extractor, plbert)
+# _ = [model[key].eval() for key in model]
+# _ = [model[key].to(device) for key in model]
 # params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
+# 'bert',
+#     'bert_encoder',
+#     'predictor',
+#     'decoder',
+#     'text_encoder',
+#     'predictor_encoder',
+#     'style_encoder',
+#     'text_aligner',
+#     'pitch_extractor'
+# --
+from collections import OrderedDict
+new_state_dict = OrderedDict()
+for k, v in params['bert'].items():
+    new_state_dict[k[7:]] = v    # del 'module.'
+bert.load_state_dict(new_state_dict, strict=True)
+# --
+new_state_dict = OrderedDict()
+for k, v in params['bert_encoder'].items():
+    new_state_dict[k[7:]] = v    # del 'module.'
+bert_encoder.load_state_dict(new_state_dict, strict=True)
+# --
+new_state_dict = OrderedDict()
+for k, v in params['predictor'].items():
+    new_state_dict[k[7:]] = v    # del 'module.'
+predictor.load_state_dict(new_state_dict, strict=True)  # XTRA non-ckpt LSTMs nlayers add slowiness to voice
+# --
+new_state_dict = OrderedDict()
+for k, v in params['decoder'].items():
+    new_state_dict[k[7:]] = v
+decoder.load_state_dict(new_state_dict, strict=True)
+# --
+new_state_dict = OrderedDict()
+for k, v in params['text_encoder'].items():
+    new_state_dict[k[7:]] = v
+text_encoder.load_state_dict(new_state_dict, strict=True)
+# --
+new_state_dict = OrderedDict()
+for k, v in params['predictor_encoder'].items():
+    new_state_dict[k[7:]] = v
+predictor_encoder.load_state_dict(new_state_dict, strict=True)
+# --
+new_state_dict = OrderedDict()
+for k, v in params['style_encoder'].items():
+    new_state_dict[k[7:]] = v
+style_encoder.load_state_dict(new_state_dict, strict=True)
+# --
+new_state_dict = OrderedDict()
+for k, v in params['text_aligner'].items():
+    new_state_dict[k[7:]] = v    # del 'module.'
+text_aligner.load_state_dict(new_state_dict, strict=True)
+# --
+new_state_dict = OrderedDict()
+for k, v in params['pitch_extractor'].items():
+    new_state_dict[k[7:]] = v
+pitch_extractor.load_state_dict(new_state_dict, strict=True)
 def inference(text,
                         #   54, 156,  63, 158, 147,  83,  56,  16,   4]], device='cuda:0')
+        t_en = text_encoder(tokens, input_lengths, text_mask)
+        bert_dur = bert(tokens, attention_mask=(~text_mask).int())
+        d_en = bert_encoder(bert_dur).transpose(-1, -2)
         # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
         # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
+        ref = ref_s[:, :, :, :128] # [bs, 11, 1, 128]
+        s = ref_s[:, :, :, 128:]   # have channels as last dim so it can go through nn.Linear layers
+        # ON compute style we dont know yet the size to interpolate
+        # Perhaps we can interpolate ref_s here as now we know how many bert time-frames the text needs
+        # s = .74 * s  # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
+        print(f'{d_en.shape=}  {s.shape=} {input_lengths.shape=}  {text_mask.shape=}')
+        d = predictor.text_encoder(d_en,
+                                         s,
+                                         input_lengths,
+                                         text_mask)
+        x, _ = predictor.lstm(d)
+        duration = predictor.duration_proj(x)
         duration = torch.sigmoid(duration).sum(axis=-1)
         pred_dur = torch.round(duration.squeeze()).clamp(min=1)
         # encode prosody
         en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
+        asr_new = torch.zeros_like(en)
+        asr_new[:, :, 0] = en[:, :, 0]
+        asr_new[:, :, 1:] = en[:, :, 0:-1]
+        en = asr_new
+        print('_________________________________________F0_____________________________')
+        F0_pred, N_pred = predictor.F0Ntrain(en, s)
         asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
+        asr_new = torch.zeros_like(asr)
+        asr_new[:, :, 0] = asr[:, :, 0]
+        asr_new[:, :, 1:] = asr[:, :, 0:-1]
+        asr = asr_new
+        print('_________________________________________HiFI_____________________________')
+        x = decoder(asr=asr,
+                    F0_curve=F0_pred,
+                    N=N_pred,
+                    s=ref)
     x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
 from num2words import num2words
 PHONEME_MAP = {
+        'služ' : 'sloooozz', # 'službeno'
+        'suver': 'siuveeerra', # 'suverena'
+        'država': 'dirrezav', # 'država'
+        'iči': 'ici', # 'Graniči'
+        's ': 'se', # a s with space
         'q': 'ku',
         'w': 'aou',
         'z': 's',

requirements.txt CHANGED Viewed

@@ -13,7 +13,7 @@ omegaconf
 opencv-python
 soundfile
 transformers
-munch
 srt
 nltk
 phonemizer

 opencv-python
 soundfile
 transformers
+audresample
 srt
 nltk
 phonemizer