HiFiGAN v2.0

Browse files

Files changed (9) hide show

Modules/hifigan.py +1 -1
Modules/vits/attentions.py +32 -34
Modules/vits/commons.py +0 -14
Modules/vits/models.py +17 -9
Modules/vits/modules.py +7 -44
Modules/vits/transforms.py +6 -18
Modules/vits/utils.py +0 -117
msinference.py +11 -23
requirements.txt +1 -0

Modules/hifigan.py CHANGED Viewed

@@ -142,7 +142,7 @@ class SineGen(torch.nn.Module):
         fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))  # [1, 145200, 9]
-        sine_waves = self._f02sine(fn) * .007  # very important effect DEFAULT=0.1  very sensitive to speaker
         uv = (f0 > self.voiced_threshold).type(torch.float32)

         fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))  # [1, 145200, 9]
+        sine_waves = self._f02sine(fn) * .01 # .007  # very important effect DEFAULT=0.1  very sensitive to speaker CHECK COnTINUITY FROM SEGMENTS IN AUDIOBOOK
         uv = (f0 > self.voiced_threshold).type(torch.float32)

Modules/vits/attentions.py CHANGED Viewed

@@ -18,10 +18,10 @@ class Encoder(nn.Module):
     self.n_heads = n_heads
     self.n_layers = n_layers
     self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
     self.window_size = window_size
-    self.drop = nn.Dropout(p_dropout)
     self.attn_layers = nn.ModuleList()
     self.norm_layers_1 = nn.ModuleList()
     self.ffn_layers = nn.ModuleList()
@@ -37,11 +37,11 @@ class Encoder(nn.Module):
     x = x * x_mask
     for i in range(self.n_layers):
       y = self.attn_layers[i](x, x, attn_mask)
-      y = self.drop(y)
       x = self.norm_layers_1[i](x + y)
       y = self.ffn_layers[i](x, x_mask)
-      y = self.drop(y)
       x = self.norm_layers_2[i](x + y)
     x = x * x_mask
     return x
@@ -58,7 +58,7 @@ class MultiHeadAttention(nn.Module):
     self.channels = channels
     self.out_channels = out_channels
     self.n_heads = n_heads
-    self.p_dropout = p_dropout
     self.window_size = window_size
     self.heads_share = heads_share
     self.block_length = block_length
@@ -71,7 +71,7 @@ class MultiHeadAttention(nn.Module):
     self.conv_k = nn.Conv1d(channels, channels, 1)
     self.conv_v = nn.Conv1d(channels, channels, 1)
     self.conv_o = nn.Conv1d(channels, out_channels, 1)
-    self.drop = nn.Dropout(p_dropout)
     if window_size is not None:
       n_heads_rel = 1 if heads_share else n_heads
@@ -83,17 +83,16 @@ class MultiHeadAttention(nn.Module):
     nn.init.xavier_uniform_(self.conv_k.weight)
     nn.init.xavier_uniform_(self.conv_v.weight)
     if proximal_init:
-      with torch.no_grad():
-        self.conv_k.weight.copy_(self.conv_q.weight)
-        self.conv_k.bias.copy_(self.conv_q.bias)
   def forward(self, x, c, attn_mask=None):
     q = self.conv_q(x)
     k = self.conv_k(c)
     v = self.conv_v(c)
-    x, self.attn = self.attention(q, k, v, mask=attn_mask)
     x = self.conv_o(x)
     return x
@@ -112,18 +111,21 @@ class MultiHeadAttention(nn.Module):
       scores_local = self._relative_position_to_absolute_position(rel_logits)
       scores = scores + scores_local
     if self.proximal_bias:
-      assert t_s == t_t, "Proximal bias is only available for self-attention."
-      scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
     if mask is not None:
       scores = scores.masked_fill(mask == 0, -1e4)
       if self.block_length is not None:
-        assert t_s == t_t, "Local attention is only available for self-attention."
-        block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
-        scores = scores.masked_fill(block_mask == 0, -1e4)
     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
-    p_attn = self.drop(p_attn)
     output = torch.matmul(p_attn, value)
-    if self.window_size is not None:
       relative_weights = self._absolute_position_to_relative_position(p_attn)
       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
@@ -155,11 +157,19 @@ class MultiHeadAttention(nn.Module):
     slice_start_position = max((self.window_size + 1) - length, 0)
     slice_end_position = slice_start_position + 2 * length - 1
     if pad_length > 0:
       padded_relative_embeddings = F.pad(
           relative_embeddings,
           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
     else:
-      padded_relative_embeddings = relative_embeddings
     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
     return used_relative_embeddings
@@ -194,18 +204,6 @@ class MultiHeadAttention(nn.Module):
     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
     return x_final
-  def _attention_bias_proximal(self, length):
-    """Bias for self-attention to encourage attention to close positions.
-    Args:
-      length: an integer scalar.
-    Returns:
-      a Tensor with shape [1, 1, length, length]
-    """
-    r = torch.arange(length, dtype=torch.float32)
-    diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-    return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
 class FFN(nn.Module):
   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
     super().__init__()
@@ -213,7 +211,7 @@ class FFN(nn.Module):
     self.out_channels = out_channels
     self.filter_channels = filter_channels
     self.kernel_size = kernel_size
-    self.p_dropout = p_dropout
     self.activation = activation
     self.causal = causal
@@ -224,7 +222,7 @@ class FFN(nn.Module):
     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
-    self.drop = nn.Dropout(p_dropout)
   def forward(self, x, x_mask):
     x = self.conv_1(self.padding(x * x_mask))
@@ -232,7 +230,7 @@ class FFN(nn.Module):
       x = x * torch.sigmoid(1.702 * x)
     else:
       x = torch.relu(x)
-    x = self.drop(x)
     x = self.conv_2(self.padding(x * x_mask))
     return x * x_mask

     self.n_heads = n_heads
     self.n_layers = n_layers
     self.kernel_size = kernel_size
     self.window_size = window_size
     self.attn_layers = nn.ModuleList()
     self.norm_layers_1 = nn.ModuleList()
     self.ffn_layers = nn.ModuleList()
     x = x * x_mask
     for i in range(self.n_layers):
       y = self.attn_layers[i](x, x, attn_mask)
       x = self.norm_layers_1[i](x + y)
       y = self.ffn_layers[i](x, x_mask)
       x = self.norm_layers_2[i](x + y)
     x = x * x_mask
     return x
     self.channels = channels
     self.out_channels = out_channels
     self.n_heads = n_heads
     self.window_size = window_size
     self.heads_share = heads_share
     self.block_length = block_length
     self.conv_k = nn.Conv1d(channels, channels, 1)
     self.conv_v = nn.Conv1d(channels, channels, 1)
     self.conv_o = nn.Conv1d(channels, out_channels, 1)
     if window_size is not None:
       n_heads_rel = 1 if heads_share else n_heads
     nn.init.xavier_uniform_(self.conv_k.weight)
     nn.init.xavier_uniform_(self.conv_v.weight)
     if proximal_init:
+      raise ValueError
   def forward(self, x, c, attn_mask=None):
     q = self.conv_q(x)
     k = self.conv_k(c)
     v = self.conv_v(c)
+    x, self.attn = self.attention(q, k, v, mask=attn_mask)  # x.shape=torch.Size([1, 192, 1499])
     x = self.conv_o(x)
     return x
       scores_local = self._relative_position_to_absolute_position(rel_logits)
       scores = scores + scores_local
     if self.proximal_bias:
+      raise ValueError
+      # assert t_s == t_t, "Proximal bias is only available for self-attention."
+      # scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
     if mask is not None:
+      # mask is ALL ONes !!!!
       scores = scores.masked_fill(mask == 0, -1e4)
       if self.block_length is not None:
+        raise ValueError
+        # assert t_s == t_t, "Local attention is only available for self-attention."
+        # block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+        # scores = scores.masked_fill(block_mask == 0, -1e4)
     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
     output = torch.matmul(p_attn, value)
+    if self.window_size is not None:  # self.window_size=4
       relative_weights = self._absolute_position_to_relative_position(p_attn)
       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
     slice_start_position = max((self.window_size + 1) - length, 0)
     slice_end_position = slice_start_position + 2 * length - 1
     if pad_length > 0:
+      # --
+      # AFTEr = torch.Size([1, 2997, 96]) relative_embeddings.shape = torch.Size([1, 9, 96]) pad_length = 1494
+      # --
       padded_relative_embeddings = F.pad(
           relative_embeddings,
           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
     else:
+      raise ValueError
+      # padded_relative_embeddings = relative_embeddings
+    # --
+    # print(f'{slice_start_position=} {slice_end_position=} {padded_relative_embeddings.shape=}')
+    # slice_start_position=0 slice_end_position=2997 padded_relative_embeddings.shape=torch.Size([1, 2997, 96])
+    # --
     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
     return used_relative_embeddings
     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
     return x_final
 class FFN(nn.Module):
   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
     super().__init__()
     self.out_channels = out_channels
     self.filter_channels = filter_channels
     self.kernel_size = kernel_size
     self.activation = activation
     self.causal = causal
     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
   def forward(self, x, x_mask):
     x = self.conv_1(self.padding(x * x_mask))
       x = x * torch.sigmoid(1.702 * x)
     else:
       x = torch.relu(x)
     x = self.conv_2(self.padding(x * x_mask))
     return x * x_mask

Modules/vits/commons.py CHANGED Viewed

@@ -19,20 +19,6 @@ def intersperse(lst, item):
   result[1::2] = lst
   return result
-def kl_divergence(m_p, logs_p, m_q, logs_q):
-  """KL(P||Q)"""
-  kl = (logs_q - logs_p) - 0.5
-  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
-  return kl
-def rand_gumbel(shape):
-  """Sample from the Gumbel distribution, protect from overflows."""
-  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
-  return -torch.log(-torch.log(uniform_samples))
 @torch.jit.script
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
   n_channels_int = n_channels[0]

   result[1::2] = lst
   return result
 @torch.jit.script
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
   n_channels_int = n_channels[0]

Modules/vits/models.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import copy
 import math
 import torch
 from torch import nn
@@ -24,7 +23,6 @@ class StochasticDurationPredictor(nn.Module):
     self.n_flows = n_flows
     self.gin_channels = gin_channels
-    self.log_flow = modules.Log()
     self.flows = nn.ModuleList()
     self.flows.append(modules.ElementwiseAffine(2))
     for i in range(n_flows):
@@ -46,7 +44,12 @@ class StochasticDurationPredictor(nn.Module):
     if gin_channels != 0:
       self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-  def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
     x = torch.detach(x)
     x = self.pre(x)
     if g is not None:
@@ -60,10 +63,13 @@ class StochasticDurationPredictor(nn.Module):
     else:
       flows = list(reversed(self.flows))
       flows = flows[:-2] + [flows[-1]] # remove a useless vflow
-      z = torch.zeros(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) #* noise_scale
       for flow in flows:
         z = flow(z, x_mask, g=x, reverse=reverse)
-      z0, z1 = torch.split(z, [1, 1], 1)
       logw = z0
       return logw
@@ -89,7 +95,7 @@ class TextEncoder(nn.Module):
     self.emb = nn.Embedding(n_vocab, hidden_channels)
     nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
     self.encoder = attentions.Encoder(
       hidden_channels,
       filter_channels,
@@ -98,6 +104,7 @@ class TextEncoder(nn.Module):
       kernel_size,
       p_dropout)
     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
   def forward(self, x, x_lengths):
     x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
@@ -150,7 +157,7 @@ class Generator(torch.nn.Module):
         self.num_upsamples = len(upsample_rates)
         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
         self.ups = nn.ModuleList()
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
             self.ups.append(weight_norm(
@@ -279,7 +286,8 @@ class SynthesizerTrn(nn.Module):
     if self.use_sdp:
       logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
     else:
-      logw = self.dp(x, x_mask, g=g)
     w = torch.exp(logw) * x_mask * length_scale
     w_ceil = torch.ceil(w)
     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
@@ -290,7 +298,7 @@ class SynthesizerTrn(nn.Module):
     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
-    z_p = m_p + torch.zeros_like(m_p) * torch.exp(logs_p)#* noise_scale
     z = self.flow(z_p, y_mask, g=g, reverse=True)
     o = self.dec((z * y_mask)[:,:,:max_len], g=g)
     return o, attn, y_mask, (z, z_p, m_p, logs_p)

 import math
 import torch
 from torch import nn
     self.n_flows = n_flows
     self.gin_channels = gin_channels
     self.flows = nn.ModuleList()
     self.flows.append(modules.ElementwiseAffine(2))
     for i in range(n_flows):
     if gin_channels != 0:
       self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+  def forward(self,
+              x,
+              x_mask,
+              g=None,
+              reverse=False,
+              noise_scale=1.0):
     x = torch.detach(x)
     x = self.pre(x)
     if g is not None:
     else:
       flows = list(reversed(self.flows))
       flows = flows[:-2] + [flows[-1]] # remove a useless vflow
+      # noise_scale = 0.0 => Fast
+      # noise_scale = 1.0 => Slow
+      z = torch.rand(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * .44 #* noise_scale  # [1, 2, 2604=letters]
       for flow in flows:
         z = flow(z, x_mask, g=x, reverse=reverse)
+      z0, _ = torch.split(z, [1, 1], 1)
       logw = z0
       return logw
     self.emb = nn.Embedding(n_vocab, hidden_channels)
     nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
     self.encoder = attentions.Encoder(
       hidden_channels,
       filter_channels,
       kernel_size,
       p_dropout)
     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
   def forward(self, x, x_lengths):
     x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
         self.num_upsamples = len(upsample_rates)
         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+        print(f'_____________________________________{resblock=}_________')
         self.ups = nn.ModuleList()
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
             self.ups.append(weight_norm(
     if self.use_sdp:
       logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
     else:
+      raise ValueError
     w = torch.exp(logw) * x_mask * length_scale
     w_ceil = torch.ceil(w)
     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+    z_p = m_p + torch.rand_like(m_p) * torch.exp(logs_p)#* noise_scale
     z = self.flow(z_p, y_mask, g=g, reverse=True)
     o = self.dec((z * y_mask)[:,:,:max_len], g=g)
     return o, attn, y_mask, (z, z_p, m_p, logs_p)

Modules/vits/modules.py CHANGED Viewed

@@ -229,42 +229,7 @@ class ResBlock1(torch.nn.Module):
             remove_weight_norm(l)
-class ResBlock2(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock2, self).__init__()
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1])))
-        ])
-        self.convs.apply(init_weights)
-    def forward(self, x, x_mask=None):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c(xt)
-            x = xt + x
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_weight_norm(l)
-class Log(nn.Module):
-  def forward(self, x, x_mask, reverse=False, **kwargs):
-    if not reverse:
-      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
-      logdet = torch.sum(-y, [1, 2])
-      return y, logdet
-    else:
-      x = torch.exp(x) * x_mask
-      return x
 class Flip(nn.Module):
@@ -373,18 +338,16 @@ class ConvFlow(nn.Module):
     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
     unnormalized_derivatives = h[..., 2 * self.num_bins:]
-    x1, logabsdet = piecewise_rational_quadratic_transform(x1,
         unnormalized_widths,
         unnormalized_heights,
         unnormalized_derivatives,
         inverse=reverse,
         tails='linear',
         tail_bound=self.tail_bound
-    )
-    x = torch.cat([x0, x1], 1) * x_mask
-    logdet = torch.sum(logabsdet * x_mask, [1,2])
-    if not reverse:
-        return x, logdet
-    else:
-        return x

             remove_weight_norm(l)
 class Flip(nn.Module):
     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
     unnormalized_derivatives = h[..., 2 * self.num_bins:]
+    x1, _ = piecewise_rational_quadratic_transform(x1,
         unnormalized_widths,
         unnormalized_heights,
         unnormalized_derivatives,
         inverse=reverse,
         tails='linear',
         tail_bound=self.tail_bound
+    )  # if x1=x0 sounds like fast and syllabes have no time to finish via rand on duration however what if duration is set ones?
+    # x1 = x0
+    #  x0.shape = x1.shape = torch.Size([1, 1, 1499])
+    x = torch.cat([x0, x1], 1) * x_mask
+    return x

Modules/vits/transforms.py CHANGED Viewed

@@ -21,8 +21,9 @@ def piecewise_rational_quadratic_transform(inputs,
                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
     if tails is None:
-        spline_fn = rational_quadratic_spline
-        spline_kwargs = {}
     else:
         spline_fn = unconstrained_rational_quadratic_spline
         spline_kwargs = {
@@ -135,7 +136,8 @@ def rational_quadratic_spline(inputs,
     if inverse:
         bin_idx = searchsorted(cumheights, inputs)[..., None]
     else:
-        bin_idx = searchsorted(cumwidths, inputs)[..., None]
     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
@@ -176,18 +178,4 @@ def rational_quadratic_spline(inputs,
         return outputs, -logabsdet
     else:
-        theta = (inputs - input_cumwidths) / input_bin_widths
-        theta_one_minus_theta = theta * (1 - theta)
-        numerator = input_heights * (input_delta * theta.pow(2)
-                                     + input_derivatives * theta_one_minus_theta)
-        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
-                                     * theta_one_minus_theta)
-        outputs = input_cumheights + numerator / denominator
-        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
-                                                     + 2 * input_delta * theta_one_minus_theta
-                                                     + input_derivatives * (1 - theta).pow(2))
-        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
-        return outputs, logabsdet

                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
     if tails is None:
+        raise ValueError
+        # spline_fn = rational_quadratic_spline
+        # spline_kwargs = {}
     else:
         spline_fn = unconstrained_rational_quadratic_spline
         spline_kwargs = {
     if inverse:
         bin_idx = searchsorted(cumheights, inputs)[..., None]
     else:
+        raise ValueError
+        # bin_idx = searchsorted(cumwidths, inputs)[..., None]
     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
         return outputs, -logabsdet
     else:
+        raise ValueError

Modules/vits/utils.py CHANGED Viewed

@@ -43,125 +43,8 @@ def load_checkpoint(checkpoint_path, model, optimizer=None):
   return model, optimizer, learning_rate, iteration
-def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
-  logger.info("Saving model and optimizer state at iteration {} to {}".format(
-    iteration, checkpoint_path))
-  if hasattr(model, 'module'):
-    state_dict = model.module.state_dict()
-  else:
-    state_dict = model.state_dict()
-  torch.save({'model': state_dict,
-              'iteration': iteration,
-              'optimizer': optimizer.state_dict(),
-              'learning_rate': learning_rate}, checkpoint_path)
-def plot_spectrogram_to_numpy(spectrogram):
-  global MATPLOTLIB_FLAG
-  if not MATPLOTLIB_FLAG:
-    import matplotlib
-    matplotlib.use("Agg")
-    MATPLOTLIB_FLAG = True
-    mpl_logger = logging.getLogger('matplotlib')
-    mpl_logger.setLevel(logging.WARNING)
-  import matplotlib.pylab as plt
-  import numpy as np
-  fig, ax = plt.subplots(figsize=(10,2))
-  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
-                  interpolation='none')
-  plt.colorbar(im, ax=ax)
-  plt.xlabel("Frames")
-  plt.ylabel("Channels")
-  plt.tight_layout()
-  fig.canvas.draw()
-  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
-  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-  plt.close()
-  return data
-def plot_alignment_to_numpy(alignment, info=None):
-  global MATPLOTLIB_FLAG
-  if not MATPLOTLIB_FLAG:
-    import matplotlib
-    matplotlib.use("Agg")
-    MATPLOTLIB_FLAG = True
-    mpl_logger = logging.getLogger('matplotlib')
-    mpl_logger.setLevel(logging.WARNING)
-  import matplotlib.pylab as plt
-  import numpy as np
-  fig, ax = plt.subplots(figsize=(6, 4))
-  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
-                  interpolation='none')
-  fig.colorbar(im, ax=ax)
-  xlabel = 'Decoder timestep'
-  if info is not None:
-      xlabel += '\n\n' + info
-  plt.xlabel(xlabel)
-  plt.ylabel('Encoder timestep')
-  plt.tight_layout()
-  fig.canvas.draw()
-  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
-  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-  plt.close()
-  return data
-def load_wav_to_torch(full_path):
-  sampling_rate, data = read(full_path)
-  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
-def load_filepaths_and_text(filename, split="|"):
-  with open(filename, encoding='utf-8') as f:
-    filepaths_and_text = [line.strip().split(split) for line in f]
-  return filepaths_and_text
-def get_hparams(init=True):
-  parser = argparse.ArgumentParser()
-  parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
-                      help='JSON file for configuration')
-  parser.add_argument('-m', '--model', type=str, required=True,
-                      help='Model name')
-  args = parser.parse_args()
-  model_dir = os.path.join("./logs", args.model)
-  if not os.path.exists(model_dir):
-    os.makedirs(model_dir)
-  config_path = args.config
-  config_save_path = os.path.join(model_dir, "config.json")
-  if init:
-    with open(config_path, "r") as f:
-      data = f.read()
-    with open(config_save_path, "w") as f:
-      f.write(data)
-  else:
-    with open(config_save_path, "r") as f:
-      data = f.read()
-  config = json.loads(data)
-  hparams = HParams(**config)
-  hparams.model_dir = model_dir
-  return hparams
-def get_hparams_from_dir(model_dir):
-  config_save_path = os.path.join(model_dir, "config.json")
-  with open(config_save_path, "r") as f:
-    data = f.read()
-  config = json.loads(data)
-  hparams =HParams(**config)
-  hparams.model_dir = model_dir
-  return hparams
 def get_hparams_from_file(config_path):
   with open(config_path, "r") as f:

   return model, optimizer, learning_rate, iteration
 def get_hparams_from_file(config_path):
   with open(config_path, "r") as f:

msinference.py CHANGED Viewed

@@ -130,17 +130,6 @@ bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
-# 'bert',
-#     'bert_encoder',
-#     'predictor',
-#     'decoder',
-#     'text_encoder',
-#     'predictor_encoder',
-#     'style_encoder',
-#     'text_aligner',
-#     'pitch_extractor'
-# --
 from collections import OrderedDict
 def _del_prefix(d):
@@ -149,7 +138,6 @@ def _del_prefix(d):
     for k, v in d.items():
         out[k[7:]] = v
     return out
 bert.load_state_dict(        _del_prefix(params['bert']), strict=True)
 bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
@@ -216,23 +204,23 @@ def inference(text,
         for i in range(pred_aln_trg.size(0)):
             pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
             c_frame += int(pred_dur[i].data)
         en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
         F0_pred, N_pred = predictor.F0Ntrain(en, s)
         asr = (hidden_states @ pred_aln_trg.unsqueeze(0).to(device))
-        # -- END DURATION
-        # [bs, 640, 198]
-        # replicated Huberrt frames for duration-of-each-frame to elast [bs, 640, 130] -> [bs, 640, 198]
-        # every Hubert frame can be cloned from 1 to ~12 times and appended to the final array
-        F0_pred, N_pred = predictor.F0Ntrain(en, s)
         x = decoder(asr=asr,
                     F0_curve=F0_pred,
                     N=N_pred,

 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
 from collections import OrderedDict
 def _del_prefix(d):
     for k, v in d.items():
         out[k[7:]] = v
     return out
 bert.load_state_dict(        _del_prefix(params['bert']), strict=True)
 bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
         for i in range(pred_aln_trg.size(0)):
             pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
             c_frame += int(pred_dur[i].data)
         en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
+        asr_new = torch.zeros_like(en)
+        asr_new[:, :, 0] = en[:, :, 0]
+        asr_new[:, :, 1:] = en[:, :, 0:-1]
+        en = asr_new
         F0_pred, N_pred = predictor.F0Ntrain(en, s)
         asr = (hidden_states @ pred_aln_trg.unsqueeze(0).to(device))
+        asr_new = torch.zeros_like(asr)
+        asr_new[:, :, 0] = asr[:, :, 0]
+        asr_new[:, :, 1:] = asr[:, :, 0:-1]
+        asr = asr_new
+        # -
         x = decoder(asr=asr,
                     F0_curve=F0_pred,
                     N=N_pred,

requirements.txt CHANGED Viewed

@@ -17,3 +17,4 @@ audresample
 srt
 nltk
 phonemizer

 srt
 nltk
 phonemizer
+docx