Dionyssos commited on
Commit
a93bf0d
·
1 Parent(s): d353343

HiFiGAN v2.0

Browse files
Modules/hifigan.py CHANGED
@@ -142,7 +142,7 @@ class SineGen(torch.nn.Module):
142
 
143
  fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) # [1, 145200, 9]
144
 
145
- sine_waves = self._f02sine(fn) * .007 # very important effect DEFAULT=0.1 very sensitive to speaker
146
 
147
  uv = (f0 > self.voiced_threshold).type(torch.float32)
148
 
 
142
 
143
  fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) # [1, 145200, 9]
144
 
145
+ sine_waves = self._f02sine(fn) * .01 # .007 # very important effect DEFAULT=0.1 very sensitive to speaker CHECK COnTINUITY FROM SEGMENTS IN AUDIOBOOK
146
 
147
  uv = (f0 > self.voiced_threshold).type(torch.float32)
148
 
Modules/vits/attentions.py CHANGED
@@ -18,10 +18,10 @@ class Encoder(nn.Module):
18
  self.n_heads = n_heads
19
  self.n_layers = n_layers
20
  self.kernel_size = kernel_size
21
- self.p_dropout = p_dropout
22
  self.window_size = window_size
23
 
24
- self.drop = nn.Dropout(p_dropout)
25
  self.attn_layers = nn.ModuleList()
26
  self.norm_layers_1 = nn.ModuleList()
27
  self.ffn_layers = nn.ModuleList()
@@ -37,11 +37,11 @@ class Encoder(nn.Module):
37
  x = x * x_mask
38
  for i in range(self.n_layers):
39
  y = self.attn_layers[i](x, x, attn_mask)
40
- y = self.drop(y)
41
  x = self.norm_layers_1[i](x + y)
42
 
43
  y = self.ffn_layers[i](x, x_mask)
44
- y = self.drop(y)
45
  x = self.norm_layers_2[i](x + y)
46
  x = x * x_mask
47
  return x
@@ -58,7 +58,7 @@ class MultiHeadAttention(nn.Module):
58
  self.channels = channels
59
  self.out_channels = out_channels
60
  self.n_heads = n_heads
61
- self.p_dropout = p_dropout
62
  self.window_size = window_size
63
  self.heads_share = heads_share
64
  self.block_length = block_length
@@ -71,7 +71,7 @@ class MultiHeadAttention(nn.Module):
71
  self.conv_k = nn.Conv1d(channels, channels, 1)
72
  self.conv_v = nn.Conv1d(channels, channels, 1)
73
  self.conv_o = nn.Conv1d(channels, out_channels, 1)
74
- self.drop = nn.Dropout(p_dropout)
75
 
76
  if window_size is not None:
77
  n_heads_rel = 1 if heads_share else n_heads
@@ -83,17 +83,16 @@ class MultiHeadAttention(nn.Module):
83
  nn.init.xavier_uniform_(self.conv_k.weight)
84
  nn.init.xavier_uniform_(self.conv_v.weight)
85
  if proximal_init:
86
- with torch.no_grad():
87
- self.conv_k.weight.copy_(self.conv_q.weight)
88
- self.conv_k.bias.copy_(self.conv_q.bias)
89
 
90
  def forward(self, x, c, attn_mask=None):
91
  q = self.conv_q(x)
92
  k = self.conv_k(c)
93
  v = self.conv_v(c)
94
 
95
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
96
-
97
  x = self.conv_o(x)
98
  return x
99
 
@@ -112,18 +111,21 @@ class MultiHeadAttention(nn.Module):
112
  scores_local = self._relative_position_to_absolute_position(rel_logits)
113
  scores = scores + scores_local
114
  if self.proximal_bias:
115
- assert t_s == t_t, "Proximal bias is only available for self-attention."
116
- scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
 
117
  if mask is not None:
 
118
  scores = scores.masked_fill(mask == 0, -1e4)
119
  if self.block_length is not None:
120
- assert t_s == t_t, "Local attention is only available for self-attention."
121
- block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
122
- scores = scores.masked_fill(block_mask == 0, -1e4)
 
123
  p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
124
- p_attn = self.drop(p_attn)
125
  output = torch.matmul(p_attn, value)
126
- if self.window_size is not None:
127
  relative_weights = self._absolute_position_to_relative_position(p_attn)
128
  value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
129
  output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
@@ -155,11 +157,19 @@ class MultiHeadAttention(nn.Module):
155
  slice_start_position = max((self.window_size + 1) - length, 0)
156
  slice_end_position = slice_start_position + 2 * length - 1
157
  if pad_length > 0:
 
 
 
158
  padded_relative_embeddings = F.pad(
159
  relative_embeddings,
160
  commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
161
  else:
162
- padded_relative_embeddings = relative_embeddings
 
 
 
 
 
163
  used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
164
  return used_relative_embeddings
165
 
@@ -194,18 +204,6 @@ class MultiHeadAttention(nn.Module):
194
  x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
195
  return x_final
196
 
197
- def _attention_bias_proximal(self, length):
198
- """Bias for self-attention to encourage attention to close positions.
199
- Args:
200
- length: an integer scalar.
201
- Returns:
202
- a Tensor with shape [1, 1, length, length]
203
- """
204
- r = torch.arange(length, dtype=torch.float32)
205
- diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
206
- return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
207
-
208
-
209
  class FFN(nn.Module):
210
  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
211
  super().__init__()
@@ -213,7 +211,7 @@ class FFN(nn.Module):
213
  self.out_channels = out_channels
214
  self.filter_channels = filter_channels
215
  self.kernel_size = kernel_size
216
- self.p_dropout = p_dropout
217
  self.activation = activation
218
  self.causal = causal
219
 
@@ -224,7 +222,7 @@ class FFN(nn.Module):
224
 
225
  self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
226
  self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
227
- self.drop = nn.Dropout(p_dropout)
228
 
229
  def forward(self, x, x_mask):
230
  x = self.conv_1(self.padding(x * x_mask))
@@ -232,7 +230,7 @@ class FFN(nn.Module):
232
  x = x * torch.sigmoid(1.702 * x)
233
  else:
234
  x = torch.relu(x)
235
- x = self.drop(x)
236
  x = self.conv_2(self.padding(x * x_mask))
237
  return x * x_mask
238
 
 
18
  self.n_heads = n_heads
19
  self.n_layers = n_layers
20
  self.kernel_size = kernel_size
21
+
22
  self.window_size = window_size
23
 
24
+
25
  self.attn_layers = nn.ModuleList()
26
  self.norm_layers_1 = nn.ModuleList()
27
  self.ffn_layers = nn.ModuleList()
 
37
  x = x * x_mask
38
  for i in range(self.n_layers):
39
  y = self.attn_layers[i](x, x, attn_mask)
40
+
41
  x = self.norm_layers_1[i](x + y)
42
 
43
  y = self.ffn_layers[i](x, x_mask)
44
+
45
  x = self.norm_layers_2[i](x + y)
46
  x = x * x_mask
47
  return x
 
58
  self.channels = channels
59
  self.out_channels = out_channels
60
  self.n_heads = n_heads
61
+
62
  self.window_size = window_size
63
  self.heads_share = heads_share
64
  self.block_length = block_length
 
71
  self.conv_k = nn.Conv1d(channels, channels, 1)
72
  self.conv_v = nn.Conv1d(channels, channels, 1)
73
  self.conv_o = nn.Conv1d(channels, out_channels, 1)
74
+
75
 
76
  if window_size is not None:
77
  n_heads_rel = 1 if heads_share else n_heads
 
83
  nn.init.xavier_uniform_(self.conv_k.weight)
84
  nn.init.xavier_uniform_(self.conv_v.weight)
85
  if proximal_init:
86
+ raise ValueError
87
+
 
88
 
89
  def forward(self, x, c, attn_mask=None):
90
  q = self.conv_q(x)
91
  k = self.conv_k(c)
92
  v = self.conv_v(c)
93
 
94
+ x, self.attn = self.attention(q, k, v, mask=attn_mask) # x.shape=torch.Size([1, 192, 1499])
95
+
96
  x = self.conv_o(x)
97
  return x
98
 
 
111
  scores_local = self._relative_position_to_absolute_position(rel_logits)
112
  scores = scores + scores_local
113
  if self.proximal_bias:
114
+ raise ValueError
115
+ # assert t_s == t_t, "Proximal bias is only available for self-attention."
116
+ # scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
117
  if mask is not None:
118
+ # mask is ALL ONes !!!!
119
  scores = scores.masked_fill(mask == 0, -1e4)
120
  if self.block_length is not None:
121
+ raise ValueError
122
+ # assert t_s == t_t, "Local attention is only available for self-attention."
123
+ # block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
124
+ # scores = scores.masked_fill(block_mask == 0, -1e4)
125
  p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
126
+
127
  output = torch.matmul(p_attn, value)
128
+ if self.window_size is not None: # self.window_size=4
129
  relative_weights = self._absolute_position_to_relative_position(p_attn)
130
  value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
131
  output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
 
157
  slice_start_position = max((self.window_size + 1) - length, 0)
158
  slice_end_position = slice_start_position + 2 * length - 1
159
  if pad_length > 0:
160
+ # --
161
+ # AFTEr = torch.Size([1, 2997, 96]) relative_embeddings.shape = torch.Size([1, 9, 96]) pad_length = 1494
162
+ # --
163
  padded_relative_embeddings = F.pad(
164
  relative_embeddings,
165
  commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
166
  else:
167
+ raise ValueError
168
+ # padded_relative_embeddings = relative_embeddings
169
+ # --
170
+ # print(f'{slice_start_position=} {slice_end_position=} {padded_relative_embeddings.shape=}')
171
+ # slice_start_position=0 slice_end_position=2997 padded_relative_embeddings.shape=torch.Size([1, 2997, 96])
172
+ # --
173
  used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
174
  return used_relative_embeddings
175
 
 
204
  x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
205
  return x_final
206
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  class FFN(nn.Module):
208
  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
209
  super().__init__()
 
211
  self.out_channels = out_channels
212
  self.filter_channels = filter_channels
213
  self.kernel_size = kernel_size
214
+
215
  self.activation = activation
216
  self.causal = causal
217
 
 
222
 
223
  self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
224
  self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
225
+
226
 
227
  def forward(self, x, x_mask):
228
  x = self.conv_1(self.padding(x * x_mask))
 
230
  x = x * torch.sigmoid(1.702 * x)
231
  else:
232
  x = torch.relu(x)
233
+
234
  x = self.conv_2(self.padding(x * x_mask))
235
  return x * x_mask
236
 
Modules/vits/commons.py CHANGED
@@ -19,20 +19,6 @@ def intersperse(lst, item):
19
  result[1::2] = lst
20
  return result
21
 
22
-
23
- def kl_divergence(m_p, logs_p, m_q, logs_q):
24
- """KL(P||Q)"""
25
- kl = (logs_q - logs_p) - 0.5
26
- kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
27
- return kl
28
-
29
-
30
- def rand_gumbel(shape):
31
- """Sample from the Gumbel distribution, protect from overflows."""
32
- uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
33
- return -torch.log(-torch.log(uniform_samples))
34
-
35
-
36
  @torch.jit.script
37
  def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
38
  n_channels_int = n_channels[0]
 
19
  result[1::2] = lst
20
  return result
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  @torch.jit.script
23
  def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
24
  n_channels_int = n_channels[0]
Modules/vits/models.py CHANGED
@@ -1,4 +1,3 @@
1
- import copy
2
  import math
3
  import torch
4
  from torch import nn
@@ -24,7 +23,6 @@ class StochasticDurationPredictor(nn.Module):
24
  self.n_flows = n_flows
25
  self.gin_channels = gin_channels
26
 
27
- self.log_flow = modules.Log()
28
  self.flows = nn.ModuleList()
29
  self.flows.append(modules.ElementwiseAffine(2))
30
  for i in range(n_flows):
@@ -46,7 +44,12 @@ class StochasticDurationPredictor(nn.Module):
46
  if gin_channels != 0:
47
  self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
48
 
49
- def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 
 
 
 
 
50
  x = torch.detach(x)
51
  x = self.pre(x)
52
  if g is not None:
@@ -60,10 +63,13 @@ class StochasticDurationPredictor(nn.Module):
60
  else:
61
  flows = list(reversed(self.flows))
62
  flows = flows[:-2] + [flows[-1]] # remove a useless vflow
63
- z = torch.zeros(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) #* noise_scale
 
 
 
64
  for flow in flows:
65
  z = flow(z, x_mask, g=x, reverse=reverse)
66
- z0, z1 = torch.split(z, [1, 1], 1)
67
  logw = z0
68
  return logw
69
 
@@ -89,7 +95,7 @@ class TextEncoder(nn.Module):
89
 
90
  self.emb = nn.Embedding(n_vocab, hidden_channels)
91
  nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
92
-
93
  self.encoder = attentions.Encoder(
94
  hidden_channels,
95
  filter_channels,
@@ -98,6 +104,7 @@ class TextEncoder(nn.Module):
98
  kernel_size,
99
  p_dropout)
100
  self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
 
101
 
102
  def forward(self, x, x_lengths):
103
  x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
@@ -150,7 +157,7 @@ class Generator(torch.nn.Module):
150
  self.num_upsamples = len(upsample_rates)
151
  self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
152
  resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
153
-
154
  self.ups = nn.ModuleList()
155
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
156
  self.ups.append(weight_norm(
@@ -279,7 +286,8 @@ class SynthesizerTrn(nn.Module):
279
  if self.use_sdp:
280
  logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
281
  else:
282
- logw = self.dp(x, x_mask, g=g)
 
283
  w = torch.exp(logw) * x_mask * length_scale
284
  w_ceil = torch.ceil(w)
285
  y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
@@ -290,7 +298,7 @@ class SynthesizerTrn(nn.Module):
290
  m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
291
  logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
292
 
293
- z_p = m_p + torch.zeros_like(m_p) * torch.exp(logs_p)#* noise_scale
294
  z = self.flow(z_p, y_mask, g=g, reverse=True)
295
  o = self.dec((z * y_mask)[:,:,:max_len], g=g)
296
  return o, attn, y_mask, (z, z_p, m_p, logs_p)
 
 
1
  import math
2
  import torch
3
  from torch import nn
 
23
  self.n_flows = n_flows
24
  self.gin_channels = gin_channels
25
 
 
26
  self.flows = nn.ModuleList()
27
  self.flows.append(modules.ElementwiseAffine(2))
28
  for i in range(n_flows):
 
44
  if gin_channels != 0:
45
  self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
46
 
47
+ def forward(self,
48
+ x,
49
+ x_mask,
50
+ g=None,
51
+ reverse=False,
52
+ noise_scale=1.0):
53
  x = torch.detach(x)
54
  x = self.pre(x)
55
  if g is not None:
 
63
  else:
64
  flows = list(reversed(self.flows))
65
  flows = flows[:-2] + [flows[-1]] # remove a useless vflow
66
+
67
+ # noise_scale = 0.0 => Fast
68
+ # noise_scale = 1.0 => Slow
69
+ z = torch.rand(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * .44 #* noise_scale # [1, 2, 2604=letters]
70
  for flow in flows:
71
  z = flow(z, x_mask, g=x, reverse=reverse)
72
+ z0, _ = torch.split(z, [1, 1], 1)
73
  logw = z0
74
  return logw
75
 
 
95
 
96
  self.emb = nn.Embedding(n_vocab, hidden_channels)
97
  nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
98
+
99
  self.encoder = attentions.Encoder(
100
  hidden_channels,
101
  filter_channels,
 
104
  kernel_size,
105
  p_dropout)
106
  self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
107
+
108
 
109
  def forward(self, x, x_lengths):
110
  x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
 
157
  self.num_upsamples = len(upsample_rates)
158
  self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
159
  resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
160
+ print(f'_____________________________________{resblock=}_________')
161
  self.ups = nn.ModuleList()
162
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
163
  self.ups.append(weight_norm(
 
286
  if self.use_sdp:
287
  logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
288
  else:
289
+ raise ValueError
290
+
291
  w = torch.exp(logw) * x_mask * length_scale
292
  w_ceil = torch.ceil(w)
293
  y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
 
298
  m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
299
  logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
300
 
301
+ z_p = m_p + torch.rand_like(m_p) * torch.exp(logs_p)#* noise_scale
302
  z = self.flow(z_p, y_mask, g=g, reverse=True)
303
  o = self.dec((z * y_mask)[:,:,:max_len], g=g)
304
  return o, attn, y_mask, (z, z_p, m_p, logs_p)
Modules/vits/modules.py CHANGED
@@ -229,42 +229,7 @@ class ResBlock1(torch.nn.Module):
229
  remove_weight_norm(l)
230
 
231
 
232
- class ResBlock2(torch.nn.Module):
233
- def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
234
- super(ResBlock2, self).__init__()
235
- self.convs = nn.ModuleList([
236
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
237
- padding=get_padding(kernel_size, dilation[0]))),
238
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
239
- padding=get_padding(kernel_size, dilation[1])))
240
- ])
241
- self.convs.apply(init_weights)
242
-
243
- def forward(self, x, x_mask=None):
244
- for c in self.convs:
245
- xt = F.leaky_relu(x, LRELU_SLOPE)
246
- if x_mask is not None:
247
- xt = xt * x_mask
248
- xt = c(xt)
249
- x = xt + x
250
- if x_mask is not None:
251
- x = x * x_mask
252
- return x
253
 
254
- def remove_weight_norm(self):
255
- for l in self.convs:
256
- remove_weight_norm(l)
257
-
258
-
259
- class Log(nn.Module):
260
- def forward(self, x, x_mask, reverse=False, **kwargs):
261
- if not reverse:
262
- y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
263
- logdet = torch.sum(-y, [1, 2])
264
- return y, logdet
265
- else:
266
- x = torch.exp(x) * x_mask
267
- return x
268
 
269
 
270
  class Flip(nn.Module):
@@ -373,18 +338,16 @@ class ConvFlow(nn.Module):
373
  unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
374
  unnormalized_derivatives = h[..., 2 * self.num_bins:]
375
 
376
- x1, logabsdet = piecewise_rational_quadratic_transform(x1,
377
  unnormalized_widths,
378
  unnormalized_heights,
379
  unnormalized_derivatives,
380
  inverse=reverse,
381
  tails='linear',
382
  tail_bound=self.tail_bound
383
- )
384
-
385
- x = torch.cat([x0, x1], 1) * x_mask
386
- logdet = torch.sum(logabsdet * x_mask, [1,2])
387
- if not reverse:
388
- return x, logdet
389
- else:
390
- return x
 
229
  remove_weight_norm(l)
230
 
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
 
235
  class Flip(nn.Module):
 
338
  unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
339
  unnormalized_derivatives = h[..., 2 * self.num_bins:]
340
 
341
+ x1, _ = piecewise_rational_quadratic_transform(x1,
342
  unnormalized_widths,
343
  unnormalized_heights,
344
  unnormalized_derivatives,
345
  inverse=reverse,
346
  tails='linear',
347
  tail_bound=self.tail_bound
348
+ ) # if x1=x0 sounds like fast and syllabes have no time to finish via rand on duration however what if duration is set ones?
349
+ # x1 = x0
350
+ # x0.shape = x1.shape = torch.Size([1, 1, 1499])
351
+
352
+ x = torch.cat([x0, x1], 1) * x_mask
353
+ return x
 
 
Modules/vits/transforms.py CHANGED
@@ -21,8 +21,9 @@ def piecewise_rational_quadratic_transform(inputs,
21
  min_derivative=DEFAULT_MIN_DERIVATIVE):
22
 
23
  if tails is None:
24
- spline_fn = rational_quadratic_spline
25
- spline_kwargs = {}
 
26
  else:
27
  spline_fn = unconstrained_rational_quadratic_spline
28
  spline_kwargs = {
@@ -135,7 +136,8 @@ def rational_quadratic_spline(inputs,
135
  if inverse:
136
  bin_idx = searchsorted(cumheights, inputs)[..., None]
137
  else:
138
- bin_idx = searchsorted(cumwidths, inputs)[..., None]
 
139
 
140
  input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141
  input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
@@ -176,18 +178,4 @@ def rational_quadratic_spline(inputs,
176
 
177
  return outputs, -logabsdet
178
  else:
179
- theta = (inputs - input_cumwidths) / input_bin_widths
180
- theta_one_minus_theta = theta * (1 - theta)
181
-
182
- numerator = input_heights * (input_delta * theta.pow(2)
183
- + input_derivatives * theta_one_minus_theta)
184
- denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185
- * theta_one_minus_theta)
186
- outputs = input_cumheights + numerator / denominator
187
-
188
- derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189
- + 2 * input_delta * theta_one_minus_theta
190
- + input_derivatives * (1 - theta).pow(2))
191
- logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192
-
193
- return outputs, logabsdet
 
21
  min_derivative=DEFAULT_MIN_DERIVATIVE):
22
 
23
  if tails is None:
24
+ raise ValueError
25
+ # spline_fn = rational_quadratic_spline
26
+ # spline_kwargs = {}
27
  else:
28
  spline_fn = unconstrained_rational_quadratic_spline
29
  spline_kwargs = {
 
136
  if inverse:
137
  bin_idx = searchsorted(cumheights, inputs)[..., None]
138
  else:
139
+ raise ValueError
140
+ # bin_idx = searchsorted(cumwidths, inputs)[..., None]
141
 
142
  input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
143
  input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
 
178
 
179
  return outputs, -logabsdet
180
  else:
181
+ raise ValueError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/vits/utils.py CHANGED
@@ -43,125 +43,8 @@ def load_checkpoint(checkpoint_path, model, optimizer=None):
43
  return model, optimizer, learning_rate, iteration
44
 
45
 
46
- def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
47
- logger.info("Saving model and optimizer state at iteration {} to {}".format(
48
- iteration, checkpoint_path))
49
- if hasattr(model, 'module'):
50
- state_dict = model.module.state_dict()
51
- else:
52
- state_dict = model.state_dict()
53
- torch.save({'model': state_dict,
54
- 'iteration': iteration,
55
- 'optimizer': optimizer.state_dict(),
56
- 'learning_rate': learning_rate}, checkpoint_path)
57
-
58
-
59
- def plot_spectrogram_to_numpy(spectrogram):
60
- global MATPLOTLIB_FLAG
61
- if not MATPLOTLIB_FLAG:
62
- import matplotlib
63
- matplotlib.use("Agg")
64
- MATPLOTLIB_FLAG = True
65
- mpl_logger = logging.getLogger('matplotlib')
66
- mpl_logger.setLevel(logging.WARNING)
67
- import matplotlib.pylab as plt
68
- import numpy as np
69
-
70
- fig, ax = plt.subplots(figsize=(10,2))
71
- im = ax.imshow(spectrogram, aspect="auto", origin="lower",
72
- interpolation='none')
73
- plt.colorbar(im, ax=ax)
74
- plt.xlabel("Frames")
75
- plt.ylabel("Channels")
76
- plt.tight_layout()
77
-
78
- fig.canvas.draw()
79
- data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
80
- data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
81
- plt.close()
82
- return data
83
-
84
-
85
- def plot_alignment_to_numpy(alignment, info=None):
86
- global MATPLOTLIB_FLAG
87
- if not MATPLOTLIB_FLAG:
88
- import matplotlib
89
- matplotlib.use("Agg")
90
- MATPLOTLIB_FLAG = True
91
- mpl_logger = logging.getLogger('matplotlib')
92
- mpl_logger.setLevel(logging.WARNING)
93
- import matplotlib.pylab as plt
94
- import numpy as np
95
-
96
- fig, ax = plt.subplots(figsize=(6, 4))
97
- im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
98
- interpolation='none')
99
- fig.colorbar(im, ax=ax)
100
- xlabel = 'Decoder timestep'
101
- if info is not None:
102
- xlabel += '\n\n' + info
103
- plt.xlabel(xlabel)
104
- plt.ylabel('Encoder timestep')
105
- plt.tight_layout()
106
-
107
- fig.canvas.draw()
108
- data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
109
- data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
110
- plt.close()
111
- return data
112
-
113
-
114
- def load_wav_to_torch(full_path):
115
- sampling_rate, data = read(full_path)
116
- return torch.FloatTensor(data.astype(np.float32)), sampling_rate
117
-
118
-
119
- def load_filepaths_and_text(filename, split="|"):
120
- with open(filename, encoding='utf-8') as f:
121
- filepaths_and_text = [line.strip().split(split) for line in f]
122
- return filepaths_and_text
123
-
124
-
125
- def get_hparams(init=True):
126
- parser = argparse.ArgumentParser()
127
- parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
128
- help='JSON file for configuration')
129
- parser.add_argument('-m', '--model', type=str, required=True,
130
- help='Model name')
131
-
132
- args = parser.parse_args()
133
- model_dir = os.path.join("./logs", args.model)
134
-
135
- if not os.path.exists(model_dir):
136
- os.makedirs(model_dir)
137
-
138
- config_path = args.config
139
- config_save_path = os.path.join(model_dir, "config.json")
140
- if init:
141
- with open(config_path, "r") as f:
142
- data = f.read()
143
- with open(config_save_path, "w") as f:
144
- f.write(data)
145
- else:
146
- with open(config_save_path, "r") as f:
147
- data = f.read()
148
- config = json.loads(data)
149
-
150
- hparams = HParams(**config)
151
- hparams.model_dir = model_dir
152
- return hparams
153
 
154
 
155
- def get_hparams_from_dir(model_dir):
156
- config_save_path = os.path.join(model_dir, "config.json")
157
- with open(config_save_path, "r") as f:
158
- data = f.read()
159
- config = json.loads(data)
160
-
161
- hparams =HParams(**config)
162
- hparams.model_dir = model_dir
163
- return hparams
164
-
165
 
166
  def get_hparams_from_file(config_path):
167
  with open(config_path, "r") as f:
 
43
  return model, optimizer, learning_rate, iteration
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def get_hparams_from_file(config_path):
50
  with open(config_path, "r") as f:
msinference.py CHANGED
@@ -130,17 +130,6 @@ bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
130
  params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
131
  params = params_whole['net']
132
 
133
-
134
- # 'bert',
135
- # 'bert_encoder',
136
- # 'predictor',
137
- # 'decoder',
138
- # 'text_encoder',
139
- # 'predictor_encoder',
140
- # 'style_encoder',
141
- # 'text_aligner',
142
- # 'pitch_extractor'
143
- # --
144
  from collections import OrderedDict
145
 
146
  def _del_prefix(d):
@@ -149,7 +138,6 @@ def _del_prefix(d):
149
  for k, v in d.items():
150
  out[k[7:]] = v
151
  return out
152
-
153
 
154
  bert.load_state_dict( _del_prefix(params['bert']), strict=True)
155
  bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
@@ -216,23 +204,23 @@ def inference(text,
216
  for i in range(pred_aln_trg.size(0)):
217
  pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
218
  c_frame += int(pred_dur[i].data)
 
219
  en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
 
 
 
 
220
 
221
  F0_pred, N_pred = predictor.F0Ntrain(en, s)
222
 
223
  asr = (hidden_states @ pred_aln_trg.unsqueeze(0).to(device))
224
 
225
- # -- END DURATION
226
-
227
- # [bs, 640, 198]
228
-
229
- # replicated Huberrt frames for duration-of-each-frame to elast [bs, 640, 130] -> [bs, 640, 198]
230
-
231
- # every Hubert frame can be cloned from 1 to ~12 times and appended to the final array
232
-
233
-
234
- F0_pred, N_pred = predictor.F0Ntrain(en, s)
235
-
236
  x = decoder(asr=asr,
237
  F0_curve=F0_pred,
238
  N=N_pred,
 
130
  params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
131
  params = params_whole['net']
132
 
 
 
 
 
 
 
 
 
 
 
 
133
  from collections import OrderedDict
134
 
135
  def _del_prefix(d):
 
138
  for k, v in d.items():
139
  out[k[7:]] = v
140
  return out
 
141
 
142
  bert.load_state_dict( _del_prefix(params['bert']), strict=True)
143
  bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
 
204
  for i in range(pred_aln_trg.size(0)):
205
  pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
206
  c_frame += int(pred_dur[i].data)
207
+
208
  en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
209
+ asr_new = torch.zeros_like(en)
210
+ asr_new[:, :, 0] = en[:, :, 0]
211
+ asr_new[:, :, 1:] = en[:, :, 0:-1]
212
+ en = asr_new
213
 
214
  F0_pred, N_pred = predictor.F0Ntrain(en, s)
215
 
216
  asr = (hidden_states @ pred_aln_trg.unsqueeze(0).to(device))
217
 
218
+ asr_new = torch.zeros_like(asr)
219
+ asr_new[:, :, 0] = asr[:, :, 0]
220
+ asr_new[:, :, 1:] = asr[:, :, 0:-1]
221
+ asr = asr_new
222
+ # -
223
+
 
 
 
 
 
224
  x = decoder(asr=asr,
225
  F0_curve=F0_pred,
226
  N=N_pred,
requirements.txt CHANGED
@@ -17,3 +17,4 @@ audresample
17
  srt
18
  nltk
19
  phonemizer
 
 
17
  srt
18
  nltk
19
  phonemizer
20
+ docx