2 voice deu demo VITS
Browse files- Modules/vits/models.py +47 -129
- demo.py +16 -4
- msinference.py +4 -1
Modules/vits/models.py
CHANGED
@@ -1,22 +1,14 @@
|
|
1 |
import math
|
2 |
from dataclasses import dataclass
|
3 |
from typing import Any, Optional, Tuple, Union
|
4 |
-
|
5 |
import numpy as np
|
6 |
import torch
|
7 |
import torch.utils.checkpoint
|
8 |
from torch import nn
|
9 |
-
|
10 |
from transformers.activations import ACT2FN
|
11 |
-
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
12 |
-
from transformers.integrations.fsdp import is_fsdp_managed_module
|
13 |
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
|
14 |
-
from transformers.modeling_outputs import
|
15 |
-
BaseModelOutput,
|
16 |
-
ModelOutput,
|
17 |
-
)
|
18 |
from transformers.modeling_utils import PreTrainedModel
|
19 |
-
# ============================================== configuration
|
20 |
from transformers.configuration_utils import PretrainedConfig
|
21 |
|
22 |
class VitsConfig(PretrainedConfig):
|
@@ -234,7 +226,7 @@ class VitsConfig(PretrainedConfig):
|
|
234 |
self.wavenet_kernel_size = wavenet_kernel_size
|
235 |
self.wavenet_dilation_rate = wavenet_dilation_rate
|
236 |
self.wavenet_dropout = wavenet_dropout
|
237 |
-
self.speaking_rate = speaking_rate
|
238 |
self.noise_scale = noise_scale
|
239 |
self.noise_scale_duration = noise_scale_duration
|
240 |
self.sampling_rate = sampling_rate
|
@@ -252,40 +244,6 @@ class VitsConfig(PretrainedConfig):
|
|
252 |
|
253 |
# ============================ modeling
|
254 |
|
255 |
-
|
256 |
-
@dataclass
|
257 |
-
class VitsModelOutput(ModelOutput):
|
258 |
-
"""
|
259 |
-
Describes the outputs for the VITS model, with potential hidden states and attentions.
|
260 |
-
|
261 |
-
Args:
|
262 |
-
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
263 |
-
The final audio waveform predicted by the model.
|
264 |
-
sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
|
265 |
-
The length in samples of each element in the `waveform` batch.
|
266 |
-
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
|
267 |
-
The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
|
268 |
-
GAN decoder model to obtain the final audio waveform.
|
269 |
-
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
270 |
-
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
271 |
-
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
272 |
-
|
273 |
-
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
274 |
-
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
275 |
-
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
276 |
-
sequence_length)`.
|
277 |
-
|
278 |
-
Attention weights after the attention softmax, used to compute the weighted average in the self-attention
|
279 |
-
heads.
|
280 |
-
"""
|
281 |
-
|
282 |
-
waveform: torch.FloatTensor = None
|
283 |
-
sequence_lengths: torch.FloatTensor = None
|
284 |
-
spectrogram: Optional[Tuple[torch.FloatTensor]] = None
|
285 |
-
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
286 |
-
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
287 |
-
|
288 |
-
|
289 |
@dataclass
|
290 |
class VitsTextEncoderOutput(ModelOutput):
|
291 |
"""
|
@@ -907,7 +865,7 @@ class VitsConvFlow(nn.Module):
|
|
907 |
|
908 |
|
909 |
class VitsElementwiseAffine(nn.Module):
|
910 |
-
def __init__(self, config
|
911 |
super().__init__()
|
912 |
self.channels = config.depth_separable_channels
|
913 |
self.translate = nn.Parameter(torch.zeros(self.channels, 1))
|
@@ -1094,12 +1052,12 @@ class VitsAttention(nn.Module):
|
|
1094 |
|
1095 |
def forward(
|
1096 |
self,
|
1097 |
-
hidden_states
|
1098 |
key_value_states: Optional[torch.Tensor] = None,
|
1099 |
attention_mask: Optional[torch.Tensor] = None,
|
1100 |
layer_head_mask: Optional[torch.Tensor] = None,
|
1101 |
output_attentions: bool = False,
|
1102 |
-
)
|
1103 |
"""Input shape: Batch x Time x Channel"""
|
1104 |
|
1105 |
# if key_value_states are provided this layer is used as a cross-attention layer
|
@@ -1129,6 +1087,7 @@ class VitsAttention(nn.Module):
|
|
1129 |
)
|
1130 |
|
1131 |
if self.window_size is not None:
|
|
|
1132 |
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
|
1133 |
relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
|
1134 |
rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
|
@@ -1141,28 +1100,21 @@ class VitsAttention(nn.Module):
|
|
1141 |
)
|
1142 |
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
1143 |
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
1144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1145 |
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
1146 |
|
1147 |
-
if layer_head_mask is not None:
|
1148 |
-
if layer_head_mask.size() != (self.num_heads,):
|
1149 |
-
raise ValueError(
|
1150 |
-
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
|
1151 |
-
f" {layer_head_mask.size()}"
|
1152 |
-
)
|
1153 |
-
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
1154 |
-
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
1155 |
-
|
1156 |
-
if output_attentions:
|
1157 |
-
# this operation is a bit awkward, but it's required to
|
1158 |
-
# make sure that attn_weights keeps its gradient.
|
1159 |
-
# In order to do so, attn_weights have to be reshaped
|
1160 |
-
# twice and have to be reused in the following
|
1161 |
-
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
1162 |
-
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
1163 |
-
else:
|
1164 |
-
attn_weights_reshaped = None
|
1165 |
-
|
1166 |
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
|
1167 |
|
1168 |
attn_output = torch.bmm(attn_probs, value_states)
|
@@ -1174,6 +1126,7 @@ class VitsAttention(nn.Module):
|
|
1174 |
)
|
1175 |
|
1176 |
if self.window_size is not None:
|
|
|
1177 |
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
|
1178 |
relative_weights = self._absolute_position_to_relative_position(attn_probs)
|
1179 |
rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
|
@@ -1188,7 +1141,7 @@ class VitsAttention(nn.Module):
|
|
1188 |
|
1189 |
attn_output = self.out_proj(attn_output)
|
1190 |
|
1191 |
-
return attn_output, attn_weights_reshaped
|
1192 |
|
1193 |
def _get_relative_embeddings(self, relative_embeddings, length):
|
1194 |
pad_length = max(length - (self.window_size + 1), 0)
|
@@ -1335,7 +1288,7 @@ class VitsEncoder(nn.Module):
|
|
1335 |
|
1336 |
hidden_states = hidden_states * padding_mask
|
1337 |
|
1338 |
-
|
1339 |
|
1340 |
for encoder_layer in self.layers:
|
1341 |
if output_hidden_states:
|
@@ -1345,25 +1298,14 @@ class VitsEncoder(nn.Module):
|
|
1345 |
dropout_probability = np.random.uniform(0, 1)
|
1346 |
|
1347 |
skip_the_layer = self.training and (dropout_probability < self.layerdrop)
|
1348 |
-
|
1349 |
-
|
1350 |
-
|
1351 |
-
|
1352 |
-
|
1353 |
-
|
1354 |
-
|
1355 |
-
|
1356 |
-
# attention_mask,
|
1357 |
-
# output_attentions,
|
1358 |
-
# )
|
1359 |
-
else:
|
1360 |
-
layer_outputs = encoder_layer(
|
1361 |
-
hidden_states,
|
1362 |
-
attention_mask=attention_mask,
|
1363 |
-
padding_mask=padding_mask,
|
1364 |
-
output_attentions=output_attentions,
|
1365 |
-
)
|
1366 |
-
hidden_states = layer_outputs[0]
|
1367 |
|
1368 |
if skip_the_layer:
|
1369 |
layer_outputs = (None, None)
|
@@ -1395,7 +1337,7 @@ class VitsTextEncoder(nn.Module):
|
|
1395 |
super().__init__()
|
1396 |
self.config = config
|
1397 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
|
1398 |
-
self.encoder = VitsEncoder(config)
|
1399 |
self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
|
1400 |
|
1401 |
def get_input_embeddings(self):
|
@@ -1477,7 +1419,7 @@ class VitsModel(VitsPreTrainedModel):
|
|
1477 |
def __init__(self, config: VitsConfig):
|
1478 |
super().__init__(config)
|
1479 |
self.config = config
|
1480 |
-
self.text_encoder = VitsTextEncoder(config)
|
1481 |
self.flow = VitsResidualCouplingBlock(config)
|
1482 |
self.decoder = VitsHifiGan(config)
|
1483 |
|
@@ -1502,14 +1444,14 @@ class VitsModel(VitsPreTrainedModel):
|
|
1502 |
|
1503 |
def forward(
|
1504 |
self,
|
1505 |
-
input_ids
|
1506 |
-
attention_mask
|
1507 |
-
speaker_id
|
1508 |
-
output_attentions
|
1509 |
-
output_hidden_states
|
1510 |
-
return_dict
|
1511 |
-
labels
|
1512 |
-
)
|
1513 |
r"""
|
1514 |
labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
|
1515 |
Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
|
@@ -1583,7 +1525,8 @@ class VitsModel(VitsPreTrainedModel):
|
|
1583 |
noise_scale=self.noise_scale_duration,
|
1584 |
)
|
1585 |
else:
|
1586 |
-
|
|
|
1587 |
|
1588 |
length_scale = 1.0 / self.speaking_rate
|
1589 |
duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
|
@@ -1620,13 +1563,7 @@ class VitsModel(VitsPreTrainedModel):
|
|
1620 |
outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
|
1621 |
return outputs
|
1622 |
|
1623 |
-
return
|
1624 |
-
waveform=waveform,
|
1625 |
-
sequence_lengths=sequence_lengths,
|
1626 |
-
spectrogram=spectrogram,
|
1627 |
-
hidden_states=text_encoder_output.hidden_states,
|
1628 |
-
attentions=text_encoder_output.attentions,
|
1629 |
-
)
|
1630 |
|
1631 |
|
1632 |
|
@@ -1784,29 +1721,10 @@ class VitsTokenizer(PreTrainedTokenizer):
|
|
1784 |
def prepare_for_tokenization(
|
1785 |
self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
|
1786 |
) -> Tuple[str, Dict[str, Any]]:
|
1787 |
-
|
1788 |
-
|
1789 |
-
|
1790 |
-
|
1791 |
-
`kwargs` at the end of the encoding process to be sure all the arguments have been used.
|
1792 |
-
|
1793 |
-
Args:
|
1794 |
-
text (`str`):
|
1795 |
-
The text to prepare.
|
1796 |
-
is_split_into_words (`bool`, *optional*, defaults to `False`):
|
1797 |
-
Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
|
1798 |
-
tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
|
1799 |
-
which it will tokenize.
|
1800 |
-
normalize (`bool`, *optional*, defaults to `None`):
|
1801 |
-
Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
|
1802 |
-
trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
|
1803 |
-
text consists only of lower-case characters.
|
1804 |
-
kwargs (`Dict[str, Any]`, *optional*):
|
1805 |
-
Keyword arguments to use for the tokenization.
|
1806 |
-
|
1807 |
-
Returns:
|
1808 |
-
`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
|
1809 |
-
"""
|
1810 |
normalize = normalize if normalize is not None else self.normalize
|
1811 |
|
1812 |
if normalize:
|
|
|
1 |
import math
|
2 |
from dataclasses import dataclass
|
3 |
from typing import Any, Optional, Tuple, Union
|
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import torch.utils.checkpoint
|
7 |
from torch import nn
|
|
|
8 |
from transformers.activations import ACT2FN
|
|
|
|
|
9 |
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
|
10 |
+
from transformers.modeling_outputs import BaseModelOutput, ModelOutput
|
|
|
|
|
|
|
11 |
from transformers.modeling_utils import PreTrainedModel
|
|
|
12 |
from transformers.configuration_utils import PretrainedConfig
|
13 |
|
14 |
class VitsConfig(PretrainedConfig):
|
|
|
226 |
self.wavenet_kernel_size = wavenet_kernel_size
|
227 |
self.wavenet_dilation_rate = wavenet_dilation_rate
|
228 |
self.wavenet_dropout = wavenet_dropout
|
229 |
+
self.speaking_rate = speaking_rate # reset during long txt inference for natural variation
|
230 |
self.noise_scale = noise_scale
|
231 |
self.noise_scale_duration = noise_scale_duration
|
232 |
self.sampling_rate = sampling_rate
|
|
|
244 |
|
245 |
# ============================ modeling
|
246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
@dataclass
|
248 |
class VitsTextEncoderOutput(ModelOutput):
|
249 |
"""
|
|
|
865 |
|
866 |
|
867 |
class VitsElementwiseAffine(nn.Module):
|
868 |
+
def __init__(self, config):
|
869 |
super().__init__()
|
870 |
self.channels = config.depth_separable_channels
|
871 |
self.translate = nn.Parameter(torch.zeros(self.channels, 1))
|
|
|
1052 |
|
1053 |
def forward(
|
1054 |
self,
|
1055 |
+
hidden_states,
|
1056 |
key_value_states: Optional[torch.Tensor] = None,
|
1057 |
attention_mask: Optional[torch.Tensor] = None,
|
1058 |
layer_head_mask: Optional[torch.Tensor] = None,
|
1059 |
output_attentions: bool = False,
|
1060 |
+
):
|
1061 |
"""Input shape: Batch x Time x Channel"""
|
1062 |
|
1063 |
# if key_value_states are provided this layer is used as a cross-attention layer
|
|
|
1087 |
)
|
1088 |
|
1089 |
if self.window_size is not None:
|
1090 |
+
# 4
|
1091 |
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
|
1092 |
relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
|
1093 |
rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
|
|
|
1100 |
)
|
1101 |
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
1102 |
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
1103 |
+
|
1104 |
+
# Is possible that starting frames of this attentio hold the choice of voice to place the generation in male or female for german
|
1105 |
+
# 1. Is plausible to have some pre-append or post-append frames (whose TTS is always male or female )
|
1106 |
+
|
1107 |
+
#
|
1108 |
+
# --
|
1109 |
+
# ___IN attn 1110__ torch.Size([2, 927, 927])
|
1110 |
+
# ___IN attn 1110__ torch.Size([2, 927, 927])
|
1111 |
+
# ___IN attn 1110__ torch.Size([2, 927, 927])
|
1112 |
+
# ___IN attn 1110__ torch.Size([2, 927, 927])
|
1113 |
+
# ___IN attn 1110__ torch.Size([2, 927, 927])
|
1114 |
+
# ___IN attn 1110__ torch.Size([2, 927, 927]) # this appears to use always thefull len of bert hidden states
|
1115 |
+
# --
|
1116 |
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
1117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1118 |
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
|
1119 |
|
1120 |
attn_output = torch.bmm(attn_probs, value_states)
|
|
|
1126 |
)
|
1127 |
|
1128 |
if self.window_size is not None:
|
1129 |
+
# Entering here with self.window_size = 4
|
1130 |
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
|
1131 |
relative_weights = self._absolute_position_to_relative_position(attn_probs)
|
1132 |
rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
|
|
|
1141 |
|
1142 |
attn_output = self.out_proj(attn_output)
|
1143 |
|
1144 |
+
return attn_output, None #attn_weights_reshaped
|
1145 |
|
1146 |
def _get_relative_embeddings(self, relative_embeddings, length):
|
1147 |
pad_length = max(length - (self.window_size + 1), 0)
|
|
|
1288 |
|
1289 |
hidden_states = hidden_states * padding_mask
|
1290 |
|
1291 |
+
|
1292 |
|
1293 |
for encoder_layer in self.layers:
|
1294 |
if output_hidden_states:
|
|
|
1298 |
dropout_probability = np.random.uniform(0, 1)
|
1299 |
|
1300 |
skip_the_layer = self.training and (dropout_probability < self.layerdrop)
|
1301 |
+
|
1302 |
+
layer_outputs = encoder_layer(
|
1303 |
+
hidden_states,
|
1304 |
+
attention_mask=attention_mask,
|
1305 |
+
padding_mask=padding_mask,
|
1306 |
+
output_attentions=output_attentions,
|
1307 |
+
)
|
1308 |
+
hidden_states = layer_outputs[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1309 |
|
1310 |
if skip_the_layer:
|
1311 |
layer_outputs = (None, None)
|
|
|
1337 |
super().__init__()
|
1338 |
self.config = config
|
1339 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
|
1340 |
+
self.encoder = VitsEncoder(config) # 6 Layers of VitsAttention
|
1341 |
self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
|
1342 |
|
1343 |
def get_input_embeddings(self):
|
|
|
1419 |
def __init__(self, config: VitsConfig):
|
1420 |
super().__init__(config)
|
1421 |
self.config = config
|
1422 |
+
self.text_encoder = VitsTextEncoder(config) # has VitsEncoder that includes 6L of VitsAttention
|
1423 |
self.flow = VitsResidualCouplingBlock(config)
|
1424 |
self.decoder = VitsHifiGan(config)
|
1425 |
|
|
|
1444 |
|
1445 |
def forward(
|
1446 |
self,
|
1447 |
+
input_ids = None,
|
1448 |
+
attention_mask = None,
|
1449 |
+
speaker_id = None,
|
1450 |
+
output_attentions = None,
|
1451 |
+
output_hidden_states = None,
|
1452 |
+
return_dict = None,
|
1453 |
+
labels = None,
|
1454 |
+
):
|
1455 |
r"""
|
1456 |
labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
|
1457 |
Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
|
|
|
1525 |
noise_scale=self.noise_scale_duration,
|
1526 |
)
|
1527 |
else:
|
1528 |
+
raise ValueError
|
1529 |
+
# log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
|
1530 |
|
1531 |
length_scale = 1.0 / self.speaking_rate
|
1532 |
duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
|
|
|
1563 |
outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
|
1564 |
return outputs
|
1565 |
|
1566 |
+
return waveform
|
|
|
|
|
|
|
|
|
|
|
|
|
1567 |
|
1568 |
|
1569 |
|
|
|
1721 |
def prepare_for_tokenization(
|
1722 |
self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
|
1723 |
) -> Tuple[str, Dict[str, Any]]:
|
1724 |
+
'''
|
1725 |
+
Performs any necessary transformations before tokenization.
|
1726 |
+
|
1727 |
+
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1728 |
normalize = normalize if normalize is not None else self.normalize
|
1729 |
|
1730 |
if normalize:
|
demo.py
CHANGED
@@ -2,10 +2,22 @@ import numpy as np
|
|
2 |
import soundfile
|
3 |
import msinference
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
affect = True # False = higher clarity sound for partially sight
|
10 |
):
|
11 |
'''returns 24kHZ np.array TTS
|
|
|
2 |
import soundfile
|
3 |
import msinference
|
4 |
|
5 |
+
# Prepend »Vom Prof. Friedrich ist noch eine .. string in the beginning brings the male voice in deu MMS TTS (if later string is much longer
|
6 |
+
# sometimes the woman voices pronounces words <dass>) TODO amplify attn weights of first hidden states / certain voice
|
7 |
+
|
8 |
+
def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
|
9 |
+
'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
|
10 |
+
'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
|
11 |
+
'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
|
12 |
+
'»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
|
13 |
+
'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
|
14 |
+
'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
|
15 |
+
'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
|
16 |
+
'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
|
17 |
+
'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
|
18 |
+
'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
|
19 |
+
voice='deu', #'af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
|
20 |
+
speed=1.14, # only for MMS TTS
|
21 |
affect = True # False = higher clarity sound for partially sight
|
22 |
):
|
23 |
'''returns 24kHZ np.array TTS
|
msinference.py
CHANGED
@@ -379,9 +379,12 @@ def foreign(text=None, # list of text
|
|
379 |
inputs = tokenizer(_t, return_tensors="pt") # input_ids / attention_mask
|
380 |
|
381 |
with torch.no_grad():
|
|
|
|
|
|
|
382 |
x.append(
|
383 |
net_g(input_ids=inputs.input_ids.to(device),
|
384 |
-
attention_mask=inputs.attention_mask.to(device))
|
385 |
)
|
386 |
print(x[-1].shape)
|
387 |
print(f'{speed=}\n\n\n\n_______________________________ {_t}')
|
|
|
379 |
inputs = tokenizer(_t, return_tensors="pt") # input_ids / attention_mask
|
380 |
|
381 |
with torch.no_grad():
|
382 |
+
# -- reset speed
|
383 |
+
net_g.speaking_rate = speed
|
384 |
+
# --
|
385 |
x.append(
|
386 |
net_g(input_ids=inputs.input_ids.to(device),
|
387 |
+
attention_mask=inputs.attention_mask.to(device))
|
388 |
)
|
389 |
print(x[-1].shape)
|
390 |
print(f'{speed=}\n\n\n\n_______________________________ {_t}')
|