Dionyssos commited on
Commit
d8edfa5
·
1 Parent(s): 477195e

2 voice deu demo VITS

Browse files
Files changed (3) hide show
  1. Modules/vits/models.py +47 -129
  2. demo.py +16 -4
  3. msinference.py +4 -1
Modules/vits/models.py CHANGED
@@ -1,22 +1,14 @@
1
  import math
2
  from dataclasses import dataclass
3
  from typing import Any, Optional, Tuple, Union
4
-
5
  import numpy as np
6
  import torch
7
  import torch.utils.checkpoint
8
  from torch import nn
9
-
10
  from transformers.activations import ACT2FN
11
- from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
12
- from transformers.integrations.fsdp import is_fsdp_managed_module
13
  from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
14
- from transformers.modeling_outputs import (
15
- BaseModelOutput,
16
- ModelOutput,
17
- )
18
  from transformers.modeling_utils import PreTrainedModel
19
- # ============================================== configuration
20
  from transformers.configuration_utils import PretrainedConfig
21
 
22
  class VitsConfig(PretrainedConfig):
@@ -234,7 +226,7 @@ class VitsConfig(PretrainedConfig):
234
  self.wavenet_kernel_size = wavenet_kernel_size
235
  self.wavenet_dilation_rate = wavenet_dilation_rate
236
  self.wavenet_dropout = wavenet_dropout
237
- self.speaking_rate = speaking_rate
238
  self.noise_scale = noise_scale
239
  self.noise_scale_duration = noise_scale_duration
240
  self.sampling_rate = sampling_rate
@@ -252,40 +244,6 @@ class VitsConfig(PretrainedConfig):
252
 
253
  # ============================ modeling
254
 
255
-
256
- @dataclass
257
- class VitsModelOutput(ModelOutput):
258
- """
259
- Describes the outputs for the VITS model, with potential hidden states and attentions.
260
-
261
- Args:
262
- waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
263
- The final audio waveform predicted by the model.
264
- sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
265
- The length in samples of each element in the `waveform` batch.
266
- spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
267
- The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
268
- GAN decoder model to obtain the final audio waveform.
269
- hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
270
- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
271
- one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
272
-
273
- Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
274
- attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
275
- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
276
- sequence_length)`.
277
-
278
- Attention weights after the attention softmax, used to compute the weighted average in the self-attention
279
- heads.
280
- """
281
-
282
- waveform: torch.FloatTensor = None
283
- sequence_lengths: torch.FloatTensor = None
284
- spectrogram: Optional[Tuple[torch.FloatTensor]] = None
285
- hidden_states: Optional[Tuple[torch.FloatTensor]] = None
286
- attentions: Optional[Tuple[torch.FloatTensor]] = None
287
-
288
-
289
  @dataclass
290
  class VitsTextEncoderOutput(ModelOutput):
291
  """
@@ -907,7 +865,7 @@ class VitsConvFlow(nn.Module):
907
 
908
 
909
  class VitsElementwiseAffine(nn.Module):
910
- def __init__(self, config: VitsConfig):
911
  super().__init__()
912
  self.channels = config.depth_separable_channels
913
  self.translate = nn.Parameter(torch.zeros(self.channels, 1))
@@ -1094,12 +1052,12 @@ class VitsAttention(nn.Module):
1094
 
1095
  def forward(
1096
  self,
1097
- hidden_states: torch.Tensor,
1098
  key_value_states: Optional[torch.Tensor] = None,
1099
  attention_mask: Optional[torch.Tensor] = None,
1100
  layer_head_mask: Optional[torch.Tensor] = None,
1101
  output_attentions: bool = False,
1102
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
1103
  """Input shape: Batch x Time x Channel"""
1104
 
1105
  # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1129,6 +1087,7 @@ class VitsAttention(nn.Module):
1129
  )
1130
 
1131
  if self.window_size is not None:
 
1132
  key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
1133
  relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
1134
  rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
@@ -1141,28 +1100,21 @@ class VitsAttention(nn.Module):
1141
  )
1142
  attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
1143
  attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
1144
-
 
 
 
 
 
 
 
 
 
 
 
 
1145
  attn_weights = nn.functional.softmax(attn_weights, dim=-1)
1146
 
1147
- if layer_head_mask is not None:
1148
- if layer_head_mask.size() != (self.num_heads,):
1149
- raise ValueError(
1150
- f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
1151
- f" {layer_head_mask.size()}"
1152
- )
1153
- attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
1154
- attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
1155
-
1156
- if output_attentions:
1157
- # this operation is a bit awkward, but it's required to
1158
- # make sure that attn_weights keeps its gradient.
1159
- # In order to do so, attn_weights have to be reshaped
1160
- # twice and have to be reused in the following
1161
- attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
1162
- attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
1163
- else:
1164
- attn_weights_reshaped = None
1165
-
1166
  attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
1167
 
1168
  attn_output = torch.bmm(attn_probs, value_states)
@@ -1174,6 +1126,7 @@ class VitsAttention(nn.Module):
1174
  )
1175
 
1176
  if self.window_size is not None:
 
1177
  value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
1178
  relative_weights = self._absolute_position_to_relative_position(attn_probs)
1179
  rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
@@ -1188,7 +1141,7 @@ class VitsAttention(nn.Module):
1188
 
1189
  attn_output = self.out_proj(attn_output)
1190
 
1191
- return attn_output, attn_weights_reshaped
1192
 
1193
  def _get_relative_embeddings(self, relative_embeddings, length):
1194
  pad_length = max(length - (self.window_size + 1), 0)
@@ -1335,7 +1288,7 @@ class VitsEncoder(nn.Module):
1335
 
1336
  hidden_states = hidden_states * padding_mask
1337
 
1338
- synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
1339
 
1340
  for encoder_layer in self.layers:
1341
  if output_hidden_states:
@@ -1345,25 +1298,14 @@ class VitsEncoder(nn.Module):
1345
  dropout_probability = np.random.uniform(0, 1)
1346
 
1347
  skip_the_layer = self.training and (dropout_probability < self.layerdrop)
1348
- if not skip_the_layer or synced_gpus:
1349
- # under fsdp or deepspeed zero3 all gpus must run in sync
1350
- if self.gradient_checkpointing and self.training:
1351
- raise ValueError
1352
- # layer_outputs = self._gradient_checkpointing_func(
1353
- # encoder_layer.__call__,
1354
- # hidden_states,
1355
- # padding_mask,
1356
- # attention_mask,
1357
- # output_attentions,
1358
- # )
1359
- else:
1360
- layer_outputs = encoder_layer(
1361
- hidden_states,
1362
- attention_mask=attention_mask,
1363
- padding_mask=padding_mask,
1364
- output_attentions=output_attentions,
1365
- )
1366
- hidden_states = layer_outputs[0]
1367
 
1368
  if skip_the_layer:
1369
  layer_outputs = (None, None)
@@ -1395,7 +1337,7 @@ class VitsTextEncoder(nn.Module):
1395
  super().__init__()
1396
  self.config = config
1397
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
1398
- self.encoder = VitsEncoder(config)
1399
  self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
1400
 
1401
  def get_input_embeddings(self):
@@ -1477,7 +1419,7 @@ class VitsModel(VitsPreTrainedModel):
1477
  def __init__(self, config: VitsConfig):
1478
  super().__init__(config)
1479
  self.config = config
1480
- self.text_encoder = VitsTextEncoder(config)
1481
  self.flow = VitsResidualCouplingBlock(config)
1482
  self.decoder = VitsHifiGan(config)
1483
 
@@ -1502,14 +1444,14 @@ class VitsModel(VitsPreTrainedModel):
1502
 
1503
  def forward(
1504
  self,
1505
- input_ids: Optional[torch.Tensor] = None,
1506
- attention_mask: Optional[torch.Tensor] = None,
1507
- speaker_id: Optional[int] = None,
1508
- output_attentions: Optional[bool] = None,
1509
- output_hidden_states: Optional[bool] = None,
1510
- return_dict: Optional[bool] = None,
1511
- labels: Optional[torch.FloatTensor] = None,
1512
- ) -> Union[Tuple[Any], VitsModelOutput]:
1513
  r"""
1514
  labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
1515
  Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
@@ -1583,7 +1525,8 @@ class VitsModel(VitsPreTrainedModel):
1583
  noise_scale=self.noise_scale_duration,
1584
  )
1585
  else:
1586
- log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
 
1587
 
1588
  length_scale = 1.0 / self.speaking_rate
1589
  duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
@@ -1620,13 +1563,7 @@ class VitsModel(VitsPreTrainedModel):
1620
  outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
1621
  return outputs
1622
 
1623
- return VitsModelOutput(
1624
- waveform=waveform,
1625
- sequence_lengths=sequence_lengths,
1626
- spectrogram=spectrogram,
1627
- hidden_states=text_encoder_output.hidden_states,
1628
- attentions=text_encoder_output.attentions,
1629
- )
1630
 
1631
 
1632
 
@@ -1784,29 +1721,10 @@ class VitsTokenizer(PreTrainedTokenizer):
1784
  def prepare_for_tokenization(
1785
  self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
1786
  ) -> Tuple[str, Dict[str, Any]]:
1787
- """
1788
- Performs any necessary transformations before tokenization.
1789
-
1790
- This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
1791
- `kwargs` at the end of the encoding process to be sure all the arguments have been used.
1792
-
1793
- Args:
1794
- text (`str`):
1795
- The text to prepare.
1796
- is_split_into_words (`bool`, *optional*, defaults to `False`):
1797
- Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
1798
- tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
1799
- which it will tokenize.
1800
- normalize (`bool`, *optional*, defaults to `None`):
1801
- Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
1802
- trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
1803
- text consists only of lower-case characters.
1804
- kwargs (`Dict[str, Any]`, *optional*):
1805
- Keyword arguments to use for the tokenization.
1806
-
1807
- Returns:
1808
- `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
1809
- """
1810
  normalize = normalize if normalize is not None else self.normalize
1811
 
1812
  if normalize:
 
1
  import math
2
  from dataclasses import dataclass
3
  from typing import Any, Optional, Tuple, Union
 
4
  import numpy as np
5
  import torch
6
  import torch.utils.checkpoint
7
  from torch import nn
 
8
  from transformers.activations import ACT2FN
 
 
9
  from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
10
+ from transformers.modeling_outputs import BaseModelOutput, ModelOutput
 
 
 
11
  from transformers.modeling_utils import PreTrainedModel
 
12
  from transformers.configuration_utils import PretrainedConfig
13
 
14
  class VitsConfig(PretrainedConfig):
 
226
  self.wavenet_kernel_size = wavenet_kernel_size
227
  self.wavenet_dilation_rate = wavenet_dilation_rate
228
  self.wavenet_dropout = wavenet_dropout
229
+ self.speaking_rate = speaking_rate # reset during long txt inference for natural variation
230
  self.noise_scale = noise_scale
231
  self.noise_scale_duration = noise_scale_duration
232
  self.sampling_rate = sampling_rate
 
244
 
245
  # ============================ modeling
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  @dataclass
248
  class VitsTextEncoderOutput(ModelOutput):
249
  """
 
865
 
866
 
867
  class VitsElementwiseAffine(nn.Module):
868
+ def __init__(self, config):
869
  super().__init__()
870
  self.channels = config.depth_separable_channels
871
  self.translate = nn.Parameter(torch.zeros(self.channels, 1))
 
1052
 
1053
  def forward(
1054
  self,
1055
+ hidden_states,
1056
  key_value_states: Optional[torch.Tensor] = None,
1057
  attention_mask: Optional[torch.Tensor] = None,
1058
  layer_head_mask: Optional[torch.Tensor] = None,
1059
  output_attentions: bool = False,
1060
+ ):
1061
  """Input shape: Batch x Time x Channel"""
1062
 
1063
  # if key_value_states are provided this layer is used as a cross-attention layer
 
1087
  )
1088
 
1089
  if self.window_size is not None:
1090
+ # 4
1091
  key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
1092
  relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
1093
  rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
 
1100
  )
1101
  attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
1102
  attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
1103
+
1104
+ # Is possible that starting frames of this attentio hold the choice of voice to place the generation in male or female for german
1105
+ # 1. Is plausible to have some pre-append or post-append frames (whose TTS is always male or female )
1106
+
1107
+ #
1108
+ # --
1109
+ # ___IN attn 1110__ torch.Size([2, 927, 927])
1110
+ # ___IN attn 1110__ torch.Size([2, 927, 927])
1111
+ # ___IN attn 1110__ torch.Size([2, 927, 927])
1112
+ # ___IN attn 1110__ torch.Size([2, 927, 927])
1113
+ # ___IN attn 1110__ torch.Size([2, 927, 927])
1114
+ # ___IN attn 1110__ torch.Size([2, 927, 927]) # this appears to use always thefull len of bert hidden states
1115
+ # --
1116
  attn_weights = nn.functional.softmax(attn_weights, dim=-1)
1117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
  attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
1119
 
1120
  attn_output = torch.bmm(attn_probs, value_states)
 
1126
  )
1127
 
1128
  if self.window_size is not None:
1129
+ # Entering here with self.window_size = 4
1130
  value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
1131
  relative_weights = self._absolute_position_to_relative_position(attn_probs)
1132
  rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
 
1141
 
1142
  attn_output = self.out_proj(attn_output)
1143
 
1144
+ return attn_output, None #attn_weights_reshaped
1145
 
1146
  def _get_relative_embeddings(self, relative_embeddings, length):
1147
  pad_length = max(length - (self.window_size + 1), 0)
 
1288
 
1289
  hidden_states = hidden_states * padding_mask
1290
 
1291
+
1292
 
1293
  for encoder_layer in self.layers:
1294
  if output_hidden_states:
 
1298
  dropout_probability = np.random.uniform(0, 1)
1299
 
1300
  skip_the_layer = self.training and (dropout_probability < self.layerdrop)
1301
+
1302
+ layer_outputs = encoder_layer(
1303
+ hidden_states,
1304
+ attention_mask=attention_mask,
1305
+ padding_mask=padding_mask,
1306
+ output_attentions=output_attentions,
1307
+ )
1308
+ hidden_states = layer_outputs[0]
 
 
 
 
 
 
 
 
 
 
 
1309
 
1310
  if skip_the_layer:
1311
  layer_outputs = (None, None)
 
1337
  super().__init__()
1338
  self.config = config
1339
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
1340
+ self.encoder = VitsEncoder(config) # 6 Layers of VitsAttention
1341
  self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
1342
 
1343
  def get_input_embeddings(self):
 
1419
  def __init__(self, config: VitsConfig):
1420
  super().__init__(config)
1421
  self.config = config
1422
+ self.text_encoder = VitsTextEncoder(config) # has VitsEncoder that includes 6L of VitsAttention
1423
  self.flow = VitsResidualCouplingBlock(config)
1424
  self.decoder = VitsHifiGan(config)
1425
 
 
1444
 
1445
  def forward(
1446
  self,
1447
+ input_ids = None,
1448
+ attention_mask = None,
1449
+ speaker_id = None,
1450
+ output_attentions = None,
1451
+ output_hidden_states = None,
1452
+ return_dict = None,
1453
+ labels = None,
1454
+ ):
1455
  r"""
1456
  labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
1457
  Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
 
1525
  noise_scale=self.noise_scale_duration,
1526
  )
1527
  else:
1528
+ raise ValueError
1529
+ # log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
1530
 
1531
  length_scale = 1.0 / self.speaking_rate
1532
  duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
 
1563
  outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
1564
  return outputs
1565
 
1566
+ return waveform
 
 
 
 
 
 
1567
 
1568
 
1569
 
 
1721
  def prepare_for_tokenization(
1722
  self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
1723
  ) -> Tuple[str, Dict[str, Any]]:
1724
+ '''
1725
+ Performs any necessary transformations before tokenization.
1726
+
1727
+ '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1728
  normalize = normalize if normalize is not None else self.normalize
1729
 
1730
  if normalize:
demo.py CHANGED
@@ -2,10 +2,22 @@ import numpy as np
2
  import soundfile
3
  import msinference
4
 
5
-
6
- def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
7
- voice='af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
8
- speed=1.4, # only for MMS TTS
 
 
 
 
 
 
 
 
 
 
 
 
9
  affect = True # False = higher clarity sound for partially sight
10
  ):
11
  '''returns 24kHZ np.array TTS
 
2
  import soundfile
3
  import msinference
4
 
5
+ # Prepend »Vom Prof. Friedrich ist noch eine .. string in the beginning brings the male voice in deu MMS TTS (if later string is much longer
6
+ # sometimes the woman voices pronounces words <dass>) TODO amplify attn weights of first hidden states / certain voice
7
+
8
+ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
9
+ 'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
10
+ 'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
11
+ 'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
12
+ '»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
13
+ 'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
14
+ 'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
15
+ 'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
16
+ 'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
17
+ 'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
18
+ 'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
19
+ voice='deu', #'af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
20
+ speed=1.14, # only for MMS TTS
21
  affect = True # False = higher clarity sound for partially sight
22
  ):
23
  '''returns 24kHZ np.array TTS
msinference.py CHANGED
@@ -379,9 +379,12 @@ def foreign(text=None, # list of text
379
  inputs = tokenizer(_t, return_tensors="pt") # input_ids / attention_mask
380
 
381
  with torch.no_grad():
 
 
 
382
  x.append(
383
  net_g(input_ids=inputs.input_ids.to(device),
384
- attention_mask=inputs.attention_mask.to(device)).waveform
385
  )
386
  print(x[-1].shape)
387
  print(f'{speed=}\n\n\n\n_______________________________ {_t}')
 
379
  inputs = tokenizer(_t, return_tensors="pt") # input_ids / attention_mask
380
 
381
  with torch.no_grad():
382
+ # -- reset speed
383
+ net_g.speaking_rate = speed
384
+ # --
385
  x.append(
386
  net_g(input_ids=inputs.input_ids.to(device),
387
+ attention_mask=inputs.attention_mask.to(device))
388
  )
389
  print(x[-1].shape)
390
  print(f'{speed=}\n\n\n\n_______________________________ {_t}')