dkounadis
/

artificial-styletts2

@@ -1,22 +1,14 @@
 import math
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from transformers.activations import ACT2FN
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.integrations.fsdp import is_fsdp_managed_module
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    ModelOutput,
-)
 from transformers.modeling_utils import PreTrainedModel
-# ============================================== configuration
 from transformers.configuration_utils import PretrainedConfig
 class VitsConfig(PretrainedConfig):
@@ -234,7 +226,7 @@ class VitsConfig(PretrainedConfig):
         self.wavenet_kernel_size = wavenet_kernel_size
         self.wavenet_dilation_rate = wavenet_dilation_rate
         self.wavenet_dropout = wavenet_dropout
-        self.speaking_rate = speaking_rate
         self.noise_scale = noise_scale
         self.noise_scale_duration = noise_scale_duration
         self.sampling_rate = sampling_rate
@@ -252,40 +244,6 @@ class VitsConfig(PretrainedConfig):
 # ============================ modeling
-@dataclass
-class VitsModelOutput(ModelOutput):
-    """
-    Describes the outputs for the VITS model, with potential hidden states and attentions.
-    Args:
-        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-            The final audio waveform predicted by the model.
-        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
-            The length in samples of each element in the `waveform` batch.
-        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
-            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
-            GAN decoder model to obtain the final audio waveform.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    waveform: torch.FloatTensor = None
-    sequence_lengths: torch.FloatTensor = None
-    spectrogram: Optional[Tuple[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
 class VitsTextEncoderOutput(ModelOutput):
     """
@@ -907,7 +865,7 @@ class VitsConvFlow(nn.Module):
 class VitsElementwiseAffine(nn.Module):
-    def __init__(self, config: VitsConfig):
         super().__init__()
         self.channels = config.depth_separable_channels
         self.translate = nn.Parameter(torch.zeros(self.channels, 1))
@@ -1094,12 +1052,12 @@ class VitsAttention(nn.Module):
     def forward(
         self,
-        hidden_states: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1129,6 +1087,7 @@ class VitsAttention(nn.Module):
             )
         if self.window_size is not None:
             key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
             relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
             rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
@@ -1141,28 +1100,21 @@ class VitsAttention(nn.Module):
                 )
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-        else:
-            attn_weights_reshaped = None
         attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
         attn_output = torch.bmm(attn_probs, value_states)
@@ -1174,6 +1126,7 @@ class VitsAttention(nn.Module):
             )
         if self.window_size is not None:
             value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
             relative_weights = self._absolute_position_to_relative_position(attn_probs)
             rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
@@ -1188,7 +1141,7 @@ class VitsAttention(nn.Module):
         attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights_reshaped
     def _get_relative_embeddings(self, relative_embeddings, length):
         pad_length = max(length - (self.window_size + 1), 0)
@@ -1335,7 +1288,7 @@ class VitsEncoder(nn.Module):
         hidden_states = hidden_states * padding_mask
-        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
         for encoder_layer in self.layers:
             if output_hidden_states:
@@ -1345,25 +1298,14 @@ class VitsEncoder(nn.Module):
             dropout_probability = np.random.uniform(0, 1)
             skip_the_layer = self.training and (dropout_probability < self.layerdrop)
-            if not skip_the_layer or synced_gpus:
-                # under fsdp or deepspeed zero3 all gpus must run in sync
-                if self.gradient_checkpointing and self.training:
-                    raise ValueError
-                    # layer_outputs = self._gradient_checkpointing_func(
-                    #     encoder_layer.__call__,
-                    #     hidden_states,
-                    #     padding_mask,
-                    #     attention_mask,
-                    #     output_attentions,
-                    # )
-                else:
-                    layer_outputs = encoder_layer(
-                        hidden_states,
-                        attention_mask=attention_mask,
-                        padding_mask=padding_mask,
-                        output_attentions=output_attentions,
-                    )
-                hidden_states = layer_outputs[0]
             if skip_the_layer:
                 layer_outputs = (None, None)
@@ -1395,7 +1337,7 @@ class VitsTextEncoder(nn.Module):
         super().__init__()
         self.config = config
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
-        self.encoder = VitsEncoder(config)
         self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
     def get_input_embeddings(self):
@@ -1477,7 +1419,7 @@ class VitsModel(VitsPreTrainedModel):
     def __init__(self, config: VitsConfig):
         super().__init__(config)
         self.config = config
-        self.text_encoder = VitsTextEncoder(config)
         self.flow = VitsResidualCouplingBlock(config)
         self.decoder = VitsHifiGan(config)
@@ -1502,14 +1444,14 @@ class VitsModel(VitsPreTrainedModel):
     def forward(
         self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        speaker_id: Optional[int] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        labels: Optional[torch.FloatTensor] = None,
-    ) -> Union[Tuple[Any], VitsModelOutput]:
         r"""
         labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
             Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
@@ -1583,7 +1525,8 @@ class VitsModel(VitsPreTrainedModel):
                 noise_scale=self.noise_scale_duration,
             )
         else:
-            log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
         length_scale = 1.0 / self.speaking_rate
         duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
@@ -1620,13 +1563,7 @@ class VitsModel(VitsPreTrainedModel):
             outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
             return outputs
-        return VitsModelOutput(
-            waveform=waveform,
-            sequence_lengths=sequence_lengths,
-            spectrogram=spectrogram,
-            hidden_states=text_encoder_output.hidden_states,
-            attentions=text_encoder_output.attentions,
-        )
@@ -1784,29 +1721,10 @@ class VitsTokenizer(PreTrainedTokenizer):
     def prepare_for_tokenization(
         self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
     ) -> Tuple[str, Dict[str, Any]]:
-        """
-        Performs any necessary transformations before tokenization.
-        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
-        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
-        Args:
-            text (`str`):
-                The text to prepare.
-            is_split_into_words (`bool`, *optional*, defaults to `False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
-                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
-                which it will tokenize.
-            normalize (`bool`, *optional*, defaults to `None`):
-                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
-                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
-                text consists only of lower-case characters.
-            kwargs (`Dict[str, Any]`, *optional*):
-                Keyword arguments to use for the tokenization.
-        Returns:
-            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
-        """
         normalize = normalize if normalize is not None else self.normalize
         if normalize:

 import math
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import BaseModelOutput, ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_utils import PretrainedConfig
 class VitsConfig(PretrainedConfig):
         self.wavenet_kernel_size = wavenet_kernel_size
         self.wavenet_dilation_rate = wavenet_dilation_rate
         self.wavenet_dropout = wavenet_dropout
+        self.speaking_rate = speaking_rate  # reset during long txt inference for natural variation
         self.noise_scale = noise_scale
         self.noise_scale_duration = noise_scale_duration
         self.sampling_rate = sampling_rate
 # ============================ modeling
 @dataclass
 class VitsTextEncoderOutput(ModelOutput):
     """
 class VitsElementwiseAffine(nn.Module):
+    def __init__(self, config):
         super().__init__()
         self.channels = config.depth_separable_channels
         self.translate = nn.Parameter(torch.zeros(self.channels, 1))
     def forward(
         self,
+        hidden_states,
         key_value_states: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
+    ):
         """Input shape: Batch x Time x Channel"""
         # if key_value_states are provided this layer is used as a cross-attention layer
             )
         if self.window_size is not None:
+            # 4
             key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
             relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
             rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
                 )
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        # Is possible that starting frames of this attentio hold the choice of voice to place the generation in male or female for german
+        #   1. Is plausible to have some pre-append or post-append frames (whose TTS is always male or female )
+        #
+        # --
+        # ___IN attn 1110__ torch.Size([2, 927, 927])
+        # ___IN attn 1110__ torch.Size([2, 927, 927])
+        # ___IN attn 1110__ torch.Size([2, 927, 927])
+        # ___IN attn 1110__ torch.Size([2, 927, 927])
+        # ___IN attn 1110__ torch.Size([2, 927, 927])
+        # ___IN attn 1110__ torch.Size([2, 927, 927])  # this appears to use always thefull len of bert hidden states
+        # --
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
         attn_output = torch.bmm(attn_probs, value_states)
             )
         if self.window_size is not None:
+            # Entering here with self.window_size = 4
             value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
             relative_weights = self._absolute_position_to_relative_position(attn_probs)
             rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
         attn_output = self.out_proj(attn_output)
+        return attn_output, None #attn_weights_reshaped
     def _get_relative_embeddings(self, relative_embeddings, length):
         pad_length = max(length - (self.window_size + 1), 0)
         hidden_states = hidden_states * padding_mask
         for encoder_layer in self.layers:
             if output_hidden_states:
             dropout_probability = np.random.uniform(0, 1)
             skip_the_layer = self.training and (dropout_probability < self.layerdrop)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                padding_mask=padding_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
             if skip_the_layer:
                 layer_outputs = (None, None)
         super().__init__()
         self.config = config
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.encoder = VitsEncoder(config)  # 6 Layers of VitsAttention
         self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
     def get_input_embeddings(self):
     def __init__(self, config: VitsConfig):
         super().__init__(config)
         self.config = config
+        self.text_encoder = VitsTextEncoder(config)  # has VitsEncoder that includes 6L of VitsAttention
         self.flow = VitsResidualCouplingBlock(config)
         self.decoder = VitsHifiGan(config)
     def forward(
         self,
+        input_ids = None,
+        attention_mask = None,
+        speaker_id = None,
+        output_attentions = None,
+        output_hidden_states = None,
+        return_dict = None,
+        labels = None,
+    ):
         r"""
         labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
             Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
                 noise_scale=self.noise_scale_duration,
             )
         else:
+            raise ValueError
+            # log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
         length_scale = 1.0 / self.speaking_rate
         duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
             outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
             return outputs
+        return waveform
     def prepare_for_tokenization(
         self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
     ) -> Tuple[str, Dict[str, Any]]:
+        '''
+            Performs any necessary transformations before tokenization.
+        '''
         normalize = normalize if normalize is not None else self.normalize
         if normalize:

demo.py CHANGED Viewed

@@ -2,10 +2,22 @@ import numpy as np
 import soundfile
 import msinference
-def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
-              voice='af_ZA_google-nwu_1919',  # 'serbian', 'en_US/vctk_low#p276', 'isl',
-              speed=1.4,  # only for MMS TTS
               affect = True  # False = higher clarity sound for partially sight
               ):
     '''returns 24kHZ np.array TTS

 import soundfile
 import msinference
+# Prepend »Vom Prof. Friedrich ist noch eine .. string in the beginning brings the male voice in deu MMS TTS (if later string is much longer
+#                                      sometimes the woman voices pronounces words <dass>) TODO amplify attn weights of first hidden states / certain voice
+def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
+                   'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
+                   'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
+                   'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
+                   '»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
+                   'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
+                   'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
+                   'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
+                   'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
+                   'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
+                   'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
+              voice='deu', #'af_ZA_google-nwu_1919',  # 'serbian', 'en_US/vctk_low#p276', 'isl',
+              speed=1.14,  # only for MMS TTS
               affect = True  # False = higher clarity sound for partially sight
               ):
     '''returns 24kHZ np.array TTS

msinference.py CHANGED Viewed

@@ -379,9 +379,12 @@ def foreign(text=None,   # list of text
         inputs = tokenizer(_t, return_tensors="pt")  # input_ids / attention_mask
         with torch.no_grad():
             x.append(
                 net_g(input_ids=inputs.input_ids.to(device),
-                      attention_mask=inputs.attention_mask.to(device)).waveform
             )
             print(x[-1].shape)
         print(f'{speed=}\n\n\n\n_______________________________ {_t}')

         inputs = tokenizer(_t, return_tensors="pt")  # input_ids / attention_mask
         with torch.no_grad():
+            # -- reset speed
+            net_g.speaking_rate = speed
+            # --
             x.append(
                 net_g(input_ids=inputs.input_ids.to(device),
+                      attention_mask=inputs.attention_mask.to(device))
             )
             print(x[-1].shape)
         print(f'{speed=}\n\n\n\n_______________________________ {_t}')