dkounadis
/

artificial-styletts2

@@ -1,6 +1,5 @@
 import omegaconf
 import torchaudio
-from torch import nn
 import torch
 import numpy as np
 from huggingface_hub import hf_hub_download
@@ -11,7 +10,7 @@ from .lm import LMModel
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
-N_REPEAT = 7  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
     n = x.shape[0]
@@ -38,7 +37,7 @@ def dict_from_config(cfg):
     return dct
-class AudioGen(nn.Module):
     # https://huggingface.co/facebook/audiogen-medium
@@ -57,11 +56,9 @@ class AudioGen(nn.Module):
                  ):
         with torch.no_grad():
-            print('\nCUSTOM\n',int(duration / N_REPEAT * self.compression_model.frame_rate), 'DURATION TOKENS AudioGen')
             gen_tokens = self.lm.generate(
                 descriptions=[descriptions] * N_REPEAT,
-                max_tokens=int(duration / N_REPEAT * self.compression_model.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
             x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
@@ -70,16 +67,14 @@ class AudioGen(nn.Module):
             x = self.resample_fn(x)  # [N_REPEAT, duration]
-            x = x.repeat(1, N_REPEAT).reshape(-1)
-            for _ in range(7):
-                x = _shift(x)
             print(x.abs().max(), 'MAX')
             return x / (x.abs().max() + 1e-7)
-    # == BUILD Fn
     def get_quantizer(self, quantizer, cfg, dimension):
         klass = {
             'no_quant': None,
@@ -90,7 +85,6 @@ class AudioGen(nn.Module):
             kwargs['dimension'] = dimension
         return klass(**kwargs)
     def get_encodec_autoencoder(self, cfg):
         kwargs = dict_from_config(getattr(cfg, 'seanet'))
         _ = kwargs.pop('encoder')
@@ -98,8 +92,6 @@ class AudioGen(nn.Module):
         decoder_kwargs = {**kwargs, **decoder_override_kwargs}
         decoder = SEANetDecoder(**decoder_kwargs)
         return decoder
     def get_compression_model(self, cfg):
         """Instantiate a compression model."""

 import omegaconf
 import torchaudio
 import torch
 import numpy as np
 from huggingface_hub import hf_hub_download
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
+N_REPEAT = 2  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
     n = x.shape[0]
     return dct
+class AudioGen(torch.nn.Module):
     # https://huggingface.co/facebook/audiogen-medium
                  ):
         with torch.no_grad():
             gen_tokens = self.lm.generate(
                 descriptions=[descriptions] * N_REPEAT,
+                max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
             x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
             x = self.resample_fn(x)  # [N_REPEAT, duration]
+            x = x.reshape(-1)
+            # for _ in range(7):
+            #     x = _shift(x)
             print(x.abs().max(), 'MAX')
             return x / (x.abs().max() + 1e-7)
     def get_quantizer(self, quantizer, cfg, dimension):
         klass = {
             'no_quant': None,
             kwargs['dimension'] = dimension
         return klass(**kwargs)
     def get_encodec_autoencoder(self, cfg):
         kwargs = dict_from_config(getattr(cfg, 'seanet'))
         _ = kwargs.pop('encoder')
         decoder_kwargs = {**kwargs, **decoder_override_kwargs}
         decoder = SEANetDecoder(**decoder_kwargs)
         return decoder
     def get_compression_model(self, cfg):
         """Instantiate a compression model."""

audiocraft/lm.py CHANGED Viewed

@@ -19,7 +19,7 @@ class LMModel(nn.Module):
         self.condition_provider = T5Conditioner(name='t5-large',
                                                 output_dim=dim)
         self.card = card  # 2048 ?
-        self.n_draw = 6  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
         # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
@@ -111,11 +111,11 @@ class LMModel(nn.Module):
             # NO OVerWriting
             if offset == 0:
-                next_token[:, :, 1:4] = 2048  # self.card
             elif offset == 1:
-                next_token[:, :, 2:4] = 2048
             elif offset == 2:
@@ -123,15 +123,15 @@ class LMModel(nn.Module):
             elif offset == max_tokens:
-                next_token[:, :, 0:1] = 2048
             elif offset == (max_tokens + 1):
-                next_token[:, :, 0:1] = 2048
             elif offset == (max_tokens + 2):
-                next_token[:, :, 0:2] = 2048
             else:  # offset 3,4,5,6,7...... max_tokens-1   # FILL Complete n_q = 4 ANTIDIAGONAL ENTRIES
@@ -139,15 +139,8 @@ class LMModel(nn.Module):
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
-            # print(out_codes.shape, f'{offset=} \n', out_codes[0:1, 0:1, :, :],'\n______________L_____________________\n')
-        # align 4-rows (shift by 1)
-        # print(out_codes[0, 0, :, :])  # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
-        # out_codes = torch.cat([out_codes[:, :, 0:1, 4:max_tokens+4],  # first row starts to be filled at offset = 4
-        #                        out_codes[:, :, 1:2, 3:max_tokens+3],
-        #                        out_codes[:, :, 2:3, 2:max_tokens+2],
-        #                        out_codes[:, :, 3:4, 1:max_tokens+1]], 2)
-        print('\n_____ALIGN____\n',  out_codes[0, 0, :, 4:max_tokens+4])  # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048
         for lay in self.transformer.layers:

         self.condition_provider = T5Conditioner(name='t5-large',
                                                 output_dim=dim)
         self.card = card  # 2048 ?
+        self.n_draw = 3  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
         # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
             # NO OVerWriting
             if offset == 0:
+                next_token[:, :, 1:4] = 2048  # self.card - bottom 3 entries of the antidiagonal should remain 2048
             elif offset == 1:
+                next_token[:, :, 2:4] = 2048  # bottom 2 entries of the antidiagonal should remain 2048
             elif offset == 2:
             elif offset == max_tokens:
+                next_token[:, :, 0:1] = 2048  # top 1 entry of the antidiagonal should stay to 2048
             elif offset == (max_tokens + 1):
+                next_token[:, :, 0:2] = 2048
             elif offset == (max_tokens + 2):
+                next_token[:, :, 0:3] = 2048
             else:  # offset 3,4,5,6,7...... max_tokens-1   # FILL Complete n_q = 4 ANTIDIAGONAL ENTRIES
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
+        print('\n_____ALIGN____\n',  out_codes[1, 2, :, 4:max_tokens+4])  # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
+        # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048
         for lay in self.transformer.layers:

demo.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import numpy as np
 import soundfile
 import msinference
 # Prepend »Vom Prof. Friedrich ist noch eine .. string in the beginning brings the male voice in deu MMS TTS (if later string is much longer
 #                                      sometimes the woman voices pronounces words <dass>) TODO amplify attn weights of first hidden states / certain voice
@@ -16,26 +16,25 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
                    'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
                    'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
                    'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
-              voice='deu', #'af_ZA_google-nwu_1919',  # 'serbian', 'en_US/vctk_low#p276', 'isl',
-              speed=1.14,  # only for MMS TTS
-              affect = True  # False = higher clarity sound for partially sight
               ):
-    '''returns 24kHZ np.array TTS
-       voice : 'en_US/vctk_low#p276'  # from English voices -> https://audeering.github.io/shift/
           or
-       voice : 'af_ZA_google-nwu_1919' # from english non-native accents -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
           or
-       voice : 'deu'  # foreign langs -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
        '''
-    # StyleTTS2 - En
-    # mimic-3 format of voice (English txt - English accent)
     if ('en_US/' in voice) or ('en_UK/' in voice):
         a = '' if affect else 'v2/'
@@ -47,7 +46,7 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
         x = msinference.inference(text,
                                     style_vector)
-    # mimic-3 format of voice (English text - Foreign accent)
     elif '_' in  voice:
         style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
@@ -59,19 +58,23 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
                                     style_vector)
-    # Fallback - MMS TTS - Non-English
     else:
-        # MMS TTS - list of sentences
         x = msinference.foreign(text=text,
-                                lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
-                                speed=speed)  # normalisation externally
     # volume
     x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
-    print(x.shape, 'TTS OK')
     return x
-soundfile.write(f'de00i.wav', tts_entry(), 24000)

 import numpy as np
 import soundfile
 import msinference
+from audiocraft.builders import AudioGen
 # Prepend »Vom Prof. Friedrich ist noch eine .. string in the beginning brings the male voice in deu MMS TTS (if later string is much longer
 #                                      sometimes the woman voices pronounces words <dass>) TODO amplify attn weights of first hidden states / certain voice
                    'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
                    'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
                    'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
+              voice='deu',  #'af_ZA_google-nwu_1919',  # 'serbian', 'en_US/vctk_low#p276', 'isl',
+              speed=1.14,
+              affect = True,  # False = higher clarity
+              soundscape = 'dogs barg in dungeons n dragons'
               ):
+    '''24kHz
+       voice : 'en_US/vctk_low#p276'  # Native English voices -> https://audeering.github.io/shift/
           or
+       voice : 'af_ZA_google-nwu_1919' # Non-Native English voices -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
           or
+       voice : 'deu'  # Foreign languages -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
        '''
+    # StyleTTS2 - find voice from folder
     if ('en_US/' in voice) or ('en_UK/' in voice):
         a = '' if affect else 'v2/'
         x = msinference.inference(text,
                                     style_vector)
+    # find voice from mimic-3 folder with styles
     elif '_' in  voice:
         style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                     style_vector)
+    # Fallback - MMS TTS - Non-English voice/language
     else:
         x = msinference.foreign(text=text,
+                                lang=voice,
+                                speed=speed)  # volume normalis.
     # volume
     x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
+    if soundscape is not None:
+        sound_gen = AudioGen().to('cuda:0').eval()
+        background = sound_gen.generate(soundscape,
+                                              duration=len(x)/24000 + .74,  # sound duration in seconds
+                                              ).detach().cpu().numpy() # bs, 11400 @.74s
+        x = .5 * x + .5 * background[:len(x)]
     return x
+soundfile.write(f'demo.wav', tts_entry(), 24000)