AudioGen dtypes

Browse files

Files changed (6) hide show

audiocraft/builders.py +64 -108
audiocraft/conditioners.py +0 -71
audiocraft/lm.py +76 -23
audiocraft/seanet.py +29 -14
audiocraft/transformer.py +25 -23
audiocraft/vq.py +16 -35

audiocraft/builders.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import omegaconf
 import torchaudio
 import torch
 import numpy as np
 from huggingface_hub import hf_hub_download
 import os
 from omegaconf import OmegaConf
 from .encodec import EncodecModel
-from .lm import LMModel
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
 N_REPEAT = 4  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
@@ -29,13 +31,7 @@ def _delete_param(cfg, full_name):
     if parts[-1] in cfg:
         del cfg[parts[-1]]
     OmegaConf.set_struct(cfg, True)
-def dict_from_config(cfg):
-    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
-    return dct
 class AudioGen(torch.nn.Module):
@@ -44,21 +40,72 @@ class AudioGen(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.load_compression_model()
-        self.load_lm_model()
-        #  AudioGen = 16KHZ                StyleTTS2 = 24 KHz / MMSTTS = 24 KHz
-        self.resample_fn = torchaudio.transforms.Resample(16000, 24000)
     def generate(self,
-                 descriptions,
                  duration=2.24,  ## seconds of audio
                  ):
         with torch.no_grad():
-            gen_tokens = self.lm.generate(
-                descriptions=[descriptions] * N_REPEAT,
-                max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
             x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
@@ -74,94 +121,3 @@ class AudioGen(torch.nn.Module):
             print(x.abs().max(), 'MAX')
             return x / (x.abs().max() + 1e-7)
-    def get_quantizer(self, quantizer, cfg, dimension):
-        klass = {
-            'no_quant': None,
-            'rvq': ResidualVectorQuantizer
-        }[quantizer]
-        kwargs = dict_from_config(getattr(cfg, quantizer))
-        if quantizer != 'no_quant':
-            kwargs['dimension'] = dimension
-        return klass(**kwargs)
-    def get_encodec_autoencoder(self, cfg):
-        kwargs = dict_from_config(getattr(cfg, 'seanet'))
-        _ = kwargs.pop('encoder')
-        decoder_override_kwargs = kwargs.pop('decoder')
-        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
-        decoder = SEANetDecoder(**decoder_kwargs)
-        return decoder
-    def get_compression_model(self, cfg):
-        """Instantiate a compression model."""
-        if cfg.compression_model == 'encodec':
-            kwargs = dict_from_config(getattr(cfg, 'encodec'))
-            quantizer_name = kwargs.pop('quantizer')
-            decoder = self.get_encodec_autoencoder(cfg)
-            quantizer = self.get_quantizer(quantizer_name, cfg, 128)
-            renormalize = kwargs.pop('renormalize', False)
-            # deprecated params
-            # print(f'{frame_rate=} {encoder.dimension=}')  frame_rate=50 encoder.dimension=128
-            kwargs.pop('renorm', None)
-            # print('\n______!____________\n', kwargs, '\n______!____________\n')
-            #     ______!____________
-            #     {'autoencoder': 'seanet', 'sample_rate': 16000, 'channels': 1, 'causal': False}
-            #     ______!____________
-            return EncodecModel(decoder=decoder,
-                                quantizer=quantizer,
-                                frame_rate=50,
-                                renormalize=renormalize,
-                                sample_rate=16000,
-                                channels=1,
-                                causal=False
-                                ).to(cfg.device)
-        else:
-            raise KeyError(f"Unexpected compression model {cfg.compression_model}")
-    def load_compression_model(self):
-        file = hf_hub_download(
-            repo_id='facebook/audiogen-medium',
-            filename="compression_state_dict.bin",
-            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
-            library_name="audiocraft",
-            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
-        pkg = torch.load(file, map_location='cpu')
-        # if 'pretrained' in pkg:
-        #     print('NO RPtrained\n=\n=\n=\n=\n=')
-        #     return EncodecModel.get_pretrained(pkg['pretrained'], device='cpu')
-        cfg = OmegaConf.create(pkg['xp.cfg'])
-        cfg.device = 'cpu'
-        model = self.get_compression_model(cfg)
-        model.load_state_dict(pkg['best_state'], strict=False)  # ckpt has also unused encoder weights
-        # return model
-        self.compression_model = model
-    def load_lm_model(self):
-        file = hf_hub_download(
-            repo_id='facebook/audiogen-medium',
-            filename="state_dict.bin",
-            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
-            library_name="audiocraft",
-            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
-        pkg = torch.load(file,
-                        map_location='cpu')
-        cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
-        _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
-        _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
-        _delete_param(cfg, 'conditioners.args.drop_desc_p')
-        print('___________________________CFG___________________',cfg,'\n=======================')
-        kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
-        print('___________________________Kwarg___________________',kwargs,'\n=======================')
-        model = LMModel().to(getattr(torch, cfg.dtype)) #.to(cfg.device)
-        _best = pkg['best_state']
-        _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
-        _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
-        model.load_state_dict(pkg['best_state'], strict=True)
-        # model.cfg = cfg
-        self.lm = model.to(torch.float)
-    # def _flush(self):
-        # self.lm._flush()  # already done in lm generate at end

 import omegaconf
 import torchaudio
 import torch
+from torch import nn
 import numpy as np
 from huggingface_hub import hf_hub_download
 import os
 from omegaconf import OmegaConf
 from .encodec import EncodecModel
+from .lm import LMModel, TorchAutocast
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
 N_REPEAT = 4  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
     if parts[-1] in cfg:
         del cfg[parts[-1]]
     OmegaConf.set_struct(cfg, True)
 class AudioGen(torch.nn.Module):
     def __init__(self):
         super().__init__()
+        self.autocast = TorchAutocast(
+                enabled=True, device_type='cuda', dtype=torch.float16)
+        # Vocoder
+        _file_1 = hf_hub_download(
+            repo_id='facebook/audiogen-medium',
+            filename="compression_state_dict.bin",
+            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
+            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
+        pkg = torch.load(_file_1, map_location='cpu')
+        # kwargs = OmegaConf.create(pkg['xp.cfg'])
+        # kwargs.device = 'cpu'
+        decoder = SEANetDecoder()
+        quantizer = ResidualVectorQuantizer()
+        self.compression_model = EncodecModel(decoder=decoder,
+                                              quantizer=quantizer,
+                                              frame_rate=50,
+                                              renormalize=False,
+                                              sample_rate=16000,
+                                              channels=1,
+                                              causal=False)  #.to(cfg.device)
+        # self.compression_model = self.get_compression_model(cfg)
+        self.compression_model.load_state_dict(pkg['best_state'], strict=False)  # ckpt has also unused encoder weights
+        self.resample_fn = torchaudio.transforms.Resample(16000, 24000)  #  AudioGen = 16KHZ                StyleTTS2 = 24 KHz / MMSTTS = 24 KHz
+        # # T5 &
+        #  LM
+        _file_2 = hf_hub_download(
+            repo_id='facebook/audiogen-medium',
+            filename="state_dict.bin",
+            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
+            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
+        pkg = torch.load(_file_2, map_location='cpu')
+        cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
+        _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
+        _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
+        _delete_param(cfg, 'conditioners.args.drop_desc_p')
+        # print('___________________________CFG___________________',cfg,'\n=======================')
+        # kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
+        # print('___________________________Kwarg___________________',kwargs,'\n=======================')
+        _best = pkg['best_state']
+        # _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
+        # _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
+        _best['t5.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')#.to(torch.float)
+        _best['t5.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')#.to(torch.float)
+        self.lm = LMModel() #to(torch.float16)
+        self.lm.load_state_dict(pkg['best_state'],
+                              strict=True)
+        #
+        self.lm.eval()
+        self.compression_model.eval()
     def generate(self,
+                 prompt='dogs mewo',
                  duration=2.24,  ## seconds of audio
                  ):
         with torch.no_grad():
+            with self.autocast:
+                # LM
+                gen_tokens = self.lm.generate(
+                    text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,  # '' for null condition,  # ['trance', 'dogs meow', '', '']
+                    max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
             x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
             print(x.abs().max(), 'MAX')
             return x / (x.abs().max() + 1e-7)

audiocraft/conditioners.py DELETED Viewed

@@ -1,71 +0,0 @@
-import warnings
-from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
-import torch
-from torch import nn
-class T5Conditioner(nn.Module):
-    MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
-              "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large",
-              "google/flan-t5-xl", "google/flan-t5-xxl"]
-    MODELS_DIMS = {
-        "t5-small": 512,
-        "t5-base": 768,
-        "t5-large": 1024,
-        "t5-3b": 1024,
-        "t5-11b": 1024,
-        "google/flan-t5-small": 512,
-        "google/flan-t5-base": 768,
-        "google/flan-t5-large": 1024,
-        "google/flan-t5-3b": 1024,
-        "google/flan-t5-11b": 1024,
-    }
-    def __init__(self,
-                 name,
-                 output_dim,
-                 device='cuda:0',
-                 finetune=False):
-        print(f'{finetune=}')
-        assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
-        super().__init__()
-        self.dim = self.MODELS_DIMS[name]
-        self.output_dim = output_dim
-        self.output_proj = nn.Linear(self.dim, output_dim)
-        self.device = device
-        self.name = name
-        self.t5_tokenizer = T5Tokenizer.from_pretrained(name, legacy=True)
-        t5 = T5EncoderModel.from_pretrained(name).eval()  #.train(mode=finetune)
-        if finetune:
-            self.t5 = t5
-        else:
-            # this makes sure that the t5 models is not part
-            # of the saved checkpoint
-            self.__dict__['t5'] = t5.to(device)
-    def tokenize(self, x):
-        entries = [xi if xi is not None else "" for xi in x]
-        inputs = self.t5_tokenizer(entries,
-                                   return_tensors='pt',
-                                   padding=True).to(self.device)
-        return inputs  # 'input_ids' 'attentio mask'
-    def forward(self, descriptions):
-        d = self.tokenize(descriptions)
-        with torch.no_grad():
-            embeds = self.t5(input_ids=d['input_ids'],
-                             attention_mask=d['attention_mask']
-                             ).last_hidden_state  # no kvcache for txt conditioning
-            embeds = self.output_proj(embeds.to(self.output_proj.weight))
-        embeds = (embeds * d['attention_mask'].unsqueeze(-1))
-        return embeds # , d['attention_mask']

audiocraft/lm.py CHANGED Viewed

@@ -1,10 +1,69 @@
 import torch
-import torch.nn.functional as F
 from audiocraft.transformer import StreamingTransformer
 from torch import nn
-from audiocraft.conditioners import T5Conditioner
-import numpy as np
 class LMModel(nn.Module):
@@ -16,8 +75,7 @@ class LMModel(nn.Module):
                  hidden_scale = 4,  # FFN of Transformer
                  ):
         super().__init__()
-        self.condition_provider = T5Conditioner(name='t5-large',
-                                                output_dim=dim)
         self.card = card  # 2048 ?
         self.n_draw = 6  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
@@ -49,14 +107,14 @@ class LMModel(nn.Module):
                                cross_attention_src=condition_tensors,
                                token_count=token_count
                                )
         logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
         logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
         # SAMPLE TOP K
-        k = 400 # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
         min_value_top_k = top_k_value[:, :, :, -1:]
@@ -67,36 +125,31 @@ class LMModel(nn.Module):
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
-                                replacement=False)  # [bs*4, self.n_draw]
         # print('DRAW','c', out)
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
     def generate(self,
-                 descriptions = ['windy day', 'rain storm'],
-                 max_tokens = None):
-        text_condition = self.condition_provider(descriptions)
-        bs, _, _ = text_condition.shape
-        text_condition = torch.cat(
-            [
-                text_condition,
-                torch.zeros_like(text_condition)
-            ], 0)
         out_codes = torch.full((bs,
                                 self.n_draw,
                                 4,
                                 4 + 3 + max_tokens),  # 4 + max_tokens + 4-1 to have sufficient to index the 1st antidiagonal of 4x4 + 4 xtra tokens
                                self.card,
                                dtype=torch.long,
-                               device=text_condition.device) # [bs, n_draw, 4, dur]
         # =========================================
         for offset in range(0, max_tokens + 4 - 1):  # max_tokens + n_q - 1
             # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
             next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
                                       #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
-                                      condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
                                       token_count=offset)  # [bs, n_draw, 4]
             # Fill of next_token should be also placed on antidiagonal [not column]
@@ -138,7 +191,7 @@ class LMModel(nn.Module):
                 pass #print('No delete anti-diag')
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
-# END LOOP
         # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048

 import torch
 from audiocraft.transformer import StreamingTransformer
 from torch import nn
+from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
+class TorchAutocast:
+    def __init__(self, enabled: bool, *args, **kwargs):
+        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
+    def __enter__(self):
+        if self.autocast is None:
+            return
+        try:
+            self.autocast.__enter__()
+        except RuntimeError:
+            device = self.autocast.device
+            dtype = self.autocast.fast_dtype
+            raise RuntimeError(
+                f"There was an error autocasting with dtype={dtype} device={device}\n"
+                "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16"
+            )
+    def __exit__(self, *args, **kwargs):
+        if self.autocast is None:
+            return
+        self.autocast.__exit__(*args, **kwargs)
+class T5(nn.Module):
+    def __init__(self):
+        # run this from within lm so it autocasts thus match exact values of t5 in official audiogen
+        super().__init__()
+        self.dim = 1024
+        self.output_dim = 1536
+        self.output_proj = nn.Linear(self.dim, self.output_dim)
+        self.autocast = TorchAutocast(enabled=True, device_type='cuda', dtype=torch.float)
+        self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-large', legacy=True)
+        t5 = T5EncoderModel.from_pretrained('t5-large').train(mode=False)
+        # this makes sure that the t5 models is not part
+        # of the saved checkpoint
+        self.__dict__['t5'] = t5.to('cuda:0')
+    def forward(self, prompt):
+        with torch.set_grad_enabled(False), self.autocast:
+            bs = len(prompt) // 2
+            # txt 2 hidden
+            d = self.t5_tokenizer(prompt,
+                                    return_tensors='pt',
+                                    padding=True).to(self.output_proj.bias.device)
+            d['attention_mask'][bs:, :] = 0  # null condition t5 attn_mask should be zero
+            x = self.t5(input_ids=d['input_ids'],
+                            attention_mask=d['attention_mask']).last_hidden_state  # no kv
+            # output_proj as float32
+            print('BEF PROJ',x[0, :, :].sum(), x[1, :, :].sum(), self.output_proj.weight.sum(), self.output_proj.weight.dtype, self.output_proj.bias.sum(), 'GEN\n\n143')
+        x = self.output_proj(x)  # nn.Linear() - produces different result if there is no duplicate txt condition here
+        x[bs:, :, :] = 0  # venv/../site-packages/audiocraft/modules/conditioners.py -> tokenize()
+        return x
 class LMModel(nn.Module):
                  hidden_scale = 4,  # FFN of Transformer
                  ):
         super().__init__()
+        self.t5 = T5()
         self.card = card  # 2048 ?
         self.n_draw = 6  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
                                cross_attention_src=condition_tensors,
                                token_count=token_count
                                )
+        print(out.sum(), out.dtype, 'TRSF out cust')
         logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
         logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
+        print(logits.sum(0).sum(2), logits.shape, 'SAMPL custom')
         # SAMPLE TOP K
+        k = 1#400 # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
         min_value_top_k = top_k_value[:, :, :, -1:]
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
+                                replacement=True)  # [bs*4, self.n_draw]
         # print('DRAW','c', out)
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
     def generate(self,
+                 max_tokens=None,
+                 text_condition=None
+                 ):
+        x = self.t5(text_condition)
+        bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
         out_codes = torch.full((bs,
                                 self.n_draw,
                                 4,
                                 4 + 3 + max_tokens),  # 4 + max_tokens + 4-1 to have sufficient to index the 1st antidiagonal of 4x4 + 4 xtra tokens
                                self.card,
                                dtype=torch.long,
+                               device=x.device) # [bs, n_draw, 4, dur]
         # =========================================
         for offset in range(0, max_tokens + 4 - 1):  # max_tokens + n_q - 1
             # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
             next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
                                       #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
+                                      condition_tensors=x,  # utilisation of the attention mask of txt condition ?
                                       token_count=offset)  # [bs, n_draw, 4]
             # Fill of next_token should be also placed on antidiagonal [not column]
                 pass #print('No delete anti-diag')
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
+        print('\nFULL FINAL TOKENS UNFILT\n', out_codes[:, 0, :, 4:max_tokens+4], out_codes[0, 0, :, 4:max_tokens+4].shape)
         # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048

audiocraft/seanet.py CHANGED Viewed

@@ -82,21 +82,36 @@ class SEANetResnetBlock(nn.Module):
 class SEANetDecoder(nn.Module):
-    def __init__(self, channels: int = 1,
-                 dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
-                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU',
                  activation_params: dict = {'alpha': 1.0},
-                 final_activation: tp.Optional[str] = None,
-                 final_activation_params: tp.Optional[dict] = None,
-                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {},
-                 kernel_size: int = 7,
-                 last_kernel_size: int = 7, residual_kernel_size: int = 3,
-                 dilation_base: int = 2, causal: bool = False,
-                 pad_mode: str = 'reflect', true_skip: bool = True,
-                 compress: int = 2, lstm: int = 0,
-                 disable_norm_outer_blocks: int = 0,
-                 trim_right_ratio: float = 1.0):
         super().__init__()
         self.dimension = dimension
         self.channels = channels

 class SEANetDecoder(nn.Module):
+#  channels=1 dimension=128 n_filters=64 n_residual_layers=1 ratios=[8, 5, 4, 2]
+# activation='ELU' activation_params={'alpha': 1.0}, final_activation=None
+# final_activation_params=None norm='weight_norm'
+# norm_params={} kernel_size=7 last_kernel_size=7 residual_kernel_size=3 dilation_base=2
+# causal=False pad_mode='constant'
+# true_skip=True compress=2 lstm=2 disable_norm_outer_blocks=0 trim_right_ratio=1.0
+    def __init__(self,
+                 channels = 1,
+                 dimension = 128,
+                 n_filters = 64,
+                 n_residual_layers = 1,
+                 ratios = [8, 5, 4, 2],
+                 activation = 'ELU',
                  activation_params: dict = {'alpha': 1.0},
+                 final_activation = None,
+                 final_activation_params = None,
+                 norm = 'weight_norm',
+                 norm_params = {},
+                 kernel_size = 7,
+                 last_kernel_size = 7,
+                 residual_kernel_size = 3,
+                 dilation_base = 2,
+                 causal = False,
+                 pad_mode = 'constant',
+                 true_skip = True,
+                 compress = 2,
+                 lstm = 2,
+                 disable_norm_outer_blocks = 0,
+                 trim_right_ratio = 1.0):
         super().__init__()
         self.dimension = dimension
         self.channels = channels

audiocraft/transformer.py CHANGED Viewed

@@ -53,16 +53,13 @@ class StreamingMultiheadAttention(nn.Module):
             # 1st projected makes k,v (instantaneous)
             # Here else is self_attention for audio with itself (above is cross attention txt)
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
-            projected = nn.functional.linear(query, self.in_proj_weight)
             bound_layout = "b h p t d"
             packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
             if self.k_history is not None:
                 # flush
                 if self.k_history.shape[2] > 71:
@@ -84,7 +81,7 @@ class StreamingMultiheadAttention(nn.Module):
             # KV COMPLETION ONLY ON SELF ATTENTION
         x = torch.nn.functional.scaled_dot_product_attention(
             q, k, v, is_causal=False, dropout_p=0
         )
@@ -93,15 +90,24 @@ class StreamingMultiheadAttention(nn.Module):
         return x
-class StreamingTransformerLayer(nn.Module):
     def __init__(self,
                  d_model,
                  num_heads,
                  dim_feedforward):
-        super().__init__()
         self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
                                                      num_heads=num_heads)
@@ -116,20 +122,13 @@ class StreamingTransformerLayer(nn.Module):
     def forward(self,
-                src,
                 cross_attention_src=None):  # txtcond
-        '''T is saved float16 weights - should we cast src to float16'''
-        x = src
         x = x + self.self_attn(self.norm1(x))
-        if cross_attention_src is not None:
-            x = x + self.cross_attention(
-                                    query = self.norm_cross(x),
-                                    key   = cross_attention_src,
-                                    value = cross_attention_src)  # txtcondition
         x = x + self.linear2(F.gelu(self.linear1(   self.norm2(x)    )))
         return x
@@ -164,9 +163,12 @@ class StreamingTransformer(nn.Module):
                 x,
                 token_count=None,
                 cross_attention_src=None):
         if self.positional_embedding in ['sin', 'sin_rope']:
-            pos_emb = create_sin_embedding(torch.tensor([[[.0]], [[.0]]], device=x.device) + token_count, x.shape[2], max_period=self.max_period)
             x = x + pos_emb
         for j, lay in enumerate(self.layers):

             # 1st projected makes k,v (instantaneous)
             # Here else is self_attention for audio with itself (above is cross attention txt)
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
+            projected = nn.functional.linear(query, self.in_proj_weight, None)  # here we have different floating values from official
             bound_layout = "b h p t d"
             packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
             if self.k_history is not None:
                 # flush
                 if self.k_history.shape[2] > 71:
             # KV COMPLETION ONLY ON SELF ATTENTION
         x = torch.nn.functional.scaled_dot_product_attention(
             q, k, v, is_causal=False, dropout_p=0
         )
         return x
+class StreamingTransformerLayer(nn.TransformerEncoderLayer):
     def __init__(self,
                  d_model,
                  num_heads,
                  dim_feedforward):
+        super().__init__(d_model,
+                         num_heads,
+                         dim_feedforward=dim_feedforward,
+                         dropout=0.0,
+                         device='cuda',
+                         dtype=torch.float32,
+                         batch_first=True,
+                         norm_first=True,
+                         activation='gelu')
+        # super().__init__()
         self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
                                                      num_heads=num_heads)
     def forward(self,
+                x,
                 cross_attention_src=None):  # txtcond
+        # x = src
         x = x + self.self_attn(self.norm1(x))
+        x = x + self.cross_attention(query = self.norm_cross(x),
+                                     key   = cross_attention_src,
+                                     value = cross_attention_src)  # txtcondition
         x = x + self.linear2(F.gelu(self.linear1(   self.norm2(x)    )))
         return x
                 x,
                 token_count=None,
                 cross_attention_src=None):
+        #cross_attention_src = (torch.arange(1536)[None, None, :] * torch.arange(6).reshape(2,3,1)).to(torch.float).to(x.device)
+        print(x.sum(), cross_attention_src[0, :, :].sum(), cross_attention_src[1, :, :].sum(), x.dtype, cross_attention_src.dtype, 'Xattnc')
         if self.positional_embedding in ['sin', 'sin_rope']:
+            pos_emb = create_sin_embedding(torch.zeros(x.shape[0], 1, 1, device=x.device) + token_count,
+                                           1536,
+                                           max_period=self.max_period)
             x = x + pos_emb
         for j, lay in enumerate(self.layers):

audiocraft/vq.py CHANGED Viewed

@@ -145,46 +145,27 @@ class ResidualVectorQuantization(nn.Module):
             layer = self.layers[i]
             quantized = layer.decode(indices)
             quantized_out = quantized_out + quantized
-        return quantized_out
-# ------------------------------------- END core_vq.py
 class ResidualVectorQuantizer(nn.Module):
-    """Residual Vector Quantizer.
-    Args:
-        dimension (int): Dimension of the codebooks.
-        n_q (int): Number of residual vector quantizers used.
-        q_dropout (bool): Random quantizer drop out at train time.
-        bins (int): Codebook size.
-        decay (float): Decay for exponential moving average over the codebooks.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-        orthogonal_reg_weight (float): Orthogonal regularization weights.
-        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
-        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider.
-            for orthogonal regularization.
-    """
     def __init__(
         self,
-        dimension: int = 256,
-        n_q: int = 8,
-        q_dropout: bool = False,
-        bins: int = 1024,
-        decay: float = 0.99,
-        kmeans_init: bool = True,
-        kmeans_iters: int = 10,
-        threshold_ema_dead_code: int = 2,
-        orthogonal_reg_weight: float = 0.0,
-        orthogonal_reg_active_codes_only: bool = False,
-        orthogonal_reg_max_codes: tp.Optional[int] = None,
     ):
         super().__init__()
         self.max_n_q = n_q

             layer = self.layers[i]
             quantized = layer.decode(indices)
             quantized_out = quantized_out + quantized
+        return quantized_out
 class ResidualVectorQuantizer(nn.Module):
+# dimension=128 n_q=4 q_dropout=False bins=2048 decay=0.99 kmeans_init=True kmeans_iters=50 threshold_ema_dead_code=2
+# orthogonal_reg_weight=0.0 orthogonal_reg_active_codes_only=False orthogonal_reg_max_codes=None
     def __init__(
         self,
+        dimension = 128,
+        n_q = 4,
+        q_dropout = False,
+        bins = 2048,
+        decay = 0.99,
+        kmeans_init = True,
+        kmeans_iters = 50,
+        threshold_ema_dead_code = 2,
+        orthogonal_reg_weight = 0.0,
+        orthogonal_reg_active_codes_only = False,
+        orthogonal_reg_max_codes = None,
     ):
         super().__init__()
         self.max_n_q = n_q