pattern no overwrite 2048

Browse files

Files changed (4) hide show

audiocraft/builders.py +3 -3
audiocraft/lm.py +79 -84
audiocraft/transformer.py +5 -12
models.py +1 -2

audiocraft/builders.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .lm import LMModel
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
-N_REPEAT = 3  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
     n = x.shape[0]
@@ -57,7 +57,7 @@ class AudioGen(nn.Module):
                  ):
         with torch.no_grad():
-            print(duration / N_REPEAT * self.compression_model.frame_rate, 'DURATION TOKENS AudioGen')
             gen_tokens = self.lm.generate(
                 descriptions=[descriptions] * N_REPEAT,
@@ -166,7 +166,7 @@ class AudioGen(nn.Module):
         _best = pkg['best_state']
         _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
         _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
-        model.load_state_dict(pkg['best_state'])
         # model.cfg = cfg
         self.lm = model.to(torch.float)

 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
+N_REPEAT = 7  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
     n = x.shape[0]
                  ):
         with torch.no_grad():
+            print('\nCUSTOM\n',int(duration / N_REPEAT * self.compression_model.frame_rate), 'DURATION TOKENS AudioGen')
             gen_tokens = self.lm.generate(
                 descriptions=[descriptions] * N_REPEAT,
         _best = pkg['best_state']
         _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
         _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
+        model.load_state_dict(pkg['best_state'], strict=True)
         # model.cfg = cfg
         self.lm = model.to(torch.float)

audiocraft/lm.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 class LMModel(nn.Module):
     def __init__(self,
                  n_q = 4,
                  card = 2048,
@@ -16,28 +16,25 @@ class LMModel(nn.Module):
                  hidden_scale = 4,  # FFN of Transformer
                  ):
         super().__init__()
-        self.condition_provider = T5Conditioner(name='t5-large',
                                                 output_dim=dim)
         self.card = card  # 2048 ?
-        self.n_draw = 1  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
         # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
-        self.emb = nn.ModuleList([nn.Embedding(embed_dim, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
         self.transformer = StreamingTransformer(
-            d_model=dim,
-            num_heads=num_heads,
             dim_feedforward=int(hidden_scale * dim),
             num_layers=48,
             positional_embedding='sin',
             )
         self.out_norm = nn.LayerNorm(dim, eps=1e-5)
         self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=False) for _ in range(n_q)])  # LINEAR DOESNT HAVE 2049
-        # self._init_weights(weight_init, depthwise_init, zero_bias_init)
-        # self.__dict__['_fsdp'] = None
     def forward(self,
                 sequence,
@@ -47,116 +44,114 @@ class LMModel(nn.Module):
         bs, n_q, time_frames = sequence.shape # [bs, 4, time]
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
-        # duplicate null condition (bs x 2)
-        out = self.transformer(torch.cat([input_, input_], 0),
                                cross_attention_src=condition_tensors,
                                token_count=token_count
                                )
-        if self.out_norm:
-            out = self.out_norm(out)
-        logits = torch.stack([self.linears[k](out) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
         logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
         # SAMPLE TOP K
         k = 400  # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
-        min_value_top_k = top_k_value[:, :, :, -1:]
         p *= (p >= min_value_top_k).float()   # zero low probs
         p.div_(p.sum(dim=-1, keepdim=True))   # renormalise on non-zero probs
         # BRING THE nq = 4 IN BATCH
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
                                 replacement=True)  # [bs*4, self.n_draw]
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
     def generate(self,
                  descriptions = ['windy day', 'rain storm'],
-                 max_tokens = 256):
         text_condition = self.condition_provider(descriptions)
-        # NULL CONDITION
-        # text_condition = cfg_conditions['description'][0]
         bs, _, _ = text_condition.shape
         text_condition = torch.cat(
             [
                 text_condition,
                 torch.zeros_like(text_condition)
             ], 0)
-        out_codes = torch.full((bs, self.n_draw, 4, 4 + max_tokens),  # 4 + max_tokens to have sufficient to index the 1st antidiagonal of 4x4
                                self.card,
-                               dtype=torch.long,device=text_condition.device) # bs,n_draw,4,dur
-        for offset in range(0, max_tokens):
-            # GEN_SEQUENCE has fillers start & end = 2048
-            # [6,4,74] = gen_sequence = torch.tensor([[[
-            #   [2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048, 2048],
-            #   [2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048],
-            #   [2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048],
-            #   [2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6]],
-            #
-            # out codes = un-delayed
-            #
-            # tensor([[0, 1, 2, 3, 4, 5, 6],
-            #         [0, 1, 2, 3, 4, 5, 6],
-            #         [0, 1, 2, 3, 4, 5, 6],
-            #         [0, 1, 2, 3, 4, 5, 6]])
-            #
-            # LM "sees" 4 delayed tokens (diagonal extract)
-            #
-            # SO THE FIRST pack of 4 tokens fed TO LM is [2048, 2048, 2048, 2048]
-            #
-            # IF WE START WITH
-            # 2048 2048 2048 2048
-            # 2048 2048 2048 2048
-            # 2048 2048 2048 2048
-            # 2048 2048 2048 2048
-            #
-            # THE 2nd token pack of 4 fed to LM is [10, 20, 50, 7]
-            #
-            # 2048 2048 2048 2048 10
-            # 2048 2048 2048 2048 20
-            # 2048 2048 2048 2048 50
-            # 2048 2048 2048 2048 7
-            #
             # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
-            #
-            # forward duplicates the query to nullcond - then cfg & returns deduplicate token
-            # only 0 (1st token of n_draw is continued by LM call - rest is supersampled in torch.multinomial)
-            # feeds the antidiagonal to LM
             next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
                                       #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
                                       condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
-                                      token_count=offset)  # [bs, 4, 1, 2048]
-            out_codes[:, :, :, offset + 4] = next_token  # [bs, n_draw, 4, duration]
-        # DISCARD FILL
-        out_codes = out_codes[:, :, :, 4:].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048
-        # Clear k/v cache (Different kv is saved by every 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
-             lay.self_attn.v_history = None
         return out_codes  # SKIP THE 4 fill 2048 bs*n_draw, duration -> repeat/shift in api.py

 class LMModel(nn.Module):
     def __init__(self,
                  n_q = 4,
                  card = 2048,
                  hidden_scale = 4,  # FFN of Transformer
                  ):
         super().__init__()
+        self.condition_provider = T5Conditioner(name='t5-large',
                                                 output_dim=dim)
         self.card = card  # 2048 ?
+        self.n_draw = 6  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
         # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
+        self.emb = nn.ModuleList([nn.Embedding(embed_dim, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
         self.transformer = StreamingTransformer(
+            d_model=dim,
+            num_heads=num_heads,
             dim_feedforward=int(hidden_scale * dim),
             num_layers=48,
             positional_embedding='sin',
             )
         self.out_norm = nn.LayerNorm(dim, eps=1e-5)
         self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=False) for _ in range(n_q)])  # LINEAR DOESNT HAVE 2049
     def forward(self,
                 sequence,
         bs, n_q, time_frames = sequence.shape # [bs, 4, time]
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
+        out = self.transformer(torch.cat([input_, input_], 0),  # duplicate null condition (bs x 2) for ClassifierFreeGuidance
                                cross_attention_src=condition_tensors,
                                token_count=token_count
                                )
+        logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
         logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
         # SAMPLE TOP K
         k = 400  # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
+        min_value_top_k = top_k_value[:, :, :, -1:]
         p *= (p >= min_value_top_k).float()   # zero low probs
         p.div_(p.sum(dim=-1, keepdim=True))   # renormalise on non-zero probs
         # BRING THE nq = 4 IN BATCH
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
                                 replacement=True)  # [bs*4, self.n_draw]
+        # print('DRAW','c', out)
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
     def generate(self,
                  descriptions = ['windy day', 'rain storm'],
+                 max_tokens = None):
         text_condition = self.condition_provider(descriptions)
         bs, _, _ = text_condition.shape
         text_condition = torch.cat(
             [
                 text_condition,
                 torch.zeros_like(text_condition)
             ], 0)
+        out_codes = torch.full((bs,
+                                self.n_draw,
+                                4,
+                                4 + 3 + max_tokens),  # 4 + max_tokens + 4-1 to have sufficient to index the 1st antidiagonal of 4x4 + 4 xtra tokens
                                self.card,
+                               dtype=torch.long,
+                               device=text_condition.device) # [bs, n_draw, 4, dur]
+        # =========================================
+        for offset in range(0, max_tokens + 4 - 1):  # max_tokens + n_q - 1
             # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
             next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
                                       #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
                                       condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
+                                      token_count=offset)  # [bs, n_draw, 4]
+            # Fill of next_token should be also placed on antidiagonal [not column]
+            #   Do Not Overwrite 2048 of TRIU/TRIL = START/END => Do Not Fill them by Predicted Tokens
+            # 0-th antidiagonal should be full of card = [2048, 2048, 2048, 2048]
+            #
+            #   [2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048, 2048],
+            #   [2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048],
+            #   [2048, 2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048],
+            #   [2048, 2048, 2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6]]
+            # NO OVerWriting
+            if offset == 0:
+                next_token[:, :, 1:4] = 2048  # self.card
+            elif offset == 1:
+                next_token[:, :, 2:4] = 2048
+            elif offset == 2:
+                next_token[:, :, 3:4] = 2048
+            elif offset == max_tokens:
+                next_token[:, :, 0:1] = 2048
+            elif offset == (max_tokens + 1):
+                next_token[:, :, 0:1] = 2048
+            elif offset == (max_tokens + 2):
+                next_token[:, :, 0:2] = 2048
+            else:  # offset 3,4,5,6,7...... max_tokens-1   # FILL Complete n_q = 4 ANTIDIAGONAL ENTRIES
+                pass #print('No delete anti-diag')
+            out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
+            # print(out_codes.shape, f'{offset=} \n', out_codes[0:1, 0:1, :, :],'\n______________L_____________________\n')
+        # align 4-rows (shift by 1)
+        # print(out_codes[0, 0, :, :])  # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
+        # out_codes = torch.cat([out_codes[:, :, 0:1, 4:max_tokens+4],  # first row starts to be filled at offset = 4
+        #                        out_codes[:, :, 1:2, 3:max_tokens+3],
+        #                        out_codes[:, :, 2:3, 2:max_tokens+2],
+        #                        out_codes[:, :, 3:4, 1:max_tokens+1]], 2)
+        print('\n_____ALIGN____\n',  out_codes[0, 0, :, 4:max_tokens+4])  # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
+        out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
+             lay.self_attn.v_history = None
         return out_codes  # SKIP THE 4 fill 2048 bs*n_draw, duration -> repeat/shift in api.py

audiocraft/transformer.py CHANGED Viewed

@@ -180,21 +180,14 @@ class StreamingTransformer(nn.Module):
                 x,
                 token_count=None,
                 cross_attention_src=None):
         B, T, C = x.shape
         if self.positional_embedding in ['sin', 'sin_rope']:
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
-            positions = positions + token_count  #offsets.view(-1, 1, 1)
-            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + pos_emb
         for j, lay in enumerate(self.layers):
-            # print(f'Transf Layer{j}      {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
-            x = lay(x, cross_attention_src=cross_attention_src)  # cross_attention_src = txt-cond
-            # each layer (mha) keeps history of its own k,v for all tokens
         return x

                 x,
                 token_count=None,
                 cross_attention_src=None):
         B, T, C = x.shape
         if self.positional_embedding in ['sin', 'sin_rope']:
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            pos_emb = create_sin_embedding(positions + token_count, C, max_period=self.max_period, dtype=x.dtype)
             x = x + pos_emb
         for j, lay in enumerate(self.layers):
+            # print(f'Transf Layer c{j}      {pos_emb.sum()=} {pos_emb.shape=}{x.sum()=}___________________')
+            x = lay(x, cross_attention_src=cross_attention_src)  # cross_attention_src = txt-cond x audio
+                                                                 # self attn = audio x audio
+                                                                 # Every layer (mha) keeps itsw own kv cachE
         return x

models.py CHANGED Viewed

@@ -357,8 +357,7 @@ class DurationEncoder(nn.Module):
         for block in self.lstms:
             if isinstance(block, AdaLayerNorm):
-                print(f'\n=========ENTER ADALAYNORM L479 models.py {x.shape=}, {style.shape=}')
                 x = block(x, style)   # [bs, 75, 512]
                 x = torch.cat([x.transpose(1, 2), style], axis=1) # [bs, 512, 75]

         for block in self.lstms:
             if isinstance(block, AdaLayerNorm):
+                # not LST enters here
                 x = block(x, style)   # [bs, 75, 512]
                 x = torch.cat([x.transpose(1, 2), style], axis=1) # [bs, 512, 75]