antidiagonal fast delay pattern

Browse files

Files changed (3) hide show

audiocraft/builders.py +3 -3
audiocraft/codebooks_patterns.py +0 -285
audiocraft/lm.py +57 -53

audiocraft/builders.py CHANGED Viewed

@@ -64,14 +64,14 @@ class AudioGen(nn.Module):
         with torch.no_grad():
             gen_tokens = self.lm.generate(
                 descriptions=[descriptions]*3,
-                max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
-            x = x[:, 0, :-250]  # last samples have splash sounds DISCARD 25000 last samples
             # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
-            # x = self.resample_fn(x)
             # batch size = different sounds for same txt

         with torch.no_grad():
             gen_tokens = self.lm.generate(
                 descriptions=[descriptions]*3,
+                max_tokens=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+            x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
             # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
+            x = self.resample_fn(x)
             # batch size = different sounds for same txt

audiocraft/codebooks_patterns.py DELETED Viewed

@@ -1,285 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from collections import namedtuple
-from dataclasses import dataclass
-import logging
-import typing as tp
-import torch
-LayoutCoord = namedtuple('LayoutCoord', ['t', 'q'])  # (timestep, codebook index)
-PatternLayout = tp.List[tp.List[LayoutCoord]]  # Sequence of coordinates
-logger = logging.getLogger(__name__)
-@dataclass
-class Pattern:
-    """Base implementation of a pattern over a sequence with multiple codebooks.
-    The codebook pattern consists in a layout, defining for each sequence step
-    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
-    The first item of the pattern is always an empty list in order to properly insert a special token
-    to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern
-    and ``timesteps`` the number of timesteps corresponding to the original sequence.
-    The pattern provides convenient methods to build and revert interleaved sequences from it:
-    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
-        to the interleaved sequence of shape [B, K, S] applying the pattern, with B being the batch size,
-        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
-        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
-        is returned along with a mask indicating valid tokens.
-    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
-        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
-        to fill and specify invalid positions if needed.
-    See the dedicated methods for more details.
-    """
-    # Pattern layout, for each sequence step, we have a list of coordinates
-    # corresponding to the original codebook timestep and position.
-    # The first list is always an empty list in order to properly insert
-    # a special token to start with.
-    layout: PatternLayout
-    timesteps: int
-    n_q: int
-    def __post_init__(self):
-        # assert len(self.layout) > 0
-        # self._validate_layout()   #
-        self._build_reverted_sequence_scatter_indexes = self._build_reverted_sequence_scatter_indexes
-        self._build_pattern_sequence_scatter_indexes = self._build_pattern_sequence_scatter_indexes
-        print("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout))
-    @property
-    def max_delay(self):
-        max_t_in_seq_coords = 0
-        for seq_coords in self.layout[1:]:
-            for coords in seq_coords:
-                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
-        return max_t_in_seq_coords - self.timesteps
-    @property
-    def valid_layout(self):
-        valid_step = len(self.layout) - self.max_delay
-        return self.layout[:valid_step]
-    def starts_with_special_token(self):
-        return self.layout[0] == []
-    def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
-        """Get codebook coordinates in the layout that corresponds to the specified timestep t
-        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
-        and the actual codebook coordinates.
-        """
-        assert t <= self.timesteps, "provided timesteps is greater than the pattern's number of timesteps"
-        if q is not None:
-            assert q <= self.n_q, "provided number of codebooks is greater than the pattern's number of codebooks"
-        coords = []
-        for s, seq_codes in enumerate(self.layout):
-            for code in seq_codes:
-                if code.t == t and (q is None or code.q == q):
-                    coords.append((s, code))
-        return coords
-    def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -> tp.List[int]:
-        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
-    def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -> tp.Optional[int]:
-        steps_with_timesteps = self.get_steps_with_timestep(t, q)
-        return steps_with_timesteps[0] if len(steps_with_timesteps) > 0 else None
-    def _build_pattern_sequence_scatter_indexes(self, timesteps: int, n_q: int, keep_only_valid_steps: bool,
-                                                device: tp.Union[torch.device, str] = 'cpu'):
-        """Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
-        Args:
-            timesteps (int): Maximum number of timesteps steps to consider.
-            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
-            device (torch.device or str): Device for created tensors.
-        Returns:
-            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
-            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
-        """
-        assert n_q == self.n_q, f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}"
-        assert timesteps <= self.timesteps, "invalid number of timesteps used to build the sequence from the pattern"
-        # use the proper layout based on whether we limit ourselves to valid steps only or not,
-        # note that using the valid_layout will result in a truncated sequence up to the valid steps
-        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
-        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
-        indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy()
-        mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy()
-        # fill indexes with last sequence step value that will correspond to our special token
-        # the last value is n_q * timesteps as we have flattened z and append special token as the last token
-        # which will correspond to the index: n_q * timesteps
-        indexes[:] = n_q * timesteps
-        # iterate over the pattern and fill scattered indexes and mask
-        for s, sequence_coords in enumerate(ref_layout):
-            for coords in sequence_coords:
-                if coords.t < timesteps:
-                    indexes[coords.q, s] = coords.t + coords.q * timesteps
-                    mask[coords.q, s] = 1
-        indexes = torch.from_numpy(indexes).to(device)
-        mask = torch.from_numpy(mask).to(device)
-        return indexes, mask
-    def build_pattern_sequence(self,
-                               z,
-                               special_token,
-                               keep_only_valid_steps=False):
-        B, K, T = z.shape
-        indexes, mask = self._build_pattern_sequence_scatter_indexes(
-            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
-        )
-        z = z.view(B, -1)
-        # we append the special token as the last index of our flattened z tensor
-        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
-        values = z[:, indexes.view(-1)]
-        values = values.view(B, K, indexes.shape[-1])
-        # print(values.shape, indexes.shape, mask.shape, 'BUILD PATTERN')
-        # --
-        # torch.Size([1, 4, 39]) torch.Size([4, 39]) torch.Size([4, 39]) BUILD PATTERN
-        return values, indexes, mask
-    def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, n_q: int,
-                                                 keep_only_valid_steps: bool = False,
-                                                 is_model_output: bool = False,
-                                                 device: tp.Union[torch.device, str] = 'cpu'):
-        """Builds scatter indexes required to retrieve the original multi-codebook sequence
-        from interleaving pattern.
-        Args:
-            sequence_steps (int): Sequence steps.
-            n_q (int): Number of codebooks.
-            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
-                Steps that are beyond valid steps will be replaced by the special_token in that case.
-            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
-            device (torch.device or str): Device for created tensors.
-        Returns:
-            indexes (torch.Tensor): Indexes for reconstructing the output, of shape [K, T].
-            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
-        """
-        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
-        # TODO(jade): Do we want to further truncate to only valid timesteps here as well?
-        timesteps = self.timesteps
-        assert n_q == self.n_q, f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}"
-        assert sequence_steps <= len(ref_layout), \
-            f"sequence to revert is longer than the defined pattern: {sequence_steps} > {len(ref_layout)}"
-        # ensure we take the appropriate indexes to keep the model output from the first special token as well
-        if is_model_output and self.starts_with_special_token():
-            ref_layout = ref_layout[1:]
-        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
-        indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy()
-        mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy()
-        # fill indexes with last sequence step value that will correspond to our special token
-        indexes[:] = n_q * sequence_steps
-        for s, sequence_codes in enumerate(ref_layout):
-            if s < sequence_steps:
-                for code in sequence_codes:
-                    if code.t < timesteps:
-                        indexes[code.q, code.t] = s + code.q * sequence_steps  # oh the jump - so are the codes linearised
-                        mask[code.q, code.t] = 1
-        indexes = torch.from_numpy(indexes).to(device)
-        mask = torch.from_numpy(mask).to(device)
-        return indexes, mask
-    def revert_pattern_sequence(self,
-                                s,
-                                special_token,
-                                keep_only_valid_steps=False):
-        """SPECIAL TOKEN NOT DELETED HERE !!!!
-        Args:
-            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
-            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
-        Returns:
-            values (torch.Tensor) : Interleaved sequence matching the pattern, of shape [B, K, T] with T
-            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
-            mask (torch.Tensor)   : Mask corresponding to indexes that matches valid indexes of shape [K, T].
-                                    shall this mask delete special token id;
-        """
-        B, K, S = s.shape
-        indexes, mask = self._build_reverted_sequence_scatter_indexes(
-            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
-        )
-        s = s.view(B, -1)
-        # we append the special token as the last index of our flattened z tensor
-        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
-        values = s[:, indexes.view(-1)]
-        values = values.view(B, K, indexes.shape[-1])
-        return values, indexes, mask
-class DelayedPatternProvider():
-    """Provider for delayed pattern across delayed codebooks.
-    Codebooks are delayed in the sequence and sequence steps will contain codebooks
-    from different timesteps.
-    Example:
-        Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
-        [[1, 2, 3, 4],
-        [1, 2, 3, 4],
-        [1, 2, 3, 4]]
-        The resulting sequence obtained from the returned pattern is:
-        [[S, 1, 2, 3, 4],
-        [S, S, 1, 2, 3],
-        [S, S, S, 1, 2]]
-        (with S being a special token)
-    Args:
-        n_q (int): Number of codebooks.
-        delays (list of int, optional): Delay for each of the codebooks.
-            If delays not defined, each codebook is delayed by 1 compared to the previous one.
-        flatten_first (int): Flatten the first N timesteps.
-        empty_initial (int): Prepend with N empty list of coordinates.
-    """
-    def __init__(self,
-                 n_q,
-                 delays,
-                 flatten_first=0,
-                 empty_initial=0):
-        self.n_q = n_q
-        if delays is None:
-            delays = list(range(n_q))
-        print(f'{delays=}  PATTERN __ini')
-        self.delays = delays
-        self.flatten_first = flatten_first
-        self.empty_initial = empty_initial
-        assert len(self.delays) == self.n_q
-        assert sorted(self.delays) == self.delays
-    def get_pattern(self, timesteps):
-        # get_pattern for desired length?
-        # print(f'{timesteps=} GET_PATTERn')   # 35
-        # print(f'{self.empty_initial=}')
-        omit_special_token = self.empty_initial < 0   # False as initial = 0 unset
-        out: PatternLayout = [] if omit_special_token else [[]]
-        max_delay = max(self.delays)
-        if self.empty_initial:
-            out += [[] for _ in range(self.empty_initial)]
-        if self.flatten_first:
-            for t in range(min(timesteps, self.flatten_first)):
-                for q in range(self.n_q):
-                    out.append([LayoutCoord(t, q)])
-        for t in range(self.flatten_first, timesteps + max_delay):
-            v = []
-            for q, delay in enumerate(self.delays):
-                t_for_q = t - delay
-                if t_for_q >= self.flatten_first:
-                    v.append(LayoutCoord(t_for_q, q))
-            out.append(v)
-        # print(self.n_q, 'N_Q in PATTERN')  # 4 N_Q in PATTERN
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)

audiocraft/lm.py CHANGED Viewed

@@ -2,7 +2,6 @@ import torch
 import torch.nn.functional as F
 from audiocraft.transformer import StreamingTransformer
 from torch import nn
-from audiocraft.codebooks_patterns import DelayedPatternProvider
 from audiocraft.conditioners import T5Conditioner
 import numpy as np
@@ -26,7 +25,6 @@ class LMModel(nn.Module):
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
-        self.pattern_provider = DelayedPatternProvider()
         self.emb = nn.ModuleList([nn.Embedding(embed_dim, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
         self.transformer = StreamingTransformer(
             d_model=dim,
@@ -45,14 +43,18 @@ class LMModel(nn.Module):
                 sequence,
                 condition_tensors=None,
                 token_count=None):
-        # takes bs=3 duplicates null condition to bs=6 splits logits to cfg returns bs=3
-        bs, _, _ = sequence.shape         # sequence [bs, n_draw,4]
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
         out = self.transformer(torch.cat([input_, input_], 0),
                                cross_attention_src=condition_tensors,
-                               token_count=token_count)
         if self.out_norm:
             out = self.out_norm(out)
@@ -79,7 +81,7 @@ class LMModel(nn.Module):
     @torch.no_grad()
     def generate(self,
                  descriptions = ['windy day', 'rain storm'],
-                 max_gen_len = 256):
         text_condition = self.condition_provider(descriptions)
@@ -95,64 +97,66 @@ class LMModel(nn.Module):
-        pattern = self.pattern_provider.get_pattern(max_gen_len)
-        gen_codes = torch.full((bs,
-                                self.n_q,
-                                max_gen_len), -1, dtype=torch.long,
-                                device=text_condition.device)
-        gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.card)
-        _, _, audiodur = gen_sequence.shape  # bs, 4, 7=audiodur
-        # print(gen_sequence.shape, mask.shape, 'F')  # mask has no batch = [4,audio_duration]
-        # print(f'{mask=}')
-        #
-        # torch.Size([3, 4, 7]) torch.Size([4, 7]) F
-        # mask=tensor([[False,  True,  True,  True, False, False, False],
-        #              [False, False,  True,  True,  True, False, False],
-        #              [False, False, False,  True,  True,  True, False],
-        #              [False, False, False, False,  True,  True,  True]], device='cuda:0')
-        mask = mask[None, None, :, :].repeat(bs, self.n_draw, 1, 1)  # [bs, n_draw, 4, audio duration]
-        gen_sequence = gen_sequence[:, None, :, :].repeat(1, self.n_draw, 1, 1)  # bs,n_draw,4,dur
-        for offset in range(1, audiodur):
-            # forward duplicates the query to nullcond - then cfg & returns deduplicate token
-            next_token = self.forward(gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
                                       condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
                                       token_count=offset-1)  # [bs, 4, 1, 2048]
-            # MASK is not full 1---- HAS 4 x audioduration PATTERN
-            m = mask[:, :, :, offset]
-            next_token[~m] = self.card
-            gen_sequence[:, :, :, offset] = torch.where(
-                gen_sequence[:, :, :, offset] == -1, #unknown_token,
-                next_token,
-                gen_sequence[:, :, :, offset]
-            )
-        # 1. reshape n_draw as bs * n_draw
-        # 2. invert all short-sequences
-        # 3. reshape bs * n_draw -> bs, n_draw * audiodur ELONGATION
-        out_codes = pattern.revert_pattern_sequence(
-            gen_sequence.reshape(bs * self.n_draw, 4, audiodur),  # [3,8,4,7]
-            special_token=-1)
-        # print(f'{gen_sequence.shape=} {out_codes.shape=} Ha')  # REVERT PATTERN REDUCES DURATION?
-        _, _, new_len = out_codes.shape                        # 4 IS PRESERVED AFTER REVERT!
-        out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
-        out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
-        print(out_codes.shape, 'o')
         # Clear k/v cache (Different kv is saved by every 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
-        return out_codes  # bs*n_draw, duration -> repeat/shift in api.py

 import torch.nn.functional as F
 from audiocraft.transformer import StreamingTransformer
 from torch import nn
 from audiocraft.conditioners import T5Conditioner
 import numpy as np
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
         self.emb = nn.ModuleList([nn.Embedding(embed_dim, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
         self.transformer = StreamingTransformer(
             d_model=dim,
                 sequence,
                 condition_tensors=None,
                 token_count=None):
+        bs, n_q, time_frames = sequence.shape # [bs, 4, time]
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
+        # duplicate null condition (bs x 2)
         out = self.transformer(torch.cat([input_, input_], 0),
                                cross_attention_src=condition_tensors,
+                               token_count=token_count
+                               )
         if self.out_norm:
             out = self.out_norm(out)
     @torch.no_grad()
     def generate(self,
                  descriptions = ['windy day', 'rain storm'],
+                 max_tokens = 256):
         text_condition = self.condition_provider(descriptions)
+        out_codes = torch.full((bs, self.n_draw, 4, 4 + max_tokens),  # 4 + max_tokens to have sufficient to index the 1st antidiagonal of 4x4
+                               self.card,
+                               dtype=torch.long,device=text_condition.device) # bs,n_draw,4,dur
+        for offset in range(0, max_tokens):
+            # GEN_SEQUENCE has fillers start & end = 2048
+            # [6,4,74] = gen_sequence = torch.tensor([[[
+            #   [2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048, 2048],
+            #   [2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048],
+            #   [2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048],
+            #   [2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6]],
+            #
+            # out codes = un-delayed
+            #
+            # tensor([[0, 1, 2, 3, 4, 5, 6],
+            #         [0, 1, 2, 3, 4, 5, 6],
+            #         [0, 1, 2, 3, 4, 5, 6],
+            #         [0, 1, 2, 3, 4, 5, 6]])
+            #
+            # LM "sees" 4 delayed tokens (diagonal extract)
+            #
+            # SO THE FIRST pack of 4 tokens fed TO LM is [2048, 2048, 2048, 2048]
+            #
+            # IF WE START WITH
+            # 2048 2048 2048 2048
+            # 2048 2048 2048 2048
+            # 2048 2048 2048 2048
+            # 2048 2048 2048 2048
+            #
+            # THE 2nd token pack of 4 fed to LM is [10, 20, 50, 7]
+            #
+            # 2048 2048 2048 2048 10
+            # 2048 2048 2048 2048 20
+            # 2048 2048 2048 2048 50
+            # 2048 2048 2048 2048 7
+            #
+            # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
+            #
+            # forward duplicates the query to nullcond - then cfg & returns deduplicate token
+            # only 0 (1st token of n_draw is continued by LM call - rest is supersampled in torch.multinomial)
+            # feeds the antidiagonal to LM
+            next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
+                                      #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
                                       condition_tensors=text_condition,  # utilisation of the attention mask of txt condition ?
                                       token_count=offset-1)  # [bs, 4, 1, 2048]
+            out_codes[:, :, :, offset + 4] = next_token  # [bs, n_draw, 4, duration]
+        # DISCARD FILL
+        out_codes = out_codes[:, :, :, 4:].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048
         # Clear k/v cache (Different kv is saved by every 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
+        return out_codes  # SKIP THE 4 fill 2048 bs*n_draw, duration -> repeat/shift in api.py