Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

Hugo Flores Garcia commited on May 26, 2023

Commit

99122c4

1 Parent(s): 5a343f4

basic readme stuff

Browse files

Files changed (10) hide show

README.md +31 -53
conf/lora/gas-station.yml +10 -0
demo.py +3 -2
scripts/exp/train.py +9 -87
scripts/utils/vamp_folder.py +5 -5
vampnet/interface.py +13 -5
vampnet/modules/base.py +8 -119
vampnet/modules/layers.py +14 -0
vampnet/signal.py +5 -0
vampnet/util.py +3 -34

README.md CHANGED Viewed

@@ -1,80 +1,58 @@
-# Lyrebird VampNet
-This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
-## Development
-### Setting everything up
-Run the setup script to set up your environment via:
 ```bash
-python env/setup.py
 ```
-The setup script does not require any dependencies beyond just Python.
-Once run, follow the instructions it prints out to create your
-environment file, which will be at `env/env.sh`.
-Note that if this is a new machine, and
-the data is not downloaded somewhere on it already, it will ask you
-for a directory to download the data to.
-For Github setup, if you don't have a .netrc token, create one by going to your Github profile -> Developer settings -> Personal access tokens -> Generate new token. Copy the token and [keep it secret, keep it safe](https://www.youtube.com/watch?v=iThtELZvfPs).
-When complete, run:
 ```bash
-source env/env.sh
 ```
-Now build and launch the Docker containers:
 ```bash
-docker compose up -d
 ```
-This builds and runs a Jupyter notebook and Tensorboard
-in the background, which points to your `TENSORBOARD_PATH`
-env. variable.
-Now, launch your development environment via:
-```bash
-docker compose run dev
-```
-To tear down your development environment, just do
-```bash
-docker compose down
-```
-### Launching an experiment
-Experiments are first _staged_ by running the `stage` command (which corresponds to the script `scripts/exp/stage.py`).
-`stage` creates a directory with a copy of all of the Git-tracked files in the root repository.`stage` launches a shell into said directory, so all commands are run on the
-copy of the original repository code. This is useful for rewinding to an old experiment
-and resuming it, for example. Even if the repository code changes, the snapshot in the experiment directory is unchanged from the original run, so it can be re-used.
-Then, the experiment can be run via:
 ```bash
-torchrun --nproc_per_node gpu \
-  scripts/exp/train.py \
-  --args.load=conf/args.yml \
 ```
-The full settings are in [conf/daps/train.yml](conf/daps/train.yml).
-### Useful commands
-#### Cleaning up after a run
-Sometimes DDP runs fail to clear themselves out of the machine. To fix this, run
 ```bash
-cleanup
 ```

+# VampNet
+This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
+# Setting up
+## Install LAC
+install AudioTools
 ```bash
+git clone https://github.com/hugofloresgarcia/audiotools.git
+pip install -e ./audiotools
 ```
+install the LAC library.
 ```bash
+git clone https://github.com/hugofloresgarcia/lac.git
+pip install -e ./lac
 ```
+install VampNet
 ```bash
+git clone https://github.com/hugofloresgarcia/vampnet2.git
+pip install -e ./vampnet2
 ```
+## A note on Argbind
+This repository relies on [argbind](https://github.com/pseeth/argbind) to manage CLIs and config files.
+Config files are stored in the `conf/` folder.
+# Usage
+## Staging a Run
+Staging a run makes a copy of all the git-tracked files in the codebase and saves them to a folder for reproducibility. You can then run the training script from the staged folder.
+coming soon
+## Training a model
 ```bash
+python scripts/exp/train.py --args.load conf/vampnet.yml --save_path /path/to/checkpoints
 ```
+## Fine-tuning
+To fine-tune a model, see the configuration files under `conf/lora/`.
+You just need to provide a list of audio files // folders to fine-tune on, then launch the training job as usual.
 ```bash
+python scripts/exp/train.py --args.load conf/lora/birds.yml --save_path /path/to/checkpoints
 ```
+## Launching the Gradio Interface
+```bash
+python demo.py --args.load conf/interface/spotdl.yml --Interface.device cuda
+```

conf/lora/gas-station.yml ADDED Viewed

	@@ -0,0 +1,10 @@

+$include:
+  - conf/lora/lora.yml
+fine_tune: True
+train/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/gas-station-sushi.mp3
+val/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/gas-station-sushi.mp3

demo.py CHANGED Viewed

@@ -48,6 +48,7 @@ def load_audio(file):
     sig.write(out_dir / "input.wav")
     return sig.path_to_file
 def load_random_audio():
     index = np.random.randint(0, len(dataset))
     sig = dataset[index]["signal"]
@@ -68,7 +69,7 @@ def ez_vamp(
     sig = at.AudioSignal(input_audio)
     print(f"running standard vampnet with {num_vamps} vamps")
-    zv = interface.coarse_vamp_v2(
         sig,
         sampling_steps=num_steps,
         temperature=(init_temp, final_temp),
@@ -140,7 +141,7 @@ def vamp(
         if mode == "standard":
             print(f"running standard vampnet with {num_vamps} vamps")
-            zv, mask_z = interface.coarse_vamp_v2(
                 sig,
                 sampling_steps=num_steps,
                 temperature=(init_temp, final_temp),

     sig.write(out_dir / "input.wav")
     return sig.path_to_file
 def load_random_audio():
     index = np.random.randint(0, len(dataset))
     sig = dataset[index]["signal"]
     sig = at.AudioSignal(input_audio)
     print(f"running standard vampnet with {num_vamps} vamps")
+    zv = interface.coarse_vamp(
         sig,
         sampling_steps=num_steps,
         temperature=(init_temp, final_temp),
         if mode == "standard":
             print(f"running standard vampnet with {num_vamps} vamps")
+            zv, mask_z = interface.coarse_vamp(
                 sig,
                 sampling_steps=num_steps,
                 temperature=(init_temp, final_temp),

scripts/exp/train.py CHANGED Viewed

@@ -115,6 +115,10 @@ def load(
         }
         if (Path(kwargs["folder"]) / "vampnet").exists():
             model, v_extra = VampNet.load_from_folder(**kwargs)
     codec = LAC.load(args["codec_ckpt"], map_location="cpu")
     codec.eval()
@@ -149,25 +153,6 @@ def load(
     }
-def get_gpu_memory_map():
-    """Get the current gpu usage.
-    Returns
-    -------
-    usage: dict
-        Keys are device ids as integers.
-        Values are memory usage as integers in MB.
-    """
-    result = subprocess.check_output(
-        ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"],
-        encoding="utf-8",
-    )
-    # Convert lines into a dictionary
-    gpu_memory = [int(x) for x in result.strip().split("\n")]
-    gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
-    gpu_memory_map = {f"gpu/{k}": v / 1024 for k, v in gpu_memory_map.items()}
-    return gpu_memory_map
 def num_params_hook(o, p):
     return o + f" {p/1e6:<.3f}M params."
@@ -189,7 +174,6 @@ def accuracy(
     target: torch.Tensor,
     top_k: int = 1,
     ignore_index: Optional[int] = None,
-    **kwargs,
 ) -> torch.Tensor:
     # Flatten the predictions and targets to be of shape (batch_size * sequence_length, n_class)
     preds = rearrange(preds, "b p s -> (b s) p")
@@ -214,30 +198,6 @@ def accuracy(
     return accuracy
-def sample_prefix_suffix_amt(
-        z,
-        n_batch,
-        prefix_amt,
-        suffix_amt,
-        prefix_dropout,
-        suffix_dropout,
-        rng
-    ):
-    """
-    Sample the number of prefix and suffix tokens to drop.
-    """
-    if prefix_amt > 0.0:
-        prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
-        n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
-    else:
-        n_prefix = None
-    if suffix_amt > 0.0:
-        suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
-        n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
-    else:
-        n_suffix = None
-    return n_prefix, n_suffix
 @argbind.bind(without_prefix=True)
 def train(
@@ -256,10 +216,6 @@ def train(
     num_workers: int = 10,
     detect_anomaly: bool = False,
     grad_clip_val: float = 5.0,
-    prefix_amt: float = 0.0,
-    suffix_amt: float = 0.0,
-    prefix_dropout: float = 0.1,
-    suffix_dropout: float = 0.1,
     fine_tune: bool = False,
     quiet: bool = False,
 ):
@@ -342,16 +298,12 @@ def train(
                         target=r_unmasked_target,
                         ignore_index=IGNORE_INDEX,
                         top_k=topk,
-                        task="multiclass",
-                        num_classes=vn.vocab_size,
                     )
                     output[f"{tag}/masked"] = accuracy(
                         preds=r_z_hat,
                         target=r_masked_target,
                         ignore_index=IGNORE_INDEX,
                         top_k=topk,
-                        task="multiclass",
-                        num_classes=vn.vocab_size,
                     )
         def train_loop(self, engine, batch):
@@ -370,15 +322,7 @@ def train(
                 n_batch = z.shape[0]
                 r = rng.draw(n_batch)[:, 0].to(accel.device)
-                n_prefix, n_suffix = sample_prefix_suffix_amt(z=z,
-                    n_batch=n_batch, prefix_amt=prefix_amt, suffix_amt=suffix_amt,
-                    prefix_dropout=prefix_dropout, suffix_dropout=suffix_dropout,
-                    rng=rng
-                )
-                z_mask, mask = vn.add_noise(
-                    z, r, n_prefix=n_prefix, n_suffix=n_suffix
-                )
                 z_mask_latent = vn.embedding.from_codes(z_mask, codec)
                 dtype = torch.bfloat16 if accel.amp else None
@@ -454,13 +398,7 @@ def train(
             n_batch = z.shape[0]
             r = rng.draw(n_batch)[:, 0].to(accel.device)
-            n_prefix, n_suffix = sample_prefix_suffix_amt(z=z,
-                n_batch=n_batch, prefix_amt=prefix_amt, suffix_amt=suffix_amt,
-                prefix_dropout=prefix_dropout, suffix_dropout=suffix_dropout,
-                rng=rng
-            )
-            z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
@@ -574,17 +512,8 @@ def train(
                     )
         def save_imputation(self, z: torch.Tensor):
-            # imputations
-            _prefix_amt = prefix_amt
-            _suffix_amt = suffix_amt
-            if _prefix_amt == 0:
-                _prefix_amt = 0.25
-            if _suffix_amt == 0:
-                _suffix_amt = 0.25
-            n_prefix = int(z.shape[-1] * _prefix_amt)
-            n_suffix = int(z.shape[-1] * _suffix_amt)
             downsample_factor = None
             vn = accel.unwrap(model)
@@ -647,13 +576,7 @@ def train(
             n_batch = z.shape[0]
-            n_prefix, n_suffix = sample_prefix_suffix_amt(z=z,
-                n_batch=n_batch, prefix_amt=prefix_amt, suffix_amt=suffix_amt,
-                prefix_dropout=prefix_dropout, suffix_dropout=suffix_dropout,
-                rng=rng
-            )
-            z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
@@ -664,7 +587,6 @@ def train(
             z_pred = vn.embedding.unflatten(z_pred, n_codebooks=vn.n_predict_codebooks)
             z_pred = torch.cat([z[:, : vn.n_conditioning_codebooks, :], z_pred], dim=1)
-            print("z_mask", z_mask.shape)
             generated = vn.to_signal(z_pred, codec)
             reconstructed = vn.to_signal(z, codec)
             masked = vn.to_signal(z_mask.squeeze(1), codec)

         }
         if (Path(kwargs["folder"]) / "vampnet").exists():
             model, v_extra = VampNet.load_from_folder(**kwargs)
+        else:
+            raise ValueError(
+                f"Could not find a VampNet checkpoint in {kwargs['folder']}"
+            )
     codec = LAC.load(args["codec_ckpt"], map_location="cpu")
     codec.eval()
     }
 def num_params_hook(o, p):
     return o + f" {p/1e6:<.3f}M params."
     target: torch.Tensor,
     top_k: int = 1,
     ignore_index: Optional[int] = None,
 ) -> torch.Tensor:
     # Flatten the predictions and targets to be of shape (batch_size * sequence_length, n_class)
     preds = rearrange(preds, "b p s -> (b s) p")
     return accuracy
 @argbind.bind(without_prefix=True)
 def train(
     num_workers: int = 10,
     detect_anomaly: bool = False,
     grad_clip_val: float = 5.0,
     fine_tune: bool = False,
     quiet: bool = False,
 ):
                         target=r_unmasked_target,
                         ignore_index=IGNORE_INDEX,
                         top_k=topk,
                     )
                     output[f"{tag}/masked"] = accuracy(
                         preds=r_z_hat,
                         target=r_masked_target,
                         ignore_index=IGNORE_INDEX,
                         top_k=topk,
                     )
         def train_loop(self, engine, batch):
                 n_batch = z.shape[0]
                 r = rng.draw(n_batch)[:, 0].to(accel.device)
+                z_mask, mask = vn.add_noise(z, r)
                 z_mask_latent = vn.embedding.from_codes(z_mask, codec)
                 dtype = torch.bfloat16 if accel.amp else None
             n_batch = z.shape[0]
             r = rng.draw(n_batch)[:, 0].to(accel.device)
+            z_mask, mask = vn.add_noise(z, r)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
                     )
         def save_imputation(self, z: torch.Tensor):
+            n_prefix = int(z.shape[-1] * 0.25)
+            n_suffix = int(z.shape[-1] *  0.25)
             downsample_factor = None
             vn = accel.unwrap(model)
             n_batch = z.shape[0]
+            z_mask, mask = vn.add_noise(z, r)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
             z_pred = vn.embedding.unflatten(z_pred, n_codebooks=vn.n_predict_codebooks)
             z_pred = torch.cat([z[:, : vn.n_conditioning_codebooks, :], z_pred], dim=1)
             generated = vn.to_signal(z_pred, codec)
             reconstructed = vn.to_signal(z, codec)
             masked = vn.to_signal(z_mask.squeeze(1), codec)

scripts/utils/vamp_folder.py CHANGED Viewed

@@ -56,7 +56,7 @@ class CoarseCond:
     def __call__(self, sig, interface):
         n_conditioning_codebooks = interface.coarse.n_codebooks - self.num_codebooks
-        zv = interface.coarse_vamp_v2(sig,
             n_conditioning_codebooks=n_conditioning_codebooks,
             downsample_factor=self.downsample_factor,
         )
@@ -113,7 +113,7 @@ def mask_ratio_1_step(ratio=1.0):
         r = interface.coarse.invgamma(ratio).to(interface.device)
         intensity = 1-r
-        zv = interface.coarse_vamp_v2(
             sig,
             sample='argmax',
             sampling_steps=1,
@@ -125,7 +125,7 @@ def mask_ratio_1_step(ratio=1.0):
 def num_sampling_steps(num_steps=1):
     def wrapper(sig, interface):
-        zv = interface.coarse_vamp_v2(
             sig,
             downsample_factor=16,
             sampling_steps=num_steps,
@@ -143,7 +143,7 @@ def beat_mask(ctx_time):
             after_beat_s=ctx_time,
             invert=True
         )
-        zv = interface.coarse_vamp_v2(
             sig,
             ext_mask=beat_mask,
         )
@@ -154,7 +154,7 @@ def beat_mask(ctx_time):
 def inpaint(ctx_time):
     def wrapper(sig, interface):
-        zv = interface.coarse_vamp_v2(
             sig,
             prefix_dur_s=ctx_time,
             suffix_dur_s=ctx_time,

     def __call__(self, sig, interface):
         n_conditioning_codebooks = interface.coarse.n_codebooks - self.num_codebooks
+        zv = interface.coarse_vamp(sig,
             n_conditioning_codebooks=n_conditioning_codebooks,
             downsample_factor=self.downsample_factor,
         )
         r = interface.coarse.invgamma(ratio).to(interface.device)
         intensity = 1-r
+        zv = interface.coarse_vamp(
             sig,
             sample='argmax',
             sampling_steps=1,
 def num_sampling_steps(num_steps=1):
     def wrapper(sig, interface):
+        zv = interface.coarse_vamp(
             sig,
             downsample_factor=16,
             sampling_steps=num_steps,
             after_beat_s=ctx_time,
             invert=True
         )
+        zv = interface.coarse_vamp(
             sig,
             ext_mask=beat_mask,
         )
 def inpaint(ctx_time):
     def wrapper(sig, interface):
+        zv = interface.coarse_vamp(
             sig,
             prefix_dur_s=ctx_time,
             suffix_dur_s=ctx_time,

vampnet/interface.py CHANGED Viewed

@@ -20,6 +20,14 @@ def signal_concat(
     return AudioSignal(audio_data, sample_rate=audio_signals[0].sample_rate)
 class Interface(torch.nn.Module):
     def __init__(
         self,
@@ -28,7 +36,7 @@ class Interface(torch.nn.Module):
         codec_ckpt: str = None,
         wavebeat_ckpt: str = None,
         device: str = "cpu",
-        coarse_chunk_size_s: int =  5,
         coarse2fine_chunk_size_s: int =  3,
     ):
         super().__init__()
@@ -141,7 +149,7 @@ class Interface(torch.nn.Module):
         """make a beat synced mask. that is, make a mask that
         places 1s at and around the beat, and 0s everywhere else.
         """
-        assert hasattr(self, "beat_tracker"), "No beat tracker loaded"
         # get the beat times
         beats, downbeats = self.beat_tracker.extract_beats(signal)
@@ -242,7 +250,7 @@ class Interface(torch.nn.Module):
         return fine_z[:, :, :length].clone()
-    def coarse_vamp_v2(
         self,
         signal,
         prefix_dur_s: float = 0.0,
@@ -471,7 +479,7 @@ class Interface(torch.nn.Module):
             else:
                 ext_mask = None
-            out_z = self.coarse_vamp_v2(
                 sig,
                 num_vamps=1,
                 swap_prefix_suffix=False,
@@ -520,7 +528,7 @@ class Interface(torch.nn.Module):
         range_fn = range if not verbose else tqdm.trange
         for i in range_fn(num_loops):
             is_flipped = i % 2 == 0
-            vamped = self.coarse_vamp_v2(
                         signal,
                         prefix_dur_s=prefix_dur_s,
                         suffix_dur_s=suffix_dur_s,

     return AudioSignal(audio_data, sample_rate=audio_signals[0].sample_rate)
+class SignalPrompt:
+    def __init__(self, signal: AudioSignal):
+        self.sig = signal
 class Interface(torch.nn.Module):
     def __init__(
         self,
         codec_ckpt: str = None,
         wavebeat_ckpt: str = None,
         device: str = "cpu",
+        coarse_chunk_size_s: int =  10,
         coarse2fine_chunk_size_s: int =  3,
     ):
         super().__init__()
         """make a beat synced mask. that is, make a mask that
         places 1s at and around the beat, and 0s everywhere else.
         """
+        assert self.beat_tracker is not None, "No beat tracker loaded"
         # get the beat times
         beats, downbeats = self.beat_tracker.extract_beats(signal)
         return fine_z[:, :, :length].clone()
+    def coarse_vamp(
         self,
         signal,
         prefix_dur_s: float = 0.0,
             else:
                 ext_mask = None
+            out_z = self.coarse_vamp(
                 sig,
                 num_vamps=1,
                 swap_prefix_suffix=False,
         range_fn = range if not verbose else tqdm.trange
         for i in range_fn(num_loops):
             is_flipped = i % 2 == 0
+            vamped = self.coarse_vamp(
                         signal,
                         prefix_dur_s=prefix_dur_s,
                         suffix_dur_s=suffix_dur_s,

vampnet/modules/base.py CHANGED Viewed

@@ -10,6 +10,8 @@ import torch.nn.functional as F
 from einops import rearrange
 from tqdm import tqdm
 def log(t, eps=1e-20):
     return torch.log(t + eps)
@@ -24,9 +26,6 @@ def gumbel_sample(t, temperature=1.0, dim=-1):
     return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
-def scalar_to_batch_tensor(x, batch_size):
-    return torch.tensor(x).repeat(batch_size)
 class VampBase(at.ml.BaseModel):
     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
@@ -150,6 +149,8 @@ class VampBase(at.ml.BaseModel):
             z_hat = z_hat * mask + truth * (1 - mask)
             z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
         return z_hat
@@ -186,6 +187,9 @@ class VampBase(at.ml.BaseModel):
     @torch.no_grad()
     def to_signal(self, z, codec):
         if z.ndim == 2:
             z = self.embedding.unflatten(z)
         assert z.ndim == 3
@@ -207,122 +211,7 @@ class VampBase(at.ml.BaseModel):
         return signal
     @torch.no_grad()
-    def sample(self, **kwargs):
-        if self.noise_mode == "mask":
-            return self.maskgit_sample(**kwargs)
-        else:
-            return self.paella_sample(**kwargs)
-    def paella_sample(
-        self,
-        codec,
-        time_steps: int = 400,
-        sampling_steps: int = 36,
-        start_tokens: Optional[torch.Tensor] = None,
-        mask: Optional[torch.Tensor] = None,
-        temperature: Union[float, Tuple[float, float]] = 0.8,
-        top_k: int = None,
-        sample: str = "gumbel",
-        renoise_mode: str = "start",
-        renoise_steps=None,
-        typical_filtering=True,
-        typical_mass=0.2,
-        typical_min_tokens=1,
-        return_signal=True,
-    ):
-        r = torch.linspace(0, 1, sampling_steps + 1)[:-1][:, None].to(self.device)
-        if renoise_steps == None:
-            renoise_steps = sampling_steps - 1
-        if isinstance(temperature, float):
-            temperature = torch.tensor(temperature).repeat(sampling_steps)
-        elif isinstance(temperature, tuple):
-            assert len(temperature) == 2
-            l, h = temperature
-            temperature = torch.linspace(l, h, sampling_steps)
-        else:
-            raise TypeError(f"invalid type for temperature")
-        if self.n_conditioning_codebooks > 0:
-            assert (
-                start_tokens is not None
-            ), "must provide start_tokens if n_conditioning_codebooks > 0"
-        if start_tokens is None:
-            if self.noise_mode == "noise":
-                z = torch.randint(
-                    0, self.vocab_size, size=(1, self.n_codebooks, time_steps)
-                ).to(self.device)
-            elif self.noise_mode == "mask":
-                z = torch.full((1, self.n_codebooks, time_steps), self.mask_token)
-        else:
-            z = start_tokens
-            assert (
-                z.ndim == 3
-            ), f"start_tokens must be shape (batch, n_codebooks, seq_len), got {z.shape}"
-            assert z.shape[0] == 1, f"batch size must be 1"
-        if mask is None:
-            mask = torch.ones(z.shape[0], z.shape[-1]).to(self.device).int()
-            mask = mask[:, None, :]
-            mask = mask.repeat(1, z.shape[1], 1)
-        mask[:, : self.n_conditioning_codebooks, :] = 0.0
-        z_true = z.clone()
-        z, mask = self.add_noise(z, r=r[0], random_x=None, mask=mask)
-        z_init = z.clone()
-        for i, tmpt in enumerate(temperature):
-            if renoise_mode == "prev":
-                z_prev = z.clone()
-            latents = self.embedding.from_codes(z, codec)
-            logits = self.forward(latents, r[i])
-            # for mask mode
-            logits = self.add_truth_to_logits(z_true, logits, mask)
-            # Apply topk sampling
-            logits = logits.permute(0, 2, 1)
-            z = self.sample_from_logits(
-                logits,
-                top_k=top_k,
-                temperature=tmpt,
-                sample=sample,
-                typical_filtering=typical_filtering,
-                typical_mass=typical_mass,
-                typical_min_tokens=typical_min_tokens,
-            )
-            # add back in conditioning codebooks
-            z = self.embedding.unflatten(z, n_codebooks=self.n_predict_codebooks)
-            z = torch.cat(
-                [z_init[:, : self.n_conditioning_codebooks, :], z], dim=1
-            ).int()
-            if i < renoise_steps:
-                if renoise_mode == "prev":
-                    z, _ = self.add_noise(z, r[i + 1], random_x=z_prev)
-                elif renoise_mode == "start":
-                    z, _ = self.add_noise(z, r[i + 1], random_x=z_init)
-                elif renoise_mode == "rand":
-                    z, _ = self.add_noise(z, r[i + 1])
-                else:
-                    raise ValueError(f"Invalid renoise_mode: {renoise_mode}")
-            if mask is not None:
-                z = start_tokens * (1 - mask) + z * mask
-        if return_signal:
-            return self.to_signal(z, codec)
-        else:
-            return z
-    def maskgit_sample(
         self,
         codec,
         time_steps: int = 300,

 from einops import rearrange
 from tqdm import tqdm
+from ..util import scalar_to_batch_tensor
 def log(t, eps=1e-20):
     return torch.log(t + eps)
     return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
 class VampBase(at.ml.BaseModel):
     def forward(self, x: torch.Tensor, r: torch.Tensor):
         raise NotImplementedError
             z_hat = z_hat * mask + truth * (1 - mask)
             z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
+        else:
+            raise ValueError(f"invalid noise mode for adding truth to logits {self.noise_mode}")
         return z_hat
     @torch.no_grad()
     def to_signal(self, z, codec):
+        """
+        convert a sequence of latents to a signal.
+        """
         if z.ndim == 2:
             z = self.embedding.unflatten(z)
         assert z.ndim == 3
         return signal
     @torch.no_grad()
+    def sample(
         self,
         codec,
         time_steps: int = 300,

vampnet/modules/layers.py CHANGED Viewed

@@ -132,6 +132,11 @@ class CodebookEmbedding(nn.Module):
         self.out_proj = nn.Conv1d(n_codebooks * self.latent_dim, self.emb_dim, 1)
     def from_codes(self, codes: torch.Tensor, codec):
         n_codebooks = codes.shape[1]
         latent = []
         for i in range(n_codebooks):
@@ -151,14 +156,23 @@ class CodebookEmbedding(nn.Module):
         return latent
     def forward(self, latents: torch.Tensor):
         x = self.out_proj(latents)
         return x
     def flatten(self, tokens: torch.Tensor, n_codebooks: int = None):
         n_c = n_codebooks if n_codebooks is not None else self.n_codebooks
         return rearrange(tokens, "b c t -> b (t c)", c=n_c)
     def unflatten(self, flat_tokens: torch.Tensor, n_codebooks: int = None):
         nb, nt = flat_tokens.shape
         n_c = n_codebooks if n_codebooks is not None else self.n_codebooks

         self.out_proj = nn.Conv1d(n_codebooks * self.latent_dim, self.emb_dim, 1)
     def from_codes(self, codes: torch.Tensor, codec):
+        """
+        get a sequence of continuous embeddings from a sequence of discrete codes.
+        unlike it's counterpart in the original VQ-VAE, this function adds for any special tokens
+        necessary for the language model, like <MASK>.
+        """
         n_codebooks = codes.shape[1]
         latent = []
         for i in range(n_codebooks):
         return latent
     def forward(self, latents: torch.Tensor):
+        """
+        project a sequence of latents to a sequence of embeddings
+        """
         x = self.out_proj(latents)
         return x
     def flatten(self, tokens: torch.Tensor, n_codebooks: int = None):
+        """
+        flatten a sequence of tokens from (batch, codebook, time) to (batch, codebook * time)
+        """
         n_c = n_codebooks if n_codebooks is not None else self.n_codebooks
         return rearrange(tokens, "b c t -> b (t c)", c=n_c)
     def unflatten(self, flat_tokens: torch.Tensor, n_codebooks: int = None):
+        """
+        unflatten a sequence of tokens from (batch, codebook * time) to (batch, codebook, time)
+        """
         nb, nt = flat_tokens.shape
         n_c = n_codebooks if n_codebooks is not None else self.n_codebooks

vampnet/signal.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import torch
+from typing import Optional, Tuple
+from .util import scalar_to_batch_tensor

vampnet/util.py CHANGED Viewed

@@ -1,40 +1,9 @@
 import tqdm
-# import pathos
-def process_map(fn, *iterables, **tqdm_kwargs):
-    """
-    Equivalent of `list(map(fn, *iterables))`
-    driven by `concurrent.futures.ProcessPoolExecutor`.
-    Parameters
-    ----------
-    tqdm_class  : optional
-        `tqdm` class to use for bars [default: tqdm.auto.tqdm].
-    max_workers  : int, optional
-        Maximum number of workers to spawn; passed to
-        `concurrent.futures.ProcessPoolExecutor.__init__`.
-        [default: min(32, cpu_count() + 4)].
-    chunksize  : int, optional
-        Size of chunks sent to worker processes; passed to
-        `concurrent.futures.ProcessPoolExecutor.map`. [default: 1].
-    lock_name  : str, optional
-        Member of `tqdm_class.get_lock()` to use [default: mp_lock].
-    """
-    from concurrent.futures import ProcessPoolExecutor
-    if iterables and "chunksize" not in tqdm_kwargs:
-        # default `chunksize=1` has poor performance for large iterables
-        # (most time spent dispatching items to workers).
-        longest_iterable_len = max(map(length_hint, iterables))
-        if longest_iterable_len > 1000:
-            from warnings import warn
-            warn("Iterable length %d > 1000 but `chunksize` is not set."
-                 " This may seriously degrade multiprocess performance."
-                 " Set `chunksize=1` or more." % longest_iterable_len,
-                 TqdmWarning, stacklevel=2)
-    if "lock_name" not in tqdm_kwargs:
-        tqdm_kwargs = tqdm_kwargs.copy()
-        tqdm_kwargs["lock_name"] = "mp_lock"
-    return _executor_map(ProcessPoolExecutor, fn, *iterables, **tqdm_kwargs)
 def parallelize(

 import tqdm
+import torch
+def scalar_to_batch_tensor(x, batch_size):
+    return torch.tensor(x).repeat(batch_size)
 def parallelize(