Spaces:

michellemoorre
/

tvar-demo-test

Runtime error

App Files Files Community

michellemoorre commited on Oct 25, 2024

Commit

6c4dee3

0 Parent(s):

Initial commit

Browse files

Files changed (26) hide show

__pycache__/app.cpython-39.pyc +0 -0
__pycache__/dist.cpython-39.pyc +0 -0
app.py +146 -0
dist.py +203 -0
gradio_cached_examples/24/Result/408bda05bb7418a064b8/image.webp +0 -0
gradio_cached_examples/24/log.csv +2 -0
models/__init__.py +90 -0
models/__pycache__/__init__.cpython-39.pyc +0 -0
models/__pycache__/basic_vae.cpython-39.pyc +0 -0
models/__pycache__/basic_var.cpython-39.pyc +0 -0
models/__pycache__/clip.cpython-39.pyc +0 -0
models/__pycache__/helpers.cpython-39.pyc +0 -0
models/__pycache__/pipeline.cpython-39.pyc +0 -0
models/__pycache__/quant.cpython-39.pyc +0 -0
models/__pycache__/rope.cpython-39.pyc +0 -0
models/__pycache__/var.cpython-39.pyc +0 -0
models/__pycache__/vqvae.cpython-39.pyc +0 -0
models/basic_vae.py +289 -0
models/basic_var.py +466 -0
models/clip.py +53 -0
models/helpers.py +93 -0
models/pipeline.py +226 -0
models/quant.py +400 -0
models/rope.py +48 -0
models/var.py +412 -0
models/vqvae.py +184 -0

__pycache__/app.cpython-39.pyc ADDED Viewed

Binary file (2.75 kB). View file

__pycache__/dist.cpython-39.pyc ADDED Viewed

Binary file (5.93 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import gradio as gr
+import numpy as np
+import random
+import spaces
+from models import TVARPipeline
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "michellemoorre/var-test"
+pipe = TVARPipeline.from_pretrained(model_repo_id, device=device)
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
+@spaces.GPU(duration=65)
+def infer(
+    prompt,
+    negative_prompt="",
+    seed=42,
+    randomize_seed=False,
+    guidance_scale=4.0,
+    top_k=450,
+    top_p=0.95,
+    re=False,
+    re_max_depth=10,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    image = pipe(
+        prompt=prompt,
+        null_prompt=negative_prompt,
+        cfg=guidance_scale,
+        top_p=top_p,
+        top_k=top_k,
+        re=re,
+        g_seed=seed,
+    )[0]
+    return image, seed
+# TODO: add examples from preview
+examples = [
+        "A capybara wearing a suit holding a sign that reads Hello World",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 640px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown(" # [OpenTVAR](https://huggingface.co/stabilityai/stable-diffusion-3.5-large)")
+        gr.Markdown("[Learn more](https://stability.ai/news/introducing-stable-diffusion-3-5) about the OpenTVAR.")
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0, variant="primary")
+        result = gr.Image(label="Result", show_label=False)
+        with gr.Accordion("Advanced Settings", open=False):
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=True,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=0.0,
+                    maximum=7.5,
+                    step=0.1,
+                    value=4.5,
+                )
+            with gr.Row():
+                top_k = gr.Slider(
+                    label="Sampling top k",
+                    minimum=1,
+                    maximum=1000,
+                    step=10,
+                    value=450,
+                )
+                top_p = gr.Slider(
+                    label="Sampling top p",
+                    minimum=0.0,
+                    maximum=1.,
+                    step=0.05,
+                    value=0.95,
+                )
+            with gr.Row():
+                re = gr.Checkbox(label="Rejection Sampling", value=False)
+                re_max_depth = gr.Slider(
+                    label="Rejection Sampling Depth",
+                    minimum=0,
+                    maximum=20,
+                    step=1,
+                    value=10,
+                )
+        gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=True)# cache_mode="lazy")
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=infer,
+        inputs=[
+            prompt,
+            negative_prompt,
+            seed,
+            randomize_seed,
+            guidance_scale,
+            top_k,
+            top_p,
+            re,
+            re_max_depth,
+        ],
+        outputs=[result, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()

dist.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+Helpers for distributed training.
+"""
+import os
+import socket
+import torch as th
+import torch.distributed as dist
+from torch.distributed import barrier, is_initialized, broadcast
+# Change this to reflect your cluster layout.
+# The GPU for a given rank is (rank % GPUS_PER_NODE).
+GPUS_PER_NODE = 8
+SETUP_RETRY_COUNT = 3
+import datetime
+import os
+import socket
+from contextlib import closing
+def find_free_port() -> int:
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+def check_if_port_open(port: int) -> bool:
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        try:
+            s.bind(("", port))
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            return True
+        except OSError:
+            return False
+def initialized():
+    return dist.is_initialized()
+def finalize():
+    if dist.is_initialized():
+        dist.destroy_process_group()
+def initialize():
+    is_mpirun = not (
+        "RANK" in os.environ
+        and "WORLD_SIZE" in os.environ
+        and "MASTER_ADDR" in os.environ
+        and "MASTER_PORT" in os.environ
+    )
+    if is_mpirun:
+        from mpi4py import MPI
+        import subprocess
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        world_size = comm.Get_size()
+        master_addr = None
+        master_port = None
+        if rank == 0:
+            hostname_cmd = ["hostname -I"]
+            result = subprocess.check_output(hostname_cmd, shell=True)
+            master_addr = result.decode("utf-8").split()[0]
+            base_port = os.environ.get(
+                "MASTER_PORT", "29500"
+            )  # TORCH_DISTRIBUTED_DEFAULT_PORT
+            if check_if_port_open(int(base_port)):
+                master_port = base_port
+            else:
+                master_port = find_free_port()
+        master_addr = comm.bcast(master_addr, root=0)
+        master_port = comm.bcast(master_port, root=0)
+        # Determine local rank by assuming hostnames are unique
+        proc_name = MPI.Get_processor_name()
+        all_procs = comm.allgather(proc_name)
+        local_rank = sum([i == proc_name for i in all_procs[:rank]])
+        uniq_proc_names = set(all_procs)
+        host_rank = sorted(uniq_proc_names).index(proc_name)
+        os.environ["LOCAL_RANK"] = str(local_rank)
+        os.environ["HOST_RANK"] = str(host_rank)
+        os.environ["NUM_HOSTS"] = str(len(uniq_proc_names))
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+        os.environ["MASTER_ADDR"] = master_addr
+        os.environ["MASTER_PORT"] = str(master_port)
+        os.environ["OMP_NUM_THREADS"] = "1"
+    # Initialize torch distributed
+    backend = "gloo" if not th.cuda.is_available() else "nccl"
+    dist.init_process_group(backend=backend, timeout=datetime.timedelta(0, 3600))
+    th.cuda.set_device(int(os.environ.get('LOCAL_RANK', '0')))
+    if is_mpirun and dist.get_rank() == 0:
+        print("Distributed setup")
+        print("LOCAL_RANK", os.environ['LOCAL_RANK'])
+        print("HOST_RANK", os.environ['HOST_RANK'])
+        print("NUM_HOSTS", os.environ['NUM_HOSTS'])
+        print("WORLD_SIZE", os.environ['WORLD_SIZE'])
+def local_host_gather(data):
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    host_rank = os.environ["HOST_RANK"]
+    all_data = comm.allgather((host_rank, data))
+    return [d[1] for d in all_data if d[0] == host_rank]
+def in_distributed_mode():
+    return dist is not None
+def is_master():
+    return get_rank() == 0
+def is_local_master():
+    return get_local_rank() == 0
+def get_rank():
+    return dist.get_rank() if in_distributed_mode() else 0
+def get_local_rank():
+    return int(os.environ["LOCAL_RANK"])
+def worker_host_idx():
+    return int(os.environ["HOST_RANK"])
+def num_hosts():
+    return int(os.environ['NUM_HOSTS'])
+def get_world_size():
+    return dist.get_world_size() if in_distributed_mode() else 1
+def gpu_visible_device_list():
+    return str(dist.get_rank()) if in_distributed_mode() else None
+def get_device():
+    """
+    Get the device to use for torch.distributed.
+    """
+    if th.cuda.is_available():
+        return th.device("cuda")
+    return th.device("cpu")
+def sync_params(params):
+    """
+    Synchronize a sequence of Tensors across ranks from rank 0.
+    """
+    for p in params:
+        with th.no_grad():
+            dist.broadcast(p, 0)
+def print0(*args, **kwargs):
+    if get_rank() == 0:
+        print(*args, **kwargs)
+def allreduce(t: th.Tensor, async_op=False):
+    if dist.is_initialized():
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            ret = dist.all_reduce(cu, async_op=async_op)
+            t.copy_(cu.cpu())
+        else:
+            ret = dist.all_reduce(t, async_op=async_op)
+        return ret
+    return None
+def allgather(t: th.Tensor, cat=True):
+    if dist.is_initialized():
+        if not t.is_cuda:
+            t = t.cuda()
+        ls = [th.empty_like(t) for _ in range(get_world_size())]
+        dist.all_gather(ls, t)
+    else:
+        ls = [t]
+    if cat:
+        ls = th.cat(ls, dim=0)
+    return ls

gradio_cached_examples/24/Result/408bda05bb7418a064b8/image.webp ADDED Viewed

gradio_cached_examples/24/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Result,Seed,flag,username,timestamp
2	+ "{""path"": ""gradio_cached_examples/24/Result/408bda05bb7418a064b8/image.webp"", ""url"": ""/file=/place/vartmp/gradio/162d5443a1136187c8b25737fbbcdc8392d218d21024c2458112af1cc6d17d66/image.webp"", ""size"": null, ""orig_name"": ""image.webp"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",42,,,2024-10-26 02:08:31.037928

models/__init__.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Tuple
+import torch.nn as nn
+from .clip import FrozenCLIPEmbedder
+from .quant import VectorQuantizer2
+from .var import VAR
+from .vqvae import VQVAE
+from .pipeline import TVARPipeline
+def build_vae_var(
+    # Shared args
+    device,
+    patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),  # 10 steps by default
+    # VQVAE args
+    V=4096,
+    Cvae=32,
+    ch=160,
+    share_quant_resi=4,
+    # VAR args
+    depth=16,
+    shared_aln=False,
+    attn_l2_norm=True,
+    init_adaln=0.5,
+    init_adaln_gamma=1e-5,
+    init_head=0.02,
+    init_std=-1,  # init_std < 0: automated
+    text_encoder_path=None,
+    text_encoder_2_path=None,
+    rope=False,
+    rope_theta=100,
+    rope_size=None,
+    dpr=0,
+    use_swiglu_ffn=False,
+) -> Tuple[VQVAE, VAR]:
+    heads = depth
+    width = depth * 64
+    if dpr > 0:
+        dpr = dpr * depth / 24
+    # disable built-in initialization for speed
+    for clz in (
+        nn.Linear,
+        nn.LayerNorm,
+        nn.BatchNorm2d,
+        nn.SyncBatchNorm,
+        nn.Conv1d,
+        nn.Conv2d,
+        nn.ConvTranspose1d,
+        nn.ConvTranspose2d,
+    ):
+        setattr(clz, "reset_parameters", lambda self: None)
+    # build models
+    vae_local = VQVAE(
+        vocab_size=V,
+        z_channels=Cvae,
+        ch=ch,
+        test_mode=True,
+        share_quant_resi=share_quant_resi,
+        v_patch_nums=patch_nums,
+    ).to(device)
+    var_wo_ddp = VAR(
+        depth=depth,
+        embed_dim=width,
+        num_heads=heads,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=dpr,
+        norm_eps=1e-6,
+        shared_aln=shared_aln,
+        attn_l2_norm=attn_l2_norm,
+        patch_nums=patch_nums,
+        rope=rope,
+        rope_theta=rope_theta,
+        rope_size=rope_size,
+        use_swiglu_ffn=use_swiglu_ffn,
+    ).to(device)
+    var_wo_ddp.init_weights(
+        init_adaln=init_adaln,
+        init_adaln_gamma=init_adaln_gamma,
+        init_head=init_head,
+        init_std=init_std,
+    )
+    text_encoder = FrozenCLIPEmbedder(text_encoder_path)
+    text_encoder_2 = FrozenCLIPEmbedder(text_encoder_2_path)
+    pipe = TVARPipeline(var_wo_ddp, vae_local, text_encoder, text_encoder_2, device)
+    return vae_local, var_wo_ddp, pipe

models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (1.9 kB). View file

models/__pycache__/basic_vae.cpython-39.pyc ADDED Viewed

Binary file (6.99 kB). View file

models/__pycache__/basic_var.cpython-39.pyc ADDED Viewed

Binary file (12.1 kB). View file

models/__pycache__/clip.cpython-39.pyc ADDED Viewed

Binary file (1.92 kB). View file

models/__pycache__/helpers.cpython-39.pyc ADDED Viewed

Binary file (2.88 kB). View file

models/__pycache__/pipeline.cpython-39.pyc ADDED Viewed

Binary file (6.19 kB). View file

models/__pycache__/quant.cpython-39.pyc ADDED Viewed

Binary file (11 kB). View file

models/__pycache__/rope.cpython-39.pyc ADDED Viewed

Binary file (2.13 kB). View file

models/__pycache__/var.cpython-39.pyc ADDED Viewed

Binary file (11.1 kB). View file

models/__pycache__/vqvae.cpython-39.pyc ADDED Viewed

Binary file (5.76 kB). View file

models/basic_vae.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# this file only provides the 2 modules used in VQVAE
+__all__ = [ "Encoder", "Decoder"]
+"""
+References: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/modules/diffusionmodules/model.py
+"""
+# swish
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample2x(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x):
+        return self.conv(F.interpolate(x, scale_factor=2, mode="nearest"))
+class Downsample2x(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x):
+        return self.conv(F.pad(x, pad=(0, 1, 0, 1), mode="constant", value=0))
+class ResnetBlock(nn.Module):
+    def __init__(
+        self, *, in_channels, out_channels=None, dropout
+    ):  # conv_shortcut=False,  # conv_shortcut: always False in VAE
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout) if dropout > 1e-6 else nn.Identity()
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = torch.nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+        else:
+            self.nin_shortcut = nn.Identity()
+    def forward(self, x):
+        h = self.conv1(F.silu(self.norm1(x), inplace=True))
+        h = self.conv2(self.dropout(F.silu(self.norm2(h), inplace=True)))
+        return self.nin_shortcut(x) + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.C = in_channels
+        self.norm = Normalize(in_channels)
+        self.qkv = torch.nn.Conv2d(
+            in_channels, 3 * in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.w_ratio = int(in_channels) ** (-0.5)
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        qkv = self.qkv(self.norm(x))
+        B, _, H, W = qkv.shape  # should be B,3C,H,W
+        C = self.C
+        q, k, v = qkv.reshape(B, 3, C, H, W).unbind(1)
+        # compute attention
+        q = q.view(B, C, H * W).contiguous()
+        q = q.permute(0, 2, 1).contiguous()  # B,HW,C
+        k = k.view(B, C, H * W).contiguous()  # B,C,HW
+        w = torch.bmm(q, k).mul_(self.w_ratio)  # B,HW,HW
+        # w[B,i,j]=sum_c q[B,i,C]k[B,C,j]
+        w = F.softmax(w, dim=2)
+        # attend to values
+        v = v.view(B, C, H * W).contiguous()
+        w = w.permute(0, 2, 1).contiguous()  # B,HW,HW (first HW of k, second of q)
+        h = torch.bmm(v, w)  # B, C,HW (HW of q) h[B,C,j] = sum_i v[B,C,i] w[B,i,j]
+        h = h.view(B, C, H, W).contiguous()
+        return x + self.proj_out(h)
+def make_attn(in_channels, using_sa=True):
+    return AttnBlock(in_channels) if using_sa else nn.Identity()
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks=2,
+        dropout=0.0,
+        in_channels=3,
+        z_channels,
+        double_z=False,
+        using_sa=True,
+        using_mid_sa=True,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.downsample_ratio = 2 ** (self.num_resolutions - 1)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1 and using_sa:
+                    attn.append(make_attn(block_in, using_sa=True))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample2x(block_in)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, using_sa=using_mid_sa)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            (2 * z_channels if double_z else z_channels),
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x):
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        # middle
+        h = self.mid.block_2(self.mid.attn_1(self.mid.block_1(h)))
+        # end
+        h = self.conv_out(F.silu(self.norm_out(h), inplace=True))
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks=2,
+        dropout=0.0,
+        in_channels=3,  # in_channels: raw img channels
+        z_channels,
+        using_sa=True,
+        using_mid_sa=True,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, using_sa=using_mid_sa)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1 and using_sa:
+                    attn.append(make_attn(block_in, using_sa=True))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample2x(block_in)
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, z):
+        # z to block_in
+        # middle
+        h = self.mid.block_2(self.mid.attn_1(self.mid.block_1(self.conv_in(z))))
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.conv_out(F.silu(self.norm_out(h), inplace=True))
+        return h

models/basic_var.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import math
+import warnings
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from torch.nn.functional import scaled_dot_product_attention  # q, k, v: BHLc
+from models.helpers import DropPath
+from models.rope import apply_rotary_emb
+try:
+    from flash_attn.ops.fused_dense import fused_mlp_func
+except ImportError:
+    fused_mlp_func = None
+# this file only provides the 4 blocks used in VAR transformer
+__all__ = ["FFN", "AdaLNSelfCrossAttn", "AdaLNBeforeHead"]
+try:
+    from apex.normalization import FusedRMSNorm as RMSNorm
+except ImportError:
+    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
+    class RMSNorm(torch.nn.Module):
+        def __init__(self, dim: int, eps: float = 1e-6):
+            """
+            Initialize the RMSNorm normalization layer.
+            Args:
+                dim (int): The dimension of the input tensor.
+                eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+            Attributes:
+                eps (float): A small value added to the denominator for numerical stability.
+                weight (nn.Parameter): Learnable scaling parameter.
+            """
+            super().__init__()
+            self.eps = eps
+            self.weight = nn.Parameter(torch.ones(dim))
+        def _norm(self, x):
+            """
+            Apply the RMSNorm normalization to the input tensor.
+            Args:
+                x (torch.Tensor): The input tensor.
+            Returns:
+                torch.Tensor: The normalized tensor.
+            """
+            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        def forward(self, x):
+            """
+            Forward pass through the RMSNorm layer.
+            Args:
+                x (torch.Tensor): The input tensor.
+            Returns:
+                torch.Tensor: The output tensor after applying RMSNorm.
+            """
+            output = self._norm(x.float()).type_as(x)
+            return output * self.weight
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        drop=0.0,
+        fused_if_available=True,
+    ):
+        super().__init__()
+        self.fused_mlp_func = fused_mlp_func if fused_if_available else None
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU(approximate="tanh")
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop, inplace=True) if drop > 0 else nn.Identity()
+    def forward(self, x):
+        if self.fused_mlp_func is not None:
+            return self.drop(
+                self.fused_mlp_func(
+                    x=x,
+                    weight1=self.fc1.weight,
+                    weight2=self.fc2.weight,
+                    bias1=self.fc1.bias,
+                    bias2=self.fc2.bias,
+                    activation="gelu_approx",
+                    save_pre_act=self.training,
+                    return_residual=False,
+                    checkpoint_lvl=0,
+                    heuristic=0,
+                    process_group=None,
+                )
+            )
+        else:
+            return self.drop(self.fc2(self.act(self.fc1(x))))
+    def extra_repr(self) -> str:
+        return f"fused_mlp_func={self.fused_mlp_func is not None}"
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ff_mult: float = 8 / 3,
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            ff_mult (float, optional): Custom multiplier for hidden dimension. Defaults to 4.
+        """
+        super().__init__()
+        hidden_dim = int(dim * ff_mult)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.fused_mlp_func = None
+        self._init()
+    def _init(self):
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+    # @torch.compile
+    def _forward_silu_gating(self, x_gate: torch.Tensor, x_up: torch.Tensor):
+        return F.silu(x_gate) * x_up
+    def forward(self, x: torch.Tensor):
+        return self.down_proj(
+            self._forward_silu_gating(self.gate_proj(x), self.up_proj(x))
+        )
+    def extra_repr(self) -> str:
+        return f"fused_mlp_func={self.fused_mlp_func is not None}"
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        context_dim: int = 2048,
+        num_heads: int = 12,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        qk_norm: bool = False,
+    ):
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        assert attn_drop == 0.0
+        self.num_heads, self.head_dim = (
+            num_heads,
+            embed_dim // num_heads,
+        )
+        self.qk_norm = qk_norm
+        self.scale = 1 / math.sqrt(self.head_dim)
+        self.q_norm = nn.LayerNorm(embed_dim, eps=1e-6)
+        self.k_norm = nn.LayerNorm(embed_dim, eps=1e-6)
+        self.to_q = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.to_kv = nn.Linear(context_dim, embed_dim * 2, bias=True)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = (
+            nn.Dropout(proj_drop, inplace=True) if proj_drop > 0 else nn.Identity()
+        )
+        self.attn_drop = attn_drop
+        # only used during inference
+        self.caching, self.cached_k, self.cached_v = False, None, None
+    def kv_caching(self, enable: bool):
+        self.caching, self.cached_k, self.cached_v = enable, None, None
+    def forward(self, x, context, context_attn_bias=None, freqs_cis=None):
+        B, L, C = x.shape
+        context_B, context_L, context_C = context.shape
+        assert B == context_B
+        q = self.to_q(x).view(B, L, -1)  # BLD , self.num_heads, self.head_dim)
+        if self.qk_norm:
+            q = self.q_norm(q)
+        q = q.view(B, L, self.num_heads, self.head_dim)
+        q = q.permute(0, 2, 1, 3)  # BHLc
+        if self.cached_k is None:
+            # not using caches or first scale inference
+            kv = self.to_kv(context).view(B, context_L, 2, -1)  # qkv: BL3D
+            k, v = kv.permute(2, 0, 1, 3).unbind(dim=0)  # q or k or v: BLHD
+            if self.qk_norm:
+                k = self.k_norm(k)
+            k = k.view(B, context_L, self.num_heads, self.head_dim)
+            k = k.permute(0, 2, 1, 3)  # BHLc
+            v = v.view(B, context_L, self.num_heads, self.head_dim)
+            v = v.permute(0, 2, 1, 3)  # BHLc
+            if self.caching:
+                self.cached_k = k
+                self.cached_v = v
+        else:
+            k = self.cached_k
+            v = self.cached_v
+        if context_attn_bias is not None:
+            context_attn_bias = rearrange(context_attn_bias, "b j -> b 1 1 j")
+        dropout_p = self.attn_drop if self.training else 0.0
+        out = (
+            scaled_dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                scale=self.scale,
+                attn_mask=context_attn_bias,
+                dropout_p=dropout_p,
+            )
+            .transpose(1, 2)
+            .reshape(B, L, C)
+        )
+        return self.proj_drop(self.proj(out))
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        block_idx: int,
+        embed_dim: int = 768,
+        num_heads: int = 12,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        qk_norm: bool = False,
+    ):
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        self.block_idx, self.num_heads, self.head_dim = (
+            block_idx,
+            num_heads,
+            embed_dim // num_heads,
+        )
+        self.qk_norm = qk_norm
+        self.scale = 1 / math.sqrt(self.head_dim)
+        self.q_norm = nn.LayerNorm(embed_dim, eps=1e-6)
+        self.k_norm = nn.LayerNorm(embed_dim, eps=1e-6)
+        self.to_qkv = nn.Linear(embed_dim, embed_dim * 3, bias=True)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = (
+            nn.Dropout(proj_drop, inplace=True) if proj_drop > 0 else nn.Identity()
+        )
+        self.attn_drop = attn_drop
+        # only used during inference
+        self.caching, self.cached_k, self.cached_v = False, None, None
+    def kv_caching(self, enable: bool):
+        self.caching, self.cached_k, self.cached_v = enable, None, None
+    # NOTE: attn_bias is None during inference because kv cache is enabled
+    def forward(self, x, attn_bias, freqs_cis: torch.Tensor = None):
+        B, L, C = x.shape
+        qkv = self.to_qkv(x).view(B, L, 3, -1)
+        q, k, v = qkv.permute(2, 0, 1, 3).unbind(dim=0)  # q or k or v: BLD
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        q = q.view(B, L, self.num_heads, self.head_dim)
+        q = q.permute(0, 2, 1, 3)  # BHLc
+        k = k.view(B, L, self.num_heads, self.head_dim)
+        k = k.permute(0, 2, 1, 3)  # BHLc
+        v = v.view(B, L, self.num_heads, self.head_dim)
+        v = v.permute(0, 2, 1, 3)  # BHLc
+        dim_cat = 2
+        if freqs_cis is not None:
+            q = apply_rotary_emb(q, freqs_cis=freqs_cis)
+            k = apply_rotary_emb(k, freqs_cis=freqs_cis)
+        if self.caching:
+            if self.cached_k is None:
+                self.cached_k = k
+                self.cached_v = v
+            else:
+                k = self.cached_k = torch.cat((self.cached_k, k), dim=dim_cat)
+                v = self.cached_v = torch.cat((self.cached_v, v), dim=dim_cat)
+        dropout_p = self.attn_drop if self.training else 0.0
+        out = (
+            scaled_dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                scale=self.scale,
+                attn_mask=attn_bias,
+                dropout_p=dropout_p,
+            )
+            .transpose(1, 2)
+            .reshape(B, L, C)
+        )
+        return self.proj_drop(self.proj(out))
+    def extra_repr(self) -> str:
+        return f"attn_l2_norm={self.qk_norm}"
+class AdaLNSelfCrossAttn(nn.Module):
+    def __init__(
+        self,
+        block_idx,
+        last_drop_p,
+        embed_dim,
+        cond_dim,
+        shared_aln: bool,
+        num_heads,
+        mlp_ratio=4.0,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        qk_norm=False,
+        context_dim=None,
+        use_swiglu_ffn=False,
+        norm_eps=1e-6,
+    ):
+        super().__init__()
+        assert attn_drop == 0.0
+        assert qk_norm
+        self.block_idx, self.last_drop_p, self.C = block_idx, last_drop_p, embed_dim
+        self.C, self.D = embed_dim, cond_dim
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.attn = SelfAttention(
+            block_idx=block_idx,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+        )
+        if context_dim:
+            self.cross_attn = CrossAttention(
+                embed_dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+                qk_norm=qk_norm,
+            )
+        else:
+            self.cross_attn = None
+        if use_swiglu_ffn:
+            self.ffn = SwiGLUFFN(dim=embed_dim)
+        else:
+            self.ffn = FFN(
+                in_features=embed_dim,
+                hidden_features=round(embed_dim * mlp_ratio),
+                drop=drop,
+            )
+        self.self_attention_norm1 = RMSNorm(embed_dim, eps=norm_eps)
+        self.self_attention_norm2 = RMSNorm(embed_dim, eps=norm_eps)
+        self.cross_attention_norm1 = RMSNorm(embed_dim, eps=norm_eps)
+        self.cross_attention_norm2 = RMSNorm(embed_dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(embed_dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(embed_dim, eps=norm_eps)
+        self.attention_y_norm = RMSNorm(context_dim, eps=norm_eps)
+        self.shared_aln = shared_aln
+        if self.shared_aln:
+            self.ada_gss = nn.Parameter(
+                torch.randn(1, 1, 6, embed_dim) / embed_dim**0.5
+            )
+        else:
+            lin = nn.Linear(cond_dim, 6 * embed_dim)
+            self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin)
+        self.fused_add_norm_fn = None
+    # NOTE: attn_bias is None during inference because kv cache is enabled
+    def forward(
+        self,
+        x,
+        cond_BD,
+        attn_bias,
+        context=None,
+        context_attn_bias=None,
+        freqs_cis=None,
+    ):  # C: embed_dim, D: cond_dim
+        if self.shared_aln:
+            gamma1, gamma2, scale1, scale2, shift1, shift2 = (
+                self.ada_gss + cond_BD
+            ).unbind(
+                2
+            )  # 116C + B16C =unbind(2)=> 6 B1C
+        else:
+            gamma1, gamma2, scale1, scale2, shift1, shift2 = (
+                self.ada_lin(cond_BD).view(-1, 1, 6, self.C).unbind(2)
+            )
+        x = x + self.self_attention_norm2(
+            self.attn(
+                self.self_attention_norm1(x).mul(scale1.add(1)).add(shift1),
+                attn_bias=attn_bias,
+                freqs_cis=freqs_cis,
+            ).mul(gamma1)
+        )
+        if context is not None:
+            x = x + self.cross_attention_norm2(
+                self.cross_attn(
+                    self.cross_attention_norm1(x),
+                    self.attention_y_norm(context),
+                    context_attn_bias=context_attn_bias,
+                    freqs_cis=freqs_cis,
+                )
+            )
+        x = x + self.ffn_norm2(
+            self.ffn(self.ffn_norm1(x).mul(scale2.add(1)).add(shift2)).mul(gamma2)
+        )
+        return x
+    def extra_repr(self) -> str:
+        return f"shared_aln={self.shared_aln}"
+class AdaLNBeforeHead(nn.Module):
+    def __init__(self, C, D, norm_layer):  # C: embed_dim, D: cond_dim
+        super().__init__()
+        self.C, self.D = C, D
+        self.ln_wo_grad = norm_layer(C, elementwise_affine=False)
+        self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), nn.Linear(D, 2 * C))
+    def forward(self, x_BLC: torch.Tensor, cond_BD: torch.Tensor):
+        scale, shift = self.ada_lin(cond_BD).view(-1, 1, 2, self.C).unbind(2)
+        return self.ln_wo_grad(x_BLC).mul(scale.add(1)).add_(shift)

models/clip.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel, CLIPTokenizer
+class FrozenCLIPEmbedder(nn.Module):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    def __init__(
+        self,
+        version="openai/clip-vit-large-patch14",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+    ):  # clip-vit-base-patch32
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version).to(device)
+        self.device = device
+        self.hidden_size = self.transformer.config.hidden_size
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        ).to(self.device)
+        with torch.cuda.amp.autocast():
+            outputs = self.transformer(**batch_encoding)
+        attn_bias = batch_encoding["attention_mask"].float()
+        attn_bias[attn_bias == 0] = -float("inf")
+        attn_bias[attn_bias == 1] = 0.0
+        outputs["attn_bias"] = attn_bias
+        return outputs
+    def encode(self, text):
+        return self(text)

models/helpers.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+def sample_with_top_k_top_p_(
+    logits_BlV: torch.Tensor,
+    top_k: int = 0,
+    top_p: float = 0.0,
+    rng=None,
+    num_samples=1,
+) -> torch.Tensor:  # return idx, shaped (B, l)
+    B, l, V = logits_BlV.shape
+    if top_k > 0:
+        idx_to_remove = logits_BlV < logits_BlV.topk(
+            top_k, largest=True, sorted=False, dim=-1
+        )[0].amin(dim=-1, keepdim=True)
+        logits_BlV.masked_fill_(idx_to_remove, -torch.inf)
+    if top_p > 0:
+        sorted_logits, sorted_idx = logits_BlV.sort(dim=-1, descending=False)
+        sorted_idx_to_remove = sorted_logits.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p)
+        sorted_idx_to_remove[..., -1:] = False
+        logits_BlV.masked_fill_(
+            sorted_idx_to_remove.scatter(
+                sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove
+            ),
+            -torch.inf,
+        )
+    # sample (have to squeeze cuz torch.multinomial can only be used for 2D tensor)
+    replacement = num_samples >= 0
+    num_samples = abs(num_samples)
+    return torch.multinomial(
+        logits_BlV.softmax(dim=-1).view(-1, V),
+        num_samples=num_samples,
+        replacement=replacement,
+        generator=rng,
+    ).view(B, l, num_samples)
+def gumbel_softmax_with_rng(
+    logits: torch.Tensor,
+    tau: float = 1,
+    hard: bool = False,
+    eps: float = 1e-10,
+    dim: int = -1,
+    rng: torch.Generator = None,
+) -> torch.Tensor:
+    if rng is None:
+        return F.gumbel_softmax(logits=logits, tau=tau, hard=hard, eps=eps, dim=dim)
+    gumbels = (
+        -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format)
+        .exponential_(generator=rng)
+        .log()
+    )
+    gumbels = (logits + gumbels) / tau
+    y_soft = gumbels.softmax(dim)
+    if hard:
+        index = y_soft.max(dim, keepdim=True)[1]
+        y_hard = torch.zeros_like(
+            logits, memory_format=torch.legacy_contiguous_format
+        ).scatter_(dim, index, 1.0)
+        ret = y_hard - y_soft.detach() + y_soft
+    else:
+        ret = y_soft
+    return ret
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):  # taken from timm
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):  # taken from timm
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"(drop_prob=...)"

models/pipeline.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from torchvision.transforms import ToPILImage
+from models.vqvae import VQVAEHF
+from models.clip import FrozenCLIPEmbedder
+from models.var import TVARHF, sample_with_top_k_top_p_, gumbel_softmax_with_rng
+class TVARPipeline:
+    vae_path = "michellemoorre/vae-test"
+    text_encoder_path = "openai/clip-vit-large-patch14"
+    text_encoder_2_path = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+    def __init__(self, var, vae, text_encoder, text_encoder_2, device):
+        self.var = var
+        self.vae = vae
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+        self.var.eval()
+        self.vae.eval()
+        self.device = device
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, device="cuda"):
+        var = TVARHF.from_pretrained(pretrained_model_name_or_path).to(device)
+        vae = VQVAEHF.from_pretrained(cls.vae_path).to(device)
+        text_encoder = FrozenCLIPEmbedder(cls.text_encoder_path, device=device)
+        text_encoder_2 = FrozenCLIPEmbedder(cls.text_encoder_2_path, device=device)
+        return cls(var, vae, text_encoder, text_encoder_2, device)
+    @staticmethod
+    def to_image(tensor):
+        return [ToPILImage()(
+            (255 * img.cpu().detach()).to(torch.uint8))
+        for img in tensor]
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        null_prompt: str = "",
+        encode_null: bool = True,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        encodings = [
+            self.text_encoder.encode(prompt),
+            self.text_encoder_2.encode(prompt),
+        ]
+        prompt_embeds = torch.concat(
+            [encoding.last_hidden_state for encoding in encodings], dim=-1
+        )
+        pooled_prompt_embeds = encodings[-1].pooler_output
+        attn_bias = encodings[-1].attn_bias
+        if encode_null:
+            null_prompt = [null_prompt] if isinstance(null_prompt, str) else prompt
+            null_encodings = [
+                self.text_encoder.encode(null_prompt),
+                self.text_encoder_2.encode(null_prompt),
+            ]
+            null_prompt_embeds = torch.concat(
+                [encoding.last_hidden_state for encoding in encodings], dim=-1
+            )
+            null_pooled_prompt_embeds = null_encodings[-1].pooler_output
+            null_attn_bias = null_encodings[-1].attn_bias
+            B, L, hidden_dim = prompt_embeds.shape
+            pooled_dim = pooled_prompt_embeds.shape[1]
+            null_prompt_embeds = null_prompt_embeds[:, :L].expand(B, L, hidden_dim).to(prompt_embeds.device)
+            null_pooled_prompt_embeds = null_pooled_prompt_embeds.expand(B, pooled_dim).to(pooled_prompt_embeds.device)
+            null_attn_bias = null_attn_bias[:, :L].expand(B, L).to(attn_bias.device)
+            prompt_embeds = torch.cat([prompt_embeds, null_prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embeds, null_pooled_prompt_embeds], dim=0)
+            attn_bias = torch.cat([attn_bias, null_attn_bias], dim=0)
+        return prompt_embeds, pooled_prompt_embeds, attn_bias
+    @torch.inference_mode()
+    def __call__(
+        self,
+        prompt = None,
+        null_prompt = "",
+        g_seed: Optional[int] = None,
+        cfg=4.0,
+        top_k=450,
+        top_p=0.95,
+        more_smooth=False,
+        re=False,
+        re_max_depth=10,
+        return_pil=True,
+        encoded_prompt = None,
+        encoded_null_prompt = None,
+    ) -> torch.Tensor:  # returns reconstructed image (B, 3, H, W) in [0, 1]
+        """
+        only used for inference, on autoregressive mode
+        :param B: batch size
+        :param label_B: imagenet label; if None, randomly sampled
+        :param g_seed: random seed
+        :param cfg: classifier-free guidance ratio
+        :param top_k: top-k sampling
+        :param top_p: top-p sampling
+        :param more_smooth: smoothing the pred using gumbel softmax; only used in visualization, not used in FID/IS benchmarking
+        :return: if returns_vemb: list of embedding h_BChw := vae_embed(idx_Bl), else: list of idx_Bl
+        """
+        assert not self.var.training
+        var = self.var
+        vae = self.vae
+        vae_quant = self.vae.quantize
+        if g_seed is None:
+            rng = None
+        else:
+            var.rng.manual_seed(g_seed)
+            rng = var.rng
+        if encoded_prompt is not None:
+            assert encoded_null_prompt is not None
+            context, cond_vector, context_attn_bias = self.var.parse_batch(
+                encoded_prompt,
+                encoded_null_prompt,
+            )
+        else:
+            context, cond_vector, context_attn_bias = self.encode_prompt(prompt, null_prompt)
+        B = context.shape[0] // 2
+        cond_vector = var.text_pooler(cond_vector)
+        sos = cond_BD = cond_vector
+        lvl_pos = var.lvl_embed(var.lvl_1L)
+        if not var.rope:
+            lvl_pos += var.pos_1LC
+        next_token_map = (
+            sos.unsqueeze(1)
+            + var.pos_start.expand(2 * B, var.first_l, -1)
+            + lvl_pos[:, : var.first_l]
+        )
+        cur_L = 0
+        f_hat = sos.new_zeros(B, var.Cvae, var.patch_nums[-1], var.patch_nums[-1])
+        for b in var.blocks:
+            b.attn.kv_caching(True)
+            b.cross_attn.kv_caching(True)
+        for si, pn in enumerate(var.patch_nums):  # si: i-th segment
+            ratio = si / var.num_stages_minus_1
+            cond_BD_or_gss = var.shared_ada_lin(cond_BD)
+            x_BLC = next_token_map
+            if var.rope:
+                freqs_cis = var.freqs_cis[:, cur_L : cur_L + pn * pn]
+            else:
+                freqs_cis = var.freqs_cis
+            for block in var.blocks:
+                x_BLC = block(
+                    x=x_BLC,
+                    cond_BD=cond_BD_or_gss,
+                    attn_bias=None,
+                    context=context,
+                    context_attn_bias=context_attn_bias,
+                    freqs_cis=freqs_cis,
+                )
+            cur_L += pn * pn
+            logits_BlV = var.get_logits(x_BLC, cond_BD)
+            t = cfg * ratio
+            logits_BlV = (1 + t) * logits_BlV[:B] - t * logits_BlV[B:]
+            idx_Bl = sample_with_top_k_top_p_(
+                logits_BlV, rng=rng, top_k=top_k, top_p=top_p, num_samples=1
+            )[:, :, 0]
+            if re:
+                selected_logits = torch.gather(logits_BlV, -1, idx_Bl.unsqueeze(-1))[:, :, 0]
+                mx = selected_logits.sum(dim=-1)[:, None]
+                for _ in range(re_max_depth):
+                    new_idx_Bl = sample_with_top_k_top_p_(
+                        logits_BlV, rng=rng, top_k=top_k, top_p=top_p, num_samples=1
+                    )[:, :, 0]
+                    selected_logits = torch.gather(logits_BlV, -1, new_idx_Bl.unsqueeze(-1))[:, :, 0]
+                    new_mx = selected_logits.sum(dim=-1)[:, None]
+                    idx_Bl = idx_Bl * (mx >= new_mx) + new_idx_Bl * (mx < new_mx)
+                    mx = mx * (mx >= new_mx) + new_mx * (mx < new_mx)
+            if not more_smooth:  # this is the default case
+                h_BChw = vae_quant.embedding(idx_Bl)  # B, l, Cvae
+            else:  # not used when evaluating FID/IS/Precision/Recall
+                gum_t = max(0.27 * (1 - ratio * 0.95), 0.005)  # refer to mask-git
+                h_BChw = gumbel_softmax_with_rng(
+                    logits_BlV.mul(1 + ratio), tau=gum_t, hard=False, dim=-1, rng=rng
+                ) @ vae_quant.embedding.weight.unsqueeze(0)
+            h_BChw = h_BChw.transpose_(1, 2).reshape(B, var.Cvae, pn, pn)
+            f_hat, next_token_map = vae_quant.get_next_autoregressive_input(
+                    si, len(var.patch_nums), f_hat, h_BChw
+            )
+            if si != var.num_stages_minus_1:  # prepare for next stage
+                next_token_map = next_token_map.view(B, var.Cvae, -1).transpose(1, 2)
+                next_token_map = (
+                    var.word_embed(next_token_map)
+                    + lvl_pos[:, cur_L : cur_L + var.patch_nums[si + 1] ** 2]
+                )
+                next_token_map = next_token_map.repeat(
+                    2, 1, 1
+                )  # double the batch sizes due to CFG
+        for b in var.blocks:
+            b.attn.kv_caching(False)
+            b.cross_attn.kv_caching(False)
+        # de-normalize, from [-1, 1] to [0, 1]
+        img = vae.fhat_to_img(f_hat).add(1).mul(0.5)
+        if return_pil:
+            img = self.to_image(img)
+        return img

models/quant.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import math
+from typing import List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+from torch import distributed as tdist
+from torch import nn as nn
+from torch.nn import functional as F
+import dist
+# this file only provides the VectorQuantizer2 used in VQVAE
+__all__ = ["VectorQuantizer2"]
+class VectorQuantizer2(nn.Module):
+    # VQGAN originally use beta=1.0, never tried 0.25; SD seems using 0.25
+    def __init__(
+        self,
+        vocab_size,
+        Cvae,
+        using_znorm,
+        beta: float = 0.25,
+        default_qresi_counts=0,
+        v_patch_nums=None,
+        quant_resi=0.5,
+        share_quant_resi=4,  # share_quant_resi: args.qsr
+    ):
+        super().__init__()
+        self.vocab_size: int = vocab_size
+        self.Cvae: int = Cvae
+        self.using_znorm: bool = using_znorm
+        self.v_patch_nums: Tuple[int] = v_patch_nums
+        self.quant_resi_ratio = quant_resi
+        if share_quant_resi == 0:  # non-shared: \phi_{1 to K} for K scales
+            self.quant_resi = PhiNonShared(
+                [
+                    (Phi(Cvae, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity())
+                    for _ in range(default_qresi_counts or len(self.v_patch_nums))
+                ]
+            )
+        elif share_quant_resi == 1:  # fully shared: only a single \phi for K scales
+            self.quant_resi = PhiShared(
+                Phi(Cvae, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity()
+            )
+        else:  # partially shared: \phi_{1 to share_quant_resi} for K scales
+            self.quant_resi = PhiPartiallyShared(
+                nn.ModuleList([(
+                    Phi(Cvae, quant_resi)
+                    if abs(quant_resi) > 1e-6
+                    else nn.Identity()
+                ) for _ in range(share_quant_resi)])
+            )
+        self.register_buffer(
+            "ema_vocab_hit_SV",
+            torch.full((len(self.v_patch_nums), self.vocab_size), fill_value=0.0),
+        )
+        self.record_hit = 0
+        self.beta: float = beta
+        self.embedding = nn.Embedding(self.vocab_size, self.Cvae)
+    def eini(self, eini):
+        if eini > 0:
+            nn.init.trunc_normal_(self.embedding.weight.data, std=eini)
+        elif eini < 0:
+            self.embedding.weight.data.uniform_(
+                -abs(eini) / self.vocab_size, abs(eini) / self.vocab_size
+            )
+    def extra_repr(self) -> str:
+        return f"{self.v_patch_nums}, znorm={self.using_znorm}, beta={self.beta}  |  S={len(self.v_patch_nums)}, quant_resi={self.quant_resi_ratio}"
+    # ===================== `forward` is only used in VAE training =====================
+    def forward(
+        self, f_BChw: torch.Tensor, ret_usages=False
+    ) -> Tuple[torch.Tensor, List[float], torch.Tensor]:
+        dtype = f_BChw.dtype
+        if dtype != torch.float32:
+            f_BChw = f_BChw.float()
+        B, C, H, W = f_BChw.shape
+        f_no_grad = f_BChw.detach()
+        f_rest = f_no_grad.clone()
+        f_hat = torch.zeros_like(f_rest)
+        with torch.cuda.amp.autocast(enabled=False):
+            mean_vq_loss: torch.Tensor = 0.0
+            vocab_hit_V = torch.zeros(
+                self.vocab_size, dtype=torch.float, device=f_BChw.device
+            )
+            SN = len(self.v_patch_nums)
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                # find the nearest embedding
+                if self.using_znorm:
+                    rest_NC = (
+                        F.interpolate(f_rest, size=(pn, pn), mode="area")
+                        .permute(0, 2, 3, 1)
+                        .reshape(-1, C)
+                        if (si != SN - 1)
+                        else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+                    )
+                    rest_NC = F.normalize(rest_NC, dim=-1)
+                    idx_N = torch.argmax(
+                        rest_NC @ F.normalize(self.embedding.weight.data.T, dim=0),
+                        dim=1,
+                    )
+                else:
+                    rest_NC = (
+                        F.interpolate(f_rest, size=(pn, pn), mode="area")
+                        .permute(0, 2, 3, 1)
+                        .reshape(-1, C)
+                        if (si != SN - 1)
+                        else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+                    )
+                    d_no_grad = torch.sum(
+                        rest_NC.square(), dim=1, keepdim=True
+                    ) + torch.sum(
+                        self.embedding.weight.data.square(), dim=1, keepdim=False
+                    )
+                    d_no_grad.addmm_(
+                        rest_NC, self.embedding.weight.data.T, alpha=-2, beta=1
+                    )  # (B*h*w, vocab_size)
+                    idx_N = torch.argmin(d_no_grad, dim=1)
+                hit_V = idx_N.bincount(minlength=self.vocab_size).float()
+                if self.training:
+                    if dist.initialized():
+                        handler = tdist.all_reduce(hit_V, async_op=True)
+                # calc loss
+                idx_Bhw = idx_N.view(B, pn, pn)
+                h_BChw = (
+                    F.interpolate(
+                        self.embedding(idx_Bhw).permute(0, 3, 1, 2),
+                        size=(H, W),
+                        mode="bicubic",
+                    ).contiguous()
+                    if (si != SN - 1)
+                    else self.embedding(idx_Bhw).permute(0, 3, 1, 2).contiguous()
+                )
+                h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+                f_hat = f_hat + h_BChw
+                f_rest -= h_BChw
+                if self.training and dist.initialized():
+                    handler.wait()
+                    if self.record_hit == 0:
+                        self.ema_vocab_hit_SV[si].copy_(hit_V)
+                    elif self.record_hit < 100:
+                        self.ema_vocab_hit_SV[si].mul_(0.9).add_(hit_V.mul(0.1))
+                    else:
+                        self.ema_vocab_hit_SV[si].mul_(0.99).add_(hit_V.mul(0.01))
+                    self.record_hit += 1
+                vocab_hit_V.add_(hit_V)
+                mean_vq_loss += F.mse_loss(f_hat.data, f_BChw).mul_(self.beta) + F.mse_loss(f_hat, f_no_grad)
+            mean_vq_loss *= 1.0 / SN
+            f_hat = (f_hat.data - f_no_grad).add_(f_BChw)
+        margin = (
+            tdist.get_world_size()
+            * (f_BChw.numel() / f_BChw.shape[1])
+            / self.vocab_size
+            * 0.08
+        )
+        # margin = pn*pn / 100
+        if ret_usages:
+            usages = [
+                (self.ema_vocab_hit_SV[si] >= margin).float().mean().item() * 100
+                for si, pn in enumerate(self.v_patch_nums)
+            ]
+        else:
+            usages = None
+        return f_hat, usages, mean_vq_loss
+    # ===================== `forward` is only used in VAE training =====================
+    def embed_to_fhat(
+        self, ms_h_BChw: List[torch.Tensor], all_to_max_scale=True, last_one=False
+    ) -> Union[List[torch.Tensor], torch.Tensor]:
+        ls_f_hat_BChw = []
+        B = ms_h_BChw[0].shape[0]
+        H = W = self.v_patch_nums[-1]
+        SN = len(self.v_patch_nums)
+        if all_to_max_scale:
+            f_hat = ms_h_BChw[0].new_zeros(B, self.Cvae, H, W, dtype=torch.float32)
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                h_BChw = ms_h_BChw[si]
+                if si < len(self.v_patch_nums) - 1:
+                    h_BChw = F.interpolate(h_BChw, size=(H, W), mode="bicubic")
+                h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+                f_hat.add_(h_BChw)
+                if last_one:
+                    ls_f_hat_BChw = f_hat
+                else:
+                    ls_f_hat_BChw.append(f_hat.clone())
+        else:
+            # WARNING: this is not the case in VQ-VAE training or inference (we'll interpolate every token map to the max H W, like above)
+            # WARNING: this should only be used for experimental purpose
+            f_hat = ms_h_BChw[0].new_zeros(
+                B,
+                self.Cvae,
+                self.v_patch_nums[0],
+                self.v_patch_nums[0],
+                dtype=torch.float32,
+            )
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                f_hat = F.interpolate(f_hat, size=(pn, pn), mode="bicubic")
+                h_BChw = self.quant_resi[si / (SN - 1)](ms_h_BChw[si])
+                f_hat.add_(h_BChw)
+                if last_one:
+                    ls_f_hat_BChw = f_hat
+                else:
+                    ls_f_hat_BChw.append(f_hat)
+        return ls_f_hat_BChw
+    def f_to_idxBl_or_fhat(
+        self,
+        f_BChw: torch.Tensor,
+        to_fhat: bool,
+        v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None,
+        noise_std: Optional[float] = None,
+    ) -> List[Union[torch.Tensor, torch.LongTensor]]:  # z_BChw is the feature from inp_img_no_grad
+        B, C, H, W = f_BChw.shape
+        f_no_grad = f_BChw.detach()
+        f_rest = f_no_grad.clone()
+        f_hat = torch.zeros_like(f_rest)
+        f_hat_or_idx_Bl: List[torch.Tensor] = []
+        patch_hws = [
+            (pn, pn) if isinstance(pn, int) else (pn[0], pn[1])
+            for pn in (v_patch_nums or self.v_patch_nums)
+        ]  # from small to large
+        assert (
+            patch_hws[-1][0] == H and patch_hws[-1][1] == W
+        ), f"{patch_hws[-1]=} != ({H=}, {W=})"
+        SN = len(patch_hws)
+        for si, (ph, pw) in enumerate(patch_hws):  # from small to large
+            # find the nearest embedding
+            z_NC = (
+                F.interpolate(f_rest, size=(ph, pw), mode="area")
+                .permute(0, 2, 3, 1)
+                .reshape(-1, C)
+                if (si != SN - 1)
+                else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+            )
+            if noise_std is not None:
+                z_NC = math.sqrt(1 - noise_std ** 2) * z_NC + torch.randn_like(z_NC) * noise_std
+            if self.using_znorm:
+                z_NC = F.normalize(z_NC, dim=-1)
+                idx_N = torch.argmax(
+                    z_NC @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1
+                )
+            else:
+                d_no_grad = torch.sum(z_NC.square(), dim=1, keepdim=True) + torch.sum(
+                    self.embedding.weight.data.square(), dim=1, keepdim=False
+                )
+                d_no_grad.addmm_(
+                    z_NC, self.embedding.weight.data.T, alpha=-2, beta=1
+                )  # (B*h*w, vocab_size)
+                idx_N = torch.argmin(d_no_grad, dim=1)
+            idx_Bhw = idx_N.view(B, ph, pw)
+            h_BChw = (
+                F.interpolate(
+                    self.embedding(idx_Bhw).permute(0, 3, 1, 2),
+                    size=(H, W),
+                    mode="bicubic",
+                ).contiguous()
+                if (si != SN - 1)
+                else self.embedding(idx_Bhw).permute(0, 3, 1, 2).contiguous()
+            )
+            h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+            f_hat.add_(h_BChw)
+            f_rest.sub_(h_BChw)
+            f_hat_or_idx_Bl.append(
+                f_hat.clone() if to_fhat else idx_N.reshape(B, ph * pw)
+            )
+        return f_hat_or_idx_Bl
+    # ===================== idxBl_to_var_input: only used in VAR training, for getting teacher-forcing input =====================
+    def idxBl_to_var_input(self, gt_ms_idx_Bl: List[torch.Tensor]) -> torch.Tensor:
+        next_scales = []
+        B = gt_ms_idx_Bl[0].shape[0]
+        C = self.Cvae
+        H = W = self.v_patch_nums[-1]
+        SN = len(self.v_patch_nums)
+        f_hat = gt_ms_idx_Bl[0].new_zeros(B, C, H, W, dtype=torch.float32)
+        pn_next: int = self.v_patch_nums[0]
+        for si in range(SN - 1):
+            h_BChw = F.interpolate(
+                self.embedding(gt_ms_idx_Bl[si])
+                .transpose_(1, 2)
+                .view(B, C, pn_next, pn_next),
+                size=(H, W),
+                mode="bicubic",
+            )
+            f_hat.add_(self.quant_resi[si / (SN - 1)](h_BChw))
+            pn_next = self.v_patch_nums[si + 1]
+            next_scales.append(
+                F.interpolate(f_hat, size=(pn_next, pn_next), mode="area")
+                .view(B, C, -1)
+                .transpose(1, 2)
+            )
+        # cat BlCs to BLC, this should be float32
+        return torch.cat(next_scales, dim=1) if len(next_scales) else None
+    # ===================== get_next_autoregressive_input: only used in VAR inference, for getting next step's input =====================
+    def get_next_autoregressive_input(
+        self, si: int, SN: int, f_hat: torch.Tensor, h_BChw: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor]:  # only used in VAR inference
+        HW = self.v_patch_nums[-1]
+        if si != SN - 1:
+            h = self.quant_resi[si / (SN - 1)](
+                F.interpolate(h_BChw, size=(HW, HW), mode="bicubic")
+            )  # conv after upsample
+            f_hat.add_(h)
+            return f_hat, F.interpolate(
+                f_hat,
+                size=(self.v_patch_nums[si + 1], self.v_patch_nums[si + 1]),
+                mode="area",
+            )
+        else:
+            h = self.quant_resi[si / (SN - 1)](h_BChw)
+            f_hat.add_(h)
+            return f_hat, f_hat
+class Phi(nn.Conv2d):
+    def __init__(self, embed_dim, quant_resi):
+        ks = 3
+        super().__init__(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            kernel_size=ks,
+            stride=1,
+            padding=ks // 2,
+        )
+        self.resi_ratio = abs(quant_resi)
+    def forward(self, h_BChw):
+        return h_BChw.mul(1 - self.resi_ratio) + super().forward(h_BChw).mul_(
+            self.resi_ratio
+        )
+class PhiShared(nn.Module):
+    def __init__(self, qresi: Phi):
+        super().__init__()
+        self.qresi: Phi = qresi
+    def __getitem__(self, _) -> Phi:
+        return self.qresi
+class PhiPartiallyShared(nn.Module):
+    def __init__(self, qresi_ls: nn.ModuleList):
+        super().__init__()
+        self.qresi_ls = qresi_ls
+        K = len(qresi_ls)
+        self.ticks = (
+            np.linspace(1 / 3 / K, 1 - 1 / 3 / K, K)
+            if K == 4
+            else np.linspace(1 / 2 / K, 1 - 1 / 2 / K, K)
+        )
+    def __getitem__(self, at_from_0_to_1: float) -> Phi:
+        return self.qresi_ls[np.argmin(np.abs(self.ticks - at_from_0_to_1)).item()]
+    def extra_repr(self) -> str:
+        return f"ticks={self.ticks}"
+class PhiNonShared(nn.ModuleList):
+    def __init__(self, qresi: List):
+        super().__init__(qresi)
+        # self.qresi = qresi
+        K = len(qresi)
+        self.ticks = (
+            np.linspace(1 / 3 / K, 1 - 1 / 3 / K, K)
+            if K == 4
+            else np.linspace(1 / 2 / K, 1 - 1 / 2 / K, K)
+        )
+    def __getitem__(self, at_from_0_to_1: float) -> Phi:
+        return super().__getitem__(
+            np.argmin(np.abs(self.ticks - at_from_0_to_1)).item()
+        )
+    def extra_repr(self) -> str:
+        return f"ticks={self.ticks}"

models/rope.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+def init_t_xy(end_x: int, end_y: int):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode="floor").float()
+    return t_x, t_y
+def compute_axial_cis(
+    dim: int, end_x: int, end_y: int, theta: float = 100.0, norm_coeff: int = 1
+):
+    freqs_x = (
+        1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+        * norm_coeff
+    )
+    freqs_y = (
+        1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+        * norm_coeff
+    )
+    t_x, t_y = init_t_xy(end_x, end_y)
+    freqs_x = torch.outer(t_x, freqs_x)
+    freqs_y = torch.outer(t_y, freqs_y)
+    freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
+    freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
+    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    freqs_cis = freqs_cis[:, x.shape[1], ...]
+    if freqs_cis.shape == (x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+    elif freqs_cis.shape == (x.shape[-3], x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim - 3 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor):
+    with torch.cuda.amp.autocast(enabled=False):
+        x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+        # freqs_cis = reshape_for_broadcast(freqs_cis, x).to(x_in.device)
+        freqs_cis = freqs_cis[None, :, : x.shape[2], ...].to(x_in.device)
+        x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+        return x_out.type_as(x_in)

models/var.py ADDED Viewed

	@@ -0,0 +1,412 @@

+import math
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+import dist
+from models.basic_var import AdaLNBeforeHead, AdaLNSelfCrossAttn
+from models.clip import FrozenCLIPEmbedder
+from models.helpers import gumbel_softmax_with_rng, sample_with_top_k_top_p_
+from models.rope import compute_axial_cis
+from models.vqvae import VQVAE, VectorQuantizer2
+class SharedAdaLin(nn.Linear):
+    def forward(self, cond_BD):
+        C = self.weight.shape[0] // 6
+        return super().forward(cond_BD).view(-1, 1, 6, C)  # B16C
+class VAR(nn.Module):
+    def __init__(
+        self,
+        rope=False,
+        rope_theta=100,
+        rope_size=None,
+        depth=16,
+        embed_dim=1024,
+        num_heads=16,
+        mlp_ratio=4.0,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_eps=1e-6,
+        shared_aln=False,
+        attn_l2_norm=False,
+        patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),  # 10 steps by default
+        fused_if_available=True,
+        use_swiglu_ffn=False,
+        Cvae=32,
+        V=4096
+    ):
+        super().__init__()
+        # 0. hyperparameters
+        assert embed_dim % num_heads == 0
+        self.depth, self.C, self.D, self.num_heads = (
+            depth,
+            embed_dim,
+            embed_dim,
+            num_heads,
+        )
+        self.Cvae, self.V = Cvae, V
+        self.prog_si = -1  # progressive training
+        self.patch_nums: Tuple[int] = patch_nums
+        self.L = sum(pn**2 for pn in self.patch_nums)
+        self.first_l = self.patch_nums[0] ** 2
+        self.rope = rope
+        self.num_stages_minus_1 = len(self.patch_nums) - 1
+        self.rng = torch.Generator(device=dist.get_device())
+        # 1. input (word) embedding
+        self.word_embed = nn.Linear(self.Cvae, self.C)
+        # 2. text embedding
+        self.pooled_embed_size = 1280
+        context_dim = 1280 + 768
+        self.text_pooler = nn.Linear(self.pooled_embed_size, self.D)
+        init_std = math.sqrt(1 / self.C / 3)
+        self.pos_start = nn.Parameter(torch.empty(1, self.first_l, self.C))
+        nn.init.trunc_normal_(self.pos_start.data, mean=0, std=init_std)
+        # 3. position embedding
+        if not self.rope:
+            # absolute position embedding
+            pos_1LC = []
+            for i, pn in enumerate(self.patch_nums):
+                pe = torch.empty(1, pn * pn, self.C)
+                nn.init.trunc_normal_(pe, mean=0, std=init_std)
+                pos_1LC.append(pe)
+            pos_1LC = torch.cat(pos_1LC, dim=1)  # 1, L, C
+            assert tuple(pos_1LC.shape) == (1, self.L, self.C)
+            self.pos_1LC = nn.Parameter(pos_1LC)
+            self.freqs_cis = None
+        else:
+            # RoPE position embedding
+            assert (
+                self.C // self.num_heads
+            ) % 4 == 0, "2d rope needs head dim to be divisible by 4"
+            patch_nums_m1 = tuple(pn - 1 if pn > 1 else 1 for pn in self.patch_nums)
+            self.compute_cis = partial(compute_axial_cis, dim=self.C // self.num_heads)
+            freqs_cis = []
+            for i, pn in enumerate(self.patch_nums):
+                norm_coeff = rope_size / patch_nums_m1[i]
+                cur_freqs = self.compute_cis(
+                    end_x=pn, end_y=pn, theta=rope_theta, norm_coeff=norm_coeff
+                )
+                freqs_cis.append(cur_freqs[None, ...])
+            self.freqs_cis = torch.cat(freqs_cis, dim=1)  # 1, L, C // 2 -- complex
+        # level embedding (similar to GPT's segment embedding, used to distinguish different levels of token pyramid)
+        self.lvl_embed = nn.Embedding(len(self.patch_nums), self.C)
+        nn.init.trunc_normal_(self.lvl_embed.weight.data, mean=0, std=init_std)
+        # 4. backbone blocks
+        self.shared_ada_lin = (
+            nn.Sequential(nn.SiLU(inplace=False), SharedAdaLin(self.D, 6 * self.C))
+            if shared_aln
+            else nn.Identity()
+        )
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        self.drop_path_rate = drop_path_rate
+        # stochastic depth decay rule (linearly increasing)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList([])
+        for block_idx in range(depth):
+            self.blocks.append(
+                AdaLNSelfCrossAttn(
+                    cond_dim=self.D,
+                    shared_aln=shared_aln,
+                    block_idx=block_idx,
+                    embed_dim=self.C,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[block_idx],
+                    last_drop_p=0 if block_idx == 0 else dpr[block_idx - 1],
+                    qk_norm=attn_l2_norm,
+                    context_dim=context_dim,
+                    use_swiglu_ffn=use_swiglu_ffn,
+                    norm_eps=norm_eps,
+                )
+            )
+        fused_add_norm_fns = [b.fused_add_norm_fn is not None for b in self.blocks]
+        self.using_fused_add_norm_fn = any(fused_add_norm_fns)
+        print(
+            f"\n[constructor]  ==== fused_if_available={fused_if_available} (fusing_add_ln={sum(fused_add_norm_fns)}/{self.depth}, fusing_mlp={sum(b.ffn.fused_mlp_func is not None for b in self.blocks)}/{self.depth}) ==== \n"
+            f"    [VAR config ] embed_dim={embed_dim}, num_heads={num_heads}, depth={depth}, mlp_ratio={mlp_ratio}\n"
+            f"    [drop ratios ] drop_rate={drop_rate}, attn_drop_rate={attn_drop_rate}, drop_path_rate={drop_path_rate:g} ({torch.linspace(0, drop_path_rate, depth)})",
+            end="\n\n",
+            flush=True,
+        )
+        # 5. attention mask used in training (for masking out the future)
+        #    it won't be used in inference, since kv cache is enabled
+        d: torch.Tensor = torch.cat(
+            [torch.full((pn * pn,), i) for i, pn in enumerate(self.patch_nums)]
+        ).view(1, self.L, 1)
+        dT = d.transpose(1, 2)  # dT: 11L
+        lvl_1L = dT[:, 0].contiguous()
+        self.register_buffer("lvl_1L", lvl_1L)
+        attn_bias_for_masking = torch.where(d >= dT, 0.0, -torch.inf).reshape(
+            1, 1, self.L, self.L
+        )
+        self.register_buffer(
+            "attn_bias_for_masking", attn_bias_for_masking.contiguous()
+        )
+        # 6. classifier head
+        self.head_nm = AdaLNBeforeHead(self.C, self.D, norm_layer=norm_layer)
+        self.head = nn.Linear(self.C, self.V)
+        # By defailt disable gradient checkpointing
+        self.use_gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = False
+    def get_logits(
+        self,
+        h_or_h_and_residual: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        cond_BD: Optional[torch.Tensor],
+    ):
+        if not isinstance(h_or_h_and_residual, torch.Tensor):
+            h, resi = h_or_h_and_residual  # fused_add_norm must be used
+            h = resi + self.blocks[-1].drop_path(h)
+        else:  # fused_add_norm is not used
+            h = h_or_h_and_residual
+        return self.head(self.head_nm(h.float(), cond_BD).float()).float()
+    def parse_batch(self, batch, null_batch=None):
+        embedding_1 = batch["vit_l_14_text_embeddings"]
+        embedding_2 = batch["vit_bigg_14_text_embeddings"]
+        attention_mask = batch["vit_bigg_14_text_mask"]
+        batch_size = embedding_1.size(0)
+        prompt_embed = torch.concat([embedding_1, embedding_2], dim=-1)
+        prompt_lens = attention_mask.sum(dim=-1).to(int)
+        pooled_output = embedding_2[
+            torch.arange(batch_size, device=embedding_2.device), prompt_lens - 1
+        ]
+        attention_bias = attention_mask.clone()
+        attention_bias[attention_mask == 0] = -float("inf")
+        attention_bias[attention_mask == 1] = 0.0
+        if null_batch is not None:
+            B, L, hidden_dim = prompt_embed.shape
+            pooled_dim = pooled_output.shape[1]
+            null_context = null_batch['prompt_embed']
+            null_pooled_embed = null_batch['pooled_embed']
+            null_attn_bias = null_batch['attn_bias']
+            null_context = null_context[:, :L].expand(B, L, hidden_dim).to(prompt_embed.device)
+            null_pooled_embed = null_pooled_embed.expand(B, pooled_dim).to(pooled_output.device)
+            null_attn_bias = null_attn_bias[:, :L].expand(B, L).to(attention_bias.device)
+            prompt_embed = torch.cat([prompt_embed, null_context], dim=0)
+            pooled_output = torch.cat([pooled_output, null_pooled_embed], dim=0)
+            attention_bias = torch.cat([attention_bias, null_attn_bias], dim=0)
+        return (
+            prompt_embed.to(dist.get_device()),
+            pooled_output.to(dist.get_device()),
+            attention_bias.to(dist.get_device()),
+        )
+    def forward(
+        self,
+        x_BLCv_wo_first_l: torch.Tensor,
+        prompt_embeds: torch.Tensor,
+        pooled_prompt_embeds: torch.Tensor,
+        prompt_attn_bias: torch.Tensor,
+    ) -> torch.Tensor:  # returns logits_BLV
+        """
+        :param batch: {'image': not used in forward,
+        'text': image caption,
+        'vit_l_14_text_embeddings': text embedding from CLIP-ViT-L-14
+        'vit_bigg_14_text_embeddings': text embedding from CLIP-ViT-Big-G-14
+        'vit_bigg_14_text_mask': attention mask to get a correct pooled embedding
+        :param x_BLCv_wo_first_l: teacher forcing input (B, self.L-self.first_l, self.Cvae)
+        :return: logits BLV, V is vocab_size
+        """
+        bg, ed = 0, self.L
+        B = x_BLCv_wo_first_l.shape[0]
+        with torch.amp.autocast('cuda', enabled=False):
+            pooled_prompt_embeds = self.text_pooler(pooled_prompt_embeds)
+            sos = cond_BD = pooled_prompt_embeds
+            sos = sos.unsqueeze(1).expand(B, self.first_l, -1) + self.pos_start.expand(
+                B, self.first_l, -1
+            )
+            x_BLC = torch.cat(
+                (sos, self.word_embed(x_BLCv_wo_first_l.float())), dim=1
+            )
+            x_BLC += self.lvl_embed(
+                self.lvl_1L[:, :ed].expand(B, -1)
+            )  # lvl: BLC;  pos: 1LC
+            if not self.rope:
+                x_BLC += self.pos_1LC[:, :ed]
+        attn_bias = self.attn_bias_for_masking[:, :, :ed, :ed]
+        cond_BD_or_gss = self.shared_ada_lin(cond_BD)
+        # hack: get the dtype if mixed precision is used
+        temp = x_BLC.new_ones(8, 8)
+        main_type = torch.matmul(temp, temp).dtype
+        x_BLC = x_BLC.to(dtype=main_type)
+        cond_BD_or_gss = cond_BD_or_gss.to(dtype=main_type)
+        attn_bias = attn_bias.to(dtype=main_type)
+        for block in self.blocks:
+            if self.use_gradient_checkpointing:
+                x_BLC = torch.utils.checkpoint.checkpoint(
+                    block,
+                    x=x_BLC,
+                    cond_BD=cond_BD_or_gss,
+                    attn_bias=attn_bias,
+                    context=prompt_embeds,
+                    freqs_cis=self.freqs_cis,
+                    context_attn_bias=prompt_attn_bias,
+                    use_reentrant=False,
+                )
+            else:
+                x_BLC = block(
+                    x=x_BLC,
+                    cond_BD=cond_BD_or_gss,
+                    attn_bias=attn_bias,
+                    context=prompt_embeds,
+                    freqs_cis=self.freqs_cis,
+                    context_attn_bias=prompt_attn_bias,
+                )
+        with torch.amp.autocast('cuda', enabled=not self.training):
+            x_BLC = self.get_logits(x_BLC.float(), cond_BD)
+        return x_BLC  # logits BLV, V is vocab_size
+    def init_weights(
+        self,
+        init_adaln=0.5,
+        init_adaln_gamma=1e-5,
+        init_head=0.02,
+        init_std=0.02,
+    ):
+        if init_std < 0:
+            init_std = (1 / self.C / 3) ** 0.5  # init_std < 0: automated
+        print(f"[init_weights] {type(self).__name__} with {init_std=:g}")
+        for m in self.modules():
+            with_weight = hasattr(m, "weight") and m.weight is not None
+            with_bias = hasattr(m, "bias") and m.bias is not None
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight.data, std=init_std)
+                if with_bias:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Embedding):
+                nn.init.trunc_normal_(m.weight.data, std=init_std)
+                if m.padding_idx is not None:
+                    m.weight.data[m.padding_idx].zero_()
+            elif isinstance(
+                m,
+                (
+                    nn.LayerNorm,
+                    nn.BatchNorm1d,
+                    nn.BatchNorm2d,
+                    nn.BatchNorm3d,
+                    nn.SyncBatchNorm,
+                    nn.GroupNorm,
+                    nn.InstanceNorm1d,
+                    nn.InstanceNorm2d,
+                    nn.InstanceNorm3d,
+                ),
+            ):
+                if with_weight:
+                    m.weight.data.fill_(1.0)
+                if with_bias:
+                    m.bias.data.zero_()
+        if init_head >= 0:
+            if isinstance(self.head, nn.Linear):
+                self.head.weight.data.mul_(init_head)
+                self.head.bias.data.zero_()
+            elif isinstance(self.head, nn.Sequential):
+                self.head[-1].weight.data.mul_(init_head)
+                self.head[-1].bias.data.zero_()
+        if isinstance(self.head_nm, AdaLNBeforeHead):
+            self.head_nm.ada_lin[-1].weight.data.mul_(init_adaln)
+            if (
+                hasattr(self.head_nm.ada_lin[-1], "bias")
+                and self.head_nm.ada_lin[-1].bias is not None
+            ):
+                self.head_nm.ada_lin[-1].bias.data.zero_()
+        depth = len(self.blocks)
+        for block in self.blocks:
+            block.attn.proj.weight.data.div_(math.sqrt(2 * depth))
+            block.cross_attn.proj.weight.data.div_(math.sqrt(2 * depth))
+            if hasattr(block.ffn, "fc2"):
+                block.ffn.fc2.weight.data.div_(math.sqrt(2 * depth))
+            if hasattr(block, "ada_lin"):
+                block.ada_lin[-1].weight.data[2 * self.C :].mul_(init_adaln)
+                block.ada_lin[-1].weight.data[: 2 * self.C].mul_(init_adaln_gamma)
+                if (
+                    hasattr(block.ada_lin[-1], "bias")
+                    and block.ada_lin[-1].bias is not None
+                ):
+                    block.ada_lin[-1].bias.data.zero_()
+            elif hasattr(block, "ada_gss"):
+                block.ada_gss.data[:, :, 2:].mul_(init_adaln)
+                block.ada_gss.data[:, :, :2].mul_(init_adaln_gamma)
+    def extra_repr(self):
+        return f"drop_path_rate={self.drop_path_rate:g}"
+class TVARHF(VAR, PyTorchModelHubMixin):
+    # tags=["image-generation"]):
+    def __init__(
+        self,
+        depth=30,
+        shared_aln=False,
+        attn_l2_norm=True,
+        rope=True,
+        rope_theta=10000,
+        rope_size=128,
+        use_swiglu_ffn=True,
+    ):
+        heads = depth
+        width = depth * 64
+        super().__init__(
+            depth=depth,
+            embed_dim=width,
+            num_heads=heads,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            norm_eps=1e-6,
+            shared_aln=shared_aln,
+            attn_l2_norm=attn_l2_norm,
+            patch_nums=(1, 2, 3, 4, 6, 9, 13, 18, 24, 32),
+            rope=rope,
+            rope_theta=rope_theta,
+            rope_size=rope_size,
+            use_swiglu_ffn=use_swiglu_ffn,
+        )

models/vqvae.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+References:
+- VectorQuantizer2: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L110
+- GumbelQuantize: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L213
+- VQVAE (VQModel): https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/models/autoencoder.py#L14
+"""
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from .basic_vae import Decoder, Encoder
+from .quant import VectorQuantizer2
+class VQVAE(nn.Module):
+    def __init__(
+        self,
+        vocab_size=4096,
+        z_channels=32,
+        ch=128,
+        dropout=0.0,
+        beta=0.25,  # commitment loss weight
+        using_znorm=False,  # whether to normalize when computing the nearest neighbors
+        quant_conv_ks=3,  # quant conv kernel size
+        quant_resi=0.5,  # 0.5 means \phi(x) = 0.5conv(x) + (1-0.5)x
+        share_quant_resi=4,  # use 4 \phi layers for K scales: partially-shared \phi
+        default_qresi_counts=0,  # if is 0: automatically set to len(v_patch_nums)
+        # number of patches for each scale, h_{1 to K} = w_{1 to K} = v_patch_nums[k]
+        v_patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),
+        test_mode=True,
+    ):
+        super().__init__()
+        self.test_mode = test_mode
+        self.V, self.Cvae = vocab_size, z_channels
+        # ddconfig is copied from https://github.com/CompVis/latent-diffusion/blob/e66308c7f2e64cb581c6d27ab6fbeb846828253b/models/first_stage_models/vq-f16/config.yaml
+        ddconfig = dict(
+            dropout=dropout,
+            ch=ch,
+            z_channels=z_channels,
+            in_channels=3,
+            ch_mult=(1, 1, 2, 2, 4),
+            num_res_blocks=2,  # from vq-f16/config.yaml above
+            using_sa=True,
+            using_mid_sa=True,  # from vq-f16/config.yaml above
+            # resamp_with_conv=True,   # always True, removed.
+        )
+        ddconfig.pop("double_z", None)  # only KL-VAE should use double_z=True
+        self.encoder = Encoder(double_z=False, **ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.vocab_size = vocab_size
+        self.downsample = 2 ** (len(ddconfig["ch_mult"]) - 1)
+        self.quantize: VectorQuantizer2 = VectorQuantizer2(
+            vocab_size=vocab_size,
+            Cvae=self.Cvae,
+            using_znorm=using_znorm,
+            beta=beta,
+            default_qresi_counts=default_qresi_counts,
+            v_patch_nums=v_patch_nums,
+            quant_resi=quant_resi,
+            share_quant_resi=share_quant_resi,
+        )
+        self.quant_conv = torch.nn.Conv2d(
+            self.Cvae, self.Cvae, quant_conv_ks, stride=1, padding=quant_conv_ks // 2
+        )
+        self.post_quant_conv = torch.nn.Conv2d(
+            self.Cvae, self.Cvae, quant_conv_ks, stride=1, padding=quant_conv_ks // 2
+        )
+        if self.test_mode:
+            self.eval()
+            [p.requires_grad_(False) for p in self.parameters()]
+    # ===================== `forward` is only used in VAE training =====================
+    def forward(self, inp, ret_usages=False):  # -> rec_B3HW, idx_N, loss
+        VectorQuantizer2.forward
+        f_hat, usages, vq_loss = self.quantize(
+            self.quant_conv(self.encoder(inp)), ret_usages=ret_usages
+        )
+        return self.decoder(self.post_quant_conv(f_hat)), usages, vq_loss
+    # ===================== `forward` is only used in VAE training =====================
+    def fhat_to_img(self, f_hat: torch.Tensor):
+        return self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1)
+    def img_to_idxBl(
+        self,
+        inp_img_no_grad: torch.Tensor,
+        v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None,
+        noise_std: Optional[float] = None,
+    ) -> List[torch.LongTensor]:  # return List[Bl]
+        f = self.quant_conv(self.encoder(inp_img_no_grad))
+        return self.quantize.f_to_idxBl_or_fhat(
+            f, to_fhat=False, v_patch_nums=v_patch_nums, noise_std=noise_std,
+        )
+    def idxBl_to_img(
+        self, ms_idx_Bl: List[torch.Tensor], same_shape: bool, last_one=False
+    ) -> Union[List[torch.Tensor], torch.Tensor]:
+        B = ms_idx_Bl[0].shape[0]
+        ms_h_BChw = []
+        for idx_Bl in ms_idx_Bl:
+            l = idx_Bl.shape[1]
+            pn = round(l**0.5)
+            ms_h_BChw.append(
+                self.quantize.embedding(idx_Bl)
+                .transpose(1, 2)
+                .view(B, self.Cvae, pn, pn)
+            )
+        return self.embed_to_img(
+            ms_h_BChw=ms_h_BChw, all_to_max_scale=same_shape, last_one=last_one
+        )
+    def embed_to_img(
+        self, ms_h_BChw: List[torch.Tensor], all_to_max_scale: bool, last_one=False
+    ) -> Union[List[torch.Tensor], torch.Tensor]:
+        if last_one:
+            return self.decoder(
+                self.post_quant_conv(
+                    self.quantize.embed_to_fhat(
+                        ms_h_BChw, all_to_max_scale=all_to_max_scale, last_one=True
+                    )
+                )
+            ).clamp_(-1, 1)
+        else:
+            return [
+                self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1)
+                for f_hat in self.quantize.embed_to_fhat(
+                    ms_h_BChw, all_to_max_scale=all_to_max_scale, last_one=False
+                )
+            ]
+    def img_to_reconstructed_img(
+        self,
+        x,
+        v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None,
+        last_one=False,
+    ) -> List[torch.Tensor]:
+        f = self.quant_conv(self.encoder(x))
+        ls_f_hat_BChw = self.quantize.f_to_idxBl_or_fhat(
+            f, to_fhat=True, v_patch_nums=v_patch_nums
+        )
+        if last_one:
+            return self.decoder(self.post_quant_conv(ls_f_hat_BChw[-1])).clamp_(-1, 1)
+        else:
+            return [
+                self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1)
+                for f_hat in ls_f_hat_BChw
+            ]
+    def load_state_dict(self, state_dict: Dict[str, Any], strict=True, assign=False):
+        if (
+            "quantize.ema_vocab_hit_SV" in state_dict
+            and state_dict["quantize.ema_vocab_hit_SV"].shape[0]
+            != self.quantize.ema_vocab_hit_SV.shape[0]
+        ):
+            state_dict["quantize.ema_vocab_hit_SV"] = self.quantize.ema_vocab_hit_SV
+        return super().load_state_dict(
+            state_dict=state_dict, strict=strict, assign=assign
+        )
+class VQVAEHF(VQVAE, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        vocab_size=4096,
+        z_channels=32,
+        ch=160,
+        test_mode=True,
+        share_quant_resi=4,
+        v_patch_nums=(1, 2, 3, 4, 6, 9, 13, 18, 24, 32),
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            z_channels=z_channels,
+            ch=ch,
+            test_mode=True,
+            share_quant_resi=4,
+            v_patch_nums=v_patch_nums,
+        )