Spaces:

xinjjj
/

ImgRoboAssetGen

Running on Zero

App Files Files Community

xinjie.wang commited on Apr 11

Commit

9b53de6

1 Parent(s): 7dfbfa6

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +2 -2
asset3d_gen/models/segment.py +1 -1
asset3d_gen/utils/gpt_clients.py +3 -1
common.py +5 -3
requirements.txt +1 -0
thirdparty/TRELLIS/trellis/__init__.py +6 -0
thirdparty/TRELLIS/trellis/models/__init__.py +70 -0
thirdparty/TRELLIS/trellis/models/sparse_structure_flow.py +200 -0
thirdparty/TRELLIS/trellis/models/sparse_structure_vae.py +306 -0
thirdparty/TRELLIS/trellis/models/structured_latent_flow.py +262 -0
thirdparty/TRELLIS/trellis/models/structured_latent_vae/__init__.py +4 -0
thirdparty/TRELLIS/trellis/models/structured_latent_vae/base.py +117 -0
thirdparty/TRELLIS/trellis/models/structured_latent_vae/decoder_gs.py +122 -0
thirdparty/TRELLIS/trellis/models/structured_latent_vae/decoder_mesh.py +167 -0
thirdparty/TRELLIS/trellis/models/structured_latent_vae/decoder_rf.py +104 -0
thirdparty/TRELLIS/trellis/models/structured_latent_vae/encoder.py +72 -0
thirdparty/TRELLIS/trellis/modules/attention/__init__.py +36 -0
thirdparty/TRELLIS/trellis/modules/attention/full_attn.py +140 -0
thirdparty/TRELLIS/trellis/modules/attention/modules.py +146 -0
thirdparty/TRELLIS/trellis/modules/norm.py +25 -0
thirdparty/TRELLIS/trellis/modules/sparse/__init__.py +102 -0
thirdparty/TRELLIS/trellis/modules/sparse/attention/__init__.py +4 -0
thirdparty/TRELLIS/trellis/modules/sparse/attention/full_attn.py +215 -0
thirdparty/TRELLIS/trellis/modules/sparse/attention/modules.py +139 -0
thirdparty/TRELLIS/trellis/modules/sparse/attention/serialized_attn.py +193 -0
thirdparty/TRELLIS/trellis/modules/sparse/attention/windowed_attn.py +135 -0
thirdparty/TRELLIS/trellis/modules/sparse/basic.py +459 -0
thirdparty/TRELLIS/trellis/modules/sparse/conv/__init__.py +21 -0
thirdparty/TRELLIS/trellis/modules/sparse/conv/conv_spconv.py +80 -0
thirdparty/TRELLIS/trellis/modules/sparse/conv/conv_torchsparse.py +38 -0
thirdparty/TRELLIS/trellis/modules/sparse/linear.py +15 -0
thirdparty/TRELLIS/trellis/modules/sparse/nonlinearity.py +35 -0
thirdparty/TRELLIS/trellis/modules/sparse/norm.py +58 -0
thirdparty/TRELLIS/trellis/modules/sparse/spatial.py +110 -0
thirdparty/TRELLIS/trellis/modules/sparse/transformer/__init__.py +2 -0
thirdparty/TRELLIS/trellis/modules/sparse/transformer/blocks.py +151 -0
thirdparty/TRELLIS/trellis/modules/sparse/transformer/modulated.py +166 -0
thirdparty/TRELLIS/trellis/modules/spatial.py +48 -0
thirdparty/TRELLIS/trellis/modules/transformer/__init__.py +2 -0
thirdparty/TRELLIS/trellis/modules/transformer/blocks.py +182 -0
thirdparty/TRELLIS/trellis/modules/transformer/modulated.py +157 -0
thirdparty/TRELLIS/trellis/modules/utils.py +54 -0
thirdparty/TRELLIS/trellis/pipelines/__init__.py +24 -0
thirdparty/TRELLIS/trellis/pipelines/base.py +66 -0
thirdparty/TRELLIS/trellis/pipelines/samplers/__init__.py +2 -0
thirdparty/TRELLIS/trellis/pipelines/samplers/base.py +20 -0
thirdparty/TRELLIS/trellis/pipelines/samplers/classifier_free_guidance_mixin.py +12 -0
thirdparty/TRELLIS/trellis/pipelines/samplers/flow_euler.py +199 -0
thirdparty/TRELLIS/trellis/pipelines/samplers/guidance_interval_mixin.py +15 -0
thirdparty/TRELLIS/trellis/pipelines/trellis_image_to_3d.py +376 -0

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from common import (
     select_point,
 )
 from gradio.themes import Default
-from gradio.themes.utils.colors import slate, sky, blue, indigo
 from gradio_litmodel3d import LitModel3D
 from asset3d_gen.models.delight import DelightingModel
 from asset3d_gen.models.segment import RembgRemover, SAMPredictor
@@ -64,7 +64,7 @@ def end_session(req: gr.Request) -> None:
 with gr.Blocks(
-    delete_cache=(43200, 43200), theme=Default(primary_hue=indigo)
 ) as demo:
     gr.Markdown(
         f"""

     select_point,
 )
 from gradio.themes import Default
+from gradio.themes.utils.colors import slate
 from gradio_litmodel3d import LitModel3D
 from asset3d_gen.models.delight import DelightingModel
 from asset3d_gen.models.segment import RembgRemover, SAMPredictor
 with gr.Blocks(
+    delete_cache=(43200, 43200), theme=Default(primary_hue=slate)
 ) as demo:
     gr.Markdown(
         f"""

asset3d_gen/models/segment.py CHANGED Viewed

@@ -162,7 +162,7 @@ class SAMPredictor(object):
         checkpoint: str = None,
         model_type: str = "vit_h",
         binary_thresh: float = 0.1,
-        device: str = "cuda"
     ):
         self.device = device
         self.model_type = model_type

         checkpoint: str = None,
         model_type: str = "vit_h",
         binary_thresh: float = 0.1,
+        device: str = "cuda",
     ):
         self.device = device
         self.model_type = model_type

asset3d_gen/utils/gpt_clients.py CHANGED Viewed

@@ -186,5 +186,7 @@ if __name__ == "__main__":
         print(response)
         # test2: text prompt
-        response = GPT_CLIENT.query(text_prompt="What is the capital of China?")
         print(response)

         print(response)
         # test2: text prompt
+        response = GPT_CLIENT.query(
+            text_prompt="What is the capital of China?"
+        )
         print(response)

common.py CHANGED Viewed

@@ -1,13 +1,14 @@
-import spaces
 import gc
 import logging
 import os
 import sys
 from glob import glob
 from typing import Union
 import cv2
 import gradio as gr
 import numpy as np
 import torch
 import trimesh
 from easydict import EasyDict as edict
@@ -44,8 +45,9 @@ from asset3d_gen.validators.quality_checkers import (
 )
 from asset3d_gen.validators.urdf_convertor import URDFGenerator, zip_files
-current_directory = os.getcwd()
-sys.path.insert(0, current_directory)
 from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
 from thirdparty.TRELLIS.trellis.renderers.mesh_renderer import MeshRenderer
 from thirdparty.TRELLIS.trellis.representations import (

 import gc
 import logging
 import os
 import sys
 from glob import glob
 from typing import Union
 import cv2
 import gradio as gr
 import numpy as np
+import spaces
 import torch
 import trimesh
 from easydict import EasyDict as edict
 )
 from asset3d_gen.validators.urdf_convertor import URDFGenerator, zip_files
+current_file_path = os.path.abspath(__file__)
+current_dir = os.path.dirname(current_file_path)
+sys.path.append(os.path.join(current_dir, "../.."))
 from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
 from thirdparty.TRELLIS.trellis.renderers.mesh_renderer import MeshRenderer
 from thirdparty.TRELLIS.trellis.representations import (

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.1.0
 torchaudio==2.1.0
 torchvision==0.16.0

 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.1.0
 torchaudio==2.1.0
 torchvision==0.16.0

thirdparty/TRELLIS/trellis/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from . import models
+from . import modules
+from . import pipelines
+from . import renderers
+from . import representations
+from . import utils

thirdparty/TRELLIS/trellis/models/__init__.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import importlib
+__attributes = {
+    'SparseStructureEncoder': 'sparse_structure_vae',
+    'SparseStructureDecoder': 'sparse_structure_vae',
+    'SparseStructureFlowModel': 'sparse_structure_flow',
+    'SLatEncoder': 'structured_latent_vae',
+    'SLatGaussianDecoder': 'structured_latent_vae',
+    'SLatRadianceFieldDecoder': 'structured_latent_vae',
+    'SLatMeshDecoder': 'structured_latent_vae',
+    'SLatFlowModel': 'structured_latent_flow',
+}
+__submodules = []
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+def from_pretrained(path: str, **kwargs):
+    """
+    Load a model from a pretrained checkpoint.
+    Args:
+        path: The path to the checkpoint. Can be either local path or a Hugging Face model name.
+              NOTE: config file and model file should take the name f'{path}.json' and f'{path}.safetensors' respectively.
+        **kwargs: Additional arguments for the model constructor.
+    """
+    import os
+    import json
+    from safetensors.torch import load_file
+    is_local = os.path.exists(f"{path}.json") and os.path.exists(f"{path}.safetensors")
+    if is_local:
+        config_file = f"{path}.json"
+        model_file = f"{path}.safetensors"
+    else:
+        from huggingface_hub import hf_hub_download
+        path_parts = path.split('/')
+        repo_id = f'{path_parts[0]}/{path_parts[1]}'
+        model_name = '/'.join(path_parts[2:])
+        config_file = hf_hub_download(repo_id, f"{model_name}.json")
+        model_file = hf_hub_download(repo_id, f"{model_name}.safetensors")
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+    model = __getattr__(config['name'])(**config['args'], **kwargs)
+    model.load_state_dict(load_file(model_file))
+    return model
+# For Pylance
+if __name__ == '__main__':
+    from .sparse_structure_vae import SparseStructureEncoder, SparseStructureDecoder
+    from .sparse_structure_flow import SparseStructureFlowModel
+    from .structured_latent_vae import SLatEncoder, SLatGaussianDecoder, SLatRadianceFieldDecoder, SLatMeshDecoder
+    from .structured_latent_flow import SLatFlowModel

thirdparty/TRELLIS/trellis/models/sparse_structure_flow.py ADDED Viewed

	@@ -0,0 +1,200 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..modules.utils import convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder, ModulatedTransformerCrossBlock
+from ..modules.spatial import patchify, unpatchify
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: a 1-D Tensor of N indices, one per batch element.
+                These may be fractional.
+            dim: the dimension of the output.
+            max_period: controls the minimum frequency of the embeddings.
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class SparseStructureFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        if pe_mode == "ape":
+            pos_embedder = AbsolutePositionEmbedder(model_channels, 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [resolution // patch_size] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            pos_emb = pos_embedder(coords)
+            self.register_buffer("pos_emb", pos_emb)
+        self.input_layer = nn.Linear(in_channels * patch_size**3, model_channels)
+        self.blocks = nn.ModuleList([
+            ModulatedTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.out_layer = nn.Linear(model_channels, out_channels * patch_size**3)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
+                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
+        h = patchify(x, self.patch_size)
+        h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        h = self.input_layer(h)
+        h = h + self.pos_emb[None]
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        h = h.type(self.dtype)
+        cond = cond.type(self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = self.out_layer(h)
+        h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+        h = unpatchify(h, self.patch_size).contiguous()
+        return h

thirdparty/TRELLIS/trellis/models/sparse_structure_vae.py ADDED Viewed

	@@ -0,0 +1,306 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..modules.norm import GroupNorm32, ChannelLayerNorm32
+from ..modules.spatial import pixel_shuffle_3d
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+def norm_layer(norm_type: str, *args, **kwargs) -> nn.Module:
+    """
+    Return a normalization layer.
+    """
+    if norm_type == "group":
+        return GroupNorm32(32, *args, **kwargs)
+    elif norm_type == "layer":
+        return ChannelLayerNorm32(*args, **kwargs)
+    else:
+        raise ValueError(f"Invalid norm type {norm_type}")
+class ResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        norm_type: Literal["group", "layer"] = "layer",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.norm1 = norm_layer(norm_type, channels)
+        self.norm2 = norm_layer(norm_type, self.out_channels)
+        self.conv1 = nn.Conv3d(channels, self.out_channels, 3, padding=1)
+        self.conv2 = zero_module(nn.Conv3d(self.out_channels, self.out_channels, 3, padding=1))
+        self.skip_connection = nn.Conv3d(channels, self.out_channels, 1) if channels != self.out_channels else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = F.silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.silu(h)
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class DownsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "avgpool"] = "conv",
+    ):
+        assert mode in ["conv", "avgpool"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels, 2, stride=2)
+        elif mode == "avgpool":
+            assert in_channels == out_channels, "Pooling mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            return self.conv(x)
+        else:
+            return F.avg_pool3d(x, 2)
+class UpsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "nearest"] = "conv",
+    ):
+        assert mode in ["conv", "nearest"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels*8, 3, padding=1)
+        elif mode == "nearest":
+            assert in_channels == out_channels, "Nearest mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            x = self.conv(x)
+            return pixel_shuffle_3d(x, 2)
+        else:
+            return F.interpolate(x, scale_factor=2, mode="nearest")
+class SparseStructureEncoder(nn.Module):
+    """
+    Encoder for Sparse Structure (\mathcal{E}_S in the paper Sec. 3.3).
+    Args:
+        in_channels (int): Channels of the input.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the encoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.input_layer = nn.Conv3d(in_channels, channels[0], 3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    DownsampleBlock3d(ch, channels[i+1])
+                )
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[-1], channels[-1])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], latent_channels*2, 3, padding=1)
+        )
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor, sample_posterior: bool = False, return_raw: bool = False) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        h = self.middle_block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        mean, logvar = h.chunk(2, dim=1)
+        if sample_posterior:
+            std = torch.exp(0.5 * logvar)
+            z = mean + std * torch.randn_like(std)
+        else:
+            z = mean
+        if return_raw:
+            return z, mean, logvar
+        return z
+class SparseStructureDecoder(nn.Module):
+    """
+    Decoder for Sparse Structure (\mathcal{D}_S in the paper Sec. 3.3).
+    Args:
+        out_channels (int): Channels of the output.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the decoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        out_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.input_layer = nn.Conv3d(latent_channels, channels[0], 3, padding=1)
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[0], channels[0])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    UpsampleBlock3d(ch, channels[i+1])
+                )
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], out_channels, 3, padding=1)
+        )
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        h = self.middle_block(h)
+        for block in self.blocks:
+            h = block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        return h

thirdparty/TRELLIS/trellis/models/structured_latent_flow.py ADDED Viewed

	@@ -0,0 +1,262 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder
+from ..modules.norm import LayerNorm32
+from ..modules import sparse as sp
+from ..modules.sparse.transformer import ModulatedSparseTransformerCrossBlock
+from .sparse_structure_flow import TimestepEmbedder
+class SparseResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        out_channels: Optional[int] = None,
+        downsample: bool = False,
+        upsample: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.out_channels = out_channels or channels
+        self.downsample = downsample
+        self.upsample = upsample
+        assert not (downsample and upsample), "Cannot downsample and upsample at the same time"
+        self.norm1 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm2 = LayerNorm32(self.out_channels, elementwise_affine=False, eps=1e-6)
+        self.conv1 = sp.SparseConv3d(channels, self.out_channels, 3)
+        self.conv2 = zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3))
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(emb_channels, 2 * self.out_channels, bias=True),
+        )
+        self.skip_connection = sp.SparseLinear(channels, self.out_channels) if channels != self.out_channels else nn.Identity()
+        self.updown = None
+        if self.downsample:
+            self.updown = sp.SparseDownsample(2)
+        elif self.upsample:
+            self.updown = sp.SparseUpsample(2)
+    def _updown(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        if self.updown is not None:
+            x = self.updown(x)
+        return x
+    def forward(self, x: sp.SparseTensor, emb: torch.Tensor) -> sp.SparseTensor:
+        emb_out = self.emb_layers(emb).type(x.dtype)
+        scale, shift = torch.chunk(emb_out, 2, dim=1)
+        x = self._updown(x)
+        h = x.replace(self.norm1(x.feats))
+        h = h.replace(F.silu(h.feats))
+        h = self.conv1(h)
+        h = h.replace(self.norm2(h.feats)) * (1 + scale) + shift
+        h = h.replace(F.silu(h.feats))
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class SLatFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        num_io_res_blocks: int = 2,
+        io_block_channels: List[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        use_skip_connection: bool = True,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.num_io_res_blocks = num_io_res_blocks
+        self.io_block_channels = io_block_channels
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.use_skip_connection = use_skip_connection
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        assert int(np.log2(patch_size)) == np.log2(patch_size), "Patch size must be a power of 2"
+        assert np.log2(patch_size) == len(io_block_channels), "Number of IO ResBlocks must match the number of stages"
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, io_block_channels[0])
+        self.input_blocks = nn.ModuleList([])
+        for chs, next_chs in zip(io_block_channels, io_block_channels[1:] + [model_channels]):
+            self.input_blocks.extend([
+                SparseResBlock3d(
+                    chs,
+                    model_channels,
+                    out_channels=chs,
+                )
+                for _ in range(num_io_res_blocks-1)
+            ])
+            self.input_blocks.append(
+                SparseResBlock3d(
+                    chs,
+                    model_channels,
+                    out_channels=next_chs,
+                    downsample=True,
+                )
+            )
+        self.blocks = nn.ModuleList([
+            ModulatedSparseTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=self.share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.out_blocks = nn.ModuleList([])
+        for chs, prev_chs in zip(reversed(io_block_channels), [model_channels] + list(reversed(io_block_channels[1:]))):
+            self.out_blocks.append(
+                SparseResBlock3d(
+                    prev_chs * 2 if self.use_skip_connection else prev_chs,
+                    model_channels,
+                    out_channels=chs,
+                    upsample=True,
+                )
+            )
+            self.out_blocks.extend([
+                SparseResBlock3d(
+                    chs * 2 if self.use_skip_connection else chs,
+                    model_channels,
+                    out_channels=chs,
+                )
+                for _ in range(num_io_res_blocks-1)
+            ])
+        self.out_layer = sp.SparseLinear(io_block_channels[0], out_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.blocks.apply(convert_module_to_f16)
+        self.out_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.blocks.apply(convert_module_to_f32)
+        self.out_blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: sp.SparseTensor, t: torch.Tensor, cond: torch.Tensor) -> sp.SparseTensor:
+        h = self.input_layer(x).type(self.dtype)
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        cond = cond.type(self.dtype)
+        skips = []
+        # pack with input blocks
+        for block in self.input_blocks:
+            h = block(h, t_emb)
+            skips.append(h.feats)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(h.coords[:, 1:]).type(self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+        # unpack with output blocks
+        for block, skip in zip(self.out_blocks, reversed(skips)):
+            if self.use_skip_connection:
+                h = block(h.replace(torch.cat([h.feats, skip], dim=1)), t_emb)
+            else:
+                h = block(h, t_emb)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h.type(x.dtype))
+        return h

thirdparty/TRELLIS/trellis/models/structured_latent_vae/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .encoder import SLatEncoder
+from .decoder_gs import SLatGaussianDecoder
+from .decoder_rf import SLatRadianceFieldDecoder
+from .decoder_mesh import SLatMeshDecoder

thirdparty/TRELLIS/trellis/models/structured_latent_vae/base.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ...modules.utils import convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from ...modules.transformer import AbsolutePositionEmbedder
+from ...modules.sparse.transformer import SparseTransformerBlock
+def block_attn_config(self):
+    """
+    Return the attention configuration of the model.
+    """
+    for i in range(self.num_blocks):
+        if self.attn_mode == "shift_window":
+            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_sequence":
+            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_order":
+            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
+        elif self.attn_mode == "full":
+            yield "full", None, None, None, None
+        elif self.attn_mode == "swin":
+            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
+class SparseTransformerBase(nn.Module):
+    """
+    Sparse Transformer without output layers.
+    Serve as the base class for encoder and decoder.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.num_blocks = num_blocks
+        self.window_size = window_size
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.attn_mode = attn_mode
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.qk_rms_norm = qk_rms_norm
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels)
+        self.blocks = nn.ModuleList([
+            SparseTransformerBlock(
+                model_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
+        ])
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        h = self.input_layer(x)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(x.coords[:, 1:])
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        return h

thirdparty/TRELLIS/trellis/models/structured_latent_vae/decoder_gs.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import sparse as sp
+from ...utils.random_utils import hammersley_sequence
+from .base import SparseTransformerBase
+from ...representations import Gaussian
+class SLatGaussianDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self._calc_layout()
+        self.out_layer = sp.SparseLinear(model_channels, self.out_channels)
+        self._build_perturbation()
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def _build_perturbation(self) -> None:
+        perturbation = [hammersley_sequence(3, i, self.rep_config['num_gaussians']) for i in range(self.rep_config['num_gaussians'])]
+        perturbation = torch.tensor(perturbation).float() * 2 - 1
+        perturbation = perturbation / self.rep_config['voxel_size']
+        perturbation = torch.atanh(perturbation).to(self.device)
+        self.register_buffer('offset_perturbation', perturbation)
+    def _calc_layout(self) -> None:
+        self.layout = {
+            '_xyz' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_features_dc' : {'shape': (self.rep_config['num_gaussians'], 1, 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_scaling' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_rotation' : {'shape': (self.rep_config['num_gaussians'], 4), 'size': self.rep_config['num_gaussians'] * 4},
+            '_opacity' : {'shape': (self.rep_config['num_gaussians'], 1), 'size': self.rep_config['num_gaussians']},
+        }
+        start = 0
+        for k, v in self.layout.items():
+            v['range'] = (start, start + v['size'])
+            start += v['size']
+        self.out_channels = start
+    def to_representation(self, x: sp.SparseTensor) -> List[Gaussian]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            representation = Gaussian(
+                sh_degree=0,
+                aabb=[-0.5, -0.5, -0.5, 1.0, 1.0, 1.0],
+                mininum_kernel_size = self.rep_config['3d_filter_kernel_size'],
+                scaling_bias = self.rep_config['scaling_bias'],
+                opacity_bias = self.rep_config['opacity_bias'],
+                scaling_activation = self.rep_config['scaling_activation']
+            )
+            xyz = (x.coords[x.layout[i]][:, 1:].float() + 0.5) / self.resolution
+            for k, v in self.layout.items():
+                if k == '_xyz':
+                    offset = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape'])
+                    offset = offset * self.rep_config['lr'][k]
+                    if self.rep_config['perturb_offset']:
+                        offset = offset + self.offset_perturbation
+                    offset = torch.tanh(offset) / self.resolution * 0.5 * self.rep_config['voxel_size']
+                    _xyz = xyz.unsqueeze(1) + offset
+                    setattr(representation, k, _xyz.flatten(0, 1))
+                else:
+                    feats = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape']).flatten(0, 1)
+                    feats = feats * self.rep_config['lr'][k]
+                    setattr(representation, k, feats)
+            ret.append(representation)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[Gaussian]:
+        h = super().forward(x)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        return self.to_representation(h)

thirdparty/TRELLIS/trellis/models/structured_latent_vae/decoder_mesh.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+from ...representations import MeshExtractResult
+from ...representations.mesh import SparseFeatures2Mesh
+class SparseSubdivideBlock3d(nn.Module):
+    """
+    A 3D subdivide block that can subdivide the sparse tensor.
+    Args:
+        channels: channels in the inputs and outputs.
+        out_channels: if specified, the number of output channels.
+        num_groups: the number of groups for the group norm.
+    """
+    def __init__(
+        self,
+        channels: int,
+        resolution: int,
+        out_channels: Optional[int] = None,
+        num_groups: int = 32
+    ):
+        super().__init__()
+        self.channels = channels
+        self.resolution = resolution
+        self.out_resolution = resolution * 2
+        self.out_channels = out_channels or channels
+        self.act_layers = nn.Sequential(
+            sp.SparseGroupNorm32(num_groups, channels),
+            sp.SparseSiLU()
+        )
+        self.sub = sp.SparseSubdivide()
+        self.out_layers = nn.Sequential(
+            sp.SparseConv3d(channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}"),
+            sp.SparseGroupNorm32(num_groups, self.out_channels),
+            sp.SparseSiLU(),
+            zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}")),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = sp.SparseConv3d(channels, self.out_channels, 1, indice_key=f"res_{self.out_resolution}")
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        Args:
+            x: an [N x C x ...] Tensor of features.
+        Returns:
+            an [N x C x ...] Tensor of outputs.
+        """
+        h = self.act_layers(x)
+        h = self.sub(h)
+        x = self.sub(x)
+        h = self.out_layers(h)
+        h = h + self.skip_connection(x)
+        return h
+class SLatMeshDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self.mesh_extractor = SparseFeatures2Mesh(res=self.resolution*4, use_color=self.rep_config.get('use_color', False))
+        self.out_channels = self.mesh_extractor.feats_channels
+        self.upsample = nn.ModuleList([
+            SparseSubdivideBlock3d(
+                channels=model_channels,
+                resolution=resolution,
+                out_channels=model_channels // 4
+            ),
+            SparseSubdivideBlock3d(
+                channels=model_channels // 4,
+                resolution=resolution * 2,
+                out_channels=model_channels // 8
+            )
+        ])
+        self.out_layer = sp.SparseLinear(model_channels // 8, self.out_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        super().convert_to_fp16()
+        self.upsample.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        super().convert_to_fp32()
+        self.upsample.apply(convert_module_to_f32)
+    def to_representation(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            mesh = self.mesh_extractor(x[i], training=self.training)
+            ret.append(mesh)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        h = super().forward(x)
+        for block in self.upsample:
+            h = block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        return self.to_representation(h)

thirdparty/TRELLIS/trellis/models/structured_latent_vae/decoder_rf.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+from ...representations import Strivec
+class SLatRadianceFieldDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self._calc_layout()
+        self.out_layer = sp.SparseLinear(model_channels, self.out_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def _calc_layout(self) -> None:
+        self.layout = {
+            'trivec': {'shape': (self.rep_config['rank'], 3, self.rep_config['dim']), 'size': self.rep_config['rank'] * 3 * self.rep_config['dim']},
+            'density': {'shape': (self.rep_config['rank'],), 'size': self.rep_config['rank']},
+            'features_dc': {'shape': (self.rep_config['rank'], 1, 3), 'size': self.rep_config['rank'] * 3},
+        }
+        start = 0
+        for k, v in self.layout.items():
+            v['range'] = (start, start + v['size'])
+            start += v['size']
+        self.out_channels = start
+    def to_representation(self, x: sp.SparseTensor) -> List[Strivec]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            representation = Strivec(
+                sh_degree=0,
+                resolution=self.resolution,
+                aabb=[-0.5, -0.5, -0.5, 1, 1, 1],
+                rank=self.rep_config['rank'],
+                dim=self.rep_config['dim'],
+                device='cuda',
+            )
+            representation.density_shift = 0.0
+            representation.position = (x.coords[x.layout[i]][:, 1:].float() + 0.5) / self.resolution
+            representation.depth = torch.full((representation.position.shape[0], 1), int(np.log2(self.resolution)), dtype=torch.uint8, device='cuda')
+            for k, v in self.layout.items():
+                setattr(representation, k, x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape']))
+            representation.trivec = representation.trivec + 1
+            ret.append(representation)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[Strivec]:
+        h = super().forward(x)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        return self.to_representation(h)

thirdparty/TRELLIS/trellis/models/structured_latent_vae/encoder.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+class SLatEncoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.out_layer = sp.SparseLinear(model_channels, 2 * latent_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: sp.SparseTensor, sample_posterior=True, return_raw=False):
+        h = super().forward(x)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        # Sample from the posterior distribution
+        mean, logvar = h.feats.chunk(2, dim=-1)
+        if sample_posterior:
+            std = torch.exp(0.5 * logvar)
+            z = mean + std * torch.randn_like(std)
+        else:
+            z = mean
+        z = h.replace(z)
+        if return_raw:
+            return z, mean, logvar
+        else:
+            return z

thirdparty/TRELLIS/trellis/modules/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import *
+BACKEND = 'flash_attn'
+DEBUG = False
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    env_attn_backend = os.environ.get('ATTN_BACKEND')
+    env_sttn_debug = os.environ.get('ATTN_DEBUG')
+    if env_attn_backend is not None and env_attn_backend in ['xformers', 'flash_attn', 'sdpa', 'naive']:
+        BACKEND = env_attn_backend
+    if env_sttn_debug is not None:
+        DEBUG = env_sttn_debug == '1'
+    print(f"[ATTENTION] Using backend: {BACKEND}")
+__from_env()
+def set_backend(backend: Literal['xformers', 'flash_attn']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+from .full_attn import *
+from .modules import *

thirdparty/TRELLIS/trellis/modules/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import *
+import torch
+import math
+from . import DEBUG, BACKEND
+if BACKEND == 'xformers':
+    import xformers.ops as xops
+elif BACKEND == 'flash_attn':
+    import flash_attn
+elif BACKEND == 'sdpa':
+    from torch.nn.functional import scaled_dot_product_attention as sdpa
+elif BACKEND == 'naive':
+    pass
+else:
+    raise ValueError(f"Unknown attention backend: {BACKEND}")
+__all__ = [
+    'scaled_dot_product_attention',
+]
+def _naive_sdpa(q, k, v):
+    """
+    Naive implementation of scaled dot product attention.
+    """
+    q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+    k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+    v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+    scale_factor = 1 / math.sqrt(q.size(-1))
+    attn_weight = q @ k.transpose(-2, -1) * scale_factor
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    out = attn_weight @ v
+    out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    return out
+@overload
+def scaled_dot_product_attention(qkv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        qkv (torch.Tensor): A [N, L, 3, H, C] tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, kv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, C] tensor containing Qs.
+        kv (torch.Tensor): A [N, L, 2, H, C] tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+def scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert len(qkv.shape) == 5 and qkv.shape[2] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, L, 3, H, C]"
+        device = qkv.device
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+        assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+        device = q.device
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+        assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+        assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+        device = q.device
+    if BACKEND == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = xops.memory_efficient_attention(q, k, v)
+    elif BACKEND == 'flash_attn':
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_qkvpacked_func(qkv)
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_kvpacked_func(q, kv)
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_func(q, k, v)
+    elif BACKEND == 'sdpa':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+        k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+        v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+        out = sdpa(q, k, v)         # [N, H, L, C]
+        out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    elif BACKEND == 'naive':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = _naive_sdpa(q, k, v)
+    else:
+        raise ValueError(f"Unknown attention module: {BACKEND}")
+    return out

thirdparty/TRELLIS/trellis/modules/attention/modules.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .full_attn import scaled_dot_product_attention
+class MultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return (F.normalize(x.float(), dim = -1) * self.gamma * self.scale).to(x.dtype)
+class RotaryPositionEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, in_channels: int = 3):
+        super().__init__()
+        assert hidden_size % 2 == 0, "Hidden size must be divisible by 2"
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.freq_dim = hidden_size // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _get_phases(self, indices: torch.Tensor) -> torch.Tensor:
+        self.freqs = self.freqs.to(indices.device)
+        phases = torch.outer(indices, self.freqs)
+        phases = torch.polar(torch.ones_like(phases), phases)
+        return phases
+    def _rotary_embedding(self, x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
+        x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = x_complex * phases
+        x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
+        return x_embed
+    def forward(self, q: torch.Tensor, k: torch.Tensor, indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            q (sp.SparseTensor): [..., N, D] tensor of queries
+            k (sp.SparseTensor): [..., N, D] tensor of keys
+            indices (torch.Tensor): [..., N, C] tensor of spatial positions
+        """
+        if indices is None:
+            indices = torch.arange(q.shape[-2], device=q.device)
+            if len(q.shape) > 2:
+                indices = indices.unsqueeze(0).expand(q.shape[:-2] + (-1,))
+        phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
+        if phases.shape[1] < self.hidden_size // 2:
+            phases = torch.cat([phases, torch.polar(
+                torch.ones(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device),
+                torch.zeros(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device)
+            )], dim=-1)
+        q_embed = self._rotary_embedding(q, phases)
+        k_embed = self._rotary_embedding(k, phases)
+        return q_embed, k_embed
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int]=None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        if attn_mode == "windowed":
+            raise NotImplementedError("Windowed attention is not yet implemented")
+        self.channels = channels
+        self.head_dim = channels // num_heads
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_window = shift_window
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+        if self.qk_rms_norm:
+            self.q_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+            self.k_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None, indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, L, C = x.shape
+        if self._type == "self":
+            qkv = self.to_qkv(x)
+            qkv = qkv.reshape(B, L, 3, self.num_heads, -1)
+            if self.use_rope:
+                q, k, v = qkv.unbind(dim=2)
+                q, k = self.rope(q, k, indices)
+                qkv = torch.stack([q, k, v], dim=2)
+            if self.attn_mode == "full":
+                if self.qk_rms_norm:
+                    q, k, v = qkv.unbind(dim=2)
+                    q = self.q_rms_norm(q)
+                    k = self.k_rms_norm(k)
+                    h = scaled_dot_product_attention(q, k, v)
+                else:
+                    h = scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "windowed":
+                raise NotImplementedError("Windowed attention is not yet implemented")
+        else:
+            Lkv = context.shape[1]
+            q = self.to_q(x)
+            kv = self.to_kv(context)
+            q = q.reshape(B, L, self.num_heads, -1)
+            kv = kv.reshape(B, Lkv, 2, self.num_heads, -1)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=2)
+                k = self.k_rms_norm(k)
+                h = scaled_dot_product_attention(q, k, v)
+            else:
+                h = scaled_dot_product_attention(q, kv)
+        h = h.reshape(B, L, -1)
+        h = self.to_out(h)
+        return h

thirdparty/TRELLIS/trellis/modules/norm.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import torch.nn as nn
+class LayerNorm32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class GroupNorm32(nn.GroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class ChannelLayerNorm32(LayerNorm32):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        DIM = x.dim()
+        x = x.permute(0, *range(2, DIM), 1).contiguous()
+        x = super().forward(x)
+        x = x.permute(0, DIM-1, *range(1, DIM-1)).contiguous()
+        return x

thirdparty/TRELLIS/trellis/modules/sparse/__init__.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import *
+BACKEND = 'spconv'
+DEBUG = False
+ATTN = 'flash_attn'
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    global ATTN
+    env_sparse_backend = os.environ.get('SPARSE_BACKEND')
+    env_sparse_debug = os.environ.get('SPARSE_DEBUG')
+    env_sparse_attn = os.environ.get('SPARSE_ATTN_BACKEND')
+    if env_sparse_attn is None:
+        env_sparse_attn = os.environ.get('ATTN_BACKEND')
+    if env_sparse_backend is not None and env_sparse_backend in ['spconv', 'torchsparse']:
+        BACKEND = env_sparse_backend
+    if env_sparse_debug is not None:
+        DEBUG = env_sparse_debug == '1'
+    if env_sparse_attn is not None and env_sparse_attn in ['xformers', 'flash_attn']:
+        ATTN = env_sparse_attn
+    print(f"[SPARSE] Backend: {BACKEND}, Attention: {ATTN}")
+__from_env()
+def set_backend(backend: Literal['spconv', 'torchsparse']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+def set_attn(attn: Literal['xformers', 'flash_attn']):
+    global ATTN
+    ATTN = attn
+import importlib
+__attributes = {
+    'SparseTensor': 'basic',
+    'sparse_batch_broadcast': 'basic',
+    'sparse_batch_op': 'basic',
+    'sparse_cat': 'basic',
+    'sparse_unbind': 'basic',
+    'SparseGroupNorm': 'norm',
+    'SparseLayerNorm': 'norm',
+    'SparseGroupNorm32': 'norm',
+    'SparseLayerNorm32': 'norm',
+    'SparseReLU': 'nonlinearity',
+    'SparseSiLU': 'nonlinearity',
+    'SparseGELU': 'nonlinearity',
+    'SparseActivation': 'nonlinearity',
+    'SparseLinear': 'linear',
+    'sparse_scaled_dot_product_attention': 'attention',
+    'SerializeMode': 'attention',
+    'sparse_serialized_scaled_dot_product_self_attention': 'attention',
+    'sparse_windowed_scaled_dot_product_self_attention': 'attention',
+    'SparseMultiHeadAttention': 'attention',
+    'SparseConv3d': 'conv',
+    'SparseInverseConv3d': 'conv',
+    'SparseDownsample': 'spatial',
+    'SparseUpsample': 'spatial',
+    'SparseSubdivide' : 'spatial'
+}
+__submodules = ['transformer']
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+# For Pylance
+if __name__ == '__main__':
+    from .basic import *
+    from .norm import *
+    from .nonlinearity import *
+    from .linear import *
+    from .attention import *
+    from .conv import *
+    from .spatial import *
+    import transformer

thirdparty/TRELLIS/trellis/modules/sparse/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .full_attn import *
+from .serialized_attn import *
+from .windowed_attn import *
+from .modules import *

thirdparty/TRELLIS/trellis/modules/sparse/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from typing import *
+import torch
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_scaled_dot_product_attention',
+]
+@overload
+def sparse_scaled_dot_product_attention(qkv: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): A [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, kv: Union[SparseTensor, torch.Tensor]) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, C] sparse tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor or a [N, L, 2, H, C] dense tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, kv: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, L, H, C] dense tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: SparseTensor, v: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: torch.Tensor, v: torch.Tensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] dense tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] dense tensor containing Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, k: SparseTensor, v: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] dense tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    """
+    ...
+def sparse_scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert isinstance(qkv, SparseTensor), f"qkv must be a SparseTensor, got {type(qkv)}"
+        assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+        device = qkv.device
+        s = qkv
+        q_seqlen = [qkv.layout[i].stop - qkv.layout[i].start for i in range(qkv.shape[0])]
+        kv_seqlen = q_seqlen
+        qkv = qkv.feats     # [T, 3, H, C]
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert isinstance(q, SparseTensor) and isinstance(kv, (SparseTensor, torch.Tensor)) or \
+               isinstance(q, torch.Tensor) and isinstance(kv, SparseTensor), \
+               f"Invalid types, got {type(q)} and {type(kv)}"
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, C]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, C]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+            s = None
+            N, L, H, C = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, C)   # [T_Q, H, C]
+        if isinstance(kv, SparseTensor):
+            assert len(kv.shape) == 4 and kv.shape[1] == 2, f"Invalid shape for kv, got {kv.shape}, expected [N, *, 2, H, C]"
+            kv_seqlen = [kv.layout[i].stop - kv.layout[i].start for i in range(kv.shape[0])]
+            kv = kv.feats     # [T_KV, 2, H, C]
+        else:
+            assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+            N, L, _, H, C = kv.shape
+            kv_seqlen = [L] * N
+            kv = kv.reshape(N * L, 2, H, C)   # [T_KV, 2, H, C]
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert isinstance(q, SparseTensor) and isinstance(k, (SparseTensor, torch.Tensor)) and type(k) == type(v) or \
+               isinstance(q, torch.Tensor) and isinstance(k, SparseTensor) and isinstance(v, SparseTensor), \
+               f"Invalid types, got {type(q)}, {type(k)}, and {type(v)}"
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, Ci]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, Ci]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+            s = None
+            N, L, H, CI = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, CI)  # [T_Q, H, Ci]
+        if isinstance(k, SparseTensor):
+            assert len(k.shape) == 3, f"Invalid shape for k, got {k.shape}, expected [N, *, H, Ci]"
+            assert len(v.shape) == 3, f"Invalid shape for v, got {v.shape}, expected [N, *, H, Co]"
+            kv_seqlen = [k.layout[i].stop - k.layout[i].start for i in range(k.shape[0])]
+            k = k.feats     # [T_KV, H, Ci]
+            v = v.feats     # [T_KV, H, Co]
+        else:
+            assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+            assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+            N, L, H, CI, CO = *k.shape, v.shape[-1]
+            kv_seqlen = [L] * N
+            k = k.reshape(N * L, H, CI)     # [T_KV, H, Ci]
+            v = v.reshape(N * L, H, CO)     # [T_KV, H, Co]
+    if DEBUG:
+        if s is not None:
+            for i in range(s.shape[0]):
+                assert (s.coords[s.layout[i]] == i).all(), f"SparseScaledDotProductSelfAttention: batch index mismatch"
+        if num_all_args in [2, 3]:
+            assert q.shape[:2] == [1, sum(q_seqlen)], f"SparseScaledDotProductSelfAttention: q shape mismatch"
+        if num_all_args == 3:
+            assert k.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: k shape mismatch"
+            assert v.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: v shape mismatch"
+    if ATTN == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=1)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=1)
+        q = q.unsqueeze(0)
+        k = k.unsqueeze(0)
+        v = v.unsqueeze(0)
+        mask = xops.fmha.BlockDiagonalMask.from_seqlens(q_seqlen, kv_seqlen)
+        out = xops.memory_efficient_attention(q, k, v, mask)[0]
+    elif ATTN == 'flash_attn':
+        cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(q_seqlen), dim=0)]).int().to(device)
+        if num_all_args in [2, 3]:
+            cu_seqlens_kv = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seqlen), dim=0)]).int().to(device)
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens_q, max(q_seqlen))
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_varlen_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+    else:
+        raise ValueError(f"Unknown attention module: {ATTN}")
+    if s is not None:
+        return s.replace(out)
+    else:
+        return out.reshape(N, L, H, -1)

thirdparty/TRELLIS/trellis/modules/sparse/attention/modules.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .. import SparseTensor
+from .full_attn import sparse_scaled_dot_product_attention
+from .serialized_attn import SerializeMode, sparse_serialized_scaled_dot_product_self_attention
+from .windowed_attn import sparse_windowed_scaled_dot_product_self_attention
+from ...attention import RotaryPositionEmbedder
+class SparseMultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        x_type = x.dtype
+        x = x.float()
+        if isinstance(x, SparseTensor):
+            x = x.replace(F.normalize(x.feats, dim=-1))
+        else:
+            x = F.normalize(x, dim=-1)
+        return (x * self.gamma * self.scale).to(x_type)
+class SparseMultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int] = None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "serialized", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "serialized", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        assert type == "self" or use_rope is False, "Rotary position embeddings only supported for self-attention"
+        self.channels = channels
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_sequence = shift_sequence
+        self.shift_window = shift_window
+        self.serialize_mode = serialize_mode
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+        if self.qk_rms_norm:
+            self.q_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+            self.k_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    @staticmethod
+    def _linear(module: nn.Linear, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.replace(module(x.feats))
+        else:
+            return module(x)
+    @staticmethod
+    def _reshape_chs(x: Union[SparseTensor, torch.Tensor], shape: Tuple[int, ...]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.reshape(*shape)
+        else:
+            return x.reshape(*x.shape[:2], *shape)
+    def _fused_pre(self, x: Union[SparseTensor, torch.Tensor], num_fused: int) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            x_feats = x.feats.unsqueeze(0)
+        else:
+            x_feats = x
+        x_feats = x_feats.reshape(*x_feats.shape[:2], num_fused, self.num_heads, -1)
+        return x.replace(x_feats.squeeze(0)) if isinstance(x, SparseTensor) else x_feats
+    def _rope(self, qkv: SparseTensor) -> SparseTensor:
+        q, k, v = qkv.feats.unbind(dim=1)   # [T, H, C]
+        q, k = self.rope(q, k, qkv.coords[:, 1:])
+        qkv = qkv.replace(torch.stack([q, k, v], dim=1))
+        return qkv
+    def forward(self, x: Union[SparseTensor, torch.Tensor], context: Optional[Union[SparseTensor, torch.Tensor]] = None) -> Union[SparseTensor, torch.Tensor]:
+        if self._type == "self":
+            qkv = self._linear(self.to_qkv, x)
+            qkv = self._fused_pre(qkv, num_fused=3)
+            if self.use_rope:
+                qkv = self._rope(qkv)
+            if self.qk_rms_norm:
+                q, k, v = qkv.unbind(dim=1)
+                q = self.q_rms_norm(q)
+                k = self.k_rms_norm(k)
+                qkv = qkv.replace(torch.stack([q.feats, k.feats, v.feats], dim=1))
+            if self.attn_mode == "full":
+                h = sparse_scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "serialized":
+                h = sparse_serialized_scaled_dot_product_self_attention(
+                    qkv, self.window_size, serialize_mode=self.serialize_mode, shift_sequence=self.shift_sequence, shift_window=self.shift_window
+                )
+            elif self.attn_mode == "windowed":
+                h = sparse_windowed_scaled_dot_product_self_attention(
+                    qkv, self.window_size, shift_window=self.shift_window
+                )
+        else:
+            q = self._linear(self.to_q, x)
+            q = self._reshape_chs(q, (self.num_heads, -1))
+            kv = self._linear(self.to_kv, context)
+            kv = self._fused_pre(kv, num_fused=2)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=1)
+                k = self.k_rms_norm(k)
+                kv = kv.replace(torch.stack([k.feats, v.feats], dim=1))
+            h = sparse_scaled_dot_product_attention(q, kv)
+        h = self._reshape_chs(h, (-1,))
+        h = self._linear(self.to_out, h)
+        return h

thirdparty/TRELLIS/trellis/modules/sparse/attention/serialized_attn.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from typing import *
+from enum import Enum
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_serialized_scaled_dot_product_self_attention',
+]
+class SerializeMode(Enum):
+    Z_ORDER = 0
+    Z_ORDER_TRANSPOSED = 1
+    HILBERT = 2
+    HILBERT_TRANSPOSED = 3
+SerializeModes = [
+    SerializeMode.Z_ORDER,
+    SerializeMode.Z_ORDER_TRANSPOSED,
+    SerializeMode.HILBERT,
+    SerializeMode.HILBERT_TRANSPOSED
+]
+def calc_serialization(
+    tensor: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor, torch.Tensor): Forwards and backwards indices.
+    """
+    fwd_indices = []
+    bwd_indices = []
+    seq_lens = []
+    seq_batch_indices = []
+    offsets = [0]
+    if 'vox2seq' not in globals():
+        import vox2seq
+    # Serialize the input
+    serialize_coords = tensor.coords[:, 1:].clone()
+    serialize_coords += torch.tensor(shift_window, dtype=torch.int32, device=tensor.device).reshape(1, 3)
+    if serialize_mode == SerializeMode.Z_ORDER:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.Z_ORDER_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[1, 0, 2])
+    elif serialize_mode == SerializeMode.HILBERT:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.HILBERT_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[1, 0, 2])
+    else:
+        raise ValueError(f"Unknown serialize mode: {serialize_mode}")
+    for bi, s in enumerate(tensor.layout):
+        num_points = s.stop - s.start
+        num_windows = (num_points + window_size - 1) // window_size
+        valid_window_size = num_points / num_windows
+        to_ordered = torch.argsort(code[s.start:s.stop])
+        if num_windows == 1:
+            fwd_indices.append(to_ordered)
+            bwd_indices.append(torch.zeros_like(to_ordered).scatter_(0, to_ordered, torch.arange(num_points, device=tensor.device)))
+            fwd_indices[-1] += s.start
+            bwd_indices[-1] += offsets[-1]
+            seq_lens.append(num_points)
+            seq_batch_indices.append(bi)
+            offsets.append(offsets[-1] + seq_lens[-1])
+        else:
+            # Partition the input
+            offset = 0
+            mids = [(i + 0.5) * valid_window_size + shift_sequence for i in range(num_windows)]
+            split = [math.floor(i * valid_window_size + shift_sequence) for i in range(num_windows + 1)]
+            bwd_index = torch.zeros((num_points,), dtype=torch.int64, device=tensor.device)
+            for i in range(num_windows):
+                mid = mids[i]
+                valid_start = split[i]
+                valid_end = split[i + 1]
+                padded_start = math.floor(mid - 0.5 * window_size)
+                padded_end = padded_start + window_size
+                fwd_indices.append(to_ordered[torch.arange(padded_start, padded_end, device=tensor.device) % num_points])
+                offset += valid_start - padded_start
+                bwd_index.scatter_(0, fwd_indices[-1][valid_start-padded_start:valid_end-padded_start], torch.arange(offset, offset + valid_end - valid_start, device=tensor.device))
+                offset += padded_end - valid_start
+                fwd_indices[-1] += s.start
+            seq_lens.extend([window_size] * num_windows)
+            seq_batch_indices.extend([bi] * num_windows)
+            bwd_indices.append(bwd_index + offsets[-1])
+            offsets.append(offsets[-1] + num_windows * window_size)
+    fwd_indices = torch.cat(fwd_indices)
+    bwd_indices = torch.cat(bwd_indices)
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_serialized_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply serialized scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'serialization_{serialize_mode}_{window_size}_{shift_sequence}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_serialization(qkv, window_size, serialize_mode, shift_sequence, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            assert (qkv_coords[start:start+seq_lens[i], 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)

thirdparty/TRELLIS/trellis/modules/sparse/attention/windowed_attn.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from typing import *
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_windowed_scaled_dot_product_self_attention',
+]
+def calc_window_partition(
+    tensor: SparseTensor,
+    window_size: Union[int, Tuple[int, ...]],
+    shift_window: Union[int, Tuple[int, ...]] = 0
+) -> Tuple[torch.Tensor, torch.Tensor, List[int], List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, ...]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor): Forwards indices.
+        (torch.Tensor): Backwards indices.
+        (List[int]): Sequence lengths.
+        (List[int]): Sequence batch indices.
+    """
+    DIM = tensor.coords.shape[1] - 1
+    shift_window = (shift_window,) * DIM if isinstance(shift_window, int) else shift_window
+    window_size = (window_size,) * DIM if isinstance(window_size, int) else window_size
+    shifted_coords = tensor.coords.clone().detach()
+    shifted_coords[:, 1:] += torch.tensor(shift_window, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    MAX_COORDS = shifted_coords[:, 1:].max(dim=0).values.tolist()
+    NUM_WINDOWS = [math.ceil((mc + 1) / ws) for mc, ws in zip(MAX_COORDS, window_size)]
+    OFFSET = torch.cumprod(torch.tensor([1] + NUM_WINDOWS[::-1]), dim=0).tolist()[::-1]
+    shifted_coords[:, 1:] //= torch.tensor(window_size, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    shifted_indices = (shifted_coords * torch.tensor(OFFSET, device=tensor.device, dtype=torch.int32).unsqueeze(0)).sum(dim=1)
+    fwd_indices = torch.argsort(shifted_indices)
+    bwd_indices = torch.empty_like(fwd_indices)
+    bwd_indices[fwd_indices] = torch.arange(fwd_indices.shape[0], device=tensor.device)
+    seq_lens = torch.bincount(shifted_indices)
+    seq_batch_indices = torch.arange(seq_lens.shape[0], device=tensor.device, dtype=torch.int32) // OFFSET[0]
+    mask = seq_lens != 0
+    seq_lens = seq_lens[mask].tolist()
+    seq_batch_indices = seq_batch_indices[mask].tolist()
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_windowed_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply windowed scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'window_partition_{window_size}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_window_partition(qkv, window_size, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            seq_coords = qkv_coords[start:start+seq_lens[i]]
+            assert (seq_coords[:, 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            assert (seq_coords[:, 1:].max(dim=0).values - seq_coords[:, 1:].min(dim=0).values < window_size).all(), \
+                    f"SparseWindowedScaledDotProductSelfAttention: window size exceeded"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)

thirdparty/TRELLIS/trellis/modules/sparse/basic.py ADDED Viewed

	@@ -0,0 +1,459 @@

+from typing import *
+import torch
+import torch.nn as nn
+from . import BACKEND, DEBUG
+SparseTensorData = None # Lazy import
+__all__ = [
+    'SparseTensor',
+    'sparse_batch_broadcast',
+    'sparse_batch_op',
+    'sparse_cat',
+    'sparse_unbind',
+]
+class SparseTensor:
+    """
+    Sparse tensor with support for both torchsparse and spconv backends.
+    Parameters:
+    - feats (torch.Tensor): Features of the sparse tensor.
+    - coords (torch.Tensor): Coordinates of the sparse tensor.
+    - shape (torch.Size): Shape of the sparse tensor.
+    - layout (List[slice]): Layout of the sparse tensor for each batch
+    - data (SparseTensorData): Sparse tensor data used for convolusion
+    NOTE:
+    - Data corresponding to a same batch should be contiguous.
+    - Coords should be in [0, 1023]
+    """
+    @overload
+    def __init__(self, feats: torch.Tensor, coords: torch.Tensor, shape: Optional[torch.Size] = None, layout: Optional[List[slice]] = None, **kwargs): ...
+    @overload
+    def __init__(self, data, shape: Optional[torch.Size] = None, layout: Optional[List[slice]] = None, **kwargs): ...
+    def __init__(self, *args, **kwargs):
+        # Lazy import of sparse tensor backend
+        global SparseTensorData
+        if SparseTensorData is None:
+            import importlib
+            if BACKEND == 'torchsparse':
+                SparseTensorData = importlib.import_module('torchsparse').SparseTensor
+            elif BACKEND == 'spconv':
+                SparseTensorData = importlib.import_module('spconv.pytorch').SparseConvTensor
+        method_id = 0
+        if len(args) != 0:
+            method_id = 0 if isinstance(args[0], torch.Tensor) else 1
+        else:
+            method_id = 1 if 'data' in kwargs else 0
+        if method_id == 0:
+            feats, coords, shape, layout = args + (None,) * (4 - len(args))
+            if 'feats' in kwargs:
+                feats = kwargs['feats']
+                del kwargs['feats']
+            if 'coords' in kwargs:
+                coords = kwargs['coords']
+                del kwargs['coords']
+            if 'shape' in kwargs:
+                shape = kwargs['shape']
+                del kwargs['shape']
+            if 'layout' in kwargs:
+                layout = kwargs['layout']
+                del kwargs['layout']
+            if shape is None:
+                shape = self.__cal_shape(feats, coords)
+            if layout is None:
+                layout = self.__cal_layout(coords, shape[0])
+            if BACKEND == 'torchsparse':
+                self.data = SparseTensorData(feats, coords, **kwargs)
+            elif BACKEND == 'spconv':
+                spatial_shape = list(coords.max(0)[0] + 1)[1:]
+                self.data = SparseTensorData(feats.reshape(feats.shape[0], -1), coords, spatial_shape, shape[0], **kwargs)
+                self.data._features = feats
+        elif method_id == 1:
+            data, shape, layout = args + (None,) * (3 - len(args))
+            if 'data' in kwargs:
+                data = kwargs['data']
+                del kwargs['data']
+            if 'shape' in kwargs:
+                shape = kwargs['shape']
+                del kwargs['shape']
+            if 'layout' in kwargs:
+                layout = kwargs['layout']
+                del kwargs['layout']
+            self.data = data
+            if shape is None:
+                shape = self.__cal_shape(self.feats, self.coords)
+            if layout is None:
+                layout = self.__cal_layout(self.coords, shape[0])
+        self._shape = shape
+        self._layout = layout
+        self._scale = kwargs.get('scale', (1, 1, 1))
+        self._spatial_cache = kwargs.get('spatial_cache', {})
+        if DEBUG:
+            try:
+                assert self.feats.shape[0] == self.coords.shape[0], f"Invalid feats shape: {self.feats.shape}, coords shape: {self.coords.shape}"
+                assert self.shape == self.__cal_shape(self.feats, self.coords), f"Invalid shape: {self.shape}"
+                assert self.layout == self.__cal_layout(self.coords, self.shape[0]), f"Invalid layout: {self.layout}"
+                for i in range(self.shape[0]):
+                    assert torch.all(self.coords[self.layout[i], 0] == i), f"The data of batch {i} is not contiguous"
+            except Exception as e:
+                print('Debugging information:')
+                print(f"- Shape: {self.shape}")
+                print(f"- Layout: {self.layout}")
+                print(f"- Scale: {self._scale}")
+                print(f"- Coords: {self.coords}")
+                raise e
+    def __cal_shape(self, feats, coords):
+        shape = []
+        shape.append(coords[:, 0].max().item() + 1)
+        shape.extend([*feats.shape[1:]])
+        return torch.Size(shape)
+    def __cal_layout(self, coords, batch_size):
+        seq_len = torch.bincount(coords[:, 0], minlength=batch_size)
+        offset = torch.cumsum(seq_len, dim=0)
+        layout = [slice((offset[i] - seq_len[i]).item(), offset[i].item()) for i in range(batch_size)]
+        return layout
+    @property
+    def shape(self) -> torch.Size:
+        return self._shape
+    def dim(self) -> int:
+        return len(self.shape)
+    @property
+    def layout(self) -> List[slice]:
+        return self._layout
+    @property
+    def feats(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.F
+        elif BACKEND == 'spconv':
+            return self.data.features
+    @feats.setter
+    def feats(self, value: torch.Tensor):
+        if BACKEND == 'torchsparse':
+            self.data.F = value
+        elif BACKEND == 'spconv':
+            self.data.features = value
+    @property
+    def coords(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.C
+        elif BACKEND == 'spconv':
+            return self.data.indices
+    @coords.setter
+    def coords(self, value: torch.Tensor):
+        if BACKEND == 'torchsparse':
+            self.data.C = value
+        elif BACKEND == 'spconv':
+            self.data.indices = value
+    @property
+    def dtype(self):
+        return self.feats.dtype
+    @property
+    def device(self):
+        return self.feats.device
+    @overload
+    def to(self, dtype: torch.dtype) -> 'SparseTensor': ...
+    @overload
+    def to(self, device: Optional[Union[str, torch.device]] = None, dtype: Optional[torch.dtype] = None) -> 'SparseTensor': ...
+    def to(self, *args, **kwargs) -> 'SparseTensor':
+        device = None
+        dtype = None
+        if len(args) == 2:
+            device, dtype = args
+        elif len(args) == 1:
+            if isinstance(args[0], torch.dtype):
+                dtype = args[0]
+            else:
+                device = args[0]
+        if 'dtype' in kwargs:
+            assert dtype is None, "to() received multiple values for argument 'dtype'"
+            dtype = kwargs['dtype']
+        if 'device' in kwargs:
+            assert device is None, "to() received multiple values for argument 'device'"
+            device = kwargs['device']
+        new_feats = self.feats.to(device=device, dtype=dtype)
+        new_coords = self.coords.to(device=device)
+        return self.replace(new_feats, new_coords)
+    def type(self, dtype):
+        new_feats = self.feats.type(dtype)
+        return self.replace(new_feats)
+    def cpu(self) -> 'SparseTensor':
+        new_feats = self.feats.cpu()
+        new_coords = self.coords.cpu()
+        return self.replace(new_feats, new_coords)
+    def cuda(self) -> 'SparseTensor':
+        new_feats = self.feats.cuda()
+        new_coords = self.coords.cuda()
+        return self.replace(new_feats, new_coords)
+    def half(self) -> 'SparseTensor':
+        new_feats = self.feats.half()
+        return self.replace(new_feats)
+    def float(self) -> 'SparseTensor':
+        new_feats = self.feats.float()
+        return self.replace(new_feats)
+    def detach(self) -> 'SparseTensor':
+        new_coords = self.coords.detach()
+        new_feats = self.feats.detach()
+        return self.replace(new_feats, new_coords)
+    def dense(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.dense()
+        elif BACKEND == 'spconv':
+            return self.data.dense()
+    def reshape(self, *shape) -> 'SparseTensor':
+        new_feats = self.feats.reshape(self.feats.shape[0], *shape)
+        return self.replace(new_feats)
+    def unbind(self, dim: int) -> List['SparseTensor']:
+        return sparse_unbind(self, dim)
+    def replace(self, feats: torch.Tensor, coords: Optional[torch.Tensor] = None) -> 'SparseTensor':
+        new_shape = [self.shape[0]]
+        new_shape.extend(feats.shape[1:])
+        if BACKEND == 'torchsparse':
+            new_data = SparseTensorData(
+                feats=feats,
+                coords=self.data.coords if coords is None else coords,
+                stride=self.data.stride,
+                spatial_range=self.data.spatial_range,
+            )
+            new_data._caches = self.data._caches
+        elif BACKEND == 'spconv':
+            new_data = SparseTensorData(
+                self.data.features.reshape(self.data.features.shape[0], -1),
+                self.data.indices,
+                self.data.spatial_shape,
+                self.data.batch_size,
+                self.data.grid,
+                self.data.voxel_num,
+                self.data.indice_dict
+            )
+            new_data._features = feats
+            new_data.benchmark = self.data.benchmark
+            new_data.benchmark_record = self.data.benchmark_record
+            new_data.thrust_allocator = self.data.thrust_allocator
+            new_data._timer = self.data._timer
+            new_data.force_algo = self.data.force_algo
+            new_data.int8_scale = self.data.int8_scale
+            if coords is not None:
+                new_data.indices = coords
+        new_tensor = SparseTensor(new_data, shape=torch.Size(new_shape), layout=self.layout, scale=self._scale, spatial_cache=self._spatial_cache)
+        return new_tensor
+    @staticmethod
+    def full(aabb, dim, value, dtype=torch.float32, device=None) -> 'SparseTensor':
+        N, C = dim
+        x = torch.arange(aabb[0], aabb[3] + 1)
+        y = torch.arange(aabb[1], aabb[4] + 1)
+        z = torch.arange(aabb[2], aabb[5] + 1)
+        coords = torch.stack(torch.meshgrid(x, y, z, indexing='ij'), dim=-1).reshape(-1, 3)
+        coords = torch.cat([
+            torch.arange(N).view(-1, 1).repeat(1, coords.shape[0]).view(-1, 1),
+            coords.repeat(N, 1),
+        ], dim=1).to(dtype=torch.int32, device=device)
+        feats = torch.full((coords.shape[0], C), value, dtype=dtype, device=device)
+        return SparseTensor(feats=feats, coords=coords)
+    def __merge_sparse_cache(self, other: 'SparseTensor') -> dict:
+        new_cache = {}
+        for k in set(list(self._spatial_cache.keys()) + list(other._spatial_cache.keys())):
+            if k in self._spatial_cache:
+                new_cache[k] = self._spatial_cache[k]
+            if k in other._spatial_cache:
+                if k not in new_cache:
+                    new_cache[k] = other._spatial_cache[k]
+                else:
+                    new_cache[k].update(other._spatial_cache[k])
+        return new_cache
+    def __neg__(self) -> 'SparseTensor':
+        return self.replace(-self.feats)
+    def __elemwise__(self, other: Union[torch.Tensor, 'SparseTensor'], op: callable) -> 'SparseTensor':
+        if isinstance(other, torch.Tensor):
+            try:
+                other = torch.broadcast_to(other, self.shape)
+                other = sparse_batch_broadcast(self, other)
+            except:
+                pass
+        if isinstance(other, SparseTensor):
+            other = other.feats
+        new_feats = op(self.feats, other)
+        new_tensor = self.replace(new_feats)
+        if isinstance(other, SparseTensor):
+            new_tensor._spatial_cache = self.__merge_sparse_cache(other)
+        return new_tensor
+    def __add__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.add)
+    def __radd__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.add)
+    def __sub__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.sub)
+    def __rsub__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, lambda x, y: torch.sub(y, x))
+    def __mul__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.mul)
+    def __rmul__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.mul)
+    def __truediv__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.div)
+    def __rtruediv__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, lambda x, y: torch.div(y, x))
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            idx = [idx]
+        elif isinstance(idx, slice):
+            idx = range(*idx.indices(self.shape[0]))
+        elif isinstance(idx, torch.Tensor):
+            if idx.dtype == torch.bool:
+                assert idx.shape == (self.shape[0],), f"Invalid index shape: {idx.shape}"
+                idx = idx.nonzero().squeeze(1)
+            elif idx.dtype in [torch.int32, torch.int64]:
+                assert len(idx.shape) == 1, f"Invalid index shape: {idx.shape}"
+            else:
+                raise ValueError(f"Unknown index type: {idx.dtype}")
+        else:
+            raise ValueError(f"Unknown index type: {type(idx)}")
+        coords = []
+        feats = []
+        for new_idx, old_idx in enumerate(idx):
+            coords.append(self.coords[self.layout[old_idx]].clone())
+            coords[-1][:, 0] = new_idx
+            feats.append(self.feats[self.layout[old_idx]])
+        coords = torch.cat(coords, dim=0).contiguous()
+        feats = torch.cat(feats, dim=0).contiguous()
+        return SparseTensor(feats=feats, coords=coords)
+    def register_spatial_cache(self, key, value) -> None:
+        """
+        Register a spatial cache.
+        The spatial cache can be any thing you want to cache.
+        The registery and retrieval of the cache is based on current scale.
+        """
+        scale_key = str(self._scale)
+        if scale_key not in self._spatial_cache:
+            self._spatial_cache[scale_key] = {}
+        self._spatial_cache[scale_key][key] = value
+    def get_spatial_cache(self, key=None):
+        """
+        Get a spatial cache.
+        """
+        scale_key = str(self._scale)
+        cur_scale_cache = self._spatial_cache.get(scale_key, {})
+        if key is None:
+            return cur_scale_cache
+        return cur_scale_cache.get(key, None)
+def sparse_batch_broadcast(input: SparseTensor, other: torch.Tensor) -> torch.Tensor:
+    """
+    Broadcast a 1D tensor to a sparse tensor along the batch dimension then perform an operation.
+    Args:
+        input (torch.Tensor): 1D tensor to broadcast.
+        target (SparseTensor): Sparse tensor to broadcast to.
+        op (callable): Operation to perform after broadcasting. Defaults to torch.add.
+    """
+    coords, feats = input.coords, input.feats
+    broadcasted = torch.zeros_like(feats)
+    for k in range(input.shape[0]):
+        broadcasted[input.layout[k]] = other[k]
+    return broadcasted
+def sparse_batch_op(input: SparseTensor, other: torch.Tensor, op: callable = torch.add) -> SparseTensor:
+    """
+    Broadcast a 1D tensor to a sparse tensor along the batch dimension then perform an operation.
+    Args:
+        input (torch.Tensor): 1D tensor to broadcast.
+        target (SparseTensor): Sparse tensor to broadcast to.
+        op (callable): Operation to perform after broadcasting. Defaults to torch.add.
+    """
+    return input.replace(op(input.feats, sparse_batch_broadcast(input, other)))
+def sparse_cat(inputs: List[SparseTensor], dim: int = 0) -> SparseTensor:
+    """
+    Concatenate a list of sparse tensors.
+    Args:
+        inputs (List[SparseTensor]): List of sparse tensors to concatenate.
+    """
+    if dim == 0:
+        start = 0
+        coords = []
+        for input in inputs:
+            coords.append(input.coords.clone())
+            coords[-1][:, 0] += start
+            start += input.shape[0]
+        coords = torch.cat(coords, dim=0)
+        feats = torch.cat([input.feats for input in inputs], dim=0)
+        output = SparseTensor(
+            coords=coords,
+            feats=feats,
+        )
+    else:
+        feats = torch.cat([input.feats for input in inputs], dim=dim)
+        output = inputs[0].replace(feats)
+    return output
+def sparse_unbind(input: SparseTensor, dim: int) -> List[SparseTensor]:
+    """
+    Unbind a sparse tensor along a dimension.
+    Args:
+        input (SparseTensor): Sparse tensor to unbind.
+        dim (int): Dimension to unbind.
+    """
+    if dim == 0:
+        return [input[i] for i in range(input.shape[0])]
+    else:
+        feats = input.feats.unbind(dim)
+        return [input.replace(f) for f in feats]

thirdparty/TRELLIS/trellis/modules/sparse/conv/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .. import BACKEND
+SPCONV_ALGO = 'auto'    # 'auto', 'implicit_gemm', 'native'
+def __from_env():
+    import os
+    global SPCONV_ALGO
+    env_spconv_algo = os.environ.get('SPCONV_ALGO')
+    if env_spconv_algo is not None and env_spconv_algo in ['auto', 'implicit_gemm', 'native']:
+        SPCONV_ALGO = env_spconv_algo
+    print(f"[SPARSE][CONV] spconv algo: {SPCONV_ALGO}")
+__from_env()
+if BACKEND == 'torchsparse':
+    from .conv_torchsparse import *
+elif BACKEND == 'spconv':
+    from .conv_spconv import *

thirdparty/TRELLIS/trellis/modules/sparse/conv/conv_spconv.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+import torch.nn as nn
+from .. import SparseTensor
+from .. import DEBUG
+from . import SPCONV_ALGO
+class SparseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, padding=None, bias=True, indice_key=None):
+        super(SparseConv3d, self).__init__()
+        if 'spconv' not in globals():
+            import spconv.pytorch as spconv
+        algo = None
+        if SPCONV_ALGO == 'native':
+            algo = spconv.ConvAlgo.Native
+        elif SPCONV_ALGO == 'implicit_gemm':
+            algo = spconv.ConvAlgo.MaskImplicitGemm
+        if stride == 1 and (padding is None):
+            self.conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, dilation=dilation, bias=bias, indice_key=indice_key, algo=algo)
+        else:
+            self.conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias, indice_key=indice_key, algo=algo)
+        self.stride = tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride, stride)
+        self.padding = padding
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        spatial_changed = any(s != 1 for s in self.stride) or (self.padding is not None)
+        new_data = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        new_layout = None if spatial_changed else x.layout
+        if spatial_changed and (x.shape[0] != 1):
+            # spconv was non-1 stride will break the contiguous of the output tensor, sort by the coords
+            fwd = new_data.indices[:, 0].argsort()
+            bwd = torch.zeros_like(fwd).scatter_(0, fwd, torch.arange(fwd.shape[0], device=fwd.device))
+            sorted_feats = new_data.features[fwd]
+            sorted_coords = new_data.indices[fwd]
+            unsorted_data = new_data
+            new_data = spconv.SparseConvTensor(sorted_feats, sorted_coords, unsorted_data.spatial_shape, unsorted_data.batch_size)  # type: ignore
+        out = SparseTensor(
+            new_data, shape=torch.Size(new_shape), layout=new_layout,
+            scale=tuple([s * stride for s, stride in zip(x._scale, self.stride)]),
+            spatial_cache=x._spatial_cache,
+        )
+        if spatial_changed and (x.shape[0] != 1):
+            out.register_spatial_cache(f'conv_{self.stride}_unsorted_data', unsorted_data)
+            out.register_spatial_cache(f'conv_{self.stride}_sort_bwd', bwd)
+        return out
+class SparseInverseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseInverseConv3d, self).__init__()
+        if 'spconv' not in globals():
+            import spconv.pytorch as spconv
+        self.conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, bias=bias, indice_key=indice_key)
+        self.stride = tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride, stride)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        spatial_changed = any(s != 1 for s in self.stride)
+        if spatial_changed:
+            # recover the original spconv order
+            data = x.get_spatial_cache(f'conv_{self.stride}_unsorted_data')
+            bwd = x.get_spatial_cache(f'conv_{self.stride}_sort_bwd')
+            data = data.replace_feature(x.feats[bwd])
+            if DEBUG:
+                assert torch.equal(data.indices, x.coords[bwd]), 'Recover the original order failed'
+        else:
+            data = x.data
+        new_data = self.conv(data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        new_layout = None if spatial_changed else x.layout
+        out = SparseTensor(
+            new_data, shape=torch.Size(new_shape), layout=new_layout,
+            scale=tuple([s // stride for s, stride in zip(x._scale, self.stride)]),
+            spatial_cache=x._spatial_cache,
+        )
+        return out

thirdparty/TRELLIS/trellis/modules/sparse/conv/conv_torchsparse.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+import torch.nn as nn
+from .. import SparseTensor
+class SparseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseConv3d, self).__init__()
+        if 'torchsparse' not in globals():
+            import torchsparse
+        self.conv = torchsparse.nn.Conv3d(in_channels, out_channels, kernel_size, stride, 0, dilation, bias)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        out = SparseTensor(out, shape=torch.Size(new_shape), layout=x.layout if all(s == 1 for s in self.conv.stride) else None)
+        out._spatial_cache = x._spatial_cache
+        out._scale = tuple([s * stride for s, stride in zip(x._scale, self.conv.stride)])
+        return out
+class SparseInverseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseInverseConv3d, self).__init__()
+        if 'torchsparse' not in globals():
+            import torchsparse
+        self.conv = torchsparse.nn.Conv3d(in_channels, out_channels, kernel_size, stride, 0, dilation, bias, transposed=True)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        out = SparseTensor(out, shape=torch.Size(new_shape), layout=x.layout if all(s == 1 for s in self.conv.stride) else None)
+        out._spatial_cache = x._spatial_cache
+        out._scale = tuple([s // stride for s, stride in zip(x._scale, self.conv.stride)])
+        return out

thirdparty/TRELLIS/trellis/modules/sparse/linear.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseLinear'
+]
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True):
+        super(SparseLinear, self).__init__(in_features, out_features, bias)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))

thirdparty/TRELLIS/trellis/modules/sparse/nonlinearity.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseReLU',
+    'SparseSiLU',
+    'SparseGELU',
+    'SparseActivation'
+]
+class SparseReLU(nn.ReLU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseSiLU(nn.SiLU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseGELU(nn.GELU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseActivation(nn.Module):
+    def __init__(self, activation: nn.Module):
+        super().__init__()
+        self.activation = activation
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(self.activation(input.feats))

thirdparty/TRELLIS/trellis/modules/sparse/norm.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+from . import SparseTensor
+from . import DEBUG
+__all__ = [
+    'SparseGroupNorm',
+    'SparseLayerNorm',
+    'SparseGroupNorm32',
+    'SparseLayerNorm32',
+]
+class SparseGroupNorm(nn.GroupNorm):
+    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
+        super(SparseGroupNorm, self).__init__(num_groups, num_channels, eps, affine)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        nfeats = torch.zeros_like(input.feats)
+        for k in range(input.shape[0]):
+            if DEBUG:
+                assert (input.coords[input.layout[k], 0] == k).all(), f"SparseGroupNorm: batch index mismatch"
+            bfeats = input.feats[input.layout[k]]
+            bfeats = bfeats.permute(1, 0).reshape(1, input.shape[1], -1)
+            bfeats = super().forward(bfeats)
+            bfeats = bfeats.reshape(input.shape[1], -1).permute(1, 0)
+            nfeats[input.layout[k]] = bfeats
+        return input.replace(nfeats)
+class SparseLayerNorm(nn.LayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super(SparseLayerNorm, self).__init__(normalized_shape, eps, elementwise_affine)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        nfeats = torch.zeros_like(input.feats)
+        for k in range(input.shape[0]):
+            bfeats = input.feats[input.layout[k]]
+            bfeats = bfeats.permute(1, 0).reshape(1, input.shape[1], -1)
+            bfeats = super().forward(bfeats)
+            bfeats = bfeats.reshape(input.shape[1], -1).permute(1, 0)
+            nfeats[input.layout[k]] = bfeats
+        return input.replace(nfeats)
+class SparseGroupNorm32(SparseGroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return super().forward(x.float()).type(x.dtype)
+class SparseLayerNorm32(SparseLayerNorm):
+    """
+    A LayerNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return super().forward(x.float()).type(x.dtype)

thirdparty/TRELLIS/trellis/modules/sparse/spatial.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from typing import *
+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseDownsample',
+    'SparseUpsample',
+    'SparseSubdivide'
+]
+class SparseDownsample(nn.Module):
+    """
+    Downsample a sparse tensor by a factor of `factor`.
+    Implemented as average pooling.
+    """
+    def __init__(self, factor: Union[int, Tuple[int, ...], List[int]]):
+        super(SparseDownsample, self).__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * DIM
+        assert DIM == len(factor), 'Input coordinates must have the same dimension as the downsample factor.'
+        coord = list(input.coords.unbind(dim=-1))
+        for i, f in enumerate(factor):
+            coord[i+1] = coord[i+1] // f
+        MAX = [coord[i+1].max().item() + 1 for i in range(DIM)]
+        OFFSET = torch.cumprod(torch.tensor(MAX[::-1]), 0).tolist()[::-1] + [1]
+        code = sum([c * o for c, o in zip(coord, OFFSET)])
+        code, idx = code.unique(return_inverse=True)
+        new_feats = torch.scatter_reduce(
+            torch.zeros(code.shape[0], input.feats.shape[1], device=input.feats.device, dtype=input.feats.dtype),
+            dim=0,
+            index=idx.unsqueeze(1).expand(-1, input.feats.shape[1]),
+            src=input.feats,
+            reduce='mean'
+        )
+        new_coords = torch.stack(
+            [code // OFFSET[0]] +
+            [(code // OFFSET[i+1]) % MAX[i] for i in range(DIM)],
+            dim=-1
+        )
+        out = SparseTensor(new_feats, new_coords, input.shape,)
+        out._scale = tuple([s // f for s, f in zip(input._scale, factor)])
+        out._spatial_cache = input._spatial_cache
+        out.register_spatial_cache(f'upsample_{factor}_coords', input.coords)
+        out.register_spatial_cache(f'upsample_{factor}_layout', input.layout)
+        out.register_spatial_cache(f'upsample_{factor}_idx', idx)
+        return out
+class SparseUpsample(nn.Module):
+    """
+    Upsample a sparse tensor by a factor of `factor`.
+    Implemented as nearest neighbor interpolation.
+    """
+    def __init__(self, factor: Union[int, Tuple[int, int, int], List[int]]):
+        super(SparseUpsample, self).__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * DIM
+        assert DIM == len(factor), 'Input coordinates must have the same dimension as the upsample factor.'
+        new_coords = input.get_spatial_cache(f'upsample_{factor}_coords')
+        new_layout = input.get_spatial_cache(f'upsample_{factor}_layout')
+        idx = input.get_spatial_cache(f'upsample_{factor}_idx')
+        if any([x is None for x in [new_coords, new_layout, idx]]):
+            raise ValueError('Upsample cache not found. SparseUpsample must be paired with SparseDownsample.')
+        new_feats = input.feats[idx]
+        out = SparseTensor(new_feats, new_coords, input.shape, new_layout)
+        out._scale = tuple([s * f for s, f in zip(input._scale, factor)])
+        out._spatial_cache = input._spatial_cache
+        return out
+class SparseSubdivide(nn.Module):
+    """
+    Upsample a sparse tensor by a factor of `factor`.
+    Implemented as nearest neighbor interpolation.
+    """
+    def __init__(self):
+        super(SparseSubdivide, self).__init__()
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        # upsample scale=2^DIM
+        n_cube = torch.ones([2] * DIM, device=input.device, dtype=torch.int)
+        n_coords = torch.nonzero(n_cube)
+        n_coords = torch.cat([torch.zeros_like(n_coords[:, :1]), n_coords], dim=-1)
+        factor = n_coords.shape[0]
+        assert factor == 2 ** DIM
+        # print(n_coords.shape)
+        new_coords = input.coords.clone()
+        new_coords[:, 1:] *= 2
+        new_coords = new_coords.unsqueeze(1) + n_coords.unsqueeze(0).to(new_coords.dtype)
+        new_feats = input.feats.unsqueeze(1).expand(input.feats.shape[0], factor, *input.feats.shape[1:])
+        out = SparseTensor(new_feats.flatten(0, 1), new_coords.flatten(0, 1), input.shape)
+        out._scale = input._scale * 2
+        out._spatial_cache = input._spatial_cache
+        return out

thirdparty/TRELLIS/trellis/modules/sparse/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .blocks import *
2	+ from .modulated import *

thirdparty/TRELLIS/trellis/modules/sparse/transformer/blocks.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..basic import SparseTensor
+from ..linear import SparseLinear
+from ..nonlinearity import SparseGELU
+from ..attention import SparseMultiHeadAttention, SerializeMode
+from ...norm import LayerNorm32
+class SparseFeedForwardNet(nn.Module):
+    def __init__(self, channels: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            SparseLinear(channels, int(channels * mlp_ratio)),
+            SparseGELU(approximate="tanh"),
+            SparseLinear(int(channels * mlp_ratio), channels),
+        )
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return self.mlp(x)
+class SparseTransformerBlock(nn.Module):
+    """
+    Sparse Transformer block (MSA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: SparseTensor) -> SparseTensor:
+        h = x.replace(self.norm1(x.feats))
+        h = self.attn(h)
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class SparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor):
+        h = x.replace(self.norm1(x.feats))
+        h = self.self_attn(h)
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = x.replace(self.norm3(x.feats))
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, context, use_reentrant=False)
+        else:
+            return self._forward(x, context)

thirdparty/TRELLIS/trellis/modules/sparse/transformer/modulated.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..basic import SparseTensor
+from ..attention import SparseMultiHeadAttention, SerializeMode
+from ...norm import LayerNorm32
+from .blocks import SparseFeedForwardNet
+class ModulatedSparseTransformerBlock(nn.Module):
+    """
+    Sparse Transformer block (MSA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, use_reentrant=False)
+        else:
+            return self._forward(x, mod)
+class ModulatedSparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.self_attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = x.replace(self.norm3(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, context, use_reentrant=False)
+        else:
+            return self._forward(x, mod, context)

thirdparty/TRELLIS/trellis/modules/spatial.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+def pixel_shuffle_3d(x: torch.Tensor, scale_factor: int) -> torch.Tensor:
+    """
+    3D pixel shuffle.
+    """
+    B, C, H, W, D = x.shape
+    C_ = C // scale_factor**3
+    x = x.reshape(B, C_, scale_factor, scale_factor, scale_factor, H, W, D)
+    x = x.permute(0, 1, 5, 2, 6, 3, 7, 4)
+    x = x.reshape(B, C_, H*scale_factor, W*scale_factor, D*scale_factor)
+    return x
+def patchify(x: torch.Tensor, patch_size: int):
+    """
+    Patchify a tensor.
+    Args:
+        x (torch.Tensor): (N, C, *spatial) tensor
+        patch_size (int): Patch size
+    """
+    DIM = x.dim() - 2
+    for d in range(2, DIM + 2):
+        assert x.shape[d] % patch_size == 0, f"Dimension {d} of input tensor must be divisible by patch size, got {x.shape[d]} and {patch_size}"
+    x = x.reshape(*x.shape[:2], *sum([[x.shape[d] // patch_size, patch_size] for d in range(2, DIM + 2)], []))
+    x = x.permute(0, 1, *([2 * i + 3 for i in range(DIM)] + [2 * i + 2 for i in range(DIM)]))
+    x = x.reshape(x.shape[0], x.shape[1] * (patch_size ** DIM), *(x.shape[-DIM:]))
+    return x
+def unpatchify(x: torch.Tensor, patch_size: int):
+    """
+    Unpatchify a tensor.
+    Args:
+        x (torch.Tensor): (N, C, *spatial) tensor
+        patch_size (int): Patch size
+    """
+    DIM = x.dim() - 2
+    assert x.shape[1] % (patch_size ** DIM) == 0, f"Second dimension of input tensor must be divisible by patch size to unpatchify, got {x.shape[1]} and {patch_size ** DIM}"
+    x = x.reshape(x.shape[0], x.shape[1] // (patch_size ** DIM), *([patch_size] * DIM), *(x.shape[-DIM:]))
+    x = x.permute(0, 1, *(sum([[2 + DIM + i, 2 + i] for i in range(DIM)], [])))
+    x = x.reshape(x.shape[0], x.shape[1], *[x.shape[2 + 2 * i] * patch_size for i in range(DIM)])
+    return x

thirdparty/TRELLIS/trellis/modules/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .blocks import *
2	+ from .modulated import *

thirdparty/TRELLIS/trellis/modules/transformer/blocks.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..attention import MultiHeadAttention
+from ..norm import LayerNorm32
+class AbsolutePositionEmbedder(nn.Module):
+    """
+    Embeds spatial positions into vector representations.
+    """
+    def __init__(self, channels: int, in_channels: int = 3):
+        super().__init__()
+        self.channels = channels
+        self.in_channels = in_channels
+        self.freq_dim = channels // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _sin_cos_embedding(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Create sinusoidal position embeddings.
+        Args:
+            x: a 1-D Tensor of N indices
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        self.freqs = self.freqs.to(x.device)
+        out = torch.outer(x, self.freqs)
+        out = torch.cat([torch.sin(out), torch.cos(out)], dim=-1)
+        return out
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): (N, D) tensor of spatial positions
+        """
+        N, D = x.shape
+        assert D == self.in_channels, "Input dimension must match number of input channels"
+        embed = self._sin_cos_embedding(x.reshape(-1))
+        embed = embed.reshape(N, -1)
+        if embed.shape[1] < self.channels:
+            embed = torch.cat([embed, torch.zeros(N, self.channels - embed.shape[1], device=embed.device)], dim=-1)
+        return embed
+class FeedForwardNet(nn.Module):
+    def __init__(self, channels: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, int(channels * mlp_ratio)),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(int(channels * mlp_ratio), channels),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+class TransformerBlock(nn.Module):
+    """
+    Transformer block (MSA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[int] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = self.attn(h)
+        x = x + h
+        h = self.norm2(x)
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class TransformerCrossBlock(nn.Module):
+    """
+    Transformer cross-attention block (MSA + MCA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.self_attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: torch.Tensor, context: torch.Tensor):
+        h = self.norm1(x)
+        h = self.self_attn(h)
+        x = x + h
+        h = self.norm2(x)
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = self.norm3(x)
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, context, use_reentrant=False)
+        else:
+            return self._forward(x, context)

thirdparty/TRELLIS/trellis/modules/transformer/modulated.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..attention import MultiHeadAttention
+from ..norm import LayerNorm32
+from .blocks import FeedForwardNet
+class ModulatedTransformerBlock(nn.Module):
+    """
+    Transformer block (MSA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        h = self.attn(h)
+        h = h * gate_msa.unsqueeze(1)
+        x = x + h
+        h = self.norm2(x)
+        h = h * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        h = self.mlp(h)
+        h = h * gate_mlp.unsqueeze(1)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, use_reentrant=False)
+        else:
+            return self._forward(x, mod)
+class ModulatedTransformerCrossBlock(nn.Module):
+    """
+    Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.self_attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: torch.Tensor, mod: torch.Tensor, context: torch.Tensor):
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        h = self.self_attn(h)
+        h = h * gate_msa.unsqueeze(1)
+        x = x + h
+        h = self.norm2(x)
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = self.norm3(x)
+        h = h * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        h = self.mlp(h)
+        h = h * gate_mlp.unsqueeze(1)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, mod: torch.Tensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, context, use_reentrant=False)
+        else:
+            return self._forward(x, mod, context)

thirdparty/TRELLIS/trellis/modules/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch.nn as nn
+from ..modules import sparse as sp
+FP16_MODULES = (
+    nn.Conv1d,
+    nn.Conv2d,
+    nn.Conv3d,
+    nn.ConvTranspose1d,
+    nn.ConvTranspose2d,
+    nn.ConvTranspose3d,
+    nn.Linear,
+    sp.SparseConv3d,
+    sp.SparseInverseConv3d,
+    sp.SparseLinear,
+)
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, FP16_MODULES):
+        for p in l.parameters():
+            p.data = p.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, FP16_MODULES):
+        for p in l.parameters():
+            p.data = p.data.float()
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)

thirdparty/TRELLIS/trellis/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from . import samplers
+from .trellis_image_to_3d import TrellisImageTo3DPipeline
+def from_pretrained(path: str):
+    """
+    Load a pipeline from a model folder or a Hugging Face model hub.
+    Args:
+        path: The path to the model. Can be either local path or a Hugging Face model name.
+    """
+    import os
+    import json
+    is_local = os.path.exists(f"{path}/pipeline.json")
+    if is_local:
+        config_file = f"{path}/pipeline.json"
+    else:
+        from huggingface_hub import hf_hub_download
+        config_file = hf_hub_download(path, "pipeline.json")
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+    return globals()[config['name']].from_pretrained(path)

thirdparty/TRELLIS/trellis/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import *
+import torch
+import torch.nn as nn
+from .. import models
+class Pipeline:
+    """
+    A base class for pipelines.
+    """
+    def __init__(
+        self,
+        models: dict[str, nn.Module] = None,
+    ):
+        if models is None:
+            return
+        self.models = models
+        for model in self.models.values():
+            model.eval()
+    @staticmethod
+    def from_pretrained(path: str) -> "Pipeline":
+        """
+        Load a pretrained model.
+        """
+        import os
+        import json
+        is_local = os.path.exists(f"{path}/pipeline.json")
+        if is_local:
+            config_file = f"{path}/pipeline.json"
+        else:
+            from huggingface_hub import hf_hub_download
+            config_file = hf_hub_download(path, "pipeline.json")
+        with open(config_file, 'r') as f:
+            args = json.load(f)['args']
+        _models = {
+            k: models.from_pretrained(f"{path}/{v}")
+            for k, v in args['models'].items()
+        }
+        new_pipeline = Pipeline(_models)
+        new_pipeline._pretrained_args = args
+        return new_pipeline
+    @property
+    def device(self) -> torch.device:
+        for model in self.models.values():
+            if hasattr(model, 'device'):
+                return model.device
+        for model in self.models.values():
+            if hasattr(model, 'parameters'):
+                return next(model.parameters()).device
+        raise RuntimeError("No device found.")
+    def to(self, device: torch.device) -> None:
+        for model in self.models.values():
+            model.to(device)
+    def cuda(self) -> None:
+        self.to(torch.device("cuda"))
+    def cpu(self) -> None:
+        self.to(torch.device("cpu"))

thirdparty/TRELLIS/trellis/pipelines/samplers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .base import Sampler
2	+ from .flow_euler import FlowEulerSampler, FlowEulerCfgSampler, FlowEulerGuidanceIntervalSampler

thirdparty/TRELLIS/trellis/pipelines/samplers/base.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import *
+from abc import ABC, abstractmethod
+class Sampler(ABC):
+    """
+    A base class for samplers.
+    """
+    @abstractmethod
+    def sample(
+        self,
+        model,
+        **kwargs
+    ):
+        """
+        Sample from a model.
+        """
+        pass

thirdparty/TRELLIS/trellis/pipelines/samplers/classifier_free_guidance_mixin.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import *
+class ClassifierFreeGuidanceSamplerMixin:
+    """
+    A mixin class for samplers that apply classifier-free guidance.
+    """
+    def _inference_model(self, model, x_t, t, cond, neg_cond, cfg_strength, **kwargs):
+        pred = super()._inference_model(model, x_t, t, cond, **kwargs)
+        neg_pred = super()._inference_model(model, x_t, t, neg_cond, **kwargs)
+        return (1 + cfg_strength) * pred - cfg_strength * neg_pred

thirdparty/TRELLIS/trellis/pipelines/samplers/flow_euler.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from typing import *
+import torch
+import numpy as np
+from tqdm import tqdm
+from easydict import EasyDict as edict
+from .base import Sampler
+from .classifier_free_guidance_mixin import ClassifierFreeGuidanceSamplerMixin
+from .guidance_interval_mixin import GuidanceIntervalSamplerMixin
+class FlowEulerSampler(Sampler):
+    """
+    Generate samples from a flow-matching model using Euler sampling.
+    Args:
+        sigma_min: The minimum scale of noise in flow.
+    """
+    def __init__(
+        self,
+        sigma_min: float,
+    ):
+        self.sigma_min = sigma_min
+    def _eps_to_xstart(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (x_t - (self.sigma_min + (1 - self.sigma_min) * t) * eps) / (1 - t)
+    def _xstart_to_eps(self, x_t, t, x_0):
+        assert x_t.shape == x_0.shape
+        return (x_t - (1 - t) * x_0) / (self.sigma_min + (1 - self.sigma_min) * t)
+    def _v_to_xstart_eps(self, x_t, t, v):
+        assert x_t.shape == v.shape
+        eps = (1 - t) * v + x_t
+        x_0 = (1 - self.sigma_min) * x_t - (self.sigma_min + (1 - self.sigma_min) * t) * v
+        return x_0, eps
+    def _inference_model(self, model, x_t, t, cond=None, **kwargs):
+        t = torch.tensor([1000 * t] * x_t.shape[0], device=x_t.device, dtype=torch.float32)
+        return model(x_t, t, cond, **kwargs)
+    def _get_model_prediction(self, model, x_t, t, cond=None, **kwargs):
+        pred_v = self._inference_model(model, x_t, t, cond, **kwargs)
+        pred_x_0, pred_eps = self._v_to_xstart_eps(x_t=x_t, t=t, v=pred_v)
+        return pred_x_0, pred_eps, pred_v
+    @torch.no_grad()
+    def sample_once(
+        self,
+        model,
+        x_t,
+        t: float,
+        t_prev: float,
+        cond: Optional[Any] = None,
+        **kwargs
+    ):
+        """
+        Sample x_{t-1} from the model using Euler method.
+        Args:
+            model: The model to sample from.
+            x_t: The [N x C x ...] tensor of noisy inputs at time t.
+            t: The current timestep.
+            t_prev: The previous timestep.
+            cond: conditional information.
+            **kwargs: Additional arguments for model inference.
+        Returns:
+            a dict containing the following
+            - 'pred_x_prev': x_{t-1}.
+            - 'pred_x_0': a prediction of x_0.
+        """
+        pred_x_0, pred_eps, pred_v = self._get_model_prediction(model, x_t, t, cond, **kwargs)
+        pred_x_prev = x_t - (t - t_prev) * pred_v
+        return edict({"pred_x_prev": pred_x_prev, "pred_x_0": pred_x_0})
+    @torch.no_grad()
+    def sample(
+        self,
+        model,
+        noise,
+        cond: Optional[Any] = None,
+        steps: int = 50,
+        rescale_t: float = 1.0,
+        verbose: bool = True,
+        **kwargs
+    ):
+        """
+        Generate samples from the model using Euler method.
+        Args:
+            model: The model to sample from.
+            noise: The initial noise tensor.
+            cond: conditional information.
+            steps: The number of steps to sample.
+            rescale_t: The rescale factor for t.
+            verbose: If True, show a progress bar.
+            **kwargs: Additional arguments for model_inference.
+        Returns:
+            a dict containing the following
+            - 'samples': the model samples.
+            - 'pred_x_t': a list of prediction of x_t.
+            - 'pred_x_0': a list of prediction of x_0.
+        """
+        sample = noise
+        t_seq = np.linspace(1, 0, steps + 1)
+        t_seq = rescale_t * t_seq / (1 + (rescale_t - 1) * t_seq)
+        t_pairs = list((t_seq[i], t_seq[i + 1]) for i in range(steps))
+        ret = edict({"samples": None, "pred_x_t": [], "pred_x_0": []})
+        for t, t_prev in tqdm(t_pairs, desc="Sampling", disable=not verbose):
+            out = self.sample_once(model, sample, t, t_prev, cond, **kwargs)
+            sample = out.pred_x_prev
+            ret.pred_x_t.append(out.pred_x_prev)
+            ret.pred_x_0.append(out.pred_x_0)
+        ret.samples = sample
+        return ret
+class FlowEulerCfgSampler(ClassifierFreeGuidanceSamplerMixin, FlowEulerSampler):
+    """
+    Generate samples from a flow-matching model using Euler sampling with classifier-free guidance.
+    """
+    @torch.no_grad()
+    def sample(
+        self,
+        model,
+        noise,
+        cond,
+        neg_cond,
+        steps: int = 50,
+        rescale_t: float = 1.0,
+        cfg_strength: float = 3.0,
+        verbose: bool = True,
+        **kwargs
+    ):
+        """
+        Generate samples from the model using Euler method.
+        Args:
+            model: The model to sample from.
+            noise: The initial noise tensor.
+            cond: conditional information.
+            neg_cond: negative conditional information.
+            steps: The number of steps to sample.
+            rescale_t: The rescale factor for t.
+            cfg_strength: The strength of classifier-free guidance.
+            verbose: If True, show a progress bar.
+            **kwargs: Additional arguments for model_inference.
+        Returns:
+            a dict containing the following
+            - 'samples': the model samples.
+            - 'pred_x_t': a list of prediction of x_t.
+            - 'pred_x_0': a list of prediction of x_0.
+        """
+        return super().sample(model, noise, cond, steps, rescale_t, verbose, neg_cond=neg_cond, cfg_strength=cfg_strength, **kwargs)
+class FlowEulerGuidanceIntervalSampler(GuidanceIntervalSamplerMixin, FlowEulerSampler):
+    """
+    Generate samples from a flow-matching model using Euler sampling with classifier-free guidance and interval.
+    """
+    @torch.no_grad()
+    def sample(
+        self,
+        model,
+        noise,
+        cond,
+        neg_cond,
+        steps: int = 50,
+        rescale_t: float = 1.0,
+        cfg_strength: float = 3.0,
+        cfg_interval: Tuple[float, float] = (0.0, 1.0),
+        verbose: bool = True,
+        **kwargs
+    ):
+        """
+        Generate samples from the model using Euler method.
+        Args:
+            model: The model to sample from.
+            noise: The initial noise tensor.
+            cond: conditional information.
+            neg_cond: negative conditional information.
+            steps: The number of steps to sample.
+            rescale_t: The rescale factor for t.
+            cfg_strength: The strength of classifier-free guidance.
+            cfg_interval: The interval for classifier-free guidance.
+            verbose: If True, show a progress bar.
+            **kwargs: Additional arguments for model_inference.
+        Returns:
+            a dict containing the following
+            - 'samples': the model samples.
+            - 'pred_x_t': a list of prediction of x_t.
+            - 'pred_x_0': a list of prediction of x_0.
+        """
+        return super().sample(model, noise, cond, steps, rescale_t, verbose, neg_cond=neg_cond, cfg_strength=cfg_strength, cfg_interval=cfg_interval, **kwargs)

thirdparty/TRELLIS/trellis/pipelines/samplers/guidance_interval_mixin.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from typing import *
+class GuidanceIntervalSamplerMixin:
+    """
+    A mixin class for samplers that apply classifier-free guidance with interval.
+    """
+    def _inference_model(self, model, x_t, t, cond, neg_cond, cfg_strength, cfg_interval, **kwargs):
+        if cfg_interval[0] <= t <= cfg_interval[1]:
+            pred = super()._inference_model(model, x_t, t, cond, **kwargs)
+            neg_pred = super()._inference_model(model, x_t, t, neg_cond, **kwargs)
+            return (1 + cfg_strength) * pred - cfg_strength * neg_pred
+        else:
+            return super()._inference_model(model, x_t, t, cond, **kwargs)

thirdparty/TRELLIS/trellis/pipelines/trellis_image_to_3d.py ADDED Viewed

	@@ -0,0 +1,376 @@

+from typing import *
+from contextlib import contextmanager
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from easydict import EasyDict as edict
+from torchvision import transforms
+from PIL import Image
+import rembg
+from .base import Pipeline
+from . import samplers
+from ..modules import sparse as sp
+from ..representations import Gaussian, Strivec, MeshExtractResult
+class TrellisImageTo3DPipeline(Pipeline):
+    """
+    Pipeline for inferring Trellis image-to-3D models.
+    Args:
+        models (dict[str, nn.Module]): The models to use in the pipeline.
+        sparse_structure_sampler (samplers.Sampler): The sampler for the sparse structure.
+        slat_sampler (samplers.Sampler): The sampler for the structured latent.
+        slat_normalization (dict): The normalization parameters for the structured latent.
+        image_cond_model (str): The name of the image conditioning model.
+    """
+    def __init__(
+        self,
+        models: dict[str, nn.Module] = None,
+        sparse_structure_sampler: samplers.Sampler = None,
+        slat_sampler: samplers.Sampler = None,
+        slat_normalization: dict = None,
+        image_cond_model: str = None,
+    ):
+        if models is None:
+            return
+        super().__init__(models)
+        self.sparse_structure_sampler = sparse_structure_sampler
+        self.slat_sampler = slat_sampler
+        self.sparse_structure_sampler_params = {}
+        self.slat_sampler_params = {}
+        self.slat_normalization = slat_normalization
+        self.rembg_session = None
+        self._init_image_cond_model(image_cond_model)
+    @staticmethod
+    def from_pretrained(path: str) -> "TrellisImageTo3DPipeline":
+        """
+        Load a pretrained model.
+        Args:
+            path (str): The path to the model. Can be either local path or a Hugging Face repository.
+        """
+        pipeline = super(TrellisImageTo3DPipeline, TrellisImageTo3DPipeline).from_pretrained(path)
+        new_pipeline = TrellisImageTo3DPipeline()
+        new_pipeline.__dict__ = pipeline.__dict__
+        args = pipeline._pretrained_args
+        new_pipeline.sparse_structure_sampler = getattr(samplers, args['sparse_structure_sampler']['name'])(**args['sparse_structure_sampler']['args'])
+        new_pipeline.sparse_structure_sampler_params = args['sparse_structure_sampler']['params']
+        new_pipeline.slat_sampler = getattr(samplers, args['slat_sampler']['name'])(**args['slat_sampler']['args'])
+        new_pipeline.slat_sampler_params = args['slat_sampler']['params']
+        new_pipeline.slat_normalization = args['slat_normalization']
+        new_pipeline._init_image_cond_model(args['image_cond_model'])
+        return new_pipeline
+    def _init_image_cond_model(self, name: str):
+        """
+        Initialize the image conditioning model.
+        """
+        dinov2_model = torch.hub.load('facebookresearch/dinov2', name, pretrained=True)
+        dinov2_model.eval()
+        self.models['image_cond_model'] = dinov2_model
+        transform = transforms.Compose([
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+        self.image_cond_model_transform = transform
+    def preprocess_image(self, input: Image.Image) -> Image.Image:
+        """
+        Preprocess the input image.
+        """
+        # if has alpha channel, use it directly; otherwise, remove background
+        has_alpha = False
+        if input.mode == 'RGBA':
+            alpha = np.array(input)[:, :, 3]
+            if not np.all(alpha == 255):
+                has_alpha = True
+        if has_alpha:
+            output = input
+        else:
+            input = input.convert('RGB')
+            max_size = max(input.size)
+            scale = min(1, 1024 / max_size)
+            if scale < 1:
+                input = input.resize((int(input.width * scale), int(input.height * scale)), Image.Resampling.LANCZOS)
+            if getattr(self, 'rembg_session', None) is None:
+                self.rembg_session = rembg.new_session('u2net')
+            output = rembg.remove(input, session=self.rembg_session)
+        output_np = np.array(output)
+        alpha = output_np[:, :, 3]
+        bbox = np.argwhere(alpha > 0.8 * 255)
+        bbox = np.min(bbox[:, 1]), np.min(bbox[:, 0]), np.max(bbox[:, 1]), np.max(bbox[:, 0])
+        center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
+        size = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
+        size = int(size * 1.2)
+        bbox = center[0] - size // 2, center[1] - size // 2, center[0] + size // 2, center[1] + size // 2
+        output = output.crop(bbox)  # type: ignore
+        output = output.resize((518, 518), Image.Resampling.LANCZOS)
+        output = np.array(output).astype(np.float32) / 255
+        output = output[:, :, :3] * output[:, :, 3:4]
+        output = Image.fromarray((output * 255).astype(np.uint8))
+        return output
+    @torch.no_grad()
+    def encode_image(self, image: Union[torch.Tensor, list[Image.Image]]) -> torch.Tensor:
+        """
+        Encode the image.
+        Args:
+            image (Union[torch.Tensor, list[Image.Image]]): The image to encode
+        Returns:
+            torch.Tensor: The encoded features.
+        """
+        if isinstance(image, torch.Tensor):
+            assert image.ndim == 4, "Image tensor should be batched (B, C, H, W)"
+        elif isinstance(image, list):
+            assert all(isinstance(i, Image.Image) for i in image), "Image list should be list of PIL images"
+            image = [i.resize((518, 518), Image.LANCZOS) for i in image]
+            image = [np.array(i.convert('RGB')).astype(np.float32) / 255 for i in image]
+            image = [torch.from_numpy(i).permute(2, 0, 1).float() for i in image]
+            image = torch.stack(image).to(self.device)
+        else:
+            raise ValueError(f"Unsupported type of image: {type(image)}")
+        image = self.image_cond_model_transform(image).to(self.device)
+        features = self.models['image_cond_model'](image, is_training=True)['x_prenorm']
+        patchtokens = F.layer_norm(features, features.shape[-1:])
+        return patchtokens
+    def get_cond(self, image: Union[torch.Tensor, list[Image.Image]]) -> dict:
+        """
+        Get the conditioning information for the model.
+        Args:
+            image (Union[torch.Tensor, list[Image.Image]]): The image prompts.
+        Returns:
+            dict: The conditioning information
+        """
+        cond = self.encode_image(image)
+        neg_cond = torch.zeros_like(cond)
+        return {
+            'cond': cond,
+            'neg_cond': neg_cond,
+        }
+    def sample_sparse_structure(
+        self,
+        cond: dict,
+        num_samples: int = 1,
+        sampler_params: dict = {},
+    ) -> torch.Tensor:
+        """
+        Sample sparse structures with the given conditioning.
+        Args:
+            cond (dict): The conditioning information.
+            num_samples (int): The number of samples to generate.
+            sampler_params (dict): Additional parameters for the sampler.
+        """
+        # Sample occupancy latent
+        flow_model = self.models['sparse_structure_flow_model']
+        reso = flow_model.resolution
+        noise = torch.randn(num_samples, flow_model.in_channels, reso, reso, reso).to(self.device)
+        sampler_params = {**self.sparse_structure_sampler_params, **sampler_params}
+        z_s = self.sparse_structure_sampler.sample(
+            flow_model,
+            noise,
+            **cond,
+            **sampler_params,
+            verbose=True
+        ).samples
+        # Decode occupancy latent
+        decoder = self.models['sparse_structure_decoder']
+        coords = torch.argwhere(decoder(z_s)>0)[:, [0, 2, 3, 4]].int()
+        return coords
+    def decode_slat(
+        self,
+        slat: sp.SparseTensor,
+        formats: List[str] = ['mesh', 'gaussian', 'radiance_field'],
+    ) -> dict:
+        """
+        Decode the structured latent.
+        Args:
+            slat (sp.SparseTensor): The structured latent.
+            formats (List[str]): The formats to decode the structured latent to.
+        Returns:
+            dict: The decoded structured latent.
+        """
+        ret = {}
+        if 'mesh' in formats:
+            ret['mesh'] = self.models['slat_decoder_mesh'](slat)
+        if 'gaussian' in formats:
+            ret['gaussian'] = self.models['slat_decoder_gs'](slat)
+        if 'radiance_field' in formats:
+            ret['radiance_field'] = self.models['slat_decoder_rf'](slat)
+        return ret
+    def sample_slat(
+        self,
+        cond: dict,
+        coords: torch.Tensor,
+        sampler_params: dict = {},
+    ) -> sp.SparseTensor:
+        """
+        Sample structured latent with the given conditioning.
+        Args:
+            cond (dict): The conditioning information.
+            coords (torch.Tensor): The coordinates of the sparse structure.
+            sampler_params (dict): Additional parameters for the sampler.
+        """
+        # Sample structured latent
+        flow_model = self.models['slat_flow_model']
+        noise = sp.SparseTensor(
+            feats=torch.randn(coords.shape[0], flow_model.in_channels).to(self.device),
+            coords=coords,
+        )
+        sampler_params = {**self.slat_sampler_params, **sampler_params}
+        slat = self.slat_sampler.sample(
+            flow_model,
+            noise,
+            **cond,
+            **sampler_params,
+            verbose=True
+        ).samples
+        std = torch.tensor(self.slat_normalization['std'])[None].to(slat.device)
+        mean = torch.tensor(self.slat_normalization['mean'])[None].to(slat.device)
+        slat = slat * std + mean
+        return slat
+    @torch.no_grad()
+    def run(
+        self,
+        image: Image.Image,
+        num_samples: int = 1,
+        seed: int = 42,
+        sparse_structure_sampler_params: dict = {},
+        slat_sampler_params: dict = {},
+        formats: List[str] = ['mesh', 'gaussian', 'radiance_field'],
+        preprocess_image: bool = True,
+    ) -> dict:
+        """
+        Run the pipeline.
+        Args:
+            image (Image.Image): The image prompt.
+            num_samples (int): The number of samples to generate.
+            sparse_structure_sampler_params (dict): Additional parameters for the sparse structure sampler.
+            slat_sampler_params (dict): Additional parameters for the structured latent sampler.
+            preprocess_image (bool): Whether to preprocess the image.
+        """
+        if preprocess_image:
+            image = self.preprocess_image(image)
+        cond = self.get_cond([image])
+        torch.manual_seed(seed)
+        coords = self.sample_sparse_structure(cond, num_samples, sparse_structure_sampler_params)
+        slat = self.sample_slat(cond, coords, slat_sampler_params)
+        return self.decode_slat(slat, formats)
+    @contextmanager
+    def inject_sampler_multi_image(
+        self,
+        sampler_name: str,
+        num_images: int,
+        num_steps: int,
+        mode: Literal['stochastic', 'multidiffusion'] = 'stochastic',
+    ):
+        """
+        Inject a sampler with multiple images as condition.
+        Args:
+            sampler_name (str): The name of the sampler to inject.
+            num_images (int): The number of images to condition on.
+            num_steps (int): The number of steps to run the sampler for.
+        """
+        sampler = getattr(self, sampler_name)
+        setattr(sampler, f'_old_inference_model', sampler._inference_model)
+        if mode == 'stochastic':
+            if num_images > num_steps:
+                print(f"\033[93mWarning: number of conditioning images is greater than number of steps for {sampler_name}. "
+                    "This may lead to performance degradation.\033[0m")
+            cond_indices = (np.arange(num_steps) % num_images).tolist()
+            def _new_inference_model(self, model, x_t, t, cond, **kwargs):
+                cond_idx = cond_indices.pop(0)
+                cond_i = cond[cond_idx:cond_idx+1]
+                return self._old_inference_model(model, x_t, t, cond=cond_i, **kwargs)
+        elif mode =='multidiffusion':
+            from .samplers import FlowEulerSampler
+            def _new_inference_model(self, model, x_t, t, cond, neg_cond, cfg_strength, cfg_interval, **kwargs):
+                if cfg_interval[0] <= t <= cfg_interval[1]:
+                    preds = []
+                    for i in range(len(cond)):
+                        preds.append(FlowEulerSampler._inference_model(self, model, x_t, t, cond[i:i+1], **kwargs))
+                    pred = sum(preds) / len(preds)
+                    neg_pred = FlowEulerSampler._inference_model(self, model, x_t, t, neg_cond, **kwargs)
+                    return (1 + cfg_strength) * pred - cfg_strength * neg_pred
+                else:
+                    preds = []
+                    for i in range(len(cond)):
+                        preds.append(FlowEulerSampler._inference_model(self, model, x_t, t, cond[i:i+1], **kwargs))
+                    pred = sum(preds) / len(preds)
+                    return pred
+        else:
+            raise ValueError(f"Unsupported mode: {mode}")
+        sampler._inference_model = _new_inference_model.__get__(sampler, type(sampler))
+        yield
+        sampler._inference_model = sampler._old_inference_model
+        delattr(sampler, f'_old_inference_model')
+    @torch.no_grad()
+    def run_multi_image(
+        self,
+        images: List[Image.Image],
+        num_samples: int = 1,
+        seed: int = 42,
+        sparse_structure_sampler_params: dict = {},
+        slat_sampler_params: dict = {},
+        formats: List[str] = ['mesh', 'gaussian', 'radiance_field'],
+        preprocess_image: bool = True,
+        mode: Literal['stochastic', 'multidiffusion'] = 'stochastic',
+    ) -> dict:
+        """
+        Run the pipeline with multiple images as condition
+        Args:
+            images (List[Image.Image]): The multi-view images of the assets
+            num_samples (int): The number of samples to generate.
+            sparse_structure_sampler_params (dict): Additional parameters for the sparse structure sampler.
+            slat_sampler_params (dict): Additional parameters for the structured latent sampler.
+            preprocess_image (bool): Whether to preprocess the image.
+        """
+        if preprocess_image:
+            images = [self.preprocess_image(image) for image in images]
+        cond = self.get_cond(images)
+        cond['neg_cond'] = cond['neg_cond'][:1]
+        torch.manual_seed(seed)
+        ss_steps = {**self.sparse_structure_sampler_params, **sparse_structure_sampler_params}.get('steps')
+        with self.inject_sampler_multi_image('sparse_structure_sampler', len(images), ss_steps, mode=mode):
+            coords = self.sample_sparse_structure(cond, num_samples, sparse_structure_sampler_params)
+        slat_steps = {**self.slat_sampler_params, **slat_sampler_params}.get('steps')
+        with self.inject_sampler_multi_image('slat_sampler', len(images), slat_steps, mode=mode):
+            slat = self.sample_slat(cond, coords, slat_sampler_params)
+        return self.decode_slat(slat, formats)