Spaces:

gokaygokay
/

FLUX.1-dev-with-Captioner

Running on Zero

App Files Files Community

gokaygokay commited on Oct 27, 2024

Commit

0038320

1 Parent(s): 0b32f48

delete

Browse files

Files changed (39) hide show

app.py +0 -162
demo_files/comp.gif +0 -3
demo_files/examples/animal_character.png +0 -3
demo_files/examples/animal_character_2.png +0 -3
demo_files/examples/axe.png +0 -0
demo_files/examples/chair1.png +0 -0
demo_files/examples/character1.png +0 -0
demo_files/examples/otter_samurai.png +0 -0
demo_files/examples/raccoon_wizard.png +0 -0
demo_files/examples/stylized-rocks.png +0 -0
demo_files/examples/tree.png +0 -0
demo_files/hdri/abandoned_tiled_room_1k.hdr +0 -0
demo_files/hdri/metro_noord_1k.hdr +0 -0
demo_files/hdri/neon_photostudio_1k.hdr +0 -0
demo_files/hdri/peppermint_powerplant_1k.hdr +0 -0
demo_files/hdri/rainforest_trail_1k.hdr +0 -0
demo_files/hdri/studio_small_08_1k.hdr +0 -0
demo_files/hdri/urban_alley_01_1k.hdr +0 -0
demo_files/scatterplot.jpg +0 -0
demo_files/teaser.gif +0 -3
load/tets/160_tets.npz +0 -3
sf3d/box_uv_unwrap.py +0 -610
sf3d/models/camera.py +0 -32
sf3d/models/global_estimator/multi_head_estimator.py +0 -118
sf3d/models/image_estimator/clip_based_estimator.py +0 -168
sf3d/models/isosurface.py +0 -229
sf3d/models/mesh.py +0 -172
sf3d/models/network.py +0 -195
sf3d/models/tokenizers/dinov2.py +0 -1196
sf3d/models/tokenizers/image.py +0 -99
sf3d/models/tokenizers/triplane.py +0 -49
sf3d/models/transformers/attention.py +0 -31
sf3d/models/transformers/backbone.py +0 -515
sf3d/models/utils.py +0 -292
sf3d/system.py +0 -482
sf3d/texture_baker.py +0 -87
sf3d/texture_baker.slang +0 -93
sf3d/utils.py +0 -91
stable_fast.py +0 -355

app.py DELETED Viewed

@@ -1,162 +0,0 @@
-import spaces
-import os
-import tempfile
-import time
-import gradio as gr
-import torch
-from PIL import Image
-from diffusers import DiffusionPipeline
-from huggingface_hub import hf_hub_download
-from sf3d.system import SF3D
-import sf3d.utils as sf3d_utils
-from gradio_litmodel3d import LitModel3D
-from huggingface_hub import login
-import subprocess
-dtype = torch.bfloat16
-torch.backends.cuda.matmul.allow_tf32 = True
-huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
-device = torch.device('cuda')
-import shutil
-def find_cuda():
-    # Check if CUDA_HOME or CUDA_PATH environment variables are set
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
-    if cuda_home and os.path.exists(cuda_home):
-        return cuda_home
-    # Search for the nvcc executable in the system's PATH
-    nvcc_path = shutil.which('nvcc')
-    if nvcc_path:
-        # Remove the 'bin/nvcc' part to get the CUDA installation path
-        cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
-        return cuda_path
-    return None
-cuda_path = find_cuda()
-if cuda_path:
-    print(f"CUDA installation found at: {cuda_path}")
-else:
-    print("CUDA installation not found")
-login(token=huggingface_token)
-# Set up environment and cache
-cache_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
-os.environ["TRANSFORMERS_CACHE"] = cache_path
-os.environ["HF_HUB_CACHE"] = cache_path
-os.environ["HF_HOME"] = cache_path
-if not os.path.exists(cache_path):
-    os.makedirs(cache_path, exist_ok=True)
-# Initialize Flux pipeline
-pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=dtype, token = huggingface_token).to(device)
-pipe.load_lora_weights(hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"))
-pipe.fuse_lora(lora_scale=0.125)
-pipe.to(device="cuda", dtype=torch.bfloat16)
-# Initialize SF3D model
-sf3d_model = SF3D.from_pretrained(
-    "stabilityai/stable-fast-3d",
-    config_name="config.yaml",
-    weight_name="model.safetensors",
-).eval().to(device)
-# Constants for SF3D
-COND_WIDTH, COND_HEIGHT = 512, 512
-COND_DISTANCE, COND_FOVY_DEG = 1.6, 40
-BACKGROUND_COLOR = [0.5, 0.5, 0.5]
-c2w_cond = sf3d_utils.default_cond_c2w(COND_DISTANCE)
-intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg(
-    COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH
-)
-def generate_image(prompt, height, width, steps, scales, seed):
-    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
-        return pipe(
-            prompt=[prompt],
-            generator=torch.Generator().manual_seed(int(seed)),
-            num_inference_steps=int(steps),
-            guidance_scale=float(scales),
-            height=int(height),
-            width=int(width),
-            max_sequence_length=256
-        ).images[0]
-def create_batch(input_image: Image.Image) -> dict:
-    img_cond = torch.from_numpy(
-        np.asarray(input_image.resize((COND_WIDTH, COND_HEIGHT))).astype(np.float32) / 255.0
-    ).float().clip(0, 1)
-    mask_cond = img_cond[:, :, -1:]
-    rgb_cond = torch.lerp(
-        torch.tensor(BACKGROUND_COLOR)[None, None, :], img_cond[:, :, :3], mask_cond
-    )
-    batch_elem = {
-        "rgb_cond": rgb_cond,
-        "mask_cond": mask_cond,
-        "c2w_cond": c2w_cond.unsqueeze(0),
-        "intrinsic_cond": intrinsic.unsqueeze(0),
-        "intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0),
-    }
-    return {k: v.unsqueeze(0) for k, v in batch_elem.items()}
-def generate_3d_model(input_image):
-    with torch.no_grad():
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            model_batch = create_batch(input_image)
-            model_batch = {k: v.cuda() for k, v in model_batch.items()}
-            trimesh_mesh, _ = sf3d_model.generate_mesh(model_batch, 1024)
-            trimesh_mesh = trimesh_mesh[0]
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
-    trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True)
-    return tmp_file.name
-@spaces.GPU
-def process_and_generate(prompt, height, width, steps, scales, seed):
-    # Generate image from prompt
-    generated_image = generate_image(prompt, height, width, steps, scales, seed)
-    # Generate 3D model from the image
-    glb_file = generate_3d_model(generated_image)
-    return generated_image, glb_file
-# Gradio interface
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Text-to-3D Model Generator")
-    with gr.Row():
-        with gr.Column(scale=3):
-            prompt = gr.Textbox(label="Your Image Description", lines=3)
-            with gr.Accordion("Advanced Settings", open=False):
-                height = gr.Slider(label="Height", minimum=256, maximum=1152, step=64, value=1024)
-                width = gr.Slider(label="Width", minimum=256, maximum=1152, step=64, value=1024)
-                steps = gr.Slider(label="Inference Steps", minimum=6, maximum=25, step=1, value=8)
-                scales = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=5.0, step=0.1, value=3.5)
-                seed = gr.Number(label="Seed", value=3413, precision=0)
-            generate_btn = gr.Button("Generate 3D Model", variant="primary")
-        with gr.Column(scale=4):
-            output_image = gr.Image(label="Generated Image")
-            output_3d = LitModel3D(label="3D Model", clear_color=[0.0, 0.0, 0.0, 0.0])
-    generate_btn.click(
-        process_and_generate,
-        inputs=[prompt, height, width, steps, scales, seed],
-        outputs=[output_image, output_3d]
-    )
-if __name__ == "__main__":
-    demo.launch()

demo_files/comp.gif DELETED Viewed

Git LFS Details

SHA256: 1d5e060d90f29889c55c1c5681dbeb4b4c2408709d18f7451bb0a6f02c6e9bc5
Pointer size: 132 Bytes
Size of remote file: 1.93 MB

demo_files/examples/animal_character.png DELETED Viewed

Git LFS Details

SHA256: 5949f60c651e71a41b7291197f91bb8be2c8861472765fc884e604e18b7806a0
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

demo_files/examples/animal_character_2.png DELETED Viewed

Git LFS Details

SHA256: ffc3f10c629afd64798d38dad2cc419eb343c7106149426f78634a91367bf031
Pointer size: 132 Bytes
Size of remote file: 1.6 MB

demo_files/examples/axe.png DELETED Viewed

Binary file (277 kB)

demo_files/examples/chair1.png DELETED Viewed

Binary file (115 kB)

demo_files/examples/character1.png DELETED Viewed

Binary file (120 kB)

demo_files/examples/otter_samurai.png DELETED Viewed

Binary file (980 kB)

demo_files/examples/raccoon_wizard.png DELETED Viewed

Binary file (774 kB)

demo_files/examples/stylized-rocks.png DELETED Viewed

Binary file (439 kB)

demo_files/examples/tree.png DELETED Viewed

Binary file (693 kB)

demo_files/hdri/abandoned_tiled_room_1k.hdr DELETED Viewed

Binary file (478 kB)

demo_files/hdri/metro_noord_1k.hdr DELETED Viewed

Binary file (467 kB)

demo_files/hdri/neon_photostudio_1k.hdr DELETED Viewed

Binary file (438 kB)

demo_files/hdri/peppermint_powerplant_1k.hdr DELETED Viewed

Binary file (473 kB)

demo_files/hdri/rainforest_trail_1k.hdr DELETED Viewed

Binary file (512 kB)

demo_files/hdri/studio_small_08_1k.hdr DELETED Viewed

Binary file (412 kB)

demo_files/hdri/urban_alley_01_1k.hdr DELETED Viewed

Binary file (458 kB)

demo_files/scatterplot.jpg DELETED Viewed

Binary file (879 kB)

demo_files/teaser.gif DELETED Viewed

Git LFS Details

SHA256: 1d5dcb4fbe710e94c0fa70cc2c783d66e327222cb5e74839cfd003e619bc2e1d
Pointer size: 132 Bytes
Size of remote file: 2.81 MB

load/tets/160_tets.npz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1f4be37efc604d28d55a1a78c2aabefeeab7e63149f541aa45f9dd858ee35bb9
-size 15408790

sf3d/box_uv_unwrap.py DELETED Viewed

@@ -1,610 +0,0 @@
-import math
-from typing import Tuple
-import torch
-import torch.nn.functional as F
-from jaxtyping import Float, Integer
-from torch import Tensor
-from sf3d.models.utils import dot, triangle_intersection_2d
-def _box_assign_vertex_to_cube_face(
-    vertex_positions: Float[Tensor, "Nv 3"],
-    vertex_normals: Float[Tensor, "Nv 3"],
-    triangle_idxs: Integer[Tensor, "Nf 3"],
-    bbox: Float[Tensor, "2 3"],
-) -> Tuple[Float[Tensor, "Nf 3 2"], Integer[Tensor, "Nf 3"]]:
-    # Test to not have a scaled model to fit the space better
-    # bbox_min = bbox[:1].mean(-1, keepdim=True)
-    # bbox_max = bbox[1:].mean(-1, keepdim=True)
-    # v_pos_normalized = (vertex_positions - bbox_min) / (bbox_max - bbox_min)
-    # Create a [0, 1] normalized vertex position
-    v_pos_normalized = (vertex_positions - bbox[:1]) / (bbox[1:] - bbox[:1])
-    # And to [-1, 1]
-    v_pos_normalized = 2.0 * v_pos_normalized - 1.0
-    # Get all vertex positions for each triangle
-    # Now how do we define to which face the triangle belongs? Mean face pos? Max vertex pos?
-    v0 = v_pos_normalized[triangle_idxs[:, 0]]
-    v1 = v_pos_normalized[triangle_idxs[:, 1]]
-    v2 = v_pos_normalized[triangle_idxs[:, 2]]
-    tri_stack = torch.stack([v0, v1, v2], dim=1)
-    vn0 = vertex_normals[triangle_idxs[:, 0]]
-    vn1 = vertex_normals[triangle_idxs[:, 1]]
-    vn2 = vertex_normals[triangle_idxs[:, 2]]
-    tri_stack_nrm = torch.stack([vn0, vn1, vn2], dim=1)
-    # Just average the normals per face
-    face_normal = F.normalize(torch.sum(tri_stack_nrm, 1), eps=1e-6, dim=-1)
-    # Now decide based on the face normal in which box map we project
-    # abs_x, abs_y, abs_z = tri_stack_nrm.abs().unbind(-1)
-    abs_x, abs_y, abs_z = tri_stack.abs().unbind(-1)
-    axis = torch.tensor(
-        [
-            [1, 0, 0],  # 0
-            [-1, 0, 0],  # 1
-            [0, 1, 0],  # 2
-            [0, -1, 0],  # 3
-            [0, 0, 1],  # 4
-            [0, 0, -1],  # 5
-        ],
-        device=face_normal.device,
-        dtype=face_normal.dtype,
-    )
-    face_normal_axis = (face_normal[:, None] * axis[None]).sum(-1)
-    index = face_normal_axis.argmax(-1)
-    max_axis, uc, vc = (
-        torch.ones_like(abs_x),
-        torch.zeros_like(tri_stack[..., :1]),
-        torch.zeros_like(tri_stack[..., :1]),
-    )
-    mask_pos_x = index == 0
-    max_axis[mask_pos_x] = abs_x[mask_pos_x]
-    uc[mask_pos_x] = tri_stack[mask_pos_x][..., 1:2]
-    vc[mask_pos_x] = -tri_stack[mask_pos_x][..., -1:]
-    mask_neg_x = index == 1
-    max_axis[mask_neg_x] = abs_x[mask_neg_x]
-    uc[mask_neg_x] = tri_stack[mask_neg_x][..., 1:2]
-    vc[mask_neg_x] = -tri_stack[mask_neg_x][..., -1:]
-    mask_pos_y = index == 2
-    max_axis[mask_pos_y] = abs_y[mask_pos_y]
-    uc[mask_pos_y] = tri_stack[mask_pos_y][..., 0:1]
-    vc[mask_pos_y] = -tri_stack[mask_pos_y][..., -1:]
-    mask_neg_y = index == 3
-    max_axis[mask_neg_y] = abs_y[mask_neg_y]
-    uc[mask_neg_y] = tri_stack[mask_neg_y][..., 0:1]
-    vc[mask_neg_y] = -tri_stack[mask_neg_y][..., -1:]
-    mask_pos_z = index == 4
-    max_axis[mask_pos_z] = abs_z[mask_pos_z]
-    uc[mask_pos_z] = tri_stack[mask_pos_z][..., 0:1]
-    vc[mask_pos_z] = tri_stack[mask_pos_z][..., 1:2]
-    mask_neg_z = index == 5
-    max_axis[mask_neg_z] = abs_z[mask_neg_z]
-    uc[mask_neg_z] = tri_stack[mask_neg_z][..., 0:1]
-    vc[mask_neg_z] = -tri_stack[mask_neg_z][..., 1:2]
-    # UC from [-1, 1] to [0, 1]
-    max_dim_div = max_axis.max(dim=0, keepdims=True).values
-    uc = ((uc[..., 0] / max_dim_div + 1.0) * 0.5).clip(0, 1)
-    vc = ((vc[..., 0] / max_dim_div + 1.0) * 0.5).clip(0, 1)
-    uv = torch.stack([uc, vc], dim=-1)
-    return uv, index
-def _assign_faces_uv_to_atlas_index(
-    vertex_positions: Float[Tensor, "Nv 3"],
-    triangle_idxs: Integer[Tensor, "Nf 3"],
-    face_uv: Float[Tensor, "Nf 3 2"],
-    face_index: Integer[Tensor, "Nf 3"],
-) -> Integer[Tensor, "Nf"]:  # noqa: F821
-    triangle_pos = vertex_positions[triangle_idxs]
-    # We need to do perform 3 overlap checks.
-    # The first set is placed in the upper two thirds of the UV atlas.
-    # Conceptually, this is the direct visible surfaces from the each cube side
-    # The second set is placed in the lower thirds and the left half of the UV atlas.
-    # This is the first set of occluded surfaces. They will also be saved in the projected fashion
-    # The third pass finds all non assigned faces. They will be placed in the bottom right half of
-    # the UV atlas in scattered fashion.
-    assign_idx = face_index.clone()
-    for overlap_step in range(3):
-        overlapping_indicator = torch.zeros_like(assign_idx, dtype=torch.bool)
-        for i in range(overlap_step * 6, (overlap_step + 1) * 6):
-            mask = assign_idx == i
-            if not mask.any():
-                continue
-            # Get all elements belonging to the projection face
-            uv_triangle = face_uv[mask]
-            cur_triangle_pos = triangle_pos[mask]
-            # Find the center of the uv coordinates
-            center_uv = uv_triangle.mean(dim=1, keepdim=True)
-            # And also the radius of the triangle
-            uv_triangle_radius = (uv_triangle - center_uv).norm(dim=-1).max(-1).values
-            potentially_overlapping_mask = (
-                # Find all close triangles
-                (center_uv[None, ...] - center_uv[:, None]).norm(dim=-1)
-                # Do not select the same element by offseting with an large valued identity matrix
-                + torch.eye(
-                    uv_triangle.shape[0],
-                    device=uv_triangle.device,
-                    dtype=uv_triangle.dtype,
-                ).unsqueeze(-1)
-                * 1000
-            )
-            # Mark all potentially overlapping triangles to reduce the number of triangle intersection tests
-            potentially_overlapping_mask = (
-                potentially_overlapping_mask
-                <= (uv_triangle_radius.view(-1, 1, 1) * 3.0)
-            ).squeeze(-1)
-            overlap_coords = torch.stack(torch.where(potentially_overlapping_mask), -1)
-            # Only unique triangles (A|B and B|A should be the same)
-            f = torch.min(overlap_coords, dim=-1).values
-            s = torch.max(overlap_coords, dim=-1).values
-            overlap_coords = torch.unique(torch.stack([f, s], dim=1), dim=0)
-            first, second = overlap_coords.unbind(-1)
-            # Get the triangles
-            tri_1 = uv_triangle[first]
-            tri_2 = uv_triangle[second]
-            # Perform the actual set with the reduced number of potentially overlapping triangles
-            its = triangle_intersection_2d(tri_1, tri_2, eps=1e-6)
-            # So we now need to detect which triangles are the occluded ones.
-            # We always assume the first to be the visible one (the others should move)
-            # In the previous step we use a lexigraphical sort to get the unique pairs
-            # In this we use a sort based on the orthographic projection
-            ax = 0 if i < 2 else 1 if i < 4 else 2
-            use_max = i % 2 == 1
-            tri1_c = cur_triangle_pos[first].mean(dim=1)
-            tri2_c = cur_triangle_pos[second].mean(dim=1)
-            mark_first = (
-                (tri1_c[..., ax] > tri2_c[..., ax])
-                if use_max
-                else (tri1_c[..., ax] < tri2_c[..., ax])
-            )
-            first[mark_first] = second[mark_first]
-            # Lastly the same index can be tested multiple times.
-            # If one marks it as overlapping we keep it marked as such.
-            # We do this by testing if it has been marked at least once.
-            unique_idx, rev_idx = torch.unique(first, return_inverse=True)
-            add = torch.zeros_like(unique_idx, dtype=torch.float32)
-            add.index_add_(0, rev_idx, its.float())
-            its_mask = add > 0
-            # And fill it in the overlapping indicator
-            idx = torch.where(mask)[0][unique_idx]
-            overlapping_indicator[idx] = its_mask
-        # Move the index to the overlap regions (shift by 6)
-        assign_idx[overlapping_indicator] += 6
-    # We do not care about the correct face placement after the first 2 slices
-    max_idx = 6 * 2
-    return assign_idx.clamp(0, max_idx)
-def _find_slice_offset_and_scale(
-    index: Integer[Tensor, "Nf"],  # noqa: F821
-) -> Tuple[
-    Float[Tensor, "Nf"], Float[Tensor, "Nf"], Float[Tensor, "Nf"], Float[Tensor, "Nf"]  # noqa: F821
-]:  # noqa: F821
-    # 6 due to the 6 cube faces
-    off = 1 / 3
-    dupl_off = 1 / 6
-    # Here, we need to decide how to pack the textures in the case of overlap
-    def x_offset_calc(x, i):
-        offset_calc = i // 6
-        # Initial coordinates - just 3x2 grid
-        if offset_calc == 0:
-            return off * x
-        else:
-            # Smaller 3x2 grid plus eventual shift to right for
-            # second overlap
-            return dupl_off * x + min(offset_calc - 1, 1) * 0.5
-    def y_offset_calc(x, i):
-        offset_calc = i // 6
-        # Initial coordinates - just a 3x2 grid
-        if offset_calc == 0:
-            return off * x
-        else:
-            # Smaller coordinates in the lowest row
-            return dupl_off * x + off * 2
-    offset_x = torch.zeros_like(index, dtype=torch.float32)
-    offset_y = torch.zeros_like(index, dtype=torch.float32)
-    offset_x_vals = [0, 1, 2, 0, 1, 2]
-    offset_y_vals = [0, 0, 0, 1, 1, 1]
-    for i in range(index.max().item() + 1):
-        mask = index == i
-        if not mask.any():
-            continue
-        offset_x[mask] = x_offset_calc(offset_x_vals[i % 6], i)
-        offset_y[mask] = y_offset_calc(offset_y_vals[i % 6], i)
-    div_x = torch.full_like(index, 6 // 2, dtype=torch.float32)
-    # All overlap elements are saved in half scale
-    div_x[index >= 6] = 6
-    div_y = div_x.clone()  # Same for y
-    # Except for the random overlaps
-    div_x[index >= 12] = 2
-    # But the random overlaps are saved in a large block in the lower thirds
-    div_y[index >= 12] = 3
-    return offset_x, offset_y, div_x, div_y
-def rotation_flip_matrix_2d(
-    rad: float, flip_x: bool = False, flip_y: bool = False
-) -> Float[Tensor, "2 2"]:
-    cos = math.cos(rad)
-    sin = math.sin(rad)
-    rot_mat = torch.tensor([[cos, -sin], [sin, cos]], dtype=torch.float32)
-    flip_mat = torch.tensor(
-        [
-            [-1 if flip_x else 1, 0],
-            [0, -1 if flip_y else 1],
-        ],
-        dtype=torch.float32,
-    )
-    return flip_mat @ rot_mat
-def calculate_tangents(
-    vertex_positions: Float[Tensor, "Nv 3"],
-    vertex_normals: Float[Tensor, "Nv 3"],
-    triangle_idxs: Integer[Tensor, "Nf 3"],
-    face_uv: Float[Tensor, "Nf 3 2"],
-) -> Float[Tensor, "Nf 3 4"]:  # noqa: F821
-    vn_idx = [None] * 3
-    pos = [None] * 3
-    tex = face_uv.unbind(1)
-    for i in range(0, 3):
-        pos[i] = vertex_positions[triangle_idxs[:, i]]
-        # t_nrm_idx is always the same as t_pos_idx
-        vn_idx[i] = triangle_idxs[:, i]
-    tangents = torch.zeros_like(vertex_normals)
-    tansum = torch.zeros_like(vertex_normals)
-    # Compute tangent space for each triangle
-    duv1 = tex[1] - tex[0]
-    duv2 = tex[2] - tex[0]
-    dpos1 = pos[1] - pos[0]
-    dpos2 = pos[2] - pos[0]
-    tng_nom = dpos1 * duv2[..., 1:2] - dpos2 * duv1[..., 1:2]
-    denom = duv1[..., 0:1] * duv2[..., 1:2] - duv1[..., 1:2] * duv2[..., 0:1]
-    # Avoid division by zero for degenerated texture coordinates
-    denom_safe = denom.clip(1e-6)
-    tang = tng_nom / denom_safe
-    # Update all 3 vertices
-    for i in range(0, 3):
-        idx = vn_idx[i][:, None].repeat(1, 3)
-        tangents.scatter_add_(0, idx, tang)  # tangents[n_i] = tangents[n_i] + tang
-        tansum.scatter_add_(
-            0, idx, torch.ones_like(tang)
-        )  # tansum[n_i] = tansum[n_i] + 1
-    # Also normalize it. Here we do not normalize the individual triangles first so larger area
-    # triangles influence the tangent space more
-    tangents = tangents / tansum
-    # Normalize and make sure tangent is perpendicular to normal
-    tangents = F.normalize(tangents, dim=1)
-    tangents = F.normalize(tangents - dot(tangents, vertex_normals) * vertex_normals)
-    return tangents
-def _rotate_uv_slices_consistent_space(
-    vertex_positions: Float[Tensor, "Nv 3"],
-    vertex_normals: Float[Tensor, "Nv 3"],
-    triangle_idxs: Integer[Tensor, "Nf 3"],
-    uv: Float[Tensor, "Nf 3 2"],
-    index: Integer[Tensor, "Nf"],  # noqa: F821
-):
-    tangents = calculate_tangents(vertex_positions, vertex_normals, triangle_idxs, uv)
-    pos_stack = torch.stack(
-        [
-            -vertex_positions[..., 1],
-            vertex_positions[..., 0],
-            torch.zeros_like(vertex_positions[..., 0]),
-        ],
-        dim=-1,
-    )
-    expected_tangents = F.normalize(
-        torch.linalg.cross(
-            vertex_normals, torch.linalg.cross(pos_stack, vertex_normals)
-        ),
-        -1,
-    )
-    actual_tangents = tangents[triangle_idxs]
-    expected_tangents = expected_tangents[triangle_idxs]
-    def rotation_matrix_2d(theta):
-        c, s = torch.cos(theta), torch.sin(theta)
-        return torch.tensor([[c, -s], [s, c]])
-    # Now find the rotation
-    index_mod = index % 6  # Shouldn't happen. Just for safety
-    for i in range(6):
-        mask = index_mod == i
-        if not mask.any():
-            continue
-        actual_mean_tangent = actual_tangents[mask].mean(dim=(0, 1))
-        expected_mean_tangent = expected_tangents[mask].mean(dim=(0, 1))
-        dot_product = torch.dot(actual_mean_tangent, expected_mean_tangent)
-        cross_product = (
-            actual_mean_tangent[0] * expected_mean_tangent[1]
-            - actual_mean_tangent[1] * expected_mean_tangent[0]
-        )
-        angle = torch.atan2(cross_product, dot_product)
-        rot_matrix = rotation_matrix_2d(angle).to(mask.device)
-        # Center the uv coordinate to be in the range of -1 to 1 and 0 centered
-        uv_cur = uv[mask] * 2 - 1  # Center it first
-        # Rotate it
-        uv[mask] = torch.einsum("ij,nfj->nfi", rot_matrix, uv_cur)
-        # Rescale uv[mask] to be within the 0-1 range
-        uv[mask] = (uv[mask] - uv[mask].min()) / (uv[mask].max() - uv[mask].min())
-    return uv
-def _handle_slice_uvs(
-    uv: Float[Tensor, "Nf 3 2"],
-    index: Integer[Tensor, "Nf"],  # noqa: F821
-    island_padding: float,
-    max_index: int = 6 * 2,
-) -> Float[Tensor, "Nf 3 2"]:  # noqa: F821
-    uc, vc = uv.unbind(-1)
-    # Get the second slice (The first overlap)
-    index_filter = [index == i for i in range(6, max_index)]
-    # Normalize them to always fully fill the atlas patch
-    for i, fi in enumerate(index_filter):
-        if fi.sum() > 0:
-            # Scale the slice but only up to a factor of 2
-            # This keeps the texture resolution with the first slice in line (Half space in UV)
-            uc[fi] = (uc[fi] - uc[fi].min()) / (uc[fi].max() - uc[fi].min()).clip(0.5)
-            vc[fi] = (vc[fi] - vc[fi].min()) / (vc[fi].max() - vc[fi].min()).clip(0.5)
-    uc_padded = (uc * (1 - 2 * island_padding) + island_padding).clip(0, 1)
-    vc_padded = (vc * (1 - 2 * island_padding) + island_padding).clip(0, 1)
-    return torch.stack([uc_padded, vc_padded], dim=-1)
-def _handle_remaining_uvs(
-    uv: Float[Tensor, "Nf 3 2"],
-    index: Integer[Tensor, "Nf"],  # noqa: F821
-    island_padding: float,
-) -> Float[Tensor, "Nf 3 2"]:
-    uc, vc = uv.unbind(-1)
-    # Get all remaining elements
-    remaining_filter = index >= 6 * 2
-    squares_left = remaining_filter.sum()
-    if squares_left == 0:
-        return uv
-    uc = uc[remaining_filter]
-    vc = vc[remaining_filter]
-    # Or remaining triangles are distributed in a rectangle
-    # The rectangle takes 0.5 of the entire uv space in width and 1/3 in height
-    ratio = 0.5 * (1 / 3)  # 1.5
-    # sqrt(744/(0.5*(1/3)))
-    mult = math.sqrt(squares_left / ratio)
-    num_square_width = int(math.ceil(0.5 * mult))
-    num_square_height = int(math.ceil(squares_left / num_square_width))
-    width = 1 / num_square_width
-    height = 1 / num_square_height
-    # The idea is again to keep the texture resolution consistent with the first slice
-    # This only occupys half the region in the texture chart but the scaling on the squares
-    # assumes full coverage.
-    clip_val = min(width, height) * 1.5
-    # Now normalize the UVs with taking into account the maximum scaling
-    uc = (uc - uc.min(dim=1, keepdim=True).values) / (
-        uc.amax(dim=1, keepdim=True) - uc.amin(dim=1, keepdim=True)
-    ).clip(clip_val)
-    vc = (vc - vc.min(dim=1, keepdim=True).values) / (
-        vc.amax(dim=1, keepdim=True) - vc.amin(dim=1, keepdim=True)
-    ).clip(clip_val)
-    # Add a small padding
-    uc = (
-        uc * (1 - island_padding * num_square_width * 0.5)
-        + island_padding * num_square_width * 0.25
-    ).clip(0, 1)
-    vc = (
-        vc * (1 - island_padding * num_square_height * 0.5)
-        + island_padding * num_square_height * 0.25
-    ).clip(0, 1)
-    uc = uc * width
-    vc = vc * height
-    # And calculate offsets for each element
-    idx = torch.arange(uc.shape[0], device=uc.device, dtype=torch.int32)
-    x_idx = idx % num_square_width
-    y_idx = idx // num_square_width
-    # And move each triangle to its own spot
-    uc = uc + x_idx[:, None] * width
-    vc = vc + y_idx[:, None] * height
-    uc = (uc * (1 - 2 * island_padding * 0.5) + island_padding * 0.5).clip(0, 1)
-    vc = (vc * (1 - 2 * island_padding * 0.5) + island_padding * 0.5).clip(0, 1)
-    uv[remaining_filter] = torch.stack([uc, vc], dim=-1)
-    return uv
-def _distribute_individual_uvs_in_atlas(
-    face_uv: Float[Tensor, "Nf 3 2"],
-    assigned_faces: Integer[Tensor, "Nf"],  # noqa: F821
-    offset_x: Float[Tensor, "Nf"],  # noqa: F821
-    offset_y: Float[Tensor, "Nf"],  # noqa: F821
-    div_x: Float[Tensor, "Nf"],  # noqa: F821
-    div_y: Float[Tensor, "Nf"],  # noqa: F821
-    island_padding: float,
-):
-    # Place the slice first
-    placed_uv = _handle_slice_uvs(face_uv, assigned_faces, island_padding)
-    # Then handle the remaining overlap elements
-    placed_uv = _handle_remaining_uvs(placed_uv, assigned_faces, island_padding)
-    uc, vc = placed_uv.unbind(-1)
-    uc = uc / div_x[:, None] + offset_x[:, None]
-    vc = vc / div_y[:, None] + offset_y[:, None]
-    uv = torch.stack([uc, vc], dim=-1).view(-1, 2)
-    return uv
-def _get_unique_face_uv(
-    uv: Float[Tensor, "Nf 3 2"],
-) -> Tuple[Float[Tensor, "Utex 3"], Integer[Tensor, "Nf"]]:  # noqa: F821
-    unique_uv, unique_idx = torch.unique(uv, return_inverse=True, dim=0)
-    # And add the face to uv index mapping
-    vtex_idx = unique_idx.view(-1, 3)
-    return unique_uv, vtex_idx
-def _align_mesh_with_main_axis(
-    vertex_positions: Float[Tensor, "Nv 3"], vertex_normals: Float[Tensor, "Nv 3"]
-) -> Tuple[Float[Tensor, "Nv 3"], Float[Tensor, "Nv 3"]]:
-    # Use pca to find the 2 main axis (third is derived by cross product)
-    # Set the random seed so it's repeatable
-    torch.manual_seed(0)
-    _, _, v = torch.pca_lowrank(vertex_positions, q=2)
-    main_axis, seconday_axis = v[:, 0], v[:, 1]
-    main_axis: Float[Tensor, "3"] = F.normalize(main_axis, eps=1e-6, dim=-1)
-    # Orthogonalize the second axis
-    seconday_axis: Float[Tensor, "3"] = F.normalize(
-        seconday_axis - dot(seconday_axis, main_axis) * main_axis, eps=1e-6, dim=-1
-    )
-    # Create perpendicular third axis
-    third_axis: Float[Tensor, "3"] = F.normalize(
-        torch.cross(main_axis, seconday_axis), dim=-1, eps=1e-6
-    )
-    # Check to which canonical axis each aligns
-    main_axis_max_idx = main_axis.abs().argmax().item()
-    seconday_axis_max_idx = seconday_axis.abs().argmax().item()
-    third_axis_max_idx = third_axis.abs().argmax().item()
-    # Now sort the axes based on the argmax so they align with thecanonoical axes
-    # If two axes have the same argmax move one of them
-    all_possible_axis = {0, 1, 2}
-    cur_index = 1
-    while len(set([main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx])) != 3:
-        # Find missing axis
-        missing_axis = all_possible_axis - set(
-            [main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx]
-        )
-        missing_axis = missing_axis.pop()
-        # Just assign it to third axis as it had the smallest contribution to the
-        # overall shape
-        if cur_index == 1:
-            third_axis_max_idx = missing_axis
-        elif cur_index == 2:
-            seconday_axis_max_idx = missing_axis
-        else:
-            raise ValueError("Could not find 3 unique axis")
-        cur_index += 1
-    if len({main_axis_max_idx, seconday_axis_max_idx, third_axis_max_idx}) != 3:
-        raise ValueError("Could not find 3 unique axis")
-    axes = [None] * 3
-    axes[main_axis_max_idx] = main_axis
-    axes[seconday_axis_max_idx] = seconday_axis
-    axes[third_axis_max_idx] = third_axis
-    # Create rotation matrix from the individual axes
-    rot_mat = torch.stack(axes, dim=1).T
-    # Now rotate the vertex positions and vertex normals so the mesh aligns with the main axis
-    vertex_positions = torch.einsum("ij,nj->ni", rot_mat, vertex_positions)
-    vertex_normals = torch.einsum("ij,nj->ni", rot_mat, vertex_normals)
-    return vertex_positions, vertex_normals
-def box_projection_uv_unwrap(
-    vertex_positions: Float[Tensor, "Nv 3"],
-    vertex_normals: Float[Tensor, "Nv 3"],
-    triangle_idxs: Integer[Tensor, "Nf 3"],
-    island_padding: float,
-) -> Tuple[Float[Tensor, "Utex 3"], Integer[Tensor, "Nf"]]:  # noqa: F821
-    # Align the mesh with main axis directions first
-    vertex_positions, vertex_normals = _align_mesh_with_main_axis(
-        vertex_positions, vertex_normals
-    )
-    bbox: Float[Tensor, "2 3"] = torch.stack(
-        [vertex_positions.min(dim=0).values, vertex_positions.max(dim=0).values], dim=0
-    )
-    # First decide in which cube face the triangle is placed
-    face_uv, face_index = _box_assign_vertex_to_cube_face(
-        vertex_positions, vertex_normals, triangle_idxs, bbox
-    )
-    # Rotate the UV islands in a way that they align with the radial z tangent space
-    face_uv = _rotate_uv_slices_consistent_space(
-        vertex_positions, vertex_normals, triangle_idxs, face_uv, face_index
-    )
-    # Then find where where the face is placed in the atlas.
-    # This has to detect potential overlaps
-    assigned_atlas_index = _assign_faces_uv_to_atlas_index(
-        vertex_positions, triangle_idxs, face_uv, face_index
-    )
-    # Then figure out the final place in the atlas based on the assignment
-    offset_x, offset_y, div_x, div_y = _find_slice_offset_and_scale(
-        assigned_atlas_index
-    )
-    # Next distribute the faces in the uv atlas
-    placed_uv = _distribute_individual_uvs_in_atlas(
-        face_uv, assigned_atlas_index, offset_x, offset_y, div_x, div_y, island_padding
-    )
-    # And get the unique per-triangle UV coordinates
-    return _get_unique_face_uv(placed_uv)

sf3d/models/camera.py DELETED Viewed

@@ -1,32 +0,0 @@
-from dataclasses import dataclass, field
-from typing import List
-import torch
-import torch.nn as nn
-from sf3d.models.utils import BaseModule
-class LinearCameraEmbedder(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        in_channels: int = 25
-        out_channels: int = 768
-        conditions: List[str] = field(default_factory=list)
-    cfg: Config
-    def configure(self) -> None:
-        self.linear = nn.Linear(self.cfg.in_channels, self.cfg.out_channels)
-    def forward(self, **kwargs):
-        cond_tensors = []
-        for cond_name in self.cfg.conditions:
-            assert cond_name in kwargs
-            cond = kwargs[cond_name]
-            # cond in shape (B, Nv, ...)
-            cond_tensors.append(cond.view(*cond.shape[:2], -1))
-        cond_tensor = torch.cat(cond_tensors, dim=-1)
-        assert cond_tensor.shape[-1] == self.cfg.in_channels
-        embedding = self.linear(cond_tensor)
-        return embedding

sf3d/models/global_estimator/multi_head_estimator.py DELETED Viewed

@@ -1,118 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Any, List, Optional
-import torch.nn as nn
-from jaxtyping import Float
-from torch import Tensor
-from sf3d.models.network import get_activation
-from sf3d.models.utils import BaseModule
-@dataclass
-class HeadSpec:
-    name: str
-    out_channels: int
-    n_hidden_layers: int
-    output_activation: Optional[str] = None
-    output_bias: float = 0.0
-    add_to_decoder_features: bool = False
-    shape: Optional[list[int]] = None
-class MultiHeadEstimator(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        triplane_features: int = 1024
-        n_layers: int = 2
-        hidden_features: int = 512
-        activation: str = "relu"
-        pool: str = "max"
-        # Literal["mean", "max"] = "mean"  # noqa: F821
-        heads: List[HeadSpec] = field(default_factory=lambda: [])
-    cfg: Config
-    def configure(self):
-        layers = []
-        cur_features = self.cfg.triplane_features * 3
-        for _ in range(self.cfg.n_layers):
-            layers.append(
-                nn.Conv2d(
-                    cur_features,
-                    self.cfg.hidden_features,
-                    kernel_size=3,
-                    padding=0,
-                    stride=2,
-                )
-            )
-            layers.append(self.make_activation(self.cfg.activation))
-            cur_features = self.cfg.hidden_features
-        self.layers = nn.Sequential(*layers)
-        assert len(self.cfg.heads) > 0
-        heads = {}
-        for head in self.cfg.heads:
-            head_layers = []
-            for i in range(head.n_hidden_layers):
-                head_layers += [
-                    nn.Linear(
-                        self.cfg.hidden_features,
-                        self.cfg.hidden_features,
-                    ),
-                    self.make_activation(self.cfg.activation),
-                ]
-            head_layers += [
-                nn.Linear(
-                    self.cfg.hidden_features,
-                    head.out_channels,
-                ),
-            ]
-            heads[head.name] = nn.Sequential(*head_layers)
-        self.heads = nn.ModuleDict(heads)
-    def make_activation(self, activation):
-        if activation == "relu":
-            return nn.ReLU(inplace=True)
-        elif activation == "silu":
-            return nn.SiLU(inplace=True)
-        else:
-            raise NotImplementedError
-    def forward(
-        self,
-        triplane: Float[Tensor, "B 3 F Ht Wt"],
-    ) -> dict[str, Any]:
-        x = self.layers(
-            triplane.reshape(
-                triplane.shape[0], -1, triplane.shape[-2], triplane.shape[-1]
-            )
-        )
-        if self.cfg.pool == "max":
-            x = x.amax(dim=[-2, -1])
-        elif self.cfg.pool == "mean":
-            x = x.mean(dim=[-2, -1])
-        else:
-            raise NotImplementedError
-        out = {
-            ("decoder_" if head.add_to_decoder_features else "")
-            + head.name: get_activation(head.output_activation)(
-                self.heads[head.name](x) + head.output_bias
-            )
-            for head in self.cfg.heads
-        }
-        for head in self.cfg.heads:
-            if head.shape:
-                head_name = (
-                    "decoder_" if head.add_to_decoder_features else ""
-                ) + head.name
-                out[head_name] = out[head_name].reshape(*head.shape)
-        return out

sf3d/models/image_estimator/clip_based_estimator.py DELETED Viewed

@@ -1,168 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Any, List, Optional
-import open_clip
-import torch
-import torch.nn as nn
-from jaxtyping import Float
-from torch import Tensor
-from torchvision.transforms import Normalize
-from sf3d.models.network import get_activation
-from sf3d.models.utils import BaseModule
-@dataclass
-class HeadSpec:
-    name: str
-    out_channels: int
-    n_hidden_layers: int
-    output_activation: Optional[str] = None
-    output_bias: float = 0.0
-    add_to_decoder_features: bool = False
-    shape: Optional[list[int]] = None
-class ClipBasedHeadEstimator(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        model: str = "ViT-B-32"
-        pretrain: str = "laion2b_s34b_b79k"
-        distribution: str = "beta"
-        # ["mean", "mode", "sample", "sample_mean"]
-        distribution_eval: str = "mode"
-        activation: str = "relu"
-        hidden_features: int = 512
-        heads: List[HeadSpec] = field(default_factory=lambda: [])
-    cfg: Config
-    def configure(self):
-        self.model, _, self.preprocess = open_clip.create_model_and_transforms(
-            self.cfg.model, pretrained=self.cfg.pretrain
-        )
-        self.model.eval()
-        # Do not add the weights in self.model to the optimizer
-        for param in self.model.parameters():
-            param.requires_grad = False
-        assert len(self.cfg.heads) > 0
-        heads = {}
-        for head in self.cfg.heads:
-            head_layers = []
-            for i in range(head.n_hidden_layers):
-                head_layers += [
-                    nn.Linear(
-                        self.cfg.hidden_features,
-                        self.cfg.hidden_features,
-                    ),
-                    self.make_activation(self.cfg.activation),
-                ]
-            head_layers = [nn.Sequential(*head_layers)]
-            head_layers += [
-                nn.Sequential(
-                    nn.Linear(
-                        self.cfg.hidden_features,
-                        self.cfg.hidden_features,
-                    ),
-                    self.make_activation(self.cfg.activation),
-                    nn.Linear(self.cfg.hidden_features, 1),
-                )
-                for _ in range(2)
-            ]
-            heads[head.name] = nn.ModuleList(head_layers)
-        self.heads = nn.ModuleDict(heads)
-    def make_activation(self, activation):
-        if activation == "relu":
-            return nn.ReLU(inplace=True)
-        elif activation == "silu":
-            return nn.SiLU(inplace=True)
-        else:
-            raise NotImplementedError
-    def forward(
-        self,
-        cond_image: Float[Tensor, "B 1 H W 3"],
-        sample: bool = True,
-    ) -> dict[str, Any]:
-        # Run the model
-        # Resize cond_image to 224
-        cond_image = nn.functional.interpolate(
-            cond_image.flatten(0, 1).permute(0, 3, 1, 2),
-            size=(224, 224),
-            mode="bilinear",
-            align_corners=False,
-        )
-        cond_image = Normalize(
-            mean=open_clip.constants.OPENAI_DATASET_MEAN,
-            std=open_clip.constants.OPENAI_DATASET_STD,
-        )(cond_image)
-        image_features = self.model.encode_image(cond_image)
-        # Run the heads
-        outputs = {}
-        for head_dict in self.cfg.heads:
-            head_name = head_dict.name
-            shared_head, d1_h, d2_h = self.heads[head_name]
-            shared_features = shared_head(image_features)
-            d1, d2 = [head(shared_features).squeeze(-1) for head in [d1_h, d2_h]]
-            if self.cfg.distribution == "normal":
-                mean = d1
-                var = d2
-                if mean.shape[-1] == 1:
-                    outputs[head_name] = torch.distributions.Normal(
-                        mean + head_dict.output_bias,
-                        torch.nn.functional.softplus(var),
-                    )
-                else:
-                    outputs[head_name] = torch.distributions.MultivariateNormal(
-                        mean + head_dict.output_bias,
-                        torch.nn.functional.softplus(var).diag_embed(),
-                    )
-            elif self.cfg.distribution == "beta":
-                outputs[head_name] = torch.distributions.Beta(
-                    torch.nn.functional.softplus(d1 + head_dict.output_bias),
-                    torch.nn.functional.softplus(d2 + head_dict.output_bias),
-                )
-            else:
-                raise NotImplementedError
-        if sample:
-            for head_dict in self.cfg.heads:
-                head_name = head_dict.name
-                dist = outputs[head_name]
-                if self.cfg.distribution_eval == "mean":
-                    out = dist.mean
-                elif self.cfg.distribution_eval == "mode":
-                    out = dist.mode
-                elif self.cfg.distribution_eval == "sample_mean":
-                    out = dist.sample([10]).mean(-1)
-                else:
-                    # use rsample if gradient is needed
-                    out = dist.rsample() if self.training else dist.sample()
-                outputs[head_name] = get_activation(head_dict.output_activation)(out)
-                outputs[f"{head_name}_dist"] = dist
-        for head in self.cfg.heads:
-            if head.shape:
-                if not sample:
-                    raise ValueError(
-                        "Cannot reshape non-sampled probabilisitic outputs"
-                    )
-                outputs[head.name] = outputs[head.name].reshape(*head.shape)
-            if head.add_to_decoder_features:
-                outputs[f"decoder_{head.name}"] = outputs[head.name]
-                del outputs[head.name]
-        return outputs

sf3d/models/isosurface.py DELETED Viewed

@@ -1,229 +0,0 @@
-from typing import Optional, Tuple
-import numpy as np
-import torch
-import torch.nn as nn
-from jaxtyping import Float, Integer
-from torch import Tensor
-from .mesh import Mesh
-class IsosurfaceHelper(nn.Module):
-    points_range: Tuple[float, float] = (0, 1)
-    @property
-    def grid_vertices(self) -> Float[Tensor, "N 3"]:
-        raise NotImplementedError
-    @property
-    def requires_instance_per_batch(self) -> bool:
-        return False
-class MarchingTetrahedraHelper(IsosurfaceHelper):
-    def __init__(self, resolution: int, tets_path: str):
-        super().__init__()
-        self.resolution = resolution
-        self.tets_path = tets_path
-        self.triangle_table: Float[Tensor, "..."]
-        self.register_buffer(
-            "triangle_table",
-            torch.as_tensor(
-                [
-                    [-1, -1, -1, -1, -1, -1],
-                    [1, 0, 2, -1, -1, -1],
-                    [4, 0, 3, -1, -1, -1],
-                    [1, 4, 2, 1, 3, 4],
-                    [3, 1, 5, -1, -1, -1],
-                    [2, 3, 0, 2, 5, 3],
-                    [1, 4, 0, 1, 5, 4],
-                    [4, 2, 5, -1, -1, -1],
-                    [4, 5, 2, -1, -1, -1],
-                    [4, 1, 0, 4, 5, 1],
-                    [3, 2, 0, 3, 5, 2],
-                    [1, 3, 5, -1, -1, -1],
-                    [4, 1, 2, 4, 3, 1],
-                    [3, 0, 4, -1, -1, -1],
-                    [2, 0, 1, -1, -1, -1],
-                    [-1, -1, -1, -1, -1, -1],
-                ],
-                dtype=torch.long,
-            ),
-            persistent=False,
-        )
-        self.num_triangles_table: Integer[Tensor, "..."]
-        self.register_buffer(
-            "num_triangles_table",
-            torch.as_tensor(
-                [0, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 0], dtype=torch.long
-            ),
-            persistent=False,
-        )
-        self.base_tet_edges: Integer[Tensor, "..."]
-        self.register_buffer(
-            "base_tet_edges",
-            torch.as_tensor([0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3], dtype=torch.long),
-            persistent=False,
-        )
-        tets = np.load(self.tets_path)
-        self._grid_vertices: Float[Tensor, "..."]
-        self.register_buffer(
-            "_grid_vertices",
-            torch.from_numpy(tets["vertices"]).float(),
-            persistent=False,
-        )
-        self.indices: Integer[Tensor, "..."]
-        self.register_buffer(
-            "indices", torch.from_numpy(tets["indices"]).long(), persistent=False
-        )
-        self._all_edges: Optional[Integer[Tensor, "Ne 2"]] = None
-        center_indices, boundary_indices = self.get_center_boundary_index(
-            self._grid_vertices
-        )
-        self.center_indices: Integer[Tensor, "..."]
-        self.register_buffer("center_indices", center_indices, persistent=False)
-        self.boundary_indices: Integer[Tensor, "..."]
-        self.register_buffer("boundary_indices", boundary_indices, persistent=False)
-    def get_center_boundary_index(self, verts):
-        magn = torch.sum(verts**2, dim=-1)
-        center_idx = torch.argmin(magn)
-        boundary_neg = verts == verts.max()
-        boundary_pos = verts == verts.min()
-        boundary = torch.bitwise_or(boundary_pos, boundary_neg)
-        boundary = torch.sum(boundary.float(), dim=-1)
-        boundary_idx = torch.nonzero(boundary)
-        return center_idx, boundary_idx.squeeze(dim=-1)
-    def normalize_grid_deformation(
-        self, grid_vertex_offsets: Float[Tensor, "Nv 3"]
-    ) -> Float[Tensor, "Nv 3"]:
-        return (
-            (self.points_range[1] - self.points_range[0])
-            / self.resolution  # half tet size is approximately 1 / self.resolution
-            * torch.tanh(grid_vertex_offsets)
-        )  # FIXME: hard-coded activation
-    @property
-    def grid_vertices(self) -> Float[Tensor, "Nv 3"]:
-        return self._grid_vertices
-    @property
-    def all_edges(self) -> Integer[Tensor, "Ne 2"]:
-        if self._all_edges is None:
-            # compute edges on GPU, or it would be VERY SLOW (basically due to the unique operation)
-            edges = torch.tensor(
-                [0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3],
-                dtype=torch.long,
-                device=self.indices.device,
-            )
-            _all_edges = self.indices[:, edges].reshape(-1, 2)
-            _all_edges_sorted = torch.sort(_all_edges, dim=1)[0]
-            _all_edges = torch.unique(_all_edges_sorted, dim=0)
-            self._all_edges = _all_edges
-        return self._all_edges
-    def sort_edges(self, edges_ex2):
-        with torch.no_grad():
-            order = (edges_ex2[:, 0] > edges_ex2[:, 1]).long()
-            order = order.unsqueeze(dim=1)
-            a = torch.gather(input=edges_ex2, index=order, dim=1)
-            b = torch.gather(input=edges_ex2, index=1 - order, dim=1)
-        return torch.stack([a, b], -1)
-    def _forward(self, pos_nx3, sdf_n, tet_fx4):
-        with torch.no_grad():
-            occ_n = sdf_n > 0
-            occ_fx4 = occ_n[tet_fx4.reshape(-1)].reshape(-1, 4)
-            occ_sum = torch.sum(occ_fx4, -1)
-            valid_tets = (occ_sum > 0) & (occ_sum < 4)
-            occ_sum = occ_sum[valid_tets]
-            # find all vertices
-            all_edges = tet_fx4[valid_tets][:, self.base_tet_edges].reshape(-1, 2)
-            all_edges = self.sort_edges(all_edges)
-            unique_edges, idx_map = torch.unique(all_edges, dim=0, return_inverse=True)
-            unique_edges = unique_edges.long()
-            mask_edges = occ_n[unique_edges.reshape(-1)].reshape(-1, 2).sum(-1) == 1
-            mapping = (
-                torch.ones(
-                    (unique_edges.shape[0]), dtype=torch.long, device=pos_nx3.device
-                )
-                * -1
-            )
-            mapping[mask_edges] = torch.arange(
-                mask_edges.sum(), dtype=torch.long, device=pos_nx3.device
-            )
-            idx_map = mapping[idx_map]  # map edges to verts
-            interp_v = unique_edges[mask_edges]
-        edges_to_interp = pos_nx3[interp_v.reshape(-1)].reshape(-1, 2, 3)
-        edges_to_interp_sdf = sdf_n[interp_v.reshape(-1)].reshape(-1, 2, 1)
-        edges_to_interp_sdf[:, -1] *= -1
-        denominator = edges_to_interp_sdf.sum(1, keepdim=True)
-        edges_to_interp_sdf = torch.flip(edges_to_interp_sdf, [1]) / denominator
-        verts = (edges_to_interp * edges_to_interp_sdf).sum(1)
-        idx_map = idx_map.reshape(-1, 6)
-        v_id = torch.pow(2, torch.arange(4, dtype=torch.long, device=pos_nx3.device))
-        tetindex = (occ_fx4[valid_tets] * v_id.unsqueeze(0)).sum(-1)
-        num_triangles = self.num_triangles_table[tetindex]
-        # Generate triangle indices
-        faces = torch.cat(
-            (
-                torch.gather(
-                    input=idx_map[num_triangles == 1],
-                    dim=1,
-                    index=self.triangle_table[tetindex[num_triangles == 1]][:, :3],
-                ).reshape(-1, 3),
-                torch.gather(
-                    input=idx_map[num_triangles == 2],
-                    dim=1,
-                    index=self.triangle_table[tetindex[num_triangles == 2]][:, :6],
-                ).reshape(-1, 3),
-            ),
-            dim=0,
-        )
-        return verts, faces
-    def forward(
-        self,
-        level: Float[Tensor, "N3 1"],
-        deformation: Optional[Float[Tensor, "N3 3"]] = None,
-    ) -> Mesh:
-        if deformation is not None:
-            grid_vertices = self.grid_vertices + self.normalize_grid_deformation(
-                deformation
-            )
-        else:
-            grid_vertices = self.grid_vertices
-        v_pos, t_pos_idx = self._forward(grid_vertices, level, self.indices)
-        mesh = Mesh(
-            v_pos=v_pos,
-            t_pos_idx=t_pos_idx,
-            # extras
-            grid_vertices=grid_vertices,
-            tet_edges=self.all_edges,
-            grid_level=level,
-            grid_deformation=deformation,
-        )
-        return mesh

sf3d/models/mesh.py DELETED Viewed

@@ -1,172 +0,0 @@
-from __future__ import annotations
-from typing import Any, Dict, Optional
-import torch
-import torch.nn.functional as F
-from jaxtyping import Float, Integer
-from torch import Tensor
-from sf3d.box_uv_unwrap import box_projection_uv_unwrap
-from sf3d.models.utils import dot
-class Mesh:
-    def __init__(
-        self, v_pos: Float[Tensor, "Nv 3"], t_pos_idx: Integer[Tensor, "Nf 3"], **kwargs
-    ) -> None:
-        self.v_pos: Float[Tensor, "Nv 3"] = v_pos
-        self.t_pos_idx: Integer[Tensor, "Nf 3"] = t_pos_idx
-        self._v_nrm: Optional[Float[Tensor, "Nv 3"]] = None
-        self._v_tng: Optional[Float[Tensor, "Nv 3"]] = None
-        self._v_tex: Optional[Float[Tensor, "Nt 3"]] = None
-        self._edges: Optional[Integer[Tensor, "Ne 2"]] = None
-        self.extras: Dict[str, Any] = {}
-        for k, v in kwargs.items():
-            self.add_extra(k, v)
-    def add_extra(self, k, v) -> None:
-        self.extras[k] = v
-    @property
-    def requires_grad(self):
-        return self.v_pos.requires_grad
-    @property
-    def v_nrm(self):
-        if self._v_nrm is None:
-            self._v_nrm = self._compute_vertex_normal()
-        return self._v_nrm
-    @property
-    def v_tng(self):
-        if self._v_tng is None:
-            self._v_tng = self._compute_vertex_tangent()
-        return self._v_tng
-    @property
-    def v_tex(self):
-        if self._v_tex is None:
-            self.unwrap_uv()
-        return self._v_tex
-    @property
-    def edges(self):
-        if self._edges is None:
-            self._edges = self._compute_edges()
-        return self._edges
-    def _compute_vertex_normal(self):
-        i0 = self.t_pos_idx[:, 0]
-        i1 = self.t_pos_idx[:, 1]
-        i2 = self.t_pos_idx[:, 2]
-        v0 = self.v_pos[i0, :]
-        v1 = self.v_pos[i1, :]
-        v2 = self.v_pos[i2, :]
-        face_normals = torch.cross(v1 - v0, v2 - v0, dim=-1)
-        # Splat face normals to vertices
-        v_nrm = torch.zeros_like(self.v_pos)
-        v_nrm.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
-        v_nrm.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
-        v_nrm.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
-        # Normalize, replace zero (degenerated) normals with some default value
-        v_nrm = torch.where(
-            dot(v_nrm, v_nrm) > 1e-20, v_nrm, torch.as_tensor([0.0, 0.0, 1.0]).to(v_nrm)
-        )
-        v_nrm = F.normalize(v_nrm, dim=1)
-        if torch.is_anomaly_enabled():
-            assert torch.all(torch.isfinite(v_nrm))
-        return v_nrm
-    def _compute_vertex_tangent(self):
-        vn_idx = [None] * 3
-        pos = [None] * 3
-        tex = [None] * 3
-        for i in range(0, 3):
-            pos[i] = self.v_pos[self.t_pos_idx[:, i]]
-            tex[i] = self.v_tex[self.t_pos_idx[:, i]]
-            # t_nrm_idx is always the same as t_pos_idx
-            vn_idx[i] = self.t_pos_idx[:, i]
-        tangents = torch.zeros_like(self.v_nrm)
-        tansum = torch.zeros_like(self.v_nrm)
-        # Compute tangent space for each triangle
-        duv1 = tex[1] - tex[0]
-        duv2 = tex[2] - tex[0]
-        dpos1 = pos[1] - pos[0]
-        dpos2 = pos[2] - pos[0]
-        tng_nom = dpos1 * duv2[..., 1:2] - dpos2 * duv1[..., 1:2]
-        denom = duv1[..., 0:1] * duv2[..., 1:2] - duv1[..., 1:2] * duv2[..., 0:1]
-        # Avoid division by zero for degenerated texture coordinates
-        denom_safe = denom.clip(1e-6)
-        tang = tng_nom / denom_safe
-        # Update all 3 vertices
-        for i in range(0, 3):
-            idx = vn_idx[i][:, None].repeat(1, 3)
-            tangents.scatter_add_(0, idx, tang)  # tangents[n_i] = tangents[n_i] + tang
-            tansum.scatter_add_(
-                0, idx, torch.ones_like(tang)
-            )  # tansum[n_i] = tansum[n_i] + 1
-        # Also normalize it. Here we do not normalize the individual triangles first so larger area
-        # triangles influence the tangent space more
-        tangents = tangents / tansum
-        # Normalize and make sure tangent is perpendicular to normal
-        tangents = F.normalize(tangents, dim=1)
-        tangents = F.normalize(tangents - dot(tangents, self.v_nrm) * self.v_nrm)
-        if torch.is_anomaly_enabled():
-            assert torch.all(torch.isfinite(tangents))
-        return tangents
-    @torch.no_grad()
-    def unwrap_uv(
-        self,
-        island_padding: float = 0.02,
-    ) -> Mesh:
-        uv, indices = box_projection_uv_unwrap(
-            self.v_pos, self.v_nrm, self.t_pos_idx, island_padding
-        )
-        # Do store per vertex UVs.
-        # This means we need to duplicate some vertices at the seams
-        individual_vertices = self.v_pos[self.t_pos_idx].reshape(-1, 3)
-        individual_faces = torch.arange(
-            individual_vertices.shape[0],
-            device=individual_vertices.device,
-            dtype=self.t_pos_idx.dtype,
-        ).reshape(-1, 3)
-        uv_flat = uv[indices].reshape((-1, 2))
-        # uv_flat[:, 1] = 1 - uv_flat[:, 1]
-        self.v_pos = individual_vertices
-        self.t_pos_idx = individual_faces
-        self._v_tex = uv_flat
-        self._v_nrm = self._compute_vertex_normal()
-        self._v_tng = self._compute_vertex_tangent()
-    def _compute_edges(self):
-        # Compute edges
-        edges = torch.cat(
-            [
-                self.t_pos_idx[:, [0, 1]],
-                self.t_pos_idx[:, [1, 2]],
-                self.t_pos_idx[:, [2, 0]],
-            ],
-            dim=0,
-        )
-        edges = edges.sort()[0]
-        edges = torch.unique(edges, dim=0)
-        return edges

sf3d/models/network.py DELETED Viewed

@@ -1,195 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Callable, List, Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from jaxtyping import Float
-from torch import Tensor
-from torch.autograd import Function
-from torch.cuda.amp import custom_bwd, custom_fwd
-from sf3d.models.utils import BaseModule, normalize
-class PixelShuffleUpsampleNetwork(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        in_channels: int = 1024
-        out_channels: int = 40
-        scale_factor: int = 4
-        conv_layers: int = 4
-        conv_kernel_size: int = 3
-    cfg: Config
-    def configure(self) -> None:
-        layers = []
-        output_channels = self.cfg.out_channels * self.cfg.scale_factor**2
-        in_channels = self.cfg.in_channels
-        for i in range(self.cfg.conv_layers):
-            cur_out_channels = (
-                in_channels if i != self.cfg.conv_layers - 1 else output_channels
-            )
-            layers.append(
-                nn.Conv2d(
-                    in_channels,
-                    cur_out_channels,
-                    self.cfg.conv_kernel_size,
-                    padding=(self.cfg.conv_kernel_size - 1) // 2,
-                )
-            )
-            if i != self.cfg.conv_layers - 1:
-                layers.append(nn.ReLU(inplace=True))
-        layers.append(nn.PixelShuffle(self.cfg.scale_factor))
-        self.upsample = nn.Sequential(*layers)
-    def forward(
-        self, triplanes: Float[Tensor, "B 3 Ci Hp Wp"]
-    ) -> Float[Tensor, "B 3 Co Hp2 Wp2"]:
-        return rearrange(
-            self.upsample(
-                rearrange(triplanes, "B Np Ci Hp Wp -> (B Np) Ci Hp Wp", Np=3)
-            ),
-            "(B Np) Co Hp Wp -> B Np Co Hp Wp",
-            Np=3,
-        )
-class _TruncExp(Function):  # pylint: disable=abstract-method
-    # Implementation from torch-ngp:
-    # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, x):  # pylint: disable=arguments-differ
-        ctx.save_for_backward(x)
-        return torch.exp(x)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, g):  # pylint: disable=arguments-differ
-        x = ctx.saved_tensors[0]
-        return g * torch.exp(torch.clamp(x, max=15))
-trunc_exp = _TruncExp.apply
-def get_activation(name) -> Callable:
-    if name is None:
-        return lambda x: x
-    name = name.lower()
-    if name == "none" or name == "linear" or name == "identity":
-        return lambda x: x
-    elif name == "lin2srgb":
-        return lambda x: torch.where(
-            x > 0.0031308,
-            torch.pow(torch.clamp(x, min=0.0031308), 1.0 / 2.4) * 1.055 - 0.055,
-            12.92 * x,
-        ).clamp(0.0, 1.0)
-    elif name == "exp":
-        return lambda x: torch.exp(x)
-    elif name == "shifted_exp":
-        return lambda x: torch.exp(x - 1.0)
-    elif name == "trunc_exp":
-        return trunc_exp
-    elif name == "shifted_trunc_exp":
-        return lambda x: trunc_exp(x - 1.0)
-    elif name == "sigmoid":
-        return lambda x: torch.sigmoid(x)
-    elif name == "tanh":
-        return lambda x: torch.tanh(x)
-    elif name == "shifted_softplus":
-        return lambda x: F.softplus(x - 1.0)
-    elif name == "scale_-11_01":
-        return lambda x: x * 0.5 + 0.5
-    elif name == "negative":
-        return lambda x: -x
-    elif name == "normalize_channel_last":
-        return lambda x: normalize(x)
-    elif name == "normalize_channel_first":
-        return lambda x: normalize(x, dim=1)
-    else:
-        try:
-            return getattr(F, name)
-        except AttributeError:
-            raise ValueError(f"Unknown activation function: {name}")
-@dataclass
-class HeadSpec:
-    name: str
-    out_channels: int
-    n_hidden_layers: int
-    output_activation: Optional[str] = None
-    out_bias: float = 0.0
-class MaterialMLP(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        in_channels: int = 120
-        n_neurons: int = 64
-        activation: str = "silu"
-        heads: List[HeadSpec] = field(default_factory=lambda: [])
-    cfg: Config
-    def configure(self) -> None:
-        assert len(self.cfg.heads) > 0
-        heads = {}
-        for head in self.cfg.heads:
-            head_layers = []
-            for i in range(head.n_hidden_layers):
-                head_layers += [
-                    nn.Linear(
-                        self.cfg.in_channels if i == 0 else self.cfg.n_neurons,
-                        self.cfg.n_neurons,
-                    ),
-                    self.make_activation(self.cfg.activation),
-                ]
-            head_layers += [
-                nn.Linear(
-                    self.cfg.n_neurons,
-                    head.out_channels,
-                ),
-            ]
-            heads[head.name] = nn.Sequential(*head_layers)
-        self.heads = nn.ModuleDict(heads)
-    def make_activation(self, activation):
-        if activation == "relu":
-            return nn.ReLU(inplace=True)
-        elif activation == "silu":
-            return nn.SiLU(inplace=True)
-        else:
-            raise NotImplementedError
-    def keys(self):
-        return self.heads.keys()
-    def forward(
-        self, x, include: Optional[List] = None, exclude: Optional[List] = None
-    ):
-        if include is not None and exclude is not None:
-            raise ValueError("Cannot specify both include and exclude.")
-        if include is not None:
-            heads = [h for h in self.cfg.heads if h.name in include]
-        elif exclude is not None:
-            heads = [h for h in self.cfg.heads if h.name not in exclude]
-        else:
-            heads = self.cfg.heads
-        out = {
-            head.name: get_activation(head.output_activation)(
-                self.heads[head.name](x) + head.out_bias
-            )
-            for head in heads
-        }
-        return out

sf3d/models/tokenizers/dinov2.py DELETED Viewed

@@ -1,1196 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch DINOv2 model."""
-import collections.abc
-import math
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, Tuple, Union
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BackboneOutput,
-    BaseModelOutput,
-    BaseModelOutputWithPooling,
-    ImageClassifierOutput,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.dinov2.configuration_dinov2 import Dinov2Config
-from transformers.pytorch_utils import (
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
-from transformers.utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.utils.backbone_utils import BackboneMixin
-logger = logging.get_logger(__name__)
-# General docstring
-_CONFIG_FOR_DOC = "Dinov2Config"
-# Base docstring
-_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
-_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
-# Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-base"
-DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/dinov2-base",
-    # See all DINOv2 models at https://huggingface.co/models?filter=dinov2
-]
-class Dinov2Embeddings(nn.Module):
-    """
-    Construct the CLS token, mask token, position and patch embeddings.
-    """
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__()
-        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
-        # register as mask token as it's not used in optimization
-        # to avoid the use of find_unused_parameters_true
-        # self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
-        self.register_buffer("mask_token", torch.zeros(1, config.hidden_size))
-        self.patch_embeddings = Dinov2PatchEmbeddings(config)
-        num_patches = self.patch_embeddings.num_patches
-        self.position_embeddings = nn.Parameter(
-            torch.randn(1, num_patches + 1, config.hidden_size)
-        )
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.config = config
-    def interpolate_pos_encoding(
-        self, embeddings: torch.Tensor, height: int, width: int
-    ) -> torch.Tensor:
-        """
-        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
-        resolution images.
-        Source:
-        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
-        """
-        num_patches = embeddings.shape[1] - 1
-        num_positions = self.position_embeddings.shape[1] - 1
-        if num_patches == num_positions and height == width:
-            return self.position_embeddings
-        class_pos_embed = self.position_embeddings[:, 0]
-        patch_pos_embed = self.position_embeddings[:, 1:]
-        dim = embeddings.shape[-1]
-        height = height // self.config.patch_size
-        width = width // self.config.patch_size
-        # we add a small number to avoid floating point error in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        height, width = height + 0.1, width + 0.1
-        patch_pos_embed = patch_pos_embed.reshape(
-            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
-        )
-        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
-        patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed,
-            scale_factor=(
-                height / math.sqrt(num_positions),
-                width / math.sqrt(num_positions),
-            ),
-            mode="bicubic",
-            align_corners=False,
-        )
-        if (
-            int(height) != patch_pos_embed.shape[-2]
-            or int(width) != patch_pos_embed.shape[-1]
-        ):
-            raise ValueError(
-                "Width or height does not match with the interpolated position embeddings"
-            )
-        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
-        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        bool_masked_pos: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        batch_size, _, height, width = pixel_values.shape
-        patch_embeddings = self.patch_embeddings(pixel_values)
-        embeddings = patch_embeddings
-        if bool_masked_pos is not None:
-            embeddings = torch.where(
-                bool_masked_pos.unsqueeze(-1),
-                self.mask_token.to(embeddings.dtype).unsqueeze(0),
-                embeddings,
-            )
-        # add the [CLS] token to the embedded patch tokens
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
-        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
-        # add positional encoding to each token
-        embeddings = embeddings + self.interpolate_pos_encoding(
-            embeddings, height, width
-        )
-        embeddings = self.dropout(embeddings)
-        return embeddings
-class Dinov2PatchEmbeddings(nn.Module):
-    """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
-    """
-    def __init__(self, config):
-        super().__init__()
-        image_size, patch_size = config.image_size, config.patch_size
-        num_channels, hidden_size = config.num_channels, config.hidden_size
-        image_size = (
-            image_size
-            if isinstance(image_size, collections.abc.Iterable)
-            else (image_size, image_size)
-        )
-        patch_size = (
-            patch_size
-            if isinstance(patch_size, collections.abc.Iterable)
-            else (patch_size, patch_size)
-        )
-        num_patches = (image_size[1] // patch_size[1]) * (
-            image_size[0] // patch_size[0]
-        )
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-        self.projection = nn.Conv2d(
-            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
-        )
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        """
-        num_channels = pixel_values.shape[1]
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-                f" Expected {self.num_channels} but got {num_channels}."
-            )
-        """
-        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
-        return embeddings
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
-class Dinov2SelfAttention(nn.Module):
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-            config, "embedding_size"
-        ):
-            raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
-                f"heads {config.num_attention_heads}."
-            )
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-        self.query = nn.Linear(
-            config.hidden_size, self.all_head_size, bias=config.qkv_bias
-        )
-        self.key = nn.Linear(
-            config.hidden_size, self.all_head_size, bias=config.qkv_bias
-        )
-        self.value = nn.Linear(
-            config.hidden_size, self.all_head_size, bias=config.qkv_bias
-        )
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (
-            self.num_attention_heads,
-            self.attention_head_size,
-        )
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
-    def forward(
-        self,
-        hidden_states,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-        if hasattr(F, "scaled_dot_product_attention"):
-            assert head_mask is None and not output_attentions
-            new_size = hidden_states.size()[:-1] + (
-                self.num_attention_heads,
-                self.attention_head_size,
-            )
-            key_layer = self.key(hidden_states).reshape(new_size).transpose(1, 2)
-            value_layer = self.value(hidden_states).reshape(new_size).transpose(1, 2)
-            query_layer = mixed_query_layer.reshape(new_size).transpose(1, 2)
-            context_layer = F.scaled_dot_product_attention(
-                query_layer,
-                key_layer,
-                value_layer,
-                dropout_p=self.attention_probs_dropout_prob,
-                is_causal=False,
-            )
-            context_layer = context_layer.transpose(1, 2).reshape(
-                *hidden_states.size()[:-1], -1
-            )
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            query_layer = self.transpose_for_scores(mixed_query_layer)
-            # Take the dot product between "query" and "key" to get the raw attention scores.
-            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-            attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-            # Normalize the attention scores to probabilities.
-            attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.dropout(attention_probs)
-            # Mask heads if we want to
-            if head_mask is not None:
-                attention_probs = attention_probs * head_mask
-            context_layer = torch.matmul(attention_probs, value_layer)
-            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-            context_layer = context_layer.view(new_context_layer_shape)
-        outputs = (
-            (context_layer, attention_probs) if output_attentions else (context_layer,)
-        )
-        return outputs
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
-class Dinov2SelfOutput(nn.Module):
-    """
-    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
-    layernorm applied before each block.
-    """
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-    def forward(
-        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
-    ) -> torch.Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
-class Dinov2Attention(nn.Module):
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__()
-        self.attention = Dinov2SelfAttention(config)
-        self.output = Dinov2SelfOutput(config)
-        self.pruned_heads = set()
-    def prune_heads(self, heads: Set[int]) -> None:
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads,
-            self.attention.num_attention_heads,
-            self.attention.attention_head_size,
-            self.pruned_heads,
-        )
-        # Prune linear layers
-        self.attention.query = prune_linear_layer(self.attention.query, index)
-        self.attention.key = prune_linear_layer(self.attention.key, index)
-        self.attention.value = prune_linear_layer(self.attention.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-        # Update hyper params and store pruned heads
-        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
-            heads
-        )
-        self.attention.all_head_size = (
-            self.attention.attention_head_size * self.attention.num_attention_heads
-        )
-        self.pruned_heads = self.pruned_heads.union(heads)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[
-            1:
-        ]  # add attentions if we output them
-        return outputs
-class Dinov2LayerScale(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        self.lambda1 = nn.Parameter(
-            config.layerscale_value * torch.ones(config.hidden_size)
-        )
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        return hidden_state * self.lambda1
-# Copied from transformers.models.beit.modeling_beit.drop_path
-def drop_path(
-    input: torch.Tensor, drop_prob: float = 0.0, training: bool = False
-) -> torch.Tensor:
-    """
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
-    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
-    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
-    argument.
-    """
-    if drop_prob == 0.0 or not training:
-        return input
-    keep_prob = 1 - drop_prob
-    shape = (input.shape[0],) + (1,) * (
-        input.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + torch.rand(
-        shape, dtype=input.dtype, device=input.device
-    )
-    random_tensor.floor_()  # binarize
-    output = input.div(keep_prob) * random_tensor
-    return output
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath
-class Dinov2DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
-    def __init__(self, drop_prob: Optional[float] = None) -> None:
-        super().__init__()
-        self.drop_prob = drop_prob
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        return drop_path(hidden_states, self.drop_prob, self.training)
-    def extra_repr(self) -> str:
-        return "p={}".format(self.drop_prob)
-class Dinov2MLP(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        in_features = out_features = config.hidden_size
-        hidden_features = int(config.hidden_size * config.mlp_ratio)
-        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
-        if isinstance(config.hidden_act, str):
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = config.hidden_act
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        hidden_state = self.fc1(hidden_state)
-        hidden_state = self.activation(hidden_state)
-        hidden_state = self.fc2(hidden_state)
-        return hidden_state
-class Dinov2SwiGLUFFN(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        in_features = out_features = config.hidden_size
-        hidden_features = int(config.hidden_size * config.mlp_ratio)
-        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
-        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
-        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        hidden_state = self.weights_in(hidden_state)
-        x1, x2 = hidden_state.chunk(2, dim=-1)
-        hidden = nn.functional.silu(x1) * x2
-        return self.weights_out(hidden)
-class Dinov2Layer(nn.Module):
-    """This corresponds to the Block class in the original implementation."""
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__()
-        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.norm1_modulation = None
-        self.attention = Dinov2Attention(config)
-        self.layer_scale1 = Dinov2LayerScale(config)
-        self.drop_path1 = (
-            Dinov2DropPath(config.drop_path_rate)
-            if config.drop_path_rate > 0.0
-            else nn.Identity()
-        )
-        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.norm2_modulation = None
-        if config.use_swiglu_ffn:
-            self.mlp = Dinov2SwiGLUFFN(config)
-        else:
-            self.mlp = Dinov2MLP(config)
-        self.layer_scale2 = Dinov2LayerScale(config)
-        self.drop_path2 = (
-            Dinov2DropPath(config.drop_path_rate)
-            if config.drop_path_rate > 0.0
-            else nn.Identity()
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        modulation_cond: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        hidden_states_norm = self.norm1(hidden_states)
-        if self.norm1_modulation is not None:
-            assert modulation_cond is not None
-            hidden_states_norm = self.norm1_modulation(
-                hidden_states_norm, modulation_cond
-            )
-        self_attention_outputs = self.attention(
-            hidden_states_norm,  # in Dinov2, layernorm is applied before self-attention
-            head_mask,
-            output_attentions=output_attentions,
-        )
-        attention_output = self_attention_outputs[0]
-        attention_output = self.layer_scale1(attention_output)
-        outputs = self_attention_outputs[
-            1:
-        ]  # add self attentions if we output attention weights
-        # first residual connection
-        hidden_states = attention_output + hidden_states
-        # in Dinov2, layernorm is also applied after self-attention
-        layer_output = self.norm2(hidden_states)
-        if self.norm2_modulation is not None:
-            assert modulation_cond is not None
-            layer_output = self.norm2_modulation(layer_output, modulation_cond)
-        layer_output = self.mlp(layer_output)
-        layer_output = self.layer_scale2(layer_output)
-        # second residual connection
-        layer_output = layer_output + hidden_states
-        outputs = (layer_output,) + outputs
-        return outputs
-    def register_ada_norm_modulation(self, norm1_mod: nn.Module, norm2_mod: nn.Module):
-        self.norm1_modulation = norm1_mod
-        self.norm2_modulation = norm2_mod
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
-class Dinov2Encoder(nn.Module):
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [Dinov2Layer(config) for _ in range(config.num_hidden_layers)]
-        )
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
-        modulation_cond: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ) -> Union[tuple, BaseModelOutput]:
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            if self.gradient_checkpointing and self.training:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    layer_head_mask,
-                    modulation_cond,
-                    use_reentrant=False,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states, layer_head_mask, modulation_cond, output_attentions
-                )
-            hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, all_hidden_states, all_self_attentions]
-                if v is not None
-            )
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-class Dinov2PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = Dinov2Config
-    base_model_prefix = "dinov2"
-    main_input_name = "pixel_values"
-    supports_gradient_checkpointing = True
-    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
-        """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
-            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
-            # `trunc_normal_cpu` not implemented in `half` issues
-            module.weight.data = nn.init.trunc_normal_(
-                module.weight.data.to(torch.float32),
-                mean=0.0,
-                std=self.config.initializer_range,
-            ).to(module.weight.dtype)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        elif isinstance(module, Dinov2Embeddings):
-            module.position_embeddings.data = nn.init.trunc_normal_(
-                module.position_embeddings.data.to(torch.float32),
-                mean=0.0,
-                std=self.config.initializer_range,
-            ).to(module.position_embeddings.dtype)
-            module.cls_token.data = nn.init.trunc_normal_(
-                module.cls_token.data.to(torch.float32),
-                mean=0.0,
-                std=self.config.initializer_range,
-            ).to(module.cls_token.dtype)
-    def _set_gradient_checkpointing(
-        self, module: Dinov2Encoder, value: bool = False
-    ) -> None:
-        if isinstance(module, Dinov2Encoder):
-            module.gradient_checkpointing = value
-DINOV2_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-    Parameters:
-        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-DINOV2_BASE_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`BitImageProcessor.preprocess`] for details.
-        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
-            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
-            pre-training.
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-DINOV2_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`BitImageProcessor.preprocess`] for details.
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-@dataclass
-class CustomBaseModelOutputWithPooling(BaseModelOutputWithPooling):
-    patch_embeddings: Optional[torch.FloatTensor] = None
-@add_start_docstrings(
-    "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
-    DINOV2_START_DOCSTRING,
-)
-class Dinov2Model(Dinov2PreTrainedModel):
-    def __init__(self, config: Dinov2Config):
-        super().__init__(config)
-        self.config = config
-        self.embeddings = Dinov2Embeddings(config)
-        self.encoder = Dinov2Encoder(config)
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
-        return self.embeddings.patch_embeddings
-    def expand_input_channels(self, extra_input_channels: int) -> None:
-        if extra_input_channels == 0:
-            return
-        conv_old = self.embeddings.patch_embeddings.projection
-        conv_new = nn.Conv2d(
-            self.config.num_channels + extra_input_channels,
-            self.config.hidden_size,
-            kernel_size=self.config.patch_size,
-            stride=self.config.patch_size,
-        ).to(self.device)
-        with torch.no_grad():
-            conv_new.weight[:, :3] = conv_old.weight
-            conv_new.bias = conv_old.bias
-        self.embeddings.patch_embeddings.projection = conv_new
-        del conv_old
-    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-    @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPooling,
-        config_class=_CONFIG_FOR_DOC,
-        modality="vision",
-        expected_output=_EXPECTED_OUTPUT_SHAPE,
-    )
-    def forward(
-        self,
-        pixel_values: Optional[torch.Tensor] = None,
-        bool_masked_pos: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        modulation_cond: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-        embedding_output = self.embeddings(
-            pixel_values, bool_masked_pos=bool_masked_pos
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            head_mask=head_mask,
-            modulation_cond=modulation_cond,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        sequence_output = self.layernorm(sequence_output)
-        pooled_output = sequence_output[:, 0, :]
-        if not return_dict:
-            head_outputs = (sequence_output, pooled_output)
-            return head_outputs + encoder_outputs[1:]
-        return CustomBaseModelOutputWithPooling(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            patch_embeddings=embedding_output,
-        )
-    def set_gradient_checkpointing(self, value: bool = False) -> None:
-        self._set_gradient_checkpointing(self.encoder, value)
-@add_start_docstrings(
-    """
-    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
-    of the [CLS] token) e.g. for ImageNet.
-    """,
-    DINOV2_START_DOCSTRING,
-)
-class Dinov2ForImageClassification(Dinov2PreTrainedModel):
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.dinov2 = Dinov2Model(config)
-        # Classifier head
-        self.classifier = (
-            nn.Linear(config.hidden_size * 2, config.num_labels)
-            if config.num_labels > 0
-            else nn.Identity()
-        )
-        # Initialize weights and apply final processing
-        self.post_init()
-    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=ImageClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        pixel_values: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[tuple, ImageClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        outputs = self.dinov2(
-            pixel_values,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
-        cls_token = sequence_output[:, 0]
-        patch_tokens = sequence_output[:, 1:]
-        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
-        logits = self.classifier(linear_input)
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (
-                    labels.dtype == torch.long or labels.dtype == torch.int
-                ):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return ImageClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-@add_start_docstrings(
-    """
-    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
-    """,
-    DINOV2_START_DOCSTRING,
-)
-class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
-    def __init__(self, config):
-        super().__init__(config)
-        super()._init_backbone(config)
-        self.num_features = [
-            config.hidden_size for _ in range(config.num_hidden_layers + 1)
-        ]
-        self.embeddings = Dinov2Embeddings(config)
-        self.encoder = Dinov2Encoder(config)
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
-        return self.embeddings.patch_embeddings
-    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> BackboneOutput:
-        """
-        Returns:
-        Examples:
-        ```python
-        >>> from transformers import AutoImageProcessor, AutoBackbone
-        >>> import torch
-        >>> from PIL import Image
-        >>> import requests
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
-        >>> model = AutoBackbone.from_pretrained(
-        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
-        ... )
-        >>> inputs = processor(image, return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> feature_maps = outputs.feature_maps
-        >>> list(feature_maps[-1].shape)
-        [1, 768, 16, 16]
-        ```"""
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        embedding_output = self.embeddings(pixel_values)
-        outputs = self.encoder(
-            embedding_output,
-            output_hidden_states=True,
-            output_attentions=output_attentions,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs.hidden_states if return_dict else outputs[1]
-        feature_maps = ()
-        for stage, hidden_state in zip(self.stage_names, hidden_states):
-            if stage in self.out_features:
-                if self.config.apply_layernorm:
-                    hidden_state = self.layernorm(hidden_state)
-                if self.config.reshape_hidden_states:
-                    batch_size, _, height, width = pixel_values.shape
-                    patch_size = self.config.patch_size
-                    hidden_state = hidden_state[:, 1:, :].reshape(
-                        batch_size, width // patch_size, height // patch_size, -1
-                    )
-                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
-                feature_maps += (hidden_state,)
-        if not return_dict:
-            if output_hidden_states:
-                output = (feature_maps,) + outputs[1:]
-            else:
-                output = (feature_maps,) + outputs[2:]
-            return output
-        return BackboneOutput(
-            feature_maps=feature_maps,
-            hidden_states=outputs.hidden_states if output_hidden_states else None,
-            attentions=outputs.attentions if output_attentions else None,
-        )
-class CustomPatchEmbeddings(nn.Module):
-    """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
-    """
-    def __init__(
-        self, image_size: int, patch_size: int, num_channels: int, hidden_size: int
-    ):
-        super().__init__()
-        image_size = (
-            image_size
-            if isinstance(image_size, collections.abc.Iterable)
-            else (image_size, image_size)
-        )
-        patch_size = (
-            patch_size
-            if isinstance(patch_size, collections.abc.Iterable)
-            else (patch_size, patch_size)
-        )
-        num_patches = (image_size[1] // patch_size[1]) * (
-            image_size[0] // patch_size[0]
-        )
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-        self.projection = nn.Conv2d(
-            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
-        )
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        num_channels = pixel_values.shape[1]
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-                f" Expected {self.num_channels} but got {num_channels}."
-            )
-        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
-        return embeddings
-class CustomEmbeddings(nn.Module):
-    """
-    Construct the CLS token, mask token, position and patch embeddings.
-    """
-    def __init__(
-        self, image_size: int, patch_size: int, num_channels: int, hidden_size: int
-    ) -> None:
-        super().__init__()
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.hidden_size = hidden_size
-        self.cls_token = nn.Parameter(torch.randn(1, 1, self.hidden_size))
-        self.patch_embeddings = CustomPatchEmbeddings(
-            image_size, patch_size, num_channels, hidden_size
-        )
-        num_patches = self.patch_embeddings.num_patches
-        self.position_embeddings = nn.Parameter(
-            torch.randn(1, num_patches + 1, self.hidden_size)
-        )
-    def interpolate_pos_encoding(
-        self, embeddings: torch.Tensor, height: int, width: int
-    ) -> torch.Tensor:
-        """
-        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
-        resolution images.
-        Source:
-        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
-        """
-        num_patches = embeddings.shape[1] - 1
-        num_positions = self.position_embeddings.shape[1] - 1
-        if num_patches == num_positions and height == width:
-            return self.position_embeddings
-        class_pos_embed = self.position_embeddings[:, 0]
-        patch_pos_embed = self.position_embeddings[:, 1:]
-        dim = embeddings.shape[-1]
-        height = height // self.patch_size
-        width = width // self.patch_size
-        # we add a small number to avoid floating point error in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        height, width = height + 0.1, width + 0.1
-        patch_pos_embed = patch_pos_embed.reshape(
-            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
-        )
-        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
-        patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed,
-            scale_factor=(
-                height / math.sqrt(num_positions),
-                width / math.sqrt(num_positions),
-            ),
-            mode="bicubic",
-            align_corners=False,
-        )
-        if (
-            int(height) != patch_pos_embed.shape[-2]
-            or int(width) != patch_pos_embed.shape[-1]
-        ):
-            raise ValueError(
-                "Width or height does not match with the interpolated position embeddings"
-            )
-        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
-        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-    ) -> torch.Tensor:
-        batch_size, _, height, width = pixel_values.shape
-        patch_embeddings = self.patch_embeddings(pixel_values)
-        embeddings = patch_embeddings
-        # add the [CLS] token to the embedded patch tokens
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
-        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
-        # add positional encoding to each token
-        embeddings = embeddings + self.interpolate_pos_encoding(
-            embeddings, height, width
-        )
-        return embeddings

sf3d/models/tokenizers/image.py DELETED Viewed

@@ -1,99 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional
-import torch
-import torch.nn as nn
-from einops import rearrange
-from jaxtyping import Float
-from torch import Tensor
-from sf3d.models.tokenizers.dinov2 import Dinov2Model
-from sf3d.models.transformers.attention import Modulation
-from sf3d.models.utils import BaseModule
-class DINOV2SingleImageTokenizer(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        pretrained_model_name_or_path: str = "facebook/dinov2-large"
-        width: int = 512
-        height: int = 512
-        modulation_cond_dim: int = 768
-    cfg: Config
-    def configure(self) -> None:
-        self.model = Dinov2Model.from_pretrained(self.cfg.pretrained_model_name_or_path)
-        for p in self.model.parameters():
-            p.requires_grad_(False)
-        self.model.eval()
-        self.model.set_gradient_checkpointing(False)
-        # add modulation
-        modulations = []
-        for layer in self.model.encoder.layer:
-            norm1_modulation = Modulation(
-                self.model.config.hidden_size,
-                self.cfg.modulation_cond_dim,
-                zero_init=True,
-                single_layer=True,
-            )
-            norm2_modulation = Modulation(
-                self.model.config.hidden_size,
-                self.cfg.modulation_cond_dim,
-                zero_init=True,
-                single_layer=True,
-            )
-            layer.register_ada_norm_modulation(norm1_modulation, norm2_modulation)
-            modulations += [norm1_modulation, norm2_modulation]
-        self.modulations = nn.ModuleList(modulations)
-        self.register_buffer(
-            "image_mean",
-            torch.as_tensor([0.485, 0.456, 0.406]).reshape(1, 1, 3, 1, 1),
-            persistent=False,
-        )
-        self.register_buffer(
-            "image_std",
-            torch.as_tensor([0.229, 0.224, 0.225]).reshape(1, 1, 3, 1, 1),
-            persistent=False,
-        )
-    def forward(
-        self,
-        images: Float[Tensor, "B *N C H W"],
-        modulation_cond: Optional[Float[Tensor, "B *N Cc"]],
-        **kwargs,
-    ) -> Float[Tensor, "B *N Ct Nt"]:
-        model = self.model
-        packed = False
-        if images.ndim == 4:
-            packed = True
-            images = images.unsqueeze(1)
-            if modulation_cond is not None:
-                assert modulation_cond.ndim == 2
-                modulation_cond = modulation_cond.unsqueeze(1)
-        batch_size, n_input_views = images.shape[:2]
-        images = (images - self.image_mean) / self.image_std
-        out = model(
-            rearrange(images, "B N C H W -> (B N) C H W"),
-            modulation_cond=rearrange(modulation_cond, "B N Cc -> (B N) Cc")
-            if modulation_cond is not None
-            else None,
-        )
-        local_features = out.last_hidden_state
-        local_features = local_features.permute(0, 2, 1)
-        local_features = rearrange(
-            local_features, "(B N) Ct Nt -> B N Ct Nt", B=batch_size
-        )
-        if packed:
-            local_features = local_features.squeeze(1)
-        return local_features
-    def detokenize(self, *args, **kwargs):
-        raise NotImplementedError

sf3d/models/tokenizers/triplane.py DELETED Viewed

@@ -1,49 +0,0 @@
-import math
-from dataclasses import dataclass
-import torch
-import torch.nn as nn
-from einops import rearrange, repeat
-from jaxtyping import Float
-from torch import Tensor
-from sf3d.models.utils import BaseModule
-class TriplaneLearnablePositionalEmbedding(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        plane_size: int = 96
-        num_channels: int = 1024
-    cfg: Config
-    def configure(self) -> None:
-        self.embeddings = nn.Parameter(
-            torch.randn(
-                (3, self.cfg.num_channels, self.cfg.plane_size, self.cfg.plane_size),
-                dtype=torch.float32,
-            )
-            * 1
-            / math.sqrt(self.cfg.num_channels)
-        )
-    def forward(self, batch_size: int) -> Float[Tensor, "B Ct Nt"]:
-        return rearrange(
-            repeat(self.embeddings, "Np Ct Hp Wp -> B Np Ct Hp Wp", B=batch_size),
-            "B Np Ct Hp Wp -> B Ct (Np Hp Wp)",
-        )
-    def detokenize(
-        self, tokens: Float[Tensor, "B Ct Nt"]
-    ) -> Float[Tensor, "B 3 Ct Hp Wp"]:
-        batch_size, Ct, Nt = tokens.shape
-        assert Nt == self.cfg.plane_size**2 * 3
-        assert Ct == self.cfg.num_channels
-        return rearrange(
-            tokens,
-            "B Ct (Np Hp Wp) -> B Np Ct Hp Wp",
-            Np=3,
-            Hp=self.cfg.plane_size,
-            Wp=self.cfg.plane_size,
-        )

sf3d/models/transformers/attention.py DELETED Viewed

@@ -1,31 +0,0 @@
-import torch
-import torch.nn as nn
-class Modulation(nn.Module):
-    def __init__(
-        self,
-        embedding_dim: int,
-        condition_dim: int,
-        zero_init: bool = False,
-        single_layer: bool = False,
-    ):
-        super().__init__()
-        self.silu = nn.SiLU()
-        if single_layer:
-            self.linear1 = nn.Identity()
-        else:
-            self.linear1 = nn.Linear(condition_dim, condition_dim)
-        self.linear2 = nn.Linear(condition_dim, embedding_dim * 2)
-        # Only zero init the last linear layer
-        if zero_init:
-            nn.init.zeros_(self.linear2.weight)
-            nn.init.zeros_(self.linear2.bias)
-    def forward(self, x: torch.Tensor, condition: torch.Tensor) -> torch.Tensor:
-        emb = self.linear2(self.silu(self.linear1(condition)))
-        scale, shift = torch.chunk(emb, 2, dim=1)
-        x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-        return x

sf3d/models/transformers/backbone.py DELETED Viewed

@@ -1,515 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from torch import nn
-from sf3d.models.utils import BaseModule
-class GEGLU(nn.Module):
-    r"""
-    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
-    Parameters:
-        dim_in (`int`): The number of channels in the input.
-        dim_out (`int`): The number of channels in the output.
-    """
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
-        if gate.device.type != "mps":
-            return F.gelu(gate)
-        # mps: gelu is not implemented for float16
-        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
-    def forward(self, hidden_states, scale: float = 1.0):
-        args = ()
-        hidden_states, gate = self.proj(hidden_states, *args).chunk(2, dim=-1)
-        return hidden_states * self.gelu(gate)
-class CrossAttention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        kv_dim=None,
-        num_heads=16,
-        qkv_bias=False,
-        attn_drop=0.0,
-        proj_drop=0.0,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim**-0.5
-        kv_dim = dim if not kv_dim else kv_dim
-        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
-        self.wk = nn.Linear(kv_dim, dim, bias=qkv_bias)
-        self.wv = nn.Linear(kv_dim, dim, bias=qkv_bias)
-        self.attn_drop = attn_drop
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-    def forward(self, x_q, x_kv):
-        B, N_q, C = x_q.shape
-        B, N_kv, _ = x_kv.shape
-        # [B, N_q, C] -> [B, N_q, H, C/H]
-        q = self.wq(x_q).reshape(B, N_q, self.num_heads, C // self.num_heads)
-        # [B, N_kv, C] -> [B, N_kv, H, C/H]
-        k = self.wk(x_kv).reshape(B, N_kv, self.num_heads, C // self.num_heads)
-        v = self.wv(x_kv).reshape(B, N_kv, self.num_heads, C // self.num_heads)
-        #  attention
-        x = torch.nn.functional.scaled_dot_product_attention(
-            q.permute(0, 2, 1, 3),
-            k.permute(0, 2, 1, 3),
-            v.permute(0, 2, 1, 3),
-            attn_mask=None,
-            dropout_p=self.attn_drop,
-            scale=self.scale,
-        ).permute(0, 2, 1, 3)
-        # [B, N_q, H, C/H] -> [B, N_q, C]
-        x = x.reshape(B, N_q, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        dropout: float = 0.0,
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-        act_fn = GEGLU(dim, inner_dim)
-        self.net = nn.ModuleList([])
-        self.net.append(act_fn)
-        self.net.append(nn.Dropout(dropout))
-        self.net.append(nn.Linear(inner_dim, dim_out))
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        for module in self.net:
-            x = module(x)
-        return x
-class BasicBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        kv_dim: Optional[int] = None,
-        num_heads: int = 16,
-        qkv_bias: bool = False,
-        attn_drop: float = 0.0,
-        proj_drop: float = 0.0,
-        ff_drop: float = 0.0,
-    ):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(dim)
-        self.attn1 = CrossAttention(
-            dim,
-            kv_dim=dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            attn_drop=attn_drop,
-            proj_drop=proj_drop,
-        )
-        self.norm2 = nn.LayerNorm(dim)
-        self.attn2 = CrossAttention(
-            dim,
-            kv_dim=kv_dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            attn_drop=attn_drop,
-            proj_drop=proj_drop,
-        )
-        self.norm3 = nn.LayerNorm(dim)
-        self.ff = FeedForward(dim, dropout=ff_drop)
-    def forward(self, z, x):
-        z_norm = self.norm1(z)
-        z = z + self.attn1(z_norm, z_norm)
-        # TODO: do we need to have the second attention when x is None?
-        z_norm = self.norm2(z)
-        z = z + self.attn2(z_norm, x if x is not None else z_norm)
-        z_norm = self.norm3(z)
-        z = z + self.ff(z_norm)
-        return z
-class SingleStreamTransformer(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        num_attention_heads: int = 16
-        attention_head_dim: int = 88
-        in_channels: Optional[int] = None
-        out_channels: Optional[int] = None
-        num_layers: int = 16
-        dropout: float = 0.0
-        norm_num_groups: int = 32
-        cross_attention_dim: Optional[int] = None
-        attention_bias: bool = False
-    cfg: Config
-    def configure(self) -> None:
-        self.num_attention_heads = self.cfg.num_attention_heads
-        self.attention_head_dim = self.cfg.attention_head_dim
-        inner_dim = self.num_attention_heads * self.attention_head_dim
-        # Define input layers
-        self.norm = torch.nn.GroupNorm(
-            num_groups=self.cfg.norm_num_groups,
-            num_channels=self.cfg.in_channels,
-            eps=1e-6,
-            affine=True,
-        )
-        self.proj_in = nn.Linear(self.cfg.in_channels, inner_dim)
-        # Define transformers blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicBlock(
-                    inner_dim,
-                    kv_dim=self.cfg.cross_attention_dim,
-                    num_heads=self.num_attention_heads,
-                    qkv_bias=self.cfg.attention_bias,
-                    proj_drop=self.cfg.dropout,
-                    ff_drop=self.cfg.dropout,
-                )
-                for d in range(self.cfg.num_layers)
-            ]
-        )
-        # 4. Define output layers
-        self.proj_out = nn.Linear(inner_dim, self.cfg.in_channels)
-    def forward(self, hidden_states, encoder_hidden_states=None, **kwargs):
-        residual = hidden_states
-        hidden_states = self.norm(hidden_states)
-        hidden_states = hidden_states.permute(0, 2, 1)
-        hidden_states = self.proj_in(hidden_states)
-        for block in self.transformer_blocks:
-            hidden_states = block(hidden_states, encoder_hidden_states)
-        hidden_states = self.proj_out(hidden_states).permute(0, 2, 1).contiguous()
-        # TODO: do we really need to add the residual?
-        hidden_states = hidden_states + residual
-        return hidden_states
-class FuseBlock(nn.Module):
-    """
-    Fuse X in to Z with cross attention
-    """
-    def __init__(
-        self,
-        dim_z: int,
-        dim_x: int,
-        num_heads: int = 16,
-        qkv_bias: bool = False,
-        attn_drop: float = 0.0,
-        proj_drop: float = 0.0,
-        ff_drop: float = 0.0,
-        norm_x_input: bool = True,
-    ):
-        super().__init__()
-        self.norm_x_input = norm_x_input
-        if self.norm_x_input:
-            self.norm_x = nn.LayerNorm(dim_x)
-        self.attn = CrossAttention(
-            dim_z,
-            kv_dim=dim_x,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            attn_drop=attn_drop,
-            proj_drop=proj_drop,
-        )
-        self.norm_z1 = nn.LayerNorm(dim_z)
-        self.norm_z2 = nn.LayerNorm(dim_z)
-        self.ff = FeedForward(dim_z, dropout=ff_drop)
-    def forward(self, z, x):
-        # TODO: do we need to normalize x?
-        z = z + self.attn(self.norm_z1(z), self.norm_x(x) if self.norm_x_input else x)
-        z = z + self.ff(self.norm_z2(z))
-        return z
-@torch.no_grad()
-def get_triplane_attention_mask(res):
-    N = 3 * res * res
-    attn_mask = torch.zeros(3, res, res, 3, res, res)
-    i, j = torch.meshgrid(torch.arange(res), torch.arange(res))
-    attn_mask[0, i, j, 1, i, :] = 1.0
-    attn_mask[0, i, j, 2, j, :] = 1.0
-    attn_mask[1, i, j, 0, i, :] = 1.0
-    attn_mask[1, i, j, 2, :, j] = 1.0
-    attn_mask[2, i, j, 0, :, i] = 1.0
-    attn_mask[2, i, j, 1, :, j] = 1.0
-    attn_mask = attn_mask.bool()
-    attn_bias = torch.empty_like(attn_mask, dtype=torch.float)
-    attn_bias.masked_fill_(attn_mask, 0.0)
-    attn_bias.masked_fill_(~attn_mask, float("-inf"))
-    return attn_bias.reshape(N, N)
-class TriplaneAttention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        resolution: int,
-        num_heads: int = 16,
-        qkv_bias: bool = False,
-        attn_drop: float = 0.0,
-        proj_drop: float = 0.0,
-        full_attention: bool = False,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim**-0.5
-        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
-        self.wk = nn.Linear(dim, dim, bias=qkv_bias)
-        self.wv = nn.Linear(dim, dim, bias=qkv_bias)
-        self.attn_drop = attn_drop
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.resolution = resolution
-        self.full_attention = full_attention
-        self.attn_mask = (
-            get_triplane_attention_mask(resolution) if not full_attention else None
-        )
-    def forward(self, x):
-        B, N, C = x.shape
-        # [B, N, C] -> [B, N, H, C/H]
-        q = self.wq(x).reshape(B, N, self.num_heads, C // self.num_heads)
-        k = self.wk(x).reshape(B, N, self.num_heads, C // self.num_heads)
-        v = self.wv(x).reshape(B, N, self.num_heads, C // self.num_heads)
-        # detokenize the planes
-        assert N == self.resolution**2 * 3
-        attn_bias = (
-            self.attn_mask.to(q)
-            .unsqueeze(0)
-            .unsqueeze(0)
-            .expand(B, self.num_heads, -1, -1)
-            if not self.full_attention
-            else None
-        )
-        # full attention
-        x = torch.nn.functional.scaled_dot_product_attention(
-            q.permute(0, 2, 1, 3),
-            k.permute(0, 2, 1, 3),
-            v.permute(0, 2, 1, 3),
-            attn_mask=attn_bias,
-            dropout_p=self.attn_drop,
-            scale=self.scale,
-        ).permute(0, 2, 1, 3)
-        # [B, N_q, H, C/H] -> [B, N_q, C]
-        x = x.reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class TwoStreamBlock(nn.Module):
-    def __init__(
-        self,
-        dim_latent: int,
-        dim_input: int,
-        num_basic_blocks: int = 4,
-        num_heads: int = 16,
-        qkv_bias: bool = False,
-        attn_drop: float = 0.0,
-        proj_drop: float = 0.0,
-        ff_drop: float = 0.0,
-        norm_x_input: bool = True,
-        dim_cross: Optional[int] = None,
-    ):
-        super().__init__()
-        # Define the fuse block that fuse the input into the latent
-        self.fuse_block_in = FuseBlock(
-            dim_latent,
-            dim_input,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            attn_drop=attn_drop,
-            proj_drop=proj_drop,
-            ff_drop=ff_drop,
-            norm_x_input=norm_x_input,
-        )
-        # Define the transformer block that process the latent
-        self.transformer_block = nn.ModuleList(
-            [
-                BasicBlock(
-                    dim_latent,
-                    kv_dim=dim_cross,
-                    num_heads=num_heads,
-                    qkv_bias=qkv_bias,
-                    proj_drop=proj_drop,
-                    ff_drop=ff_drop,
-                )
-                for _ in range(num_basic_blocks)
-            ]
-        )
-        # Define the fuse block that fuse the latent into the input
-        self.fuse_block_out = FuseBlock(
-            dim_input,
-            dim_latent,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            attn_drop=attn_drop,
-            proj_drop=proj_drop,
-            ff_drop=ff_drop,
-            norm_x_input=norm_x_input,
-        )
-    def forward(self, latent, input, cross_input):
-        latent = self.fuse_block_in(latent, input)
-        for block in self.transformer_block:
-            latent = block(latent, cross_input)
-        input = self.fuse_block_out(input, latent)
-        return latent, input
-class TwoStreamInterleaveTransformer(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        num_attention_heads: int = 16
-        attention_head_dim: int = 64
-        raw_triplane_channels: int = 1024
-        triplane_channels: int = 1024
-        raw_image_channels: int = 1024
-        num_latents: int = 1792
-        num_blocks: int = 4
-        num_basic_blocks: int = 3
-        dropout: float = 0.0
-        latent_init_std: float = 0.02
-        norm_num_groups: int = 32
-        attention_bias: bool = False
-        norm_x_input: bool = False
-        cross_attention_dim: int = 1024
-        mix_latent: bool = True
-    cfg: Config
-    def configure(self) -> None:
-        self.mix_latent = self.cfg.mix_latent
-        # Define the dimensions
-        self.num_attention_heads = self.cfg.num_attention_heads
-        self.attention_head_dim = self.cfg.attention_head_dim
-        self.num_latents = self.cfg.num_latents
-        self.latent_dim = self.num_attention_heads * self.attention_head_dim
-        # Define input layers
-        if self.cfg.norm_num_groups > 0:
-            self.norm_triplane = torch.nn.GroupNorm(
-                num_groups=self.cfg.norm_num_groups,
-                num_channels=self.cfg.raw_triplane_channels,
-                eps=1e-6,
-                affine=True,
-            )
-        else:
-            self.norm_triplane = nn.LayerNorm(self.cfg.raw_triplane_channels)
-        self.proj_triplane = nn.Linear(
-            self.cfg.raw_triplane_channels, self.cfg.triplane_channels
-        )
-        if self.mix_latent:
-            self.norm_image = nn.LayerNorm(self.cfg.raw_image_channels)
-            self.proj_image = nn.Linear(self.cfg.raw_image_channels, self.latent_dim)
-        self.norm_latent = nn.LayerNorm(self.latent_dim)
-        self.proj_latent = nn.Linear(self.latent_dim, self.latent_dim)
-        # Define the latents
-        self.latent_init = nn.Parameter(
-            torch.zeros(1, self.num_latents, self.latent_dim)
-        )
-        nn.init.normal_(self.latent_init, std=self.cfg.latent_init_std)
-        # Define the transformer blocks
-        self.main_blocks = nn.ModuleList(
-            [
-                TwoStreamBlock(
-                    self.latent_dim,
-                    self.cfg.triplane_channels,
-                    num_basic_blocks=self.cfg.num_basic_blocks,
-                    num_heads=self.num_attention_heads,
-                    qkv_bias=self.cfg.attention_bias,
-                    proj_drop=self.cfg.dropout,
-                    ff_drop=self.cfg.dropout,
-                    norm_x_input=self.cfg.norm_x_input,
-                    dim_cross=self.cfg.cross_attention_dim,
-                )
-                for _ in range(self.cfg.num_blocks)
-            ]
-        )
-        # 4. Define output layers
-        self.proj_out = nn.Linear(
-            self.cfg.triplane_channels, self.cfg.raw_triplane_channels
-        )
-    def forward(self, hidden_states, encoder_hidden_states, **kwargs):
-        # hidden_states: [B, triplane_dim, N_triplane] is triplane tokens
-        # encoder_hidden_states: [B, N_image, image_dim] is the image tokens
-        if isinstance(self.norm_triplane, nn.GroupNorm):
-            triplane_tokens = self.norm_triplane(hidden_states)
-            triplane_tokens = triplane_tokens.permute(
-                0, 2, 1
-            )  # [B, N_triplane, triplane_dim]
-        elif isinstance(self.norm_triplane, nn.LayerNorm):
-            triplane_tokens = self.norm_triplane(hidden_states.permute(0, 2, 1))
-        else:
-            raise ValueError("Unknown normalization layer")
-        triplane_tokens = self.proj_triplane(triplane_tokens)
-        if self.mix_latent:
-            image_tokens = self.norm_image(
-                encoder_hidden_states
-            )  # [B, N_image, image_dim]
-            image_tokens = self.proj_image(image_tokens)
-        init_latents = self.latent_init.expand(
-            hidden_states.shape[0], -1, -1
-        )  # [B, N_latent_init, latent_dim]
-        init_latents = self.norm_latent(init_latents)
-        init_latents = self.proj_latent(init_latents)
-        if self.mix_latent:
-            latent_tokens = torch.cat(
-                [image_tokens, init_latents], dim=1
-            )  # [B, N_latent, latent_dim]
-        else:
-            latent_tokens = init_latents
-        # forward the main blocks
-        for block in self.main_blocks:
-            latent_tokens, triplane_tokens = block(
-                latent_tokens, triplane_tokens, encoder_hidden_states
-            )
-        # project the triplane tokens back to the original dimension
-        triplane_tokens = self.proj_out(triplane_tokens).permute(0, 2, 1).contiguous()
-        triplane_tokens = triplane_tokens + hidden_states
-        return triplane_tokens

sf3d/models/utils.py DELETED Viewed

@@ -1,292 +0,0 @@
-import dataclasses
-import importlib
-import math
-from dataclasses import dataclass
-from typing import Any, List, Optional, Tuple, Union
-import numpy as np
-import PIL
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from jaxtyping import Bool, Float, Int, Num
-from omegaconf import DictConfig, OmegaConf
-from torch import Tensor
-class BaseModule(nn.Module):
-    @dataclass
-    class Config:
-        pass
-    cfg: Config  # add this to every subclass of BaseModule to enable static type checking
-    def __init__(
-        self, cfg: Optional[Union[dict, DictConfig]] = None, *args, **kwargs
-    ) -> None:
-        super().__init__()
-        self.cfg = parse_structured(self.Config, cfg)
-        self.configure(*args, **kwargs)
-    def configure(self, *args, **kwargs) -> None:
-        raise NotImplementedError
-def find_class(cls_string):
-    module_string = ".".join(cls_string.split(".")[:-1])
-    cls_name = cls_string.split(".")[-1]
-    module = importlib.import_module(module_string, package=None)
-    cls = getattr(module, cls_name)
-    return cls
-def parse_structured(fields: Any, cfg: Optional[Union[dict, DictConfig]] = None) -> Any:
-    # Check if cfg.keys are in fields
-    cfg_ = cfg.copy()
-    keys = list(cfg_.keys())
-    field_names = {f.name for f in dataclasses.fields(fields)}
-    for key in keys:
-        # This is helpful when swapping out modules from CLI
-        if key not in field_names:
-            print(f"Ignoring {key} as it's not supported by {fields}")
-            cfg_.pop(key)
-    scfg = OmegaConf.merge(OmegaConf.structured(fields), cfg_)
-    return scfg
-EPS_DTYPE = {
-    torch.float16: 1e-4,
-    torch.bfloat16: 1e-4,
-    torch.float32: 1e-7,
-    torch.float64: 1e-8,
-}
-def dot(x, y, dim=-1):
-    return torch.sum(x * y, dim, keepdim=True)
-def reflect(x, n):
-    return x - 2 * dot(x, n) * n
-def normalize(x, dim=-1, eps=None):
-    if eps is None:
-        eps = EPS_DTYPE[x.dtype]
-    return F.normalize(x, dim=dim, p=2, eps=eps)
-def tri_winding(tri: Float[Tensor, "*B 3 2"]) -> Float[Tensor, "*B 3 3"]:
-    # One pad for determinant
-    tri_sq = F.pad(tri, (0, 1), "constant", 1.0)
-    det_tri = torch.det(tri_sq)
-    tri_rev = torch.cat(
-        (tri_sq[..., 0:1, :], tri_sq[..., 2:3, :], tri_sq[..., 1:2, :]), -2
-    )
-    tri_sq[det_tri < 0] = tri_rev[det_tri < 0]
-    return tri_sq
-def triangle_intersection_2d(
-    t1: Float[Tensor, "*B 3 2"],
-    t2: Float[Tensor, "*B 3 2"],
-    eps=1e-12,
-) -> Float[Tensor, "*B"]:  # noqa: F821
-    """Returns True if triangles collide, False otherwise"""
-    def chk_edge(x: Float[Tensor, "*B 3 3"]) -> Bool[Tensor, "*B"]:  # noqa: F821
-        logdetx = torch.logdet(x.double())
-        if eps is None:
-            return ~torch.isfinite(logdetx)
-        return ~(torch.isfinite(logdetx) & (logdetx > math.log(eps)))
-    t1s = tri_winding(t1)
-    t2s = tri_winding(t2)
-    # Assume the triangles do not collide in the begging
-    ret = torch.zeros(t1.shape[0], dtype=torch.bool, device=t1.device)
-    for i in range(3):
-        edge = torch.roll(t1s, i, dims=1)[:, :2, :]
-        # Check if all points of triangle 2 lay on the external side of edge E.
-        # If this is the case the triangle do not collide
-        upd = (
-            chk_edge(torch.cat((edge, t2s[:, 0:1]), 1))
-            & chk_edge(torch.cat((edge, t2s[:, 1:2]), 1))
-            & chk_edge(torch.cat((edge, t2s[:, 2:3]), 1))
-        )
-        # Here no collision is still True due to inversion
-        ret = ret | upd
-    for i in range(3):
-        edge = torch.roll(t2s, i, dims=1)[:, :2, :]
-        upd = (
-            chk_edge(torch.cat((edge, t1s[:, 0:1]), 1))
-            & chk_edge(torch.cat((edge, t1s[:, 1:2]), 1))
-            & chk_edge(torch.cat((edge, t1s[:, 2:3]), 1))
-        )
-        # Here no collision is still True due to inversion
-        ret = ret | upd
-    return ~ret  # Do the inversion
-ValidScale = Union[Tuple[float, float], Num[Tensor, "2 D"]]
-def scale_tensor(
-    dat: Num[Tensor, "... D"], inp_scale: ValidScale, tgt_scale: ValidScale
-):
-    if inp_scale is None:
-        inp_scale = (0, 1)
-    if tgt_scale is None:
-        tgt_scale = (0, 1)
-    if isinstance(tgt_scale, Tensor):
-        assert dat.shape[-1] == tgt_scale.shape[-1]
-    dat = (dat - inp_scale[0]) / (inp_scale[1] - inp_scale[0])
-    dat = dat * (tgt_scale[1] - tgt_scale[0]) + tgt_scale[0]
-    return dat
-def dilate_fill(img, mask, iterations=10):
-    oldMask = mask.float()
-    oldImg = img
-    mask_kernel = torch.ones(
-        (1, 1, 3, 3),
-        dtype=oldMask.dtype,
-        device=oldMask.device,
-    )
-    for i in range(iterations):
-        newMask = torch.nn.functional.max_pool2d(oldMask, 3, 1, 1)
-        # Fill the extension with mean color of old valid regions
-        img_unfold = F.unfold(oldImg, (3, 3)).view(1, 3, 3 * 3, -1)
-        mask_unfold = F.unfold(oldMask, (3, 3)).view(1, 1, 3 * 3, -1)
-        new_mask_unfold = F.unfold(newMask, (3, 3)).view(1, 1, 3 * 3, -1)
-        # Average color of the valid region
-        mean_color = (img_unfold.sum(dim=2) / mask_unfold.sum(dim=2).clip(1)).unsqueeze(
-            2
-        )
-        # Extend it to the new region
-        fill_color = (mean_color * new_mask_unfold).view(1, 3 * 3 * 3, -1)
-        mask_conv = F.conv2d(
-            newMask, mask_kernel, padding=1
-        )  # Get the sum for each kernel patch
-        newImg = F.fold(
-            fill_color, (img.shape[-2], img.shape[-1]), (3, 3)
-        ) / mask_conv.clamp(1)
-        diffMask = newMask - oldMask
-        oldMask = newMask
-        oldImg = torch.lerp(oldImg, newImg, diffMask)
-    return oldImg
-def float32_to_uint8_np(
-    x: Float[np.ndarray, "*B H W C"],
-    dither: bool = True,
-    dither_mask: Optional[Float[np.ndarray, "*B H W C"]] = None,
-    dither_strength: float = 1.0,
-) -> Int[np.ndarray, "*B H W C"]:
-    if dither:
-        dither = (
-            dither_strength * np.random.rand(*x[..., :1].shape).astype(np.float32) - 0.5
-        )
-        if dither_mask is not None:
-            dither = dither * dither_mask
-        return np.clip(np.floor((256.0 * x + dither)), 0, 255).astype(np.uint8)
-    return np.clip(np.floor((256.0 * x)), 0, 255).astype(torch.uint8)
-def convert_data(data):
-    if data is None:
-        return None
-    elif isinstance(data, np.ndarray):
-        return data
-    elif isinstance(data, torch.Tensor):
-        if data.dtype in [torch.float16, torch.bfloat16]:
-            data = data.float()
-        return data.detach().cpu().numpy()
-    elif isinstance(data, list):
-        return [convert_data(d) for d in data]
-    elif isinstance(data, dict):
-        return {k: convert_data(v) for k, v in data.items()}
-    else:
-        raise TypeError(
-            "Data must be in type numpy.ndarray, torch.Tensor, list or dict, getting",
-            type(data),
-        )
-class ImageProcessor:
-    def convert_and_resize(
-        self,
-        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
-        size: int,
-    ):
-        if isinstance(image, PIL.Image.Image):
-            image = torch.from_numpy(np.array(image).astype(np.float32) / 255.0)
-        elif isinstance(image, np.ndarray):
-            if image.dtype == np.uint8:
-                image = torch.from_numpy(image.astype(np.float32) / 255.0)
-            else:
-                image = torch.from_numpy(image)
-        elif isinstance(image, torch.Tensor):
-            pass
-        batched = image.ndim == 4
-        if not batched:
-            image = image[None, ...]
-        image = F.interpolate(
-            image.permute(0, 3, 1, 2),
-            (size, size),
-            mode="bilinear",
-            align_corners=False,
-            antialias=True,
-        ).permute(0, 2, 3, 1)
-        if not batched:
-            image = image[0]
-        return image
-    def __call__(
-        self,
-        image: Union[
-            PIL.Image.Image,
-            np.ndarray,
-            torch.FloatTensor,
-            List[PIL.Image.Image],
-            List[np.ndarray],
-            List[torch.FloatTensor],
-        ],
-        size: int,
-    ) -> Any:
-        if isinstance(image, (np.ndarray, torch.FloatTensor)) and image.ndim == 4:
-            image = self.convert_and_resize(image, size)
-        else:
-            if not isinstance(image, list):
-                image = [image]
-            image = [self.convert_and_resize(im, size) for im in image]
-            image = torch.stack(image, dim=0)
-        return image
-def get_intrinsic_from_fov(fov, H, W, bs=-1):
-    focal_length = 0.5 * H / np.tan(0.5 * fov)
-    intrinsic = np.identity(3, dtype=np.float32)
-    intrinsic[0, 0] = focal_length
-    intrinsic[1, 1] = focal_length
-    intrinsic[0, 2] = W / 2.0
-    intrinsic[1, 2] = H / 2.0
-    if bs > 0:
-        intrinsic = intrinsic[None].repeat(bs, axis=0)
-    return torch.from_numpy(intrinsic)

sf3d/system.py DELETED Viewed

@@ -1,482 +0,0 @@
-import os
-from dataclasses import dataclass, field
-from typing import Any, List, Optional, Tuple
-import numpy as np
-import torch
-import torch.nn.functional as F
-import trimesh
-from einops import rearrange
-from huggingface_hub import hf_hub_download
-from jaxtyping import Float
-from omegaconf import OmegaConf
-from PIL import Image
-from safetensors.torch import load_model
-from torch import Tensor
-from sf3d.models.isosurface import MarchingTetrahedraHelper
-from sf3d.models.mesh import Mesh
-from sf3d.models.utils import (
-    BaseModule,
-    ImageProcessor,
-    convert_data,
-    dilate_fill,
-    dot,
-    find_class,
-    float32_to_uint8_np,
-    normalize,
-    scale_tensor,
-)
-from sf3d.utils import create_intrinsic_from_fov_deg, default_cond_c2w
-from .texture_baker import TextureBaker
-class SF3D(BaseModule):
-    @dataclass
-    class Config(BaseModule.Config):
-        cond_image_size: int
-        isosurface_resolution: int
-        isosurface_threshold: float = 10.0
-        radius: float = 1.0
-        background_color: list[float] = field(default_factory=lambda: [0.5, 0.5, 0.5])
-        default_fovy_deg: float = 40.0
-        default_distance: float = 1.6
-        camera_embedder_cls: str = ""
-        camera_embedder: dict = field(default_factory=dict)
-        image_tokenizer_cls: str = ""
-        image_tokenizer: dict = field(default_factory=dict)
-        tokenizer_cls: str = ""
-        tokenizer: dict = field(default_factory=dict)
-        backbone_cls: str = ""
-        backbone: dict = field(default_factory=dict)
-        post_processor_cls: str = ""
-        post_processor: dict = field(default_factory=dict)
-        decoder_cls: str = ""
-        decoder: dict = field(default_factory=dict)
-        image_estimator_cls: str = ""
-        image_estimator: dict = field(default_factory=dict)
-        global_estimator_cls: str = ""
-        global_estimator: dict = field(default_factory=dict)
-    cfg: Config
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: str, config_name: str, weight_name: str
-    ):
-        if os.path.isdir(pretrained_model_name_or_path):
-            config_path = os.path.join(pretrained_model_name_or_path, config_name)
-            weight_path = os.path.join(pretrained_model_name_or_path, weight_name)
-        else:
-            config_path = hf_hub_download(
-                repo_id=pretrained_model_name_or_path, filename=config_name
-            )
-            weight_path = hf_hub_download(
-                repo_id=pretrained_model_name_or_path, filename=weight_name
-            )
-        cfg = OmegaConf.load(config_path)
-        OmegaConf.resolve(cfg)
-        model = cls(cfg)
-        load_model(model, weight_path)
-        return model
-    @property
-    def device(self):
-        return next(self.parameters()).device
-    def configure(self):
-        self.image_tokenizer = find_class(self.cfg.image_tokenizer_cls)(
-            self.cfg.image_tokenizer
-        )
-        self.tokenizer = find_class(self.cfg.tokenizer_cls)(self.cfg.tokenizer)
-        self.camera_embedder = find_class(self.cfg.camera_embedder_cls)(
-            self.cfg.camera_embedder
-        )
-        self.backbone = find_class(self.cfg.backbone_cls)(self.cfg.backbone)
-        self.post_processor = find_class(self.cfg.post_processor_cls)(
-            self.cfg.post_processor
-        )
-        self.decoder = find_class(self.cfg.decoder_cls)(self.cfg.decoder)
-        self.image_estimator = find_class(self.cfg.image_estimator_cls)(
-            self.cfg.image_estimator
-        )
-        self.global_estimator = find_class(self.cfg.global_estimator_cls)(
-            self.cfg.global_estimator
-        )
-        self.bbox: Float[Tensor, "2 3"]
-        self.register_buffer(
-            "bbox",
-            torch.as_tensor(
-                [
-                    [-self.cfg.radius, -self.cfg.radius, -self.cfg.radius],
-                    [self.cfg.radius, self.cfg.radius, self.cfg.radius],
-                ],
-                dtype=torch.float32,
-            ),
-        )
-        self.isosurface_helper = MarchingTetrahedraHelper(
-            self.cfg.isosurface_resolution,
-            os.path.join(
-                os.path.dirname(__file__),
-                "..",
-                "load",
-                "tets",
-                f"{self.cfg.isosurface_resolution}_tets.npz",
-            ),
-        )
-        self.baker = TextureBaker()
-        self.image_processor = ImageProcessor()
-    def triplane_to_meshes(
-        self, triplanes: Float[Tensor, "B 3 Cp Hp Wp"]
-    ) -> list[Mesh]:
-        meshes = []
-        for i in range(triplanes.shape[0]):
-            triplane = triplanes[i]
-            grid_vertices = scale_tensor(
-                self.isosurface_helper.grid_vertices.to(triplanes.device),
-                self.isosurface_helper.points_range,
-                self.bbox,
-            )
-            values = self.query_triplane(grid_vertices, triplane)
-            decoded = self.decoder(values, include=["vertex_offset", "density"])
-            sdf = decoded["density"] - self.cfg.isosurface_threshold
-            deform = decoded["vertex_offset"].squeeze(0)
-            mesh: Mesh = self.isosurface_helper(
-                sdf.view(-1, 1), deform.view(-1, 3) if deform is not None else None
-            )
-            mesh.v_pos = scale_tensor(
-                mesh.v_pos, self.isosurface_helper.points_range, self.bbox
-            )
-            meshes.append(mesh)
-        return meshes
-    def query_triplane(
-        self,
-        positions: Float[Tensor, "*B N 3"],
-        triplanes: Float[Tensor, "*B 3 Cp Hp Wp"],
-    ) -> Float[Tensor, "*B N F"]:
-        batched = positions.ndim == 3
-        if not batched:
-            # no batch dimension
-            triplanes = triplanes[None, ...]
-            positions = positions[None, ...]
-        assert triplanes.ndim == 5 and positions.ndim == 3
-        positions = scale_tensor(
-            positions, (-self.cfg.radius, self.cfg.radius), (-1, 1)
-        )
-        indices2D: Float[Tensor, "B 3 N 2"] = torch.stack(
-            (positions[..., [0, 1]], positions[..., [0, 2]], positions[..., [1, 2]]),
-            dim=-3,
-        ).to(triplanes.dtype)
-        out: Float[Tensor, "B3 Cp 1 N"] = F.grid_sample(
-            rearrange(triplanes, "B Np Cp Hp Wp -> (B Np) Cp Hp Wp", Np=3).float(),
-            rearrange(indices2D, "B Np N Nd -> (B Np) () N Nd", Np=3).float(),
-            align_corners=True,
-            mode="bilinear",
-        )
-        out = rearrange(out, "(B Np) Cp () N -> B N (Np Cp)", Np=3)
-        return out
-    def get_scene_codes(self, batch) -> Float[Tensor, "B 3 C H W"]:
-        # if batch[rgb_cond] is only one view, add a view dimension
-        if len(batch["rgb_cond"].shape) == 4:
-            batch["rgb_cond"] = batch["rgb_cond"].unsqueeze(1)
-            batch["mask_cond"] = batch["mask_cond"].unsqueeze(1)
-            batch["c2w_cond"] = batch["c2w_cond"].unsqueeze(1)
-            batch["intrinsic_cond"] = batch["intrinsic_cond"].unsqueeze(1)
-            batch["intrinsic_normed_cond"] = batch["intrinsic_normed_cond"].unsqueeze(1)
-        batch_size, n_input_views = batch["rgb_cond"].shape[:2]
-        camera_embeds: Optional[Float[Tensor, "B Nv Cc"]]
-        camera_embeds = self.camera_embedder(**batch)
-        input_image_tokens: Float[Tensor, "B Nv Cit Nit"] = self.image_tokenizer(
-            rearrange(batch["rgb_cond"], "B Nv H W C -> B Nv C H W"),
-            modulation_cond=camera_embeds,
-        )
-        input_image_tokens = rearrange(
-            input_image_tokens, "B Nv C Nt -> B (Nv Nt) C", Nv=n_input_views
-        )
-        tokens: Float[Tensor, "B Ct Nt"] = self.tokenizer(batch_size)
-        tokens = self.backbone(
-            tokens,
-            encoder_hidden_states=input_image_tokens,
-            modulation_cond=None,
-        )
-        direct_codes = self.tokenizer.detokenize(tokens)
-        scene_codes = self.post_processor(direct_codes)
-        return scene_codes, direct_codes
-    def run_image(
-        self,
-        image: Image,
-        bake_resolution: int,
-        estimate_illumination: bool = False,
-    ) -> Tuple[trimesh.Trimesh, dict[str, Any]]:
-        if image.mode != "RGBA":
-            raise ValueError("Image must be in RGBA mode")
-        img_cond = (
-            torch.from_numpy(
-                np.asarray(
-                    image.resize((self.cfg.cond_image_size, self.cfg.cond_image_size))
-                ).astype(np.float32)
-                / 255.0
-            )
-            .float()
-            .clip(0, 1)
-            .to(self.device)
-        )
-        mask_cond = img_cond[:, :, -1:]
-        rgb_cond = torch.lerp(
-            torch.tensor(self.cfg.background_color, device=self.device)[None, None, :],
-            img_cond[:, :, :3],
-            mask_cond,
-        )
-        c2w_cond = default_cond_c2w(self.cfg.default_distance).to(self.device)
-        intrinsic, intrinsic_normed_cond = create_intrinsic_from_fov_deg(
-            self.cfg.default_fovy_deg,
-            self.cfg.cond_image_size,
-            self.cfg.cond_image_size,
-        )
-        batch = {
-            "rgb_cond": rgb_cond,
-            "mask_cond": mask_cond,
-            "c2w_cond": c2w_cond.unsqueeze(0),
-            "intrinsic_cond": intrinsic.to(self.device).unsqueeze(0),
-            "intrinsic_normed_cond": intrinsic_normed_cond.to(self.device).unsqueeze(0),
-        }
-        meshes, global_dict = self.generate_mesh(
-            batch, bake_resolution, estimate_illumination
-        )
-        return meshes[0], global_dict
-    def generate_mesh(
-        self,
-        batch,
-        bake_resolution: int,
-        estimate_illumination: bool = False,
-    ) -> Tuple[List[trimesh.Trimesh], dict[str, Any]]:
-        batch["rgb_cond"] = self.image_processor(
-            batch["rgb_cond"], self.cfg.cond_image_size
-        )
-        batch["mask_cond"] = self.image_processor(
-            batch["mask_cond"], self.cfg.cond_image_size
-        )
-        scene_codes, non_postprocessed_codes = self.get_scene_codes(batch)
-        global_dict = {}
-        if self.image_estimator is not None:
-            global_dict.update(
-                self.image_estimator(batch["rgb_cond"] * batch["mask_cond"])
-            )
-        if self.global_estimator is not None and estimate_illumination:
-            global_dict.update(self.global_estimator(non_postprocessed_codes))
-        with torch.no_grad():
-            with torch.autocast(device_type="cuda", enabled=False):
-                meshes = self.triplane_to_meshes(scene_codes)
-                rets = []
-                for i, mesh in enumerate(meshes):
-                    # Check for empty mesh
-                    if mesh.v_pos.shape[0] == 0:
-                        rets.append(trimesh.Trimesh())
-                        continue
-                    mesh.unwrap_uv()
-                    # Build textures
-                    rast = self.baker.rasterize(
-                        mesh.v_tex, mesh.t_pos_idx, bake_resolution
-                    )
-                    bake_mask = self.baker.get_mask(rast)
-                    pos_bake = self.baker.interpolate(
-                        mesh.v_pos,
-                        rast,
-                        mesh.t_pos_idx,
-                        mesh.v_tex,
-                    )
-                    gb_pos = pos_bake[bake_mask]
-                    tri_query = self.query_triplane(gb_pos, scene_codes[i])[0]
-                    decoded = self.decoder(
-                        tri_query, exclude=["density", "vertex_offset"]
-                    )
-                    nrm = self.baker.interpolate(
-                        mesh.v_nrm,
-                        rast,
-                        mesh.t_pos_idx,
-                        mesh.v_tex,
-                    )
-                    gb_nrm = F.normalize(nrm[bake_mask], dim=-1)
-                    decoded["normal"] = gb_nrm
-                    # Check if any keys in global_dict start with decoded_
-                    for k, v in global_dict.items():
-                        if k.startswith("decoder_"):
-                            decoded[k.replace("decoder_", "")] = v[i]
-                    mat_out = {
-                        "albedo": decoded["features"],
-                        "roughness": decoded["roughness"],
-                        "metallic": decoded["metallic"],
-                        "normal": normalize(decoded["perturb_normal"]),
-                        "bump": None,
-                    }
-                    for k, v in mat_out.items():
-                        if v is None:
-                            continue
-                        if v.shape[0] == 1:
-                            # Skip and directly add a single value
-                            mat_out[k] = v[0]
-                        else:
-                            f = torch.zeros(
-                                bake_resolution,
-                                bake_resolution,
-                                v.shape[-1],
-                                dtype=v.dtype,
-                                device=v.device,
-                            )
-                            if v.shape == f.shape:
-                                continue
-                            if k == "normal":
-                                # Use un-normalized tangents here so that larger smaller tris
-                                # Don't effect the tangents that much
-                                tng = self.baker.interpolate(
-                                    mesh.v_tng,
-                                    rast,
-                                    mesh.t_pos_idx,
-                                    mesh.v_tex,
-                                )
-                                gb_tng = tng[bake_mask]
-                                gb_tng = F.normalize(gb_tng, dim=-1)
-                                gb_btng = F.normalize(
-                                    torch.cross(gb_tng, gb_nrm, dim=-1), dim=-1
-                                )
-                                normal = F.normalize(mat_out["normal"], dim=-1)
-                                bump = torch.cat(
-                                    # Check if we have to flip some things
-                                    (
-                                        dot(normal, gb_tng),
-                                        dot(normal, gb_btng),
-                                        dot(normal, gb_nrm).clip(
-                                            0.3, 1
-                                        ),  # Never go below 0.3. This would indicate a flipped (or close to one) normal
-                                    ),
-                                    -1,
-                                )
-                                bump = (bump * 0.5 + 0.5).clamp(0, 1)
-                                f[bake_mask] = bump.view(-1, 3)
-                                mat_out["bump"] = f
-                            else:
-                                f[bake_mask] = v.view(-1, v.shape[-1])
-                                mat_out[k] = f
-                    def uv_padding(arr):
-                        if arr.ndim == 1:
-                            return arr
-                        return (
-                            dilate_fill(
-                                arr.permute(2, 0, 1)[None, ...],
-                                bake_mask.unsqueeze(0).unsqueeze(0),
-                                iterations=bake_resolution // 150,
-                            )
-                            .squeeze(0)
-                            .permute(1, 2, 0)
-                        )
-                    verts_np = convert_data(mesh.v_pos)
-                    faces = convert_data(mesh.t_pos_idx)
-                    uvs = convert_data(mesh.v_tex)
-                    basecolor_tex = Image.fromarray(
-                        float32_to_uint8_np(convert_data(uv_padding(mat_out["albedo"])))
-                    ).convert("RGB")
-                    basecolor_tex.format = "JPEG"
-                    metallic = mat_out["metallic"].squeeze().cpu().item()
-                    roughness = mat_out["roughness"].squeeze().cpu().item()
-                    if "bump" in mat_out and mat_out["bump"] is not None:
-                        bump_np = convert_data(uv_padding(mat_out["bump"]))
-                        bump_up = np.ones_like(bump_np)
-                        bump_up[..., :2] = 0.5
-                        bump_up[..., 2:] = 1
-                        bump_tex = Image.fromarray(
-                            float32_to_uint8_np(
-                                bump_np,
-                                dither=True,
-                                # Do not dither if something is perfectly flat
-                                dither_mask=np.all(
-                                    bump_np == bump_up, axis=-1, keepdims=True
-                                ).astype(np.float32),
-                            )
-                        ).convert("RGB")
-                        bump_tex.format = (
-                            "JPEG"  # PNG would be better but the assets are larger
-                        )
-                    else:
-                        bump_tex = None
-                    material = trimesh.visual.material.PBRMaterial(
-                        baseColorTexture=basecolor_tex,
-                        roughnessFactor=roughness,
-                        metallicFactor=metallic,
-                        normalTexture=bump_tex,
-                    )
-                    tmesh = trimesh.Trimesh(
-                        vertices=verts_np,
-                        faces=faces,
-                        visual=trimesh.visual.texture.TextureVisuals(
-                            uv=uvs, material=material
-                        ),
-                    )
-                    rot = trimesh.transformations.rotation_matrix(
-                        np.radians(-90), [1, 0, 0]
-                    )
-                    tmesh.apply_transform(rot)
-                    tmesh.apply_transform(
-                        trimesh.transformations.rotation_matrix(
-                            np.radians(90), [0, 1, 0]
-                        )
-                    )
-                    tmesh.invert()
-                    rets.append(tmesh)
-        return rets, global_dict

sf3d/texture_baker.py DELETED Viewed

@@ -1,87 +0,0 @@
-import os
-import slangtorch
-import torch
-import torch.nn as nn
-from jaxtyping import Bool, Float
-from torch import Tensor
-class TextureBaker(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.baker = slangtorch.loadModule(
-            os.path.join(os.path.dirname(__file__), "texture_baker.slang")
-        )
-    def rasterize(
-        self,
-        uv: Float[Tensor, "Nv 2"],
-        face_indices: Float[Tensor, "Nf 3"],
-        bake_resolution: int,
-    ) -> Float[Tensor, "bake_resolution bake_resolution 4"]:
-        if not face_indices.is_cuda or not uv.is_cuda:
-            raise ValueError("All input tensors must be on cuda")
-        face_indices = face_indices.to(torch.int32)
-        uv = uv.to(torch.float32)
-        rast_result = torch.empty(
-            bake_resolution, bake_resolution, 4, device=uv.device, dtype=torch.float32
-        )
-        block_size = 16
-        grid_size = bake_resolution // block_size
-        self.baker.bake_uv(uv=uv, indices=face_indices, output=rast_result).launchRaw(
-            blockSize=(block_size, block_size, 1), gridSize=(grid_size, grid_size, 1)
-        )
-        return rast_result
-    def get_mask(
-        self, rast: Float[Tensor, "bake_resolution bake_resolution 4"]
-    ) -> Bool[Tensor, "bake_resolution bake_resolution"]:
-        return rast[..., -1] >= 0
-    def interpolate(
-        self,
-        attr: Float[Tensor, "Nv 3"],
-        rast: Float[Tensor, "bake_resolution bake_resolution 4"],
-        face_indices: Float[Tensor, "Nf 3"],
-        uv: Float[Tensor, "Nv 2"],
-    ) -> Float[Tensor, "bake_resolution bake_resolution 3"]:
-        # Make sure all input tensors are on torch
-        if not attr.is_cuda or not face_indices.is_cuda or not rast.is_cuda:
-            raise ValueError("All input tensors must be on cuda")
-        attr = attr.to(torch.float32)
-        face_indices = face_indices.to(torch.int32)
-        uv = uv.to(torch.float32)
-        pos_bake = torch.zeros(
-            rast.shape[0],
-            rast.shape[1],
-            3,
-            device=attr.device,
-            dtype=attr.dtype,
-        )
-        block_size = 16
-        grid_size = rast.shape[0] // block_size
-        self.baker.interpolate(
-            attr=attr, indices=face_indices, rast=rast, output=pos_bake
-        ).launchRaw(
-            blockSize=(block_size, block_size, 1), gridSize=(grid_size, grid_size, 1)
-        )
-        return pos_bake
-    def forward(
-        self,
-        attr: Float[Tensor, "Nv 3"],
-        uv: Float[Tensor, "Nv 2"],
-        face_indices: Float[Tensor, "Nf 3"],
-        bake_resolution: int,
-    ) -> Float[Tensor, "bake_resolution bake_resolution 3"]:
-        rast = self.rasterize(uv, face_indices, bake_resolution)
-        return self.interpolate(attr, rast, face_indices, uv)

sf3d/texture_baker.slang DELETED Viewed

@@ -1,93 +0,0 @@
-// xy: 2D test position
-// v1: vertex position 1
-// v2: vertex position 2
-// v3: vertex position 3
-//
-bool barycentric_coordinates(float2 xy, float2 v1, float2 v2, float2 v3, out float u, out float v, out float w)
-{
-    // Return true if the point (x,y) is inside the triangle defined by the vertices v1, v2, v3.
-    // If the point is inside the triangle, the barycentric coordinates are stored in u, v, and w.
-    float2 v1v2 = v2 - v1;
-    float2 v1v3 = v3 - v1;
-    float2 xyv1 = xy - v1;
-    float d00 = dot(v1v2, v1v2);
-    float d01 = dot(v1v2, v1v3);
-    float d11 = dot(v1v3, v1v3);
-    float d20 = dot(xyv1, v1v2);
-    float d21 = dot(xyv1, v1v3);
-    float denom = d00 * d11 - d01 * d01;
-    v = (d11 * d20 - d01 * d21) / denom;
-    w = (d00 * d21 - d01 * d20) / denom;
-    u = 1.0 - v - w;
-    return (v >= 0.0) && (w >= 0.0) && (v + w <= 1.0);
-}
-[AutoPyBindCUDA]
-[CUDAKernel]
-void interpolate(
-    TensorView<float3> attr,
-    TensorView<int3> indices,
-    TensorView<float4> rast,
-    TensorView<float3> output)
-{
-    // Interpolate the attr into output based on the rast result (barycentric coordinates, + triangle idx)
-    uint3 dispatch_id = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx();
-    if (dispatch_id.x > output.size(0) || dispatch_id.y > output.size(1))
-        return;
-    float4 barycentric = rast[dispatch_id.x, dispatch_id.y];
-    int triangle_idx = int(barycentric.w);
-    if (triangle_idx < 0) {
-        output[dispatch_id.x, dispatch_id.y] = float3(0.0, 0.0, 0.0);
-        return;
-    }
-    float3 v1 = attr[indices[triangle_idx].x];
-    float3 v2 = attr[indices[triangle_idx].y];
-    float3 v3 = attr[indices[triangle_idx].z];
-    output[dispatch_id.x, dispatch_id.y] = v1 * barycentric.x + v2 * barycentric.y + v3 * barycentric.z;
-}
-[AutoPyBindCUDA]
-[CUDAKernel]
-void bake_uv(
-    TensorView<float2> uv,
-    TensorView<int3> indices,
-    TensorView<float4> output)
-{
-    uint3 dispatch_id = cudaBlockIdx() * cudaBlockDim() + cudaThreadIdx();
-    if (dispatch_id.y > output.size(0) || dispatch_id.x > output.size(1))
-        return;
-    // We index x,y but the orginal coords are HW. So swap them
-    float2 pixel_coord = float2(dispatch_id.y, dispatch_id.x);
-    // Normalize to [0, 1]
-    pixel_coord /= float2(output.size(1), output.size(0));
-    pixel_coord = clamp(pixel_coord, 0.0, 1.0);
-    // Flip x-axis
-    pixel_coord.y = 1 - pixel_coord.y;
-    for (int i = 0; i < indices.size(0); i++) {
-        float2 v1 = float2(uv[indices[i].x].x, uv[indices[i].x].y);
-        float2 v2 = float2(uv[indices[i].y].x, uv[indices[i].y].y);
-        float2 v3 = float2(uv[indices[i].z].x, uv[indices[i].z].y);
-        float u, v, w;
-        bool hit = barycentric_coordinates(pixel_coord, v1, v2, v3, u, v, w);
-        if (hit){
-            output[dispatch_id.x, dispatch_id.y] = float4(u, v, w, i);
-            return;
-        }
-    }
-    output[dispatch_id.x, dispatch_id.y] = float4(0.0, 0.0, 0.0, -1);
-}

sf3d/utils.py DELETED Viewed

@@ -1,91 +0,0 @@
-from typing import Any
-import numpy as np
-import rembg
-import torch
-from PIL import Image
-import sf3d.models.utils as sf3d_utils
-def create_intrinsic_from_fov_deg(fov_deg: float, cond_height: int, cond_width: int):
-    intrinsic = sf3d_utils.get_intrinsic_from_fov(
-        np.deg2rad(fov_deg),
-        H=cond_height,
-        W=cond_width,
-    )
-    intrinsic_normed_cond = intrinsic.clone()
-    intrinsic_normed_cond[..., 0, 2] /= cond_width
-    intrinsic_normed_cond[..., 1, 2] /= cond_height
-    intrinsic_normed_cond[..., 0, 0] /= cond_width
-    intrinsic_normed_cond[..., 1, 1] /= cond_height
-    return intrinsic, intrinsic_normed_cond
-def default_cond_c2w(distance: float):
-    c2w_cond = torch.as_tensor(
-        [
-            [0, 0, 1, distance],
-            [1, 0, 0, 0],
-            [0, 1, 0, 0],
-            [0, 0, 0, 1],
-        ]
-    ).float()
-    return c2w_cond
-def remove_background(
-    image: Image,
-    rembg_session: Any = None,
-    force: bool = False,
-    **rembg_kwargs,
-) -> Image:
-    do_remove = True
-    if image.mode == "RGBA" and image.getextrema()[3][0] < 255:
-        do_remove = False
-    do_remove = do_remove or force
-    if do_remove:
-        image = rembg.remove(image, session=rembg_session, **rembg_kwargs)
-    return image
-def resize_foreground(
-    image: Image,
-    ratio: float,
-) -> Image:
-    image = np.array(image)
-    assert image.shape[-1] == 4
-    alpha = np.where(image[..., 3] > 0)
-    y1, y2, x1, x2 = (
-        alpha[0].min(),
-        alpha[0].max(),
-        alpha[1].min(),
-        alpha[1].max(),
-    )
-    # crop the foreground
-    fg = image[y1:y2, x1:x2]
-    # pad to square
-    size = max(fg.shape[0], fg.shape[1])
-    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
-    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
-    new_image = np.pad(
-        fg,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
-    )
-    # compute padding according to the ratio
-    new_size = int(new_image.shape[0] / ratio)
-    # pad to size, double side
-    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
-    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
-    new_image = np.pad(
-        new_image,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
-    )
-    new_image = Image.fromarray(new_image, mode="RGBA")
-    return new_image

stable_fast.py DELETED Viewed

@@ -1,355 +0,0 @@
-import os
-import tempfile
-import time
-from functools import lru_cache
-from typing import Any
-import gradio as gr
-import numpy as np
-import rembg
-import torch
-from gradio_litmodel3d import LitModel3D
-from PIL import Image
-import sf3d.utils as sf3d_utils
-from sf3d.system import SF3D
-rembg_session = rembg.new_session()
-COND_WIDTH = 512
-COND_HEIGHT = 512
-COND_DISTANCE = 1.6
-COND_FOVY_DEG = 40
-BACKGROUND_COLOR = [0.5, 0.5, 0.5]
-# Cached. Doesn't change
-c2w_cond = sf3d_utils.default_cond_c2w(COND_DISTANCE)
-intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg(
-    COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH
-)
-model = SF3D.from_pretrained(
-    "stabilityai/stable-fast-3d",
-    config_name="config.yaml",
-    weight_name="model.safetensors",
-)
-model.eval().cuda()
-example_files = [
-    os.path.join("demo_files/examples", f) for f in os.listdir("demo_files/examples")
-]
-def run_model(input_image):
-    start = time.time()
-    with torch.no_grad():
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            model_batch = create_batch(input_image)
-            model_batch = {k: v.cuda() for k, v in model_batch.items()}
-            trimesh_mesh, _glob_dict = model.generate_mesh(model_batch, 1024)
-            trimesh_mesh = trimesh_mesh[0]
-    # Create new tmp file
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
-    trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True)
-    print("Generation took:", time.time() - start, "s")
-    return tmp_file.name
-def create_batch(input_image: Image) -> dict[str, Any]:
-    img_cond = (
-        torch.from_numpy(
-            np.asarray(input_image.resize((COND_WIDTH, COND_HEIGHT))).astype(np.float32)
-            / 255.0
-        )
-        .float()
-        .clip(0, 1)
-    )
-    mask_cond = img_cond[:, :, -1:]
-    rgb_cond = torch.lerp(
-        torch.tensor(BACKGROUND_COLOR)[None, None, :], img_cond[:, :, :3], mask_cond
-    )
-    batch_elem = {
-        "rgb_cond": rgb_cond,
-        "mask_cond": mask_cond,
-        "c2w_cond": c2w_cond.unsqueeze(0),
-        "intrinsic_cond": intrinsic.unsqueeze(0),
-        "intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0),
-    }
-    # Add batch dim
-    batched = {k: v.unsqueeze(0) for k, v in batch_elem.items()}
-    return batched
-@lru_cache
-def checkerboard(squares: int, size: int, min_value: float = 0.5):
-    base = np.zeros((squares, squares)) + min_value
-    base[1::2, ::2] = 1
-    base[::2, 1::2] = 1
-    repeat_mult = size // squares
-    return (
-        base.repeat(repeat_mult, axis=0)
-        .repeat(repeat_mult, axis=1)[:, :, None]
-        .repeat(3, axis=-1)
-    )
-def remove_background(input_image: Image) -> Image:
-    return rembg.remove(input_image, session=rembg_session)
-def resize_foreground(
-    image: Image,
-    ratio: float,
-) -> Image:
-    image = np.array(image)
-    assert image.shape[-1] == 4
-    alpha = np.where(image[..., 3] > 0)
-    y1, y2, x1, x2 = (
-        alpha[0].min(),
-        alpha[0].max(),
-        alpha[1].min(),
-        alpha[1].max(),
-    )
-    # crop the foreground
-    fg = image[y1:y2, x1:x2]
-    # pad to square
-    size = max(fg.shape[0], fg.shape[1])
-    ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
-    ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
-    new_image = np.pad(
-        fg,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
-    )
-    # compute padding according to the ratio
-    new_size = int(new_image.shape[0] / ratio)
-    # pad to size, double side
-    ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
-    ph1, pw1 = new_size - size - ph0, new_size - size - pw0
-    new_image = np.pad(
-        new_image,
-        ((ph0, ph1), (pw0, pw1), (0, 0)),
-        mode="constant",
-        constant_values=((0, 0), (0, 0), (0, 0)),
-    )
-    new_image = Image.fromarray(new_image, mode="RGBA").resize(
-        (COND_WIDTH, COND_HEIGHT)
-    )
-    return new_image
-def square_crop(input_image: Image) -> Image:
-    # Perform a center square crop
-    min_size = min(input_image.size)
-    left = (input_image.size[0] - min_size) // 2
-    top = (input_image.size[1] - min_size) // 2
-    right = (input_image.size[0] + min_size) // 2
-    bottom = (input_image.size[1] + min_size) // 2
-    return input_image.crop((left, top, right, bottom)).resize(
-        (COND_WIDTH, COND_HEIGHT)
-    )
-def show_mask_img(input_image: Image) -> Image:
-    img_numpy = np.array(input_image)
-    alpha = img_numpy[:, :, 3] / 255.0
-    chkb = checkerboard(32, 512) * 255
-    new_img = img_numpy[..., :3] * alpha[:, :, None] + chkb * (1 - alpha[:, :, None])
-    return Image.fromarray(new_img.astype(np.uint8), mode="RGB")
-def run_button(run_btn, input_image, background_state, foreground_ratio):
-    if run_btn == "Run":
-        glb_file: str = run_model(background_state)
-        return (
-            gr.update(),
-            gr.update(),
-            gr.update(),
-            gr.update(),
-            gr.update(value=glb_file, visible=True),
-            gr.update(visible=True),
-        )
-    elif run_btn == "Remove Background":
-        rem_removed = remove_background(input_image)
-        sqr_crop = square_crop(rem_removed)
-        fr_res = resize_foreground(sqr_crop, foreground_ratio)
-        return (
-            gr.update(value="Run", visible=True),
-            sqr_crop,
-            fr_res,
-            gr.update(value=show_mask_img(fr_res), visible=True),
-            gr.update(value=None, visible=False),
-            gr.update(visible=False),
-        )
-def requires_bg_remove(image, fr):
-    if image is None:
-        return (
-            gr.update(visible=False, value="Run"),
-            None,
-            None,
-            gr.update(value=None, visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False),
-        )
-    alpha_channel = np.array(image.getchannel("A"))
-    min_alpha = alpha_channel.min()
-    if min_alpha == 0:
-        print("Already has alpha")
-        sqr_crop = square_crop(image)
-        fr_res = resize_foreground(sqr_crop, fr)
-        return (
-            gr.update(value="Run", visible=True),
-            sqr_crop,
-            fr_res,
-            gr.update(value=show_mask_img(fr_res), visible=True),
-            gr.update(visible=False),
-            gr.update(visible=False),
-        )
-    return (
-        gr.update(value="Remove Background", visible=True),
-        None,
-        None,
-        gr.update(value=None, visible=False),
-        gr.update(visible=False),
-        gr.update(visible=False),
-    )
-def update_foreground_ratio(img_proc, fr):
-    foreground_res = resize_foreground(img_proc, fr)
-    return (
-        foreground_res,
-        gr.update(value=show_mask_img(foreground_res)),
-    )
-with gr.Blocks() as demo:
-    img_proc_state = gr.State()
-    background_remove_state = gr.State()
-    gr.Markdown("""
-    # SF3D: Stable Fast 3D Mesh Reconstruction with UV-unwrapping and Illumination Disentanglement
-    **SF3D** is a state-of-the-art method for 3D mesh reconstruction from a single image.
-    This demo allows you to upload an image and generate a 3D mesh model from it.
-    **Tips**
-    1. If the image already has an alpha channel, you can skip the background removal step.
-    2. You can adjust the foreground ratio to control the size of the foreground object. This can influence the shape
-    3. You can upload your own HDR environment map to light the 3D model.
-    """)
-    with gr.Row(variant="panel"):
-        with gr.Column():
-            with gr.Row():
-                input_img = gr.Image(
-                    type="pil", label="Input Image", sources="upload", image_mode="RGBA"
-                )
-                preview_removal = gr.Image(
-                    label="Preview Background Removal",
-                    type="pil",
-                    image_mode="RGB",
-                    interactive=False,
-                    visible=False,
-                )
-            foreground_ratio = gr.Slider(
-                label="Foreground Ratio",
-                minimum=0.5,
-                maximum=1.0,
-                value=0.85,
-                step=0.05,
-            )
-            foreground_ratio.change(
-                update_foreground_ratio,
-                inputs=[img_proc_state, foreground_ratio],
-                outputs=[background_remove_state, preview_removal],
-            )
-            run_btn = gr.Button("Run", variant="primary", visible=False)
-        with gr.Column():
-            output_3d = LitModel3D(
-                label="3D Model",
-                visible=False,
-                clear_color=[0.0, 0.0, 0.0, 0.0],
-                tonemapping="aces",
-                contrast=1.0,
-                scale=1.0,
-            )
-            with gr.Column(visible=False, scale=1.0) as hdr_row:
-                gr.Markdown("""## HDR Environment Map
-                Select an HDR environment map to light the 3D model. You can also upload your own HDR environment maps.
-                """)
-                with gr.Row():
-                    hdr_illumination_file = gr.File(
-                        label="HDR Env Map", file_types=[".hdr"], file_count="single"
-                    )
-                    example_hdris = [
-                        os.path.join("demo_files/hdri", f)
-                        for f in os.listdir("demo_files/hdri")
-                    ]
-                    hdr_illumination_example = gr.Examples(
-                        examples=example_hdris,
-                        inputs=hdr_illumination_file,
-                    )
-                    hdr_illumination_file.change(
-                        lambda x: gr.update(env_map=x.name if x is not None else None),
-                        inputs=hdr_illumination_file,
-                        outputs=[output_3d],
-                    )
-    examples = gr.Examples(
-        examples=example_files,
-        inputs=input_img,
-    )
-    input_img.change(
-        requires_bg_remove,
-        inputs=[input_img, foreground_ratio],
-        outputs=[
-            run_btn,
-            img_proc_state,
-            background_remove_state,
-            preview_removal,
-            output_3d,
-            hdr_row,
-        ],
-    )
-    run_btn.click(
-        run_button,
-        inputs=[
-            run_btn,
-            input_img,
-            background_remove_state,
-            foreground_ratio,
-        ],
-        outputs=[
-            run_btn,
-            img_proc_state,
-            background_remove_state,
-            preview_removal,
-            output_3d,
-            hdr_row,
-        ],
-    )
-demo.launch()