Spaces:

VAST-AI
/

MV-Adapter-Img2Texture

Running on Zero

App Files Files Community

huanngzh commited on Mar 30

Commit

a207590

0 Parent(s):

init

Browse files

Files changed (22) hide show

.gitattributes +37 -0
README.md +14 -0
app.py +293 -0
inference_ig2mv_sdxl.py +286 -0
mvadapter/__init__.py +0 -0
mvadapter/loaders/__init__.py +1 -0
mvadapter/loaders/custom_adapter.py +98 -0
mvadapter/models/__init__.py +0 -0
mvadapter/models/attention_processor.py +743 -0
mvadapter/pipelines/pipeline_mvadapter_i2mv_sd.py +777 -0
mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py +962 -0
mvadapter/pipelines/pipeline_mvadapter_t2mv_sd.py +634 -0
mvadapter/pipelines/pipeline_mvadapter_t2mv_sdxl.py +801 -0
mvadapter/schedulers/scheduler_utils.py +70 -0
mvadapter/schedulers/scheduling_shift_snr.py +138 -0
mvadapter/utils/__init__.py +3 -0
mvadapter/utils/camera.py +211 -0
mvadapter/utils/geometry.py +253 -0
mvadapter/utils/logging.py +340 -0
mvadapter/utils/render.py +499 -0
mvadapter/utils/saving.py +88 -0
requirements.txt +23 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.glb filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: MV Adapter Img2Texture
+emoji: 🔮
+colorFrom: purple
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.23.1
+app_file: app.py
+pinned: false
+license: mit
+short_description: Generate 3D texture from image
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import os
+import random
+import shutil
+import subprocess
+from typing import List
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from huggingface_hub import hf_hub_download, snapshot_download
+from PIL import Image
+from torchvision import transforms
+from transformers import AutoModelForImageSegmentation
+from inference_ig2mv_sdxl import (
+    prepare_pipeline,
+    preprocess_image,
+    remove_bg,
+    run_pipeline,
+)
+from mvadapter.utils import get_orthogonal_camera, make_image_grid, tensor_to_image
+# install others
+subprocess.run("pip install spandrel==0.4.1 --no-deps", shell=True, check=True)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16
+MAX_SEED = np.iinfo(np.int32).max
+NUM_VIEWS = 6
+HEIGHT = 768
+WIDTH = 768
+TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
+os.makedirs(TMP_DIR, exist_ok=True)
+HEADER = """
+# 🔮 Image to Texture with [MV-Adapter](https://github.com/huanngzh/MV-Adapter)
+## State-of-the-art Open Source Texture Generation Using Multi-View Diffusion Model
+"""
+EXAMPLES = [
+    ["examples/001.jpeg", "examples/001.glb"],
+    ["examples/002.jpeg", "examples/002.glb"],
+]
+# MV-Adapter
+pipe = prepare_pipeline(
+    base_model="stabilityai/stable-diffusion-xl-base-1.0",
+    vae_model="madebyollin/sdxl-vae-fp16-fix",
+    unet_model=None,
+    lora_model=None,
+    adapter_path="huanngzh/mv-adapter",
+    scheduler=None,
+    num_views=NUM_VIEWS,
+    device=DEVICE,
+    dtype=DTYPE,
+)
+birefnet = AutoModelForImageSegmentation.from_pretrained(
+    "ZhengPeng7/BiRefNet", trust_remote_code=True
+)
+birefnet.to(DEVICE)
+transform_image = transforms.Compose(
+    [
+        transforms.Resize((1024, 1024)),
+        transforms.ToTensor(),
+        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+    ]
+)
+remove_bg_fn = lambda x: remove_bg(x, birefnet, transform_image, DEVICE)
+if not os.path.exists("checkpoints/RealESRGAN_x2plus.pth"):
+    hf_hub_download(
+        "dtarnow/UPscaler", filename="RealESRGAN_x2plus.pth", local_dir="checkpoints"
+    )
+if not os.path.exists("checkpoints/big-lama.pt"):
+    subprocess.run(
+        "wget -P checkpoints/ https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt",
+        shell=True,
+        check=True,
+    )
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def start_session(req: gr.Request):
+    save_dir = os.path.join(TMP_DIR, str(req.session_hash))
+    os.makedirs(save_dir, exist_ok=True)
+    print("start session, mkdir", save_dir)
+def end_session(req: gr.Request):
+    save_dir = os.path.join(TMP_DIR, str(req.session_hash))
+    shutil.rmtree(save_dir)
+def get_random_hex():
+    random_bytes = os.urandom(8)
+    random_hex = random_bytes.hex()
+    return random_hex
+def get_random_seed(randomize_seed, seed):
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+@spaces.GPU(duration=90)
+@torch.no_grad()
+def run_mvadapter(
+    mesh_path,
+    prompt,
+    image,
+    seed=42,
+    guidance_scale=3.0,
+    num_inference_steps=30,
+    reference_conditioning_scale=1.0,
+    negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
+    progress=gr.Progress(track_tqdm=True),
+):
+    # pre-process the reference image
+    image = Image.open(image).convert("RGB") if isinstance(image, str) else image
+    image = remove_bg_fn(image)
+    image = preprocess_image(image, HEIGHT, WIDTH)
+    if isinstance(seed, str):
+        try:
+            seed = int(seed.strip())
+        except ValueError:
+            seed = 42
+    images, _, _, _ = run_pipeline(
+        pipe,
+        mesh_path=mesh_path,
+        num_views=NUM_VIEWS,
+        text=prompt,
+        image=image,
+        height=HEIGHT,
+        width=WIDTH,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        seed=seed,
+        remove_bg_fn=None,
+        reference_conditioning_scale=reference_conditioning_scale,
+        negative_prompt=negative_prompt,
+        device=DEVICE,
+    )
+    torch.cuda.empty_cache()
+    return images, image
+@spaces.GPU(duration=90)
+@torch.no_grad()
+def run_texturing(
+    mesh_path: str,
+    mv_images: List[Image.Image],
+    uv_unwarp: bool,
+    preprocess_mesh: bool,
+    uv_size: int,
+    req: gr.Request,
+):
+    save_dir = os.path.join(TMP_DIR, str(req.session_hash))
+    mv_image_path = os.path.join(save_dir, f"mv_adapter_{get_random_hex()}.png")
+    mv_images = [item[0] for item in mv_images]
+    make_image_grid(mv_images, rows=1).save(mv_image_path)
+    from texture import ModProcessConfig, TexturePipeline
+    texture_pipe = TexturePipeline(
+        upscaler_ckpt_path="checkpoints/RealESRGAN_x2plus.pth",
+        inpaint_ckpt_path="checkpoints/big-lama.pt",
+        device=DEVICE,
+    )
+    textured_glb_path = texture_pipe(
+        mesh_path=mesh_path,
+        save_dir=save_dir,
+        save_name=f"texture_mesh_{get_random_hex()}",
+        uv_unwarp=uv_unwarp,
+        preprocess_mesh=preprocess_mesh,
+        uv_size=uv_size,
+        rgb_path=mv_image_path,
+        rgb_process_config=ModProcessConfig(view_upscale=True, inpaint_mode="view"),
+        camera_azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
+    ).shaded_model_save_path
+    torch.cuda.empty_cache()
+    return textured_glb_path, textured_glb_path
+with gr.Blocks(title="MVAdapter") as demo:
+    gr.Markdown(HEADER)
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                input_mesh = gr.Model3D(label="Input 3D mesh")
+                image_prompt = gr.Image(label="Input Image", type="pil")
+            with gr.Accordion("Generation Settings", open=False):
+                prompt = gr.Textbox(
+                    label="Prompt (Optional)",
+                    placeholder="Enter your prompt",
+                    value="high quality",
+                )
+                seed = gr.Slider(
+                    label="Seed", minimum=0, maximum=MAX_SEED, step=0, value=0
+                )
+                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=8,
+                    maximum=50,
+                    step=1,
+                    value=25,
+                )
+                guidance_scale = gr.Slider(
+                    label="CFG scale",
+                    minimum=0.0,
+                    maximum=20.0,
+                    step=0.1,
+                    value=3.0,
+                )
+                reference_conditioning_scale = gr.Slider(
+                    label="Image conditioning scale",
+                    minimum=0.0,
+                    maximum=2.0,
+                    step=0.1,
+                    value=1.0,
+                )
+            with gr.Accordion("Texture Settings", open=False):
+                with gr.Row():
+                    uv_unwarp = gr.Checkbox(label="Unwarp UV", value=True)
+                    preprocess_mesh = gr.Checkbox(label="Preprocess Mesh", value=False)
+                uv_size = gr.Slider(
+                    label="UV Size", minimum=1024, maximum=8192, step=512, value=4096
+                )
+            gen_button = gr.Button("Generate Texture", variant="primary")
+            examples = gr.Examples(
+                examples=EXAMPLES,
+                inputs=[image_prompt, input_mesh],
+                outputs=[image_prompt],
+            )
+        with gr.Column():
+            mv_result = gr.Gallery(
+                label="Multi-View Results",
+                show_label=False,
+                columns=[3],
+                rows=[2],
+                object_fit="contain",
+                height="auto",
+                type="pil",
+            )
+            textured_model_output = gr.Model3D(label="Textured GLB", interactive=False)
+            download_glb = gr.DownloadButton(label="Download GLB", interactive=False)
+    gen_button.click(
+        get_random_seed, inputs=[randomize_seed, seed], outputs=[seed]
+    ).then(
+        run_mvadapter,
+        inputs=[
+            input_mesh,
+            prompt,
+            image_prompt,
+            seed,
+            guidance_scale,
+            num_inference_steps,
+            reference_conditioning_scale,
+        ],
+        outputs=[mv_result, image_prompt],
+    ).then(
+        run_texturing,
+        inputs=[input_mesh, mv_result, uv_unwarp, preprocess_mesh, uv_size],
+        outputs=[textured_model_output, download_glb],
+    ).then(
+        lambda: gr.Button(interactive=True), outputs=[download_glb]
+    )
+    demo.load(start_session)
+    demo.unload(end_session)
+demo.launch()

inference_ig2mv_sdxl.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import argparse
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, DDPMScheduler, LCMScheduler, UNet2DConditionModel
+from PIL import Image
+from torchvision import transforms
+from tqdm import tqdm
+from transformers import AutoModelForImageSegmentation
+from mvadapter.models.attention_processor import DecoupledMVRowColSelfAttnProcessor2_0
+from mvadapter.pipelines.pipeline_mvadapter_i2mv_sdxl import MVAdapterI2MVSDXLPipeline
+from mvadapter.schedulers.scheduling_shift_snr import ShiftSNRScheduler
+from mvadapter.utils import get_orthogonal_camera, make_image_grid, tensor_to_image
+from mvadapter.utils.render import NVDiffRastContextWrapper, load_mesh, render
+def prepare_pipeline(
+    base_model,
+    vae_model,
+    unet_model,
+    lora_model,
+    adapter_path,
+    scheduler,
+    num_views,
+    device,
+    dtype,
+):
+    # Load vae and unet if provided
+    pipe_kwargs = {}
+    if vae_model is not None:
+        pipe_kwargs["vae"] = AutoencoderKL.from_pretrained(vae_model)
+    if unet_model is not None:
+        pipe_kwargs["unet"] = UNet2DConditionModel.from_pretrained(unet_model)
+    # Prepare pipeline
+    pipe: MVAdapterI2MVSDXLPipeline
+    pipe = MVAdapterI2MVSDXLPipeline.from_pretrained(base_model, **pipe_kwargs)
+    # Load scheduler if provided
+    scheduler_class = None
+    if scheduler == "ddpm":
+        scheduler_class = DDPMScheduler
+    elif scheduler == "lcm":
+        scheduler_class = LCMScheduler
+    pipe.scheduler = ShiftSNRScheduler.from_scheduler(
+        pipe.scheduler,
+        shift_mode="interpolated",
+        shift_scale=8.0,
+        scheduler_class=scheduler_class,
+    )
+    pipe.init_custom_adapter(
+        num_views=num_views, self_attn_processor=DecoupledMVRowColSelfAttnProcessor2_0
+    )
+    pipe.load_custom_adapter(
+        adapter_path, weight_name="mvadapter_ig2mv_sdxl.safetensors"
+    )
+    pipe.to(device=device, dtype=dtype)
+    pipe.cond_encoder.to(device=device, dtype=dtype)
+    # load lora if provided
+    if lora_model is not None:
+        model_, name_ = lora_model.rsplit("/", 1)
+        pipe.load_lora_weights(model_, weight_name=name_)
+    return pipe
+def remove_bg(image, net, transform, device):
+    image_size = image.size
+    input_images = transform(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        preds = net(input_images)[-1].sigmoid().cpu()
+    pred = preds[0].squeeze()
+    pred_pil = transforms.ToPILImage()(pred)
+    mask = pred_pil.resize(image_size)
+    image.putalpha(mask)
+    return image
+def preprocess_image(image: Image.Image, height, width):
+    image = np.array(image)
+    alpha = image[..., 3] > 0
+    H, W = alpha.shape
+    # get the bounding box of alpha
+    y, x = np.where(alpha)
+    y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
+    x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
+    image_center = image[y0:y1, x0:x1]
+    # resize the longer side to H * 0.9
+    H, W, _ = image_center.shape
+    if H > W:
+        W = int(W * (height * 0.9) / H)
+        H = int(height * 0.9)
+    else:
+        H = int(H * (width * 0.9) / W)
+        W = int(width * 0.9)
+    image_center = np.array(Image.fromarray(image_center).resize((W, H)))
+    # pad to H, W
+    start_h = (height - H) // 2
+    start_w = (width - W) // 2
+    image = np.zeros((height, width, 4), dtype=np.uint8)
+    image[start_h : start_h + H, start_w : start_w + W] = image_center
+    image = image.astype(np.float32) / 255.0
+    image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+    image = (image * 255).clip(0, 255).astype(np.uint8)
+    image = Image.fromarray(image)
+    return image
+def run_pipeline(
+    pipe,
+    mesh_path,
+    num_views,
+    text,
+    image,
+    height,
+    width,
+    num_inference_steps,
+    guidance_scale,
+    seed,
+    remove_bg_fn=None,
+    reference_conditioning_scale=1.0,
+    negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
+    lora_scale=1.0,
+    device="cuda",
+):
+    # Prepare cameras
+    cameras = get_orthogonal_camera(
+        elevation_deg=[0, 0, 0, 0, 89.99, -89.99],
+        distance=[1.8] * num_views,
+        left=-0.55,
+        right=0.55,
+        bottom=-0.55,
+        top=0.55,
+        azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
+        device=device,
+    )
+    ctx = NVDiffRastContextWrapper(device=device, context_type="cuda")
+    mesh = load_mesh(mesh_path, rescale=True, device=device)
+    render_out = render(
+        ctx,
+        mesh,
+        cameras,
+        height=height,
+        width=width,
+        render_attr=False,
+        normal_background=0.0,
+    )
+    pos_images = tensor_to_image((render_out.pos + 0.5).clamp(0, 1), batched=True)
+    normal_images = tensor_to_image(
+        (render_out.normal / 2 + 0.5).clamp(0, 1), batched=True
+    )
+    control_images = (
+        torch.cat(
+            [
+                (render_out.pos + 0.5).clamp(0, 1),
+                (render_out.normal / 2 + 0.5).clamp(0, 1),
+            ],
+            dim=-1,
+        )
+        .permute(0, 3, 1, 2)
+        .to(device)
+    )
+    # Prepare image
+    reference_image = Image.open(image) if isinstance(image, str) else image
+    if remove_bg_fn is not None:
+        reference_image = remove_bg_fn(reference_image)
+        reference_image = preprocess_image(reference_image, height, width)
+    elif reference_image.mode == "RGBA":
+        reference_image = preprocess_image(reference_image, height, width)
+    pipe_kwargs = {}
+    if seed != -1 and isinstance(seed, int):
+        pipe_kwargs["generator"] = torch.Generator(device=device).manual_seed(seed)
+    images = pipe(
+        text,
+        height=height,
+        width=width,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        num_images_per_prompt=num_views,
+        control_image=control_images,
+        control_conditioning_scale=1.0,
+        reference_image=reference_image,
+        reference_conditioning_scale=reference_conditioning_scale,
+        negative_prompt=negative_prompt,
+        cross_attention_kwargs={"scale": lora_scale},
+        **pipe_kwargs,
+    ).images
+    return images, pos_images, normal_images, reference_image
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Models
+    parser.add_argument(
+        "--base_model", type=str, default="stabilityai/stable-diffusion-xl-base-1.0"
+    )
+    parser.add_argument(
+        "--vae_model", type=str, default="madebyollin/sdxl-vae-fp16-fix"
+    )
+    parser.add_argument("--unet_model", type=str, default=None)
+    parser.add_argument("--scheduler", type=str, default=None)
+    parser.add_argument("--lora_model", type=str, default=None)
+    parser.add_argument("--adapter_path", type=str, default="huanngzh/mv-adapter")
+    parser.add_argument("--num_views", type=int, default=6)
+    # Device
+    parser.add_argument("--device", type=str, default="cuda")
+    # Inference
+    parser.add_argument("--mesh", type=str, required=True)
+    parser.add_argument("--image", type=str, required=True)
+    parser.add_argument("--text", type=str, required=False, default="high quality")
+    parser.add_argument("--num_inference_steps", type=int, default=50)
+    parser.add_argument("--guidance_scale", type=float, default=3.0)
+    parser.add_argument("--seed", type=int, default=-1)
+    parser.add_argument("--lora_scale", type=float, default=1.0)
+    parser.add_argument("--reference_conditioning_scale", type=float, default=1.0)
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default="watermark, ugly, deformed, noisy, blurry, low contrast",
+    )
+    parser.add_argument("--output", type=str, default="output.png")
+    # Extra
+    parser.add_argument("--remove_bg", action="store_true", help="Remove background")
+    args = parser.parse_args()
+    pipe = prepare_pipeline(
+        base_model=args.base_model,
+        vae_model=args.vae_model,
+        unet_model=args.unet_model,
+        lora_model=args.lora_model,
+        adapter_path=args.adapter_path,
+        scheduler=args.scheduler,
+        num_views=args.num_views,
+        device=args.device,
+        dtype=torch.float16,
+    )
+    if args.remove_bg:
+        birefnet = AutoModelForImageSegmentation.from_pretrained(
+            "ZhengPeng7/BiRefNet", trust_remote_code=True
+        )
+        birefnet.to(args.device)
+        transform_image = transforms.Compose(
+            [
+                transforms.Resize((1024, 1024)),
+                transforms.ToTensor(),
+                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        remove_bg_fn = lambda x: remove_bg(x, birefnet, transform_image, args.device)
+    else:
+        remove_bg_fn = None
+    images, pos_images, normal_images, reference_image = run_pipeline(
+        pipe,
+        mesh_path=args.mesh,
+        num_views=args.num_views,
+        text=args.text,
+        image=args.image,
+        height=768,
+        width=768,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        seed=args.seed,
+        lora_scale=args.lora_scale,
+        reference_conditioning_scale=args.reference_conditioning_scale,
+        negative_prompt=args.negative_prompt,
+        device=args.device,
+        remove_bg_fn=remove_bg_fn,
+    )
+    make_image_grid(images, rows=1).save(args.output)
+    make_image_grid(pos_images, rows=1).save(args.output.rsplit(".", 1)[0] + "_pos.png")
+    make_image_grid(normal_images, rows=1).save(
+        args.output.rsplit(".", 1)[0] + "_nor.png"
+    )
+    reference_image.save(args.output.rsplit(".", 1)[0] + "_reference.png")

mvadapter/__init__.py ADDED Viewed

File without changes

mvadapter/loaders/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .custom_adapter import CustomAdapterMixin

mvadapter/loaders/custom_adapter.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+from typing import Dict, Optional, Union
+import safetensors
+import torch
+from diffusers.utils import _get_model_file, logging
+from safetensors import safe_open
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CustomAdapterMixin:
+    def init_custom_adapter(self, *args, **kwargs):
+        self._init_custom_adapter(*args, **kwargs)
+    def _init_custom_adapter(self, *args, **kwargs):
+        raise NotImplementedError
+    def load_custom_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        weight_name: str,
+        subfolder: Optional[str] = None,
+        **kwargs,
+    ):
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            model_file = _get_model_file(
+                pretrained_model_name_or_path_or_dict,
+                weights_name=weight_name,
+                subfolder=subfolder,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                user_agent=user_agent,
+            )
+            if weight_name.endswith(".safetensors"):
+                state_dict = {}
+                with safe_open(model_file, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        state_dict[key] = f.get_tensor(key)
+            else:
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+        self._load_custom_adapter(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        raise NotImplementedError
+    def save_custom_adapter(
+        self,
+        save_directory: Union[str, os.PathLike],
+        weight_name: str,
+        safe_serialization: bool = False,
+        **kwargs,
+    ):
+        if os.path.isfile(save_directory):
+            logger.error(
+                f"Provided path ({save_directory}) should be a directory, not a file"
+            )
+            return
+        if safe_serialization:
+            def save_function(weights, filename):
+                return safetensors.torch.save_file(
+                    weights, filename, metadata={"format": "pt"}
+                )
+        else:
+            save_function = torch.save
+        # Save the model
+        state_dict = self._save_custom_adapter(**kwargs)
+        save_function(state_dict, os.path.join(save_directory, weight_name))
+        logger.info(
+            f"Custom adapter weights saved in {os.path.join(save_directory, weight_name)}"
+        )
+    def _save_custom_adapter(self):
+        raise NotImplementedError

mvadapter/models/__init__.py ADDED Viewed

File without changes

mvadapter/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,743 @@

+import math
+from typing import Callable, List, Optional, Union
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from diffusers.models.unets import UNet2DConditionModel
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from einops import rearrange, repeat
+from torch import nn
+def default_set_attn_proc_func(
+    name: str,
+    hidden_size: int,
+    cross_attention_dim: Optional[int],
+    ori_attn_proc: object,
+) -> object:
+    return ori_attn_proc
+def set_unet_2d_condition_attn_processor(
+    unet: UNet2DConditionModel,
+    set_self_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_cross_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_custom_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_self_attn_module_names: Optional[List[str]] = None,
+    set_cross_attn_module_names: Optional[List[str]] = None,
+    set_custom_attn_module_names: Optional[List[str]] = None,
+) -> None:
+    do_set_processor = lambda name, module_names: (
+        any([name.startswith(module_name) for module_name in module_names])
+        if module_names is not None
+        else True
+    )  # prefix match
+    attn_procs = {}
+    for name, attn_processor in unet.attn_processors.items():
+        # set attn_processor by default, if module_names is None
+        set_self_attn_processor = do_set_processor(name, set_self_attn_module_names)
+        set_cross_attn_processor = do_set_processor(name, set_cross_attn_module_names)
+        set_custom_attn_processor = do_set_processor(name, set_custom_attn_module_names)
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        is_custom = "attn_mid_blocks" in name or "attn_post_blocks" in name
+        if is_custom:
+            attn_procs[name] = (
+                set_custom_attn_proc_func(name, hidden_size, None, attn_processor)
+                if set_custom_attn_processor
+                else attn_processor
+            )
+        else:
+            cross_attention_dim = (
+                None
+                if name.endswith("attn1.processor")
+                else unet.config.cross_attention_dim
+            )
+            if cross_attention_dim is None or "motion_modules" in name:
+                # self attention
+                attn_procs[name] = (
+                    set_self_attn_proc_func(
+                        name, hidden_size, cross_attention_dim, attn_processor
+                    )
+                    if set_self_attn_processor
+                    else attn_processor
+                )
+            else:
+                # cross attention
+                attn_procs[name] = (
+                    set_cross_attn_proc_func(
+                        name, hidden_size, cross_attention_dim, attn_processor
+                    )
+                    if set_cross_attn_processor
+                    else attn_processor
+                )
+    unet.set_attn_processor(attn_procs)
+class DecoupledMVRowSelfAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for Decoupled Row-wise Self-Attention and Image Cross-Attention for PyTorch 2.0.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        inner_dim: int,
+        num_views: int = 1,
+        name: Optional[str] = None,
+        use_mv: bool = True,
+        use_ref: bool = False,
+    ):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "DecoupledMVRowSelfAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        super().__init__()
+        self.num_views = num_views
+        self.name = name  # NOTE: need for image cross-attention
+        self.use_mv = use_mv
+        self.use_ref = use_ref
+        if self.use_mv:
+            self.to_q_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_mv = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+        if self.use_ref:
+            self.to_q_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_ref = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        mv_scale: float = 1.0,
+        ref_hidden_states: Optional[torch.FloatTensor] = None,
+        ref_scale: float = 1.0,
+        cache_hidden_states: Optional[List[torch.FloatTensor]] = None,
+        use_mv: bool = True,
+        use_ref: bool = True,
+        num_views: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        New args:
+            mv_scale (float): scale for multi-view self-attention.
+            ref_hidden_states (torch.FloatTensor): reference encoder hidden states for image cross-attention.
+            ref_scale (float): scale for image cross-attention.
+            cache_hidden_states (List[torch.FloatTensor]): cache hidden states from reference unet.
+        """
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        if num_views is not None:
+            self.num_views = num_views
+        # NEW: cache hidden states for reference unet
+        if cache_hidden_states is not None:
+            cache_hidden_states[self.name] = hidden_states.clone()
+        # NEW: whether to use multi-view attention and image cross-attention
+        use_mv = self.use_mv and use_mv
+        use_ref = self.use_ref and use_ref
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        # NEW: for decoupled multi-view attention
+        if use_mv:
+            query_mv = self.to_q_mv(hidden_states)
+        # NEW: for decoupled reference cross attention
+        if use_ref:
+            query_ref = self.to_q_ref(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        ####### Decoupled multi-view self-attention ########
+        if use_mv:
+            key_mv = self.to_k_mv(encoder_hidden_states)
+            value_mv = self.to_v_mv(encoder_hidden_states)
+            query_mv = query_mv.view(batch_size, -1, attn.heads, head_dim)
+            key_mv = key_mv.view(batch_size, -1, attn.heads, head_dim)
+            value_mv = value_mv.view(batch_size, -1, attn.heads, head_dim)
+            height = width = math.isqrt(sequence_length)
+            # row self-attention
+            query_mv = rearrange(
+                query_mv,
+                "(b nv) (ih iw) h c -> (b nv ih) iw h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            ).transpose(1, 2)
+            key_mv = rearrange(
+                key_mv,
+                "(b nv) (ih iw) h c -> b ih (nv iw) h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            key_mv = (
+                key_mv.repeat_interleave(self.num_views, dim=0)
+                .view(batch_size * height, -1, attn.heads, head_dim)
+                .transpose(1, 2)
+            )
+            value_mv = rearrange(
+                value_mv,
+                "(b nv) (ih iw) h c -> b ih (nv iw) h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            value_mv = (
+                value_mv.repeat_interleave(self.num_views, dim=0)
+                .view(batch_size * height, -1, attn.heads, head_dim)
+                .transpose(1, 2)
+            )
+            hidden_states_mv = F.scaled_dot_product_attention(
+                query_mv,
+                key_mv,
+                value_mv,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            hidden_states_mv = rearrange(
+                hidden_states_mv,
+                "(b nv ih) h iw c -> (b nv) (ih iw) (h c)",
+                nv=self.num_views,
+                ih=height,
+            )
+            hidden_states_mv = hidden_states_mv.to(query.dtype)
+            # linear proj
+            hidden_states_mv = self.to_out_mv[0](hidden_states_mv)
+            # dropout
+            hidden_states_mv = self.to_out_mv[1](hidden_states_mv)
+        if use_ref:
+            reference_hidden_states = ref_hidden_states[self.name]
+            key_ref = self.to_k_ref(reference_hidden_states)
+            value_ref = self.to_v_ref(reference_hidden_states)
+            query_ref = query_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            key_ref = key_ref.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value_ref = value_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            hidden_states_ref = F.scaled_dot_product_attention(
+                query_ref, key_ref, value_ref, dropout_p=0.0, is_causal=False
+            )
+            hidden_states_ref = hidden_states_ref.transpose(1, 2).reshape(
+                batch_size, -1, attn.heads * head_dim
+            )
+            hidden_states_ref = hidden_states_ref.to(query.dtype)
+            # linear proj
+            hidden_states_ref = self.to_out_ref[0](hidden_states_ref)
+            # dropout
+            hidden_states_ref = self.to_out_ref[1](hidden_states_ref)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if use_mv:
+            hidden_states = hidden_states + hidden_states_mv * mv_scale
+        if use_ref:
+            hidden_states = hidden_states + hidden_states_ref * ref_scale
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+    def set_num_views(self, num_views: int) -> None:
+        self.num_views = num_views
+class DecoupledMVRowColSelfAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for Decoupled Row-wise Self-Attention and Image Cross-Attention for PyTorch 2.0.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        inner_dim: int,
+        num_views: int = 1,
+        name: Optional[str] = None,
+        use_mv: bool = True,
+        use_ref: bool = False,
+    ):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "DecoupledMVRowSelfAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        super().__init__()
+        self.num_views = num_views
+        self.name = name  # NOTE: need for image cross-attention
+        self.use_mv = use_mv
+        self.use_ref = use_ref
+        if self.use_mv:
+            self.to_q_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_mv = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+        if self.use_ref:
+            self.to_q_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_ref = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        mv_scale: float = 1.0,
+        ref_hidden_states: Optional[torch.FloatTensor] = None,
+        ref_scale: float = 1.0,
+        cache_hidden_states: Optional[List[torch.FloatTensor]] = None,
+        use_mv: bool = True,
+        use_ref: bool = True,
+        num_views: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        New args:
+            mv_scale (float): scale for multi-view self-attention.
+            ref_hidden_states (torch.FloatTensor): reference encoder hidden states for image cross-attention.
+            ref_scale (float): scale for image cross-attention.
+            cache_hidden_states (List[torch.FloatTensor]): cache hidden states from reference unet.
+        """
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        if num_views is not None:
+            self.num_views = num_views
+        # NEW: cache hidden states for reference unet
+        if cache_hidden_states is not None:
+            cache_hidden_states[self.name] = hidden_states.clone()
+        # NEW: whether to use multi-view attention and image cross-attention
+        use_mv = self.use_mv and use_mv
+        use_ref = self.use_ref and use_ref
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        # NEW: for decoupled multi-view attention
+        if use_mv:
+            query_mv = self.to_q_mv(hidden_states)
+        # NEW: for decoupled reference cross attention
+        if use_ref:
+            query_ref = self.to_q_ref(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        ####### Decoupled multi-view self-attention ########
+        if use_mv:
+            key_mv = self.to_k_mv(encoder_hidden_states)
+            value_mv = self.to_v_mv(encoder_hidden_states)
+            query_mv = query_mv.view(batch_size, -1, attn.heads, head_dim)
+            key_mv = key_mv.view(batch_size, -1, attn.heads, head_dim)
+            value_mv = value_mv.view(batch_size, -1, attn.heads, head_dim)
+            height = width = math.isqrt(sequence_length)
+            query_mv = rearrange(
+                query_mv,
+                "(b nv) (ih iw) h c -> b nv ih iw h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            key_mv = rearrange(
+                key_mv,
+                "(b nv) (ih iw) h c -> b nv ih iw h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            value_mv = rearrange(
+                value_mv,
+                "(b nv) (ih iw) h c -> b nv ih iw h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            # row-wise attention for view 0123 (front, right, back, left)
+            query_mv_0123 = rearrange(
+                query_mv[:, 0:4], "b nv ih iw h c -> (b ih) h (nv iw) c"
+            )
+            key_mv_0123 = rearrange(
+                key_mv[:, 0:4], "b nv ih iw h c -> (b ih) h (nv iw) c"
+            )
+            value_mv_0123 = rearrange(
+                value_mv[:, 0:4], "b nv ih iw h c -> (b ih) h (nv iw) c"
+            )
+            hidden_states_mv_0123 = F.scaled_dot_product_attention(
+                query_mv_0123,
+                key_mv_0123,
+                value_mv_0123,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            hidden_states_mv_0123 = rearrange(
+                hidden_states_mv_0123,
+                "(b ih) h (nv iw) c -> b nv (ih iw) (h c)",
+                ih=height,
+                iw=height,
+            )
+            # col-wise attention for view 0245 (front, back, top, bottom)
+            # flip first
+            query_mv_0245 = torch.cat(
+                [
+                    torch.flip(query_mv[:, [0]], [3]),  # horizontal flip
+                    query_mv[:, [2, 4, 5]],
+                ],
+                dim=1,
+            )
+            key_mv_0245 = torch.cat(
+                [
+                    torch.flip(key_mv[:, [0]], [3]),  # horizontal flip
+                    key_mv[:, [2, 4, 5]],
+                ],
+                dim=1,
+            )
+            value_mv_0245 = torch.cat(
+                [
+                    torch.flip(value_mv[:, [0]], [3]),  # horizontal flip
+                    value_mv[:, [2, 4, 5]],
+                ],
+                dim=1,
+            )
+            # attention
+            query_mv_0245 = rearrange(
+                query_mv_0245, "b nv ih iw h c -> (b iw) h (nv ih) c"
+            )
+            key_mv_0245 = rearrange(key_mv_0245, "b nv ih iw h c -> (b iw) h (nv ih) c")
+            value_mv_0245 = rearrange(
+                value_mv_0245, "b nv ih iw h c -> (b iw) h (nv ih) c"
+            )
+            hidden_states_mv_0245 = F.scaled_dot_product_attention(
+                query_mv_0245,
+                key_mv_0245,
+                value_mv_0245,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            # flip back
+            hidden_states_mv_0245 = rearrange(
+                hidden_states_mv_0245,
+                "(b iw) h (nv ih) c -> b nv ih iw (h c)",
+                ih=height,
+                iw=height,
+            )
+            hidden_states_mv_0245 = torch.cat(
+                [
+                    torch.flip(hidden_states_mv_0245[:, [0]], [3]),  # horizontal flip
+                    hidden_states_mv_0245[:, [1, 2, 3]],
+                ],
+                dim=1,
+            )
+            hidden_states_mv_0245 = hidden_states_mv_0245.view(
+                hidden_states_mv_0245.shape[0],
+                hidden_states_mv_0245.shape[1],
+                -1,
+                hidden_states_mv_0245.shape[-1],
+            )
+            # combine row and col
+            hidden_states_mv = torch.stack(
+                [
+                    (hidden_states_mv_0123[:, 0] + hidden_states_mv_0245[:, 0]) / 2,
+                    hidden_states_mv_0123[:, 1],
+                    (hidden_states_mv_0123[:, 2] + hidden_states_mv_0245[:, 1]) / 2,
+                    hidden_states_mv_0123[:, 3],
+                    hidden_states_mv_0245[:, 2],
+                    hidden_states_mv_0245[:, 3],
+                ],
+                dim=1,
+            )
+            hidden_states_mv = hidden_states_mv.view(
+                -1, hidden_states_mv.shape[-2], hidden_states_mv.shape[-1]
+            )
+            hidden_states_mv = hidden_states_mv.to(query.dtype)
+            # linear proj
+            hidden_states_mv = self.to_out_mv[0](hidden_states_mv)
+            # dropout
+            hidden_states_mv = self.to_out_mv[1](hidden_states_mv)
+        if use_ref:
+            reference_hidden_states = ref_hidden_states[self.name]
+            key_ref = self.to_k_ref(reference_hidden_states)
+            value_ref = self.to_v_ref(reference_hidden_states)
+            query_ref = query_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            key_ref = key_ref.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value_ref = value_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            hidden_states_ref = F.scaled_dot_product_attention(
+                query_ref, key_ref, value_ref, dropout_p=0.0, is_causal=False
+            )
+            hidden_states_ref = hidden_states_ref.transpose(1, 2).reshape(
+                batch_size, -1, attn.heads * head_dim
+            )
+            hidden_states_ref = hidden_states_ref.to(query.dtype)
+            # linear proj
+            hidden_states_ref = self.to_out_ref[0](hidden_states_ref)
+            # dropout
+            hidden_states_ref = self.to_out_ref[1](hidden_states_ref)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if use_mv:
+            hidden_states = hidden_states + hidden_states_mv * mv_scale
+        if use_ref:
+            hidden_states = hidden_states + hidden_states_ref * ref_scale
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+    def set_num_views(self, num_views: int) -> None:
+        self.num_views = num_views

mvadapter/pipelines/pipeline_mvadapter_i2mv_sd.py ADDED Viewed

	@@ -0,0 +1,777 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import PIL
+import torch
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, T2IAdapter, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_output import (
+    StableDiffusionPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, is_torch_xla_available, logging
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ..loaders import CustomAdapterMixin
+from ..models.attention_processor import (
+    DecoupledMVRowSelfAttnProcessor2_0,
+    set_unet_2d_condition_attn_processor,
+)
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def retrieve_latents(
+    encoder_output: torch.Tensor,
+    generator: Optional[torch.Generator] = None,
+    sample_mode: str = "sample",
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class MVAdapterI2MVSDPipeline(StableDiffusionPipeline, CustomAdapterMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = False,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.prepare_latents
+    def prepare_image_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(
+                        self.vae.encode(image[i : i + 1]), generator=generator[i]
+                    )
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(
+                    self.vae.encode(image), generator=generator
+                )
+            init_latents = self.vae.config.scaling_factor * init_latents
+        if (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] == 0
+        ):
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat(
+                [init_latents] * additional_image_per_prompt, dim=0
+            )
+        elif (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] != 0
+        ):
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        num_empty_images=0,  # for concat in batch like ImageDream
+    ):
+        assert hasattr(
+            self, "control_image_processor"
+        ), "control_image_processor is not initialized"
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width
+        ).to(dtype=torch.float32)
+        if num_empty_images > 0:
+            image = torch.cat(
+                [image, torch.zeros_like(image[:num_empty_images])], dim=0
+            )
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt  # always 1 for control image
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        # NEW
+        mv_scale: float = 1.0,
+        # Camera or geometry condition
+        control_image: Optional[PipelineImageInput] = None,
+        control_conditioning_scale: Optional[float] = 1.0,
+        control_conditioning_factor: float = 1.0,
+        # Image condition
+        reference_image: Optional[PipelineImageInput] = None,
+        reference_conditioning_scale: Optional[float] = 1.0,
+        # Optional. controlnet
+        controlnet_image: Optional[PipelineImageInput] = None,
+        controlnet_conditioning_scale: Optional[float] = 1.0,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # Preprocess reference image
+        reference_image = self.image_processor.preprocess(reference_image)
+        reference_latents = self.prepare_image_latents(
+            reference_image,
+            timesteps[:1].repeat(batch_size * num_images_per_prompt),  # no use
+            batch_size,
+            1,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise=False,
+        )
+        ref_timesteps = torch.zeros_like(timesteps[0])
+        ref_hidden_states = {}
+        with torch.no_grad():
+            self.unet(
+                reference_latents,
+                ref_timesteps,
+                encoder_hidden_states=prompt_embeds[-1:],
+                cross_attention_kwargs={
+                    "cache_hidden_states": ref_hidden_states,
+                    "use_mv": False,
+                    "use_ref": False,
+                },
+                return_dict=False,
+            )
+            ref_hidden_states = {
+                k: v.repeat_interleave(num_images_per_prompt, dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        if self.do_classifier_free_guidance:
+            ref_hidden_states = {
+                k: torch.cat([torch.zeros_like(v), v], dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        cross_attention_kwargs = {
+            "num_views": num_images_per_prompt,
+            "mv_scale": mv_scale,
+            "ref_hidden_states": {k: v.clone() for k, v in ref_hidden_states.items()},
+            "ref_scale": reference_conditioning_scale,
+            **(self.cross_attention_kwargs or {}),
+        }
+        # Preprocess control image
+        control_image_feature = self.prepare_control_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=1,  # NOTE: always 1 for control images
+            device=device,
+            dtype=latents.dtype,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        control_image_feature = control_image_feature.to(
+            device=device, dtype=latents.dtype
+        )
+        adapter_state = self.cond_encoder(control_image_feature)
+        for i, state in enumerate(adapter_state):
+            adapter_state[i] = state * control_conditioning_scale
+        # Preprocess controlnet image if provided
+        do_controlnet = controlnet_image is not None and hasattr(self, "controlnet")
+        if do_controlnet:
+            controlnet_image = self.prepare_control_image(
+                image=controlnet_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=1,  # NOTE: always 1 for control images
+                device=device,
+                dtype=latents.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+            controlnet_image = controlnet_image.to(device=device, dtype=latents.dtype)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                if i < int(num_inference_steps * control_conditioning_factor):
+                    down_intrablock_additional_residuals = [
+                        state.clone() for state in adapter_state
+                    ]
+                else:
+                    down_intrablock_additional_residuals = None
+                unet_add_kwargs = {}
+                # Do controlnet if provided
+                if do_controlnet:
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=controlnet_image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        guess_mode=False,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )
+                    unet_add_kwargs.update(
+                        {
+                            "down_block_additional_residuals": down_block_res_samples,
+                            "mid_block_additional_residual": mid_block_res_sample,
+                        }
+                    )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                    **unet_add_kwargs,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor,
+                return_dict=False,
+                generator=generator,
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
+    ### NEW: adapters ###
+    def _init_custom_adapter(
+        self,
+        # Multi-view adapter
+        num_views: int = 1,
+        self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0,
+        # Condition encoder
+        cond_in_channels: int = 6,
+        # For training
+        copy_attn_weights: bool = True,
+        zero_init_module_keys: List[str] = [],
+    ):
+        # Condition encoder
+        self.cond_encoder = T2IAdapter(
+            in_channels=cond_in_channels,
+            channels=self.unet.config.block_out_channels,
+            num_res_blocks=self.unet.config.layers_per_block,
+            downscale_factor=8,
+        )
+        # set custom attn processor for multi-view attention
+        self.unet: UNet2DConditionModel
+        set_unet_2d_condition_attn_processor(
+            self.unet,
+            set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=True,
+                use_ref=True,
+            ),
+            set_cross_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=False,
+                use_ref=False,
+            ),
+        )
+        # copy decoupled attention weights from original unet
+        if copy_attn_weights:
+            state_dict = self.unet.state_dict()
+            for key in state_dict.keys():
+                if "_mv" in key:
+                    compatible_key = key.replace("_mv", "").replace("processor.", "")
+                elif "_ref" in key:
+                    compatible_key = key.replace("_ref", "").replace("processor.", "")
+                else:
+                    compatible_key = key
+                is_zero_init_key = any([k in key for k in zero_init_module_keys])
+                if is_zero_init_key:
+                    state_dict[key] = torch.zeros_like(state_dict[compatible_key])
+                else:
+                    state_dict[key] = state_dict[compatible_key].clone()
+            self.unet.load_state_dict(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        self.unet.load_state_dict(state_dict, strict=False)
+        self.cond_encoder.load_state_dict(state_dict, strict=False)
+    def _save_custom_adapter(
+        self,
+        include_keys: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ):
+        def include_fn(k):
+            is_included = False
+            if include_keys is not None:
+                is_included = is_included or any([key in k for key in include_keys])
+            if exclude_keys is not None:
+                is_included = is_included and not any(
+                    [key in k for key in exclude_keys]
+                )
+            return is_included
+        state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)}
+        state_dict.update(self.cond_encoder.state_dict())
+        return state_dict

mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py ADDED Viewed

	@@ -0,0 +1,962 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import (
+    AutoencoderKL,
+    ImageProjection,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import (
+    StableDiffusionXLPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    StableDiffusionXLPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import randn_tensor
+from einops import rearrange
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ..loaders import CustomAdapterMixin
+from ..models.attention_processor import (
+    DecoupledMVRowSelfAttnProcessor2_0,
+    set_unet_2d_condition_attn_processor,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def retrieve_latents(
+    encoder_output: torch.Tensor,
+    generator: Optional[torch.Generator] = None,
+    sample_mode: str = "sample",
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class MVAdapterI2MVSDXLPipeline(StableDiffusionXLPipeline, CustomAdapterMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
+            add_watermarker=add_watermarker,
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.prepare_latents
+    def prepare_image_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        latents_mean = latents_std = None
+        if (
+            hasattr(self.vae.config, "latents_mean")
+            and self.vae.config.latents_mean is not None
+        ):
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if (
+            hasattr(self.vae.config, "latents_std")
+            and self.vae.config.latents_std is not None
+        ):
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+                init_latents = [
+                    retrieve_latents(
+                        self.vae.encode(image[i : i + 1]), generator=generator[i]
+                    )
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(
+                    self.vae.encode(image), generator=generator
+                )
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (
+                    (init_latents - latents_mean)
+                    * self.vae.config.scaling_factor
+                    / latents_std
+                )
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+        if (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] == 0
+        ):
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat(
+                [init_latents] * additional_image_per_prompt, dim=0
+            )
+        elif (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] != 0
+        ):
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        num_empty_images=0,  # for concat in batch like ImageDream
+    ):
+        assert hasattr(
+            self, "control_image_processor"
+        ), "control_image_processor is not initialized"
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width
+        ).to(dtype=torch.float32)
+        if num_empty_images > 0:
+            image = torch.cat(
+                [image, torch.zeros_like(image[:num_empty_images])], dim=0
+            )
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt  # always 1 for control image
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        # NEW
+        mv_scale: float = 1.0,
+        # Camera or geometry condition
+        control_image: Optional[PipelineImageInput] = None,
+        control_conditioning_scale: Optional[float] = 1.0,
+        control_conditioning_factor: float = 1.0,
+        # Image condition
+        reference_image: Optional[PipelineImageInput] = None,
+        reference_conditioning_scale: Optional[float] = 1.0,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # Preprocess reference image
+        reference_image = self.image_processor.preprocess(reference_image)
+        reference_latents = self.prepare_image_latents(
+            reference_image,
+            timesteps[:1].repeat(batch_size * num_images_per_prompt),  # no use
+            batch_size,
+            1,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise=False,
+        )
+        with torch.no_grad():
+            ref_timesteps = torch.zeros_like(timesteps[0])
+            ref_hidden_states = {}
+            self.unet(
+                reference_latents,
+                ref_timesteps,
+                encoder_hidden_states=prompt_embeds[-1:],
+                added_cond_kwargs={
+                    "text_embeds": add_text_embeds[-1:],
+                    "time_ids": add_time_ids[-1:],
+                },
+                cross_attention_kwargs={
+                    "cache_hidden_states": ref_hidden_states,
+                    "use_mv": False,
+                    "use_ref": False,
+                },
+                return_dict=False,
+            )
+            ref_hidden_states = {
+                k: v.repeat_interleave(num_images_per_prompt, dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        if self.do_classifier_free_guidance:
+            ref_hidden_states = {
+                k: torch.cat([torch.zeros_like(v), v], dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        cross_attention_kwargs = {
+            "mv_scale": mv_scale,
+            "ref_hidden_states": {k: v.clone() for k, v in ref_hidden_states.items()},
+            "ref_scale": reference_conditioning_scale,
+            "num_views": num_images_per_prompt,
+            **(self.cross_attention_kwargs or {}),
+        }
+        # Preprocess control image
+        control_image_feature = self.prepare_control_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=1,  # NOTE: always 1 for control images
+            device=device,
+            dtype=latents.dtype,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        control_image_feature = control_image_feature.to(
+            device=device, dtype=latents.dtype
+        )
+        adapter_state = self.cond_encoder(control_image_feature)
+        for i, state in enumerate(adapter_state):
+            adapter_state[i] = state * control_conditioning_scale
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))
+            )
+            timesteps = timesteps[:num_inference_steps]
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                added_cond_kwargs = {
+                    "text_embeds": add_text_embeds,
+                    "time_ids": add_time_ids,
+                }
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                if i < int(num_inference_steps * control_conditioning_factor):
+                    down_intrablock_additional_residuals = [
+                        state.clone() for state in adapter_state
+                    ]
+                else:
+                    down_intrablock_additional_residuals = None
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                    add_text_embeds = callback_outputs.pop(
+                        "add_text_embeds", add_text_embeds
+                    )
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop(
+                        "negative_add_time_ids", negative_add_time_ids
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = (
+                self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            )
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(
+                    next(iter(self.vae.post_quant_conv.parameters())).dtype
+                )
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = (
+                hasattr(self.vae.config, "latents_mean")
+                and self.vae.config.latents_mean is not None
+            )
+            has_latents_std = (
+                hasattr(self.vae.config, "latents_std")
+                and self.vae.config.latents_std is not None
+            )
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents = (
+                    latents * latents_std / self.vae.config.scaling_factor
+                    + latents_mean
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLPipelineOutput(images=image)
+    ### NEW: adapters ###
+    def _init_custom_adapter(
+        self,
+        # Multi-view adapter
+        num_views: int = 1,
+        self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0,
+        # Condition encoder
+        cond_in_channels: int = 6,
+        # For training
+        copy_attn_weights: bool = True,
+        zero_init_module_keys: List[str] = [],
+    ):
+        # Condition encoder
+        self.cond_encoder = T2IAdapter(
+            in_channels=cond_in_channels,
+            channels=(320, 640, 1280, 1280),
+            num_res_blocks=2,
+            downscale_factor=16,
+            adapter_type="full_adapter_xl",
+        )
+        # set custom attn processor for multi-view attention and image cross-attention
+        self.unet: UNet2DConditionModel
+        set_unet_2d_condition_attn_processor(
+            self.unet,
+            set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=True,
+                use_ref=True,
+            ),
+            set_cross_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=False,
+                use_ref=False,
+            ),
+        )
+        # copy decoupled attention weights from original unet
+        if copy_attn_weights:
+            state_dict = self.unet.state_dict()
+            for key in state_dict.keys():
+                if "_mv" in key:
+                    compatible_key = key.replace("_mv", "").replace("processor.", "")
+                elif "_ref" in key:
+                    compatible_key = key.replace("_ref", "").replace("processor.", "")
+                else:
+                    compatible_key = key
+                is_zero_init_key = any([k in key for k in zero_init_module_keys])
+                if is_zero_init_key:
+                    state_dict[key] = torch.zeros_like(state_dict[compatible_key])
+                else:
+                    state_dict[key] = state_dict[compatible_key].clone()
+            self.unet.load_state_dict(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        self.unet.load_state_dict(state_dict, strict=False)
+        self.cond_encoder.load_state_dict(state_dict, strict=False)
+    def _save_custom_adapter(
+        self,
+        include_keys: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ):
+        def include_fn(k):
+            is_included = False
+            if include_keys is not None:
+                is_included = is_included or any([key in k for key in include_keys])
+            if exclude_keys is not None:
+                is_included = is_included and not any(
+                    [key in k for key in exclude_keys]
+                )
+            return is_included
+        state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)}
+        state_dict.update(self.cond_encoder.state_dict())
+        return state_dict

mvadapter/pipelines/pipeline_mvadapter_t2mv_sd.py ADDED Viewed

	@@ -0,0 +1,634 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, T2IAdapter, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_output import (
+    StableDiffusionPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, is_torch_xla_available, logging
+from diffusers.utils.torch_utils import randn_tensor
+from packaging import version
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ..loaders import CustomAdapterMixin
+from ..models.attention_processor import (
+    DecoupledMVRowSelfAttnProcessor2_0,
+    set_unet_2d_condition_attn_processor,
+)
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class MVAdapterT2MVSDPipeline(StableDiffusionPipeline, CustomAdapterMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        assert hasattr(
+            self, "control_image_processor"
+        ), "control_image_processor is not initialized"
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width
+        ).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt  # always 1 for control image
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        # NEW
+        mv_scale: float = 1.0,
+        # Camera or geometry condition
+        control_image: Optional[PipelineImageInput] = None,
+        control_conditioning_scale: Optional[float] = 1.0,
+        control_conditioning_factor: float = 1.0,
+        # Optional. controlnet
+        controlnet_image: Optional[PipelineImageInput] = None,
+        controlnet_conditioning_scale: Optional[float] = 1.0,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # Preprocess control image
+        control_image_feature = self.prepare_control_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=1,  # NOTE: always 1 for control images
+            device=device,
+            dtype=latents.dtype,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        control_image_feature = control_image_feature.to(
+            device=device, dtype=latents.dtype
+        )
+        adapter_state = self.cond_encoder(control_image_feature)
+        for i, state in enumerate(adapter_state):
+            adapter_state[i] = state * control_conditioning_scale
+        # Preprocess controlnet image if provided
+        do_controlnet = controlnet_image is not None and hasattr(self, "controlnet")
+        if do_controlnet:
+            controlnet_image = self.prepare_control_image(
+                image=controlnet_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=1,  # NOTE: always 1 for control images
+                device=device,
+                dtype=latents.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+            controlnet_image = controlnet_image.to(device=device, dtype=latents.dtype)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                if i < int(num_inference_steps * control_conditioning_factor):
+                    down_intrablock_additional_residuals = [
+                        state.clone() for state in adapter_state
+                    ]
+                else:
+                    down_intrablock_additional_residuals = None
+                unet_add_kwargs = {}
+                # Do controlnet if provided
+                if do_controlnet:
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=controlnet_image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        guess_mode=False,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )
+                    unet_add_kwargs.update(
+                        {
+                            "down_block_additional_residuals": down_block_res_samples,
+                            "mid_block_additional_residual": mid_block_res_sample,
+                        }
+                    )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs={
+                        "mv_scale": mv_scale,
+                        "num_views": num_images_per_prompt,
+                        **(self.cross_attention_kwargs or {}),
+                    },
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                    **unet_add_kwargs,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor,
+                return_dict=False,
+                generator=generator,
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
+    ### NEW: adapters ###
+    def _init_custom_adapter(
+        self,
+        # Multi-view adapter
+        num_views: int = 1,
+        self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0,
+        # Condition encoder
+        cond_in_channels: int = 6,
+        # For training
+        copy_attn_weights: bool = True,
+        zero_init_module_keys: List[str] = [],
+    ):
+        # Condition encoder
+        self.cond_encoder = T2IAdapter(
+            in_channels=cond_in_channels,
+            channels=self.unet.config.block_out_channels,
+            num_res_blocks=self.unet.config.layers_per_block,
+            downscale_factor=8,
+        )
+        # set custom attn processor for multi-view attention
+        self.unet: UNet2DConditionModel
+        set_unet_2d_condition_attn_processor(
+            self.unet,
+            set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=True,
+                use_ref=False,
+            ),
+            set_cross_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=False,
+                use_ref=False,
+            ),
+        )
+        # copy decoupled attention weights from original unet
+        if copy_attn_weights:
+            state_dict = self.unet.state_dict()
+            for key in state_dict.keys():
+                if "_mv" in key:
+                    compatible_key = key.replace("_mv", "").replace("processor.", "")
+                else:
+                    compatible_key = key
+                is_zero_init_key = any([k in key for k in zero_init_module_keys])
+                if is_zero_init_key:
+                    state_dict[key] = torch.zeros_like(state_dict[compatible_key])
+                else:
+                    state_dict[key] = state_dict[compatible_key].clone()
+            self.unet.load_state_dict(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        self.unet.load_state_dict(state_dict, strict=False)
+        self.cond_encoder.load_state_dict(state_dict, strict=False)
+    def _save_custom_adapter(
+        self,
+        include_keys: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ):
+        def include_fn(k):
+            is_included = False
+            if include_keys is not None:
+                is_included = is_included or any([key in k for key in include_keys])
+            if exclude_keys is not None:
+                is_included = is_included and not any(
+                    [key in k for key in exclude_keys]
+                )
+            return is_included
+        state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)}
+        state_dict.update(self.cond_encoder.state_dict())
+        return state_dict

mvadapter/pipelines/pipeline_mvadapter_t2mv_sdxl.py ADDED Viewed

	@@ -0,0 +1,801 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, T2IAdapter, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import (
+    StableDiffusionXLPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    StableDiffusionXLPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, logging
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ..loaders import CustomAdapterMixin
+from ..models.attention_processor import (
+    DecoupledMVRowSelfAttnProcessor2_0,
+    set_unet_2d_condition_attn_processor,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class MVAdapterT2MVSDXLPipeline(StableDiffusionXLPipeline, CustomAdapterMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
+            add_watermarker=add_watermarker,
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        assert hasattr(
+            self, "control_image_processor"
+        ), "control_image_processor is not initialized"
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width
+        ).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt  # always 1 for control image
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        # NEW
+        mv_scale: float = 1.0,
+        # Camera or geometry condition
+        control_image: Optional[PipelineImageInput] = None,
+        control_conditioning_scale: Optional[float] = 1.0,
+        control_conditioning_factor: float = 1.0,
+        # Optional. controlnet
+        controlnet_image: Optional[PipelineImageInput] = None,
+        controlnet_conditioning_scale: Optional[float] = 1.0,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # Preprocess control image
+        control_image_feature = self.prepare_control_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=1,  # NOTE: always 1 for control images
+            device=device,
+            dtype=latents.dtype,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        control_image_feature = control_image_feature.to(
+            device=device, dtype=latents.dtype
+        )
+        adapter_state = self.cond_encoder(control_image_feature)
+        for i, state in enumerate(adapter_state):
+            adapter_state[i] = state * control_conditioning_scale
+        # Preprocess controlnet image if provided
+        do_controlnet = controlnet_image is not None and hasattr(self, "controlnet")
+        if do_controlnet:
+            controlnet_image = self.prepare_control_image(
+                image=controlnet_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=1,  # NOTE: always 1 for control images
+                device=device,
+                dtype=latents.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+            controlnet_image = controlnet_image.to(device=device, dtype=latents.dtype)
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))
+            )
+            timesteps = timesteps[:num_inference_steps]
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                added_cond_kwargs = {
+                    "text_embeds": add_text_embeds,
+                    "time_ids": add_time_ids,
+                }
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                if i < int(num_inference_steps * control_conditioning_factor):
+                    down_intrablock_additional_residuals = [
+                        state.clone() for state in adapter_state
+                    ]
+                else:
+                    down_intrablock_additional_residuals = None
+                unet_add_kwargs = {}
+                # Do controlnet if provided
+                if do_controlnet:
+                    down_block_res_samples, mid_block_res_sample = self.controlnet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=controlnet_image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        guess_mode=False,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )
+                    unet_add_kwargs.update(
+                        {
+                            "down_block_additional_residuals": down_block_res_samples,
+                            "mid_block_additional_residual": mid_block_res_sample,
+                        }
+                    )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs={
+                        "mv_scale": mv_scale,
+                        "num_views": num_images_per_prompt,
+                        **(self.cross_attention_kwargs or {}),
+                    },
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                    **unet_add_kwargs,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                    add_text_embeds = callback_outputs.pop(
+                        "add_text_embeds", add_text_embeds
+                    )
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop(
+                        "negative_add_time_ids", negative_add_time_ids
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = (
+                self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            )
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(
+                    next(iter(self.vae.post_quant_conv.parameters())).dtype
+                )
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = (
+                hasattr(self.vae.config, "latents_mean")
+                and self.vae.config.latents_mean is not None
+            )
+            has_latents_std = (
+                hasattr(self.vae.config, "latents_std")
+                and self.vae.config.latents_std is not None
+            )
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents = (
+                    latents * latents_std / self.vae.config.scaling_factor
+                    + latents_mean
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLPipelineOutput(images=image)
+    ### NEW: adapters ###
+    def _init_custom_adapter(
+        self,
+        # Multi-view adapter
+        num_views: int = 1,
+        self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0,
+        # Condition encoder
+        cond_in_channels: int = 6,
+        # For training
+        copy_attn_weights: bool = True,
+        zero_init_module_keys: List[str] = [],
+    ):
+        # Condition encoder
+        self.cond_encoder = T2IAdapter(
+            in_channels=cond_in_channels,
+            channels=(320, 640, 1280, 1280),
+            num_res_blocks=2,
+            downscale_factor=16,
+            adapter_type="full_adapter_xl",
+        )
+        # set custom attn processor for multi-view attention
+        self.unet: UNet2DConditionModel
+        set_unet_2d_condition_attn_processor(
+            self.unet,
+            set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=True,
+                use_ref=False,
+            ),
+            set_cross_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=False,
+                use_ref=False,
+            ),
+        )
+        # copy decoupled attention weights from original unet
+        if copy_attn_weights:
+            state_dict = self.unet.state_dict()
+            for key in state_dict.keys():
+                if "_mv" in key:
+                    compatible_key = key.replace("_mv", "").replace("processor.", "")
+                else:
+                    compatible_key = key
+                is_zero_init_key = any([k in key for k in zero_init_module_keys])
+                if is_zero_init_key:
+                    state_dict[key] = torch.zeros_like(state_dict[compatible_key])
+                else:
+                    state_dict[key] = state_dict[compatible_key].clone()
+            self.unet.load_state_dict(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        self.unet.load_state_dict(state_dict, strict=False)
+        self.cond_encoder.load_state_dict(state_dict, strict=False)
+    def _save_custom_adapter(
+        self,
+        include_keys: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ):
+        def include_fn(k):
+            is_included = False
+            if include_keys is not None:
+                is_included = is_included or any([key in k for key in include_keys])
+            if exclude_keys is not None:
+                is_included = is_included and not any(
+                    [key in k for key in exclude_keys]
+                )
+            return is_included
+        state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)}
+        state_dict.update(self.cond_encoder.state_dict())
+        return state_dict

mvadapter/schedulers/scheduler_utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+def get_sigmas(noise_scheduler, timesteps, n_dim=4, dtype=torch.float32, device=None):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+def SNR_to_betas(snr):
+    """
+    Converts SNR to betas
+    """
+    # alphas_cumprod = pass
+    # snr = (alpha / ) ** 2
+    # alpha_t^2 / (1 - alpha_t^2) = snr
+    alpha_t = (snr / (1 + snr)) ** 0.5
+    alphas_cumprod = alpha_t**2
+    alphas = alphas_cumprod / torch.cat(
+        [torch.ones(1, device=snr.device), alphas_cumprod[:-1]]
+    )
+    betas = 1 - alphas
+    return betas
+def compute_snr(timesteps, noise_scheduler):
+    """
+    Computes SNR as per Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+    # Expand the tensors.
+    # Adapted from Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+    sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+        device=timesteps.device
+    )[timesteps].float()
+    while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+    sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+    # Compute SNR.
+    snr = (alpha / sigma) ** 2
+    return snr
+def compute_alpha(timesteps, noise_scheduler):
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+    return alpha

mvadapter/schedulers/scheduling_shift_snr.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from typing import Any
+import torch
+from .scheduler_utils import SNR_to_betas, compute_snr
+class ShiftSNRScheduler:
+    def __init__(
+        self,
+        noise_scheduler: Any,
+        timesteps: Any,
+        shift_scale: float,
+        scheduler_class: Any,
+    ):
+        self.noise_scheduler = noise_scheduler
+        self.timesteps = timesteps
+        self.shift_scale = shift_scale
+        self.scheduler_class = scheduler_class
+    def _get_shift_scheduler(self):
+        """
+        Prepare scheduler for shifted betas.
+        :return: A scheduler object configured with shifted betas
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_betas = SNR_to_betas(snr / self.shift_scale)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config, trained_betas=shifted_betas.numpy()
+        )
+    def _get_interpolated_shift_scheduler(self):
+        """
+        Prepare scheduler for shifted betas and interpolate with the original betas in log space.
+        :return: A scheduler object configured with interpolated shifted betas
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_snr = snr / self.shift_scale
+        weighting = self.timesteps.float() / (
+            self.noise_scheduler.config.num_train_timesteps - 1
+        )
+        interpolated_snr = torch.exp(
+            torch.log(snr) * (1 - weighting) + torch.log(shifted_snr) * weighting
+        )
+        shifted_betas = SNR_to_betas(interpolated_snr)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config, trained_betas=shifted_betas.numpy()
+        )
+    @classmethod
+    def from_scheduler(
+        cls,
+        noise_scheduler: Any,
+        shift_mode: str = "default",
+        timesteps: Any = None,
+        shift_scale: float = 1.0,
+        scheduler_class: Any = None,
+    ):
+        # Check input
+        if timesteps is None:
+            timesteps = torch.arange(0, noise_scheduler.config.num_train_timesteps)
+        if scheduler_class is None:
+            scheduler_class = noise_scheduler.__class__
+        # Create scheduler
+        shift_scheduler = cls(
+            noise_scheduler=noise_scheduler,
+            timesteps=timesteps,
+            shift_scale=shift_scale,
+            scheduler_class=scheduler_class,
+        )
+        if shift_mode == "default":
+            return shift_scheduler._get_shift_scheduler()
+        elif shift_mode == "interpolated":
+            return shift_scheduler._get_interpolated_shift_scheduler()
+        else:
+            raise ValueError(f"Unknown shift_mode: {shift_mode}")
+if __name__ == "__main__":
+    """
+    Compare the alpha values for different noise schedulers.
+    """
+    import matplotlib.pyplot as plt
+    from diffusers import DDPMScheduler
+    from .scheduler_utils import compute_alpha
+    # Base
+    timesteps = torch.arange(0, 1000)
+    noise_scheduler_base = DDPMScheduler.from_pretrained(
+        "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_base)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Base")
+    # Kolors
+    num_train_timesteps_ = 1100
+    timesteps_ = torch.arange(0, num_train_timesteps_)
+    noise_kwargs = {"beta_end": 0.014, "num_train_timesteps": num_train_timesteps_}
+    noise_scheduler_kolors = DDPMScheduler.from_config(
+        noise_scheduler_base.config, **noise_kwargs
+    )
+    alpha = compute_alpha(timesteps_, noise_scheduler_kolors)
+    plt.plot(timesteps_.numpy(), alpha.numpy(), label="Kolors")
+    # Shift betas
+    shift_scale = 8.0
+    noise_scheduler_shift = ShiftSNRScheduler.from_scheduler(
+        noise_scheduler_base, shift_mode="default", shift_scale=shift_scale
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_shift)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Shift Noise (scale 8.0)")
+    # Shift betas (interpolated)
+    noise_scheduler_inter = ShiftSNRScheduler.from_scheduler(
+        noise_scheduler_base, shift_mode="interpolated", shift_scale=shift_scale
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_inter)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Interpolated (scale 8.0)")
+    # ZeroSNR
+    noise_scheduler = DDPMScheduler.from_config(
+        noise_scheduler_base.config, rescale_betas_zero_snr=True
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="ZeroSNR")
+    plt.legend()
+    plt.grid()
+    plt.savefig("check_alpha.png")

mvadapter/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .camera import get_camera, get_orthogonal_camera
+from .geometry import get_plucker_embeds_from_cameras_ortho
+from .saving import make_image_grid, tensor_to_image

mvadapter/utils/camera.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import math
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+import trimesh
+from PIL import Image
+from torch import BoolTensor, FloatTensor
+LIST_TYPE = Union[list, np.ndarray, torch.Tensor]
+def list_to_pt(
+    x: LIST_TYPE, dtype: Optional[torch.dtype] = None, device: Optional[str] = None
+) -> torch.Tensor:
+    if isinstance(x, list) or isinstance(x, np.ndarray):
+        return torch.tensor(x, dtype=dtype, device=device)
+    return x.to(dtype=dtype)
+def get_c2w(
+    elevation_deg: LIST_TYPE,
+    distance: LIST_TYPE,
+    azimuth_deg: Optional[LIST_TYPE],
+    num_views: Optional[int] = 1,
+    device: Optional[str] = None,
+) -> torch.FloatTensor:
+    if azimuth_deg is None:
+        assert (
+            num_views is not None
+        ), "num_views must be provided if azimuth_deg is None."
+        azimuth_deg = torch.linspace(
+            0, 360, num_views + 1, dtype=torch.float32, device=device
+        )[:-1]
+    else:
+        num_views = len(azimuth_deg)
+    azimuth_deg = list_to_pt(azimuth_deg, dtype=torch.float32, device=device)
+    elevation_deg = list_to_pt(elevation_deg, dtype=torch.float32, device=device)
+    camera_distances = list_to_pt(distance, dtype=torch.float32, device=device)
+    elevation = elevation_deg * math.pi / 180
+    azimuth = azimuth_deg * math.pi / 180
+    camera_positions = torch.stack(
+        [
+            camera_distances * torch.cos(elevation) * torch.cos(azimuth),
+            camera_distances * torch.cos(elevation) * torch.sin(azimuth),
+            camera_distances * torch.sin(elevation),
+        ],
+        dim=-1,
+    )
+    center = torch.zeros_like(camera_positions)
+    up = torch.tensor([0, 0, 1], dtype=torch.float32, device=device)[None, :].repeat(
+        num_views, 1
+    )
+    lookat = F.normalize(center - camera_positions, dim=-1)
+    right = F.normalize(torch.cross(lookat, up, dim=-1), dim=-1)
+    up = F.normalize(torch.cross(right, lookat, dim=-1), dim=-1)
+    c2w3x4 = torch.cat(
+        [torch.stack([right, up, -lookat], dim=-1), camera_positions[:, :, None]],
+        dim=-1,
+    )
+    c2w = torch.cat([c2w3x4, torch.zeros_like(c2w3x4[:, :1])], dim=1)
+    c2w[:, 3, 3] = 1.0
+    return c2w
+def get_projection_matrix(
+    fovy_deg: LIST_TYPE,
+    aspect_wh: float = 1.0,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+) -> torch.FloatTensor:
+    fovy_deg = list_to_pt(fovy_deg, dtype=torch.float32, device=device)
+    batch_size = fovy_deg.shape[0]
+    fovy = fovy_deg * math.pi / 180
+    tan_half_fovy = torch.tan(fovy / 2)
+    projection_matrix = torch.zeros(
+        batch_size, 4, 4, dtype=torch.float32, device=device
+    )
+    projection_matrix[:, 0, 0] = 1 / (aspect_wh * tan_half_fovy)
+    projection_matrix[:, 1, 1] = -1 / tan_half_fovy
+    projection_matrix[:, 2, 2] = -(far + near) / (far - near)
+    projection_matrix[:, 2, 3] = -2 * far * near / (far - near)
+    projection_matrix[:, 3, 2] = -1
+    return projection_matrix
+def get_orthogonal_projection_matrix(
+    batch_size: int,
+    left: float,
+    right: float,
+    bottom: float,
+    top: float,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+) -> torch.FloatTensor:
+    projection_matrix = torch.zeros(
+        batch_size, 4, 4, dtype=torch.float32, device=device
+    )
+    projection_matrix[:, 0, 0] = 2 / (right - left)
+    projection_matrix[:, 1, 1] = -2 / (top - bottom)
+    projection_matrix[:, 2, 2] = -2 / (far - near)
+    projection_matrix[:, 0, 3] = -(right + left) / (right - left)
+    projection_matrix[:, 1, 3] = -(top + bottom) / (top - bottom)
+    projection_matrix[:, 2, 3] = -(far + near) / (far - near)
+    projection_matrix[:, 3, 3] = 1
+    return projection_matrix
+@dataclass
+class Camera:
+    c2w: Optional[torch.FloatTensor]
+    w2c: torch.FloatTensor
+    proj_mtx: torch.FloatTensor
+    mvp_mtx: torch.FloatTensor
+    cam_pos: Optional[torch.FloatTensor]
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            sl = slice(index, index + 1)
+        elif isinstance(index, slice):
+            sl = index
+        else:
+            raise NotImplementedError
+        return Camera(
+            c2w=self.c2w[sl] if self.c2w is not None else None,
+            w2c=self.w2c[sl],
+            proj_mtx=self.proj_mtx[sl],
+            mvp_mtx=self.mvp_mtx[sl],
+            cam_pos=self.cam_pos[sl] if self.cam_pos is not None else None,
+        )
+    def to(self, device: Optional[str] = None):
+        if self.c2w is not None:
+            self.c2w = self.c2w.to(device)
+        self.w2c = self.w2c.to(device)
+        self.proj_mtx = self.proj_mtx.to(device)
+        self.mvp_mtx = self.mvp_mtx.to(device)
+        if self.cam_pos is not None:
+            self.cam_pos = self.cam_pos.to(device)
+    def __len__(self):
+        return self.c2w.shape[0]
+def get_camera(
+    elevation_deg: Optional[LIST_TYPE] = None,
+    distance: Optional[LIST_TYPE] = None,
+    fovy_deg: Optional[LIST_TYPE] = None,
+    azimuth_deg: Optional[LIST_TYPE] = None,
+    num_views: Optional[int] = 1,
+    c2w: Optional[torch.FloatTensor] = None,
+    w2c: Optional[torch.FloatTensor] = None,
+    proj_mtx: Optional[torch.FloatTensor] = None,
+    aspect_wh: float = 1.0,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+):
+    if w2c is None:
+        if c2w is None:
+            c2w = get_c2w(elevation_deg, distance, azimuth_deg, num_views, device)
+        camera_positions = c2w[:, :3, 3]
+        w2c = torch.linalg.inv(c2w)
+    else:
+        camera_positions = None
+        c2w = None
+    if proj_mtx is None:
+        proj_mtx = get_projection_matrix(
+            fovy_deg, aspect_wh=aspect_wh, near=near, far=far, device=device
+        )
+    mvp_mtx = proj_mtx @ w2c
+    return Camera(
+        c2w=c2w, w2c=w2c, proj_mtx=proj_mtx, mvp_mtx=mvp_mtx, cam_pos=camera_positions
+    )
+def get_orthogonal_camera(
+    elevation_deg: LIST_TYPE,
+    distance: LIST_TYPE,
+    left: float,
+    right: float,
+    bottom: float,
+    top: float,
+    azimuth_deg: Optional[LIST_TYPE] = None,
+    num_views: Optional[int] = 1,
+    near: float = 0.1,
+    far: float = 100.0,
+    device: Optional[str] = None,
+):
+    c2w = get_c2w(elevation_deg, distance, azimuth_deg, num_views, device)
+    camera_positions = c2w[:, :3, 3]
+    w2c = torch.linalg.inv(c2w)
+    proj_mtx = get_orthogonal_projection_matrix(
+        batch_size=c2w.shape[0],
+        left=left,
+        right=right,
+        bottom=bottom,
+        top=top,
+        near=near,
+        far=far,
+        device=device,
+    )
+    mvp_mtx = proj_mtx @ w2c
+    return Camera(
+        c2w=c2w, w2c=w2c, proj_mtx=proj_mtx, mvp_mtx=mvp_mtx, cam_pos=camera_positions
+    )

mvadapter/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,253 @@

+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from torch.nn import functional as F
+def get_position_map_from_depth(depth, mask, intrinsics, extrinsics, image_wh=None):
+    """Compute the position map from the depth map and the camera parameters for a batch of views.
+    Args:
+        depth (torch.Tensor): The depth maps with the shape (B, H, W, 1).
+        mask (torch.Tensor): The masks with the shape (B, H, W, 1).
+        intrinsics (torch.Tensor): The camera intrinsics matrices with the shape (B, 3, 3).
+        extrinsics (torch.Tensor): The camera extrinsics matrices with the shape (B, 4, 4).
+        image_wh (Tuple[int, int]): The image width and height.
+    Returns:
+        torch.Tensor: The position maps with the shape (B, H, W, 3).
+    """
+    if image_wh is None:
+        image_wh = depth.shape[2], depth.shape[1]
+    B, H, W, _ = depth.shape
+    depth = depth.squeeze(-1)
+    u_coord, v_coord = torch.meshgrid(
+        torch.arange(image_wh[0]), torch.arange(image_wh[1]), indexing="xy"
+    )
+    u_coord = u_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    v_coord = v_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    # Compute the position map by back-projecting depth pixels to 3D space
+    x = (
+        (u_coord - intrinsics[:, 0, 2].unsqueeze(-1).unsqueeze(-1))
+        * depth
+        / intrinsics[:, 0, 0].unsqueeze(-1).unsqueeze(-1)
+    )
+    y = (
+        (v_coord - intrinsics[:, 1, 2].unsqueeze(-1).unsqueeze(-1))
+        * depth
+        / intrinsics[:, 1, 1].unsqueeze(-1).unsqueeze(-1)
+    )
+    z = depth
+    # Concatenate to form the 3D coordinates in the camera frame
+    camera_coords = torch.stack([x, y, z], dim=-1)
+    # Apply the extrinsic matrix to get coordinates in the world frame
+    coords_homogeneous = torch.nn.functional.pad(
+        camera_coords, (0, 1), "constant", 1.0
+    )  # Add a homogeneous coordinate
+    world_coords = torch.matmul(
+        coords_homogeneous.view(B, -1, 4), extrinsics.transpose(1, 2)
+    ).view(B, H, W, 4)
+    # Apply the mask to the position map
+    position_map = world_coords[..., :3] * mask
+    return position_map
+def get_position_map_from_depth_ortho(
+    depth, mask, extrinsics, ortho_scale, image_wh=None
+):
+    """Compute the position map from the depth map and the camera parameters for a batch of views
+    using orthographic projection with a given ortho_scale.
+    Args:
+        depth (torch.Tensor): The depth maps with the shape (B, H, W, 1).
+        mask (torch.Tensor): The masks with the shape (B, H, W, 1).
+        extrinsics (torch.Tensor): The camera extrinsics matrices with the shape (B, 4, 4).
+        ortho_scale (torch.Tensor): The scaling factor for the orthographic projection with the shape (B, 1, 1, 1).
+        image_wh (Tuple[int, int]): Optional. The image width and height.
+    Returns:
+        torch.Tensor: The position maps with the shape (B, H, W, 3).
+    """
+    if image_wh is None:
+        image_wh = depth.shape[2], depth.shape[1]
+    B, H, W, _ = depth.shape
+    depth = depth.squeeze(-1)
+    # Generating grid of coordinates in the image space
+    u_coord, v_coord = torch.meshgrid(
+        torch.arange(0, image_wh[0]), torch.arange(0, image_wh[1]), indexing="xy"
+    )
+    u_coord = u_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    v_coord = v_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1)
+    # Compute the position map using orthographic projection with ortho_scale
+    x = (u_coord - image_wh[0] / 2) * ortho_scale / image_wh[0]
+    y = (v_coord - image_wh[1] / 2) * ortho_scale / image_wh[1]
+    z = depth
+    # Concatenate to form the 3D coordinates in the camera frame
+    camera_coords = torch.stack([x, y, z], dim=-1)
+    # Apply the extrinsic matrix to get coordinates in the world frame
+    coords_homogeneous = torch.nn.functional.pad(
+        camera_coords, (0, 1), "constant", 1.0
+    )  # Add a homogeneous coordinate
+    world_coords = torch.matmul(
+        coords_homogeneous.view(B, -1, 4), extrinsics.transpose(1, 2)
+    ).view(B, H, W, 4)
+    # Apply the mask to the position map
+    position_map = world_coords[..., :3] * mask
+    return position_map
+def get_opencv_from_blender(matrix_world, fov=None, image_size=None):
+    # convert matrix_world to opencv format extrinsics
+    opencv_world_to_cam = matrix_world.inverse()
+    opencv_world_to_cam[1, :] *= -1
+    opencv_world_to_cam[2, :] *= -1
+    R, T = opencv_world_to_cam[:3, :3], opencv_world_to_cam[:3, 3]
+    if fov is None:  # orthographic camera
+        return R, T
+    R, T = R.unsqueeze(0), T.unsqueeze(0)
+    # convert fov to opencv format intrinsics
+    focal = 1 / np.tan(fov / 2)
+    intrinsics = np.diag(np.array([focal, focal, 1])).astype(np.float32)
+    opencv_cam_matrix = (
+        torch.from_numpy(intrinsics).unsqueeze(0).float().to(matrix_world.device)
+    )
+    opencv_cam_matrix[:, :2, -1] += torch.tensor([image_size / 2, image_size / 2]).to(
+        matrix_world.device
+    )
+    opencv_cam_matrix[:, [0, 1], [0, 1]] *= image_size / 2
+    return R, T, opencv_cam_matrix
+def get_ray_directions(
+    H: int,
+    W: int,
+    focal: float,
+    principal: Optional[Tuple[float, float]] = None,
+    use_pixel_centers: bool = True,
+) -> torch.Tensor:
+    """
+    Get ray directions for all pixels in camera coordinate.
+    Args:
+        H, W, focal, principal, use_pixel_centers: image height, width, focal length, principal point and whether use pixel centers
+    Outputs:
+        directions: (H, W, 3), the direction of the rays in camera coordinate
+    """
+    pixel_center = 0.5 if use_pixel_centers else 0
+    cx, cy = W / 2, H / 2 if principal is None else principal
+    i, j = torch.meshgrid(
+        torch.arange(W, dtype=torch.float32) + pixel_center,
+        torch.arange(H, dtype=torch.float32) + pixel_center,
+        indexing="xy",
+    )
+    directions = torch.stack(
+        [(i - cx) / focal, -(j - cy) / focal, -torch.ones_like(i)], -1
+    )
+    return F.normalize(directions, dim=-1)
+def get_rays(
+    directions: torch.Tensor, c2w: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Get ray origins and directions from camera coordinates to world coordinates
+    Args:
+        directions: (H, W, 3) ray directions in camera coordinates
+        c2w: (4, 4) camera-to-world transformation matrix
+    Outputs:
+        rays_o, rays_d: (H, W, 3) ray origins and directions in world coordinates
+    """
+    # Rotate ray directions from camera coordinate to the world coordinate
+    rays_d = directions @ c2w[:3, :3].T
+    rays_o = c2w[:3, 3].expand(rays_d.shape)
+    return rays_o, rays_d
+def compute_plucker_embed(
+    c2w: torch.Tensor, image_width: int, image_height: int, focal: float
+) -> torch.Tensor:
+    """
+    Computes Plucker coordinates for a camera.
+    Args:
+        c2w: (4, 4) camera-to-world transformation matrix
+        image_width: Image width
+        image_height: Image height
+        focal: Focal length of the camera
+    Returns:
+        plucker: (6, H, W) Plucker embedding
+    """
+    directions = get_ray_directions(image_height, image_width, focal)
+    rays_o, rays_d = get_rays(directions, c2w)
+    # Cross product to get Plucker coordinates
+    cross = torch.cross(rays_o, rays_d, dim=-1)
+    plucker = torch.cat((rays_d, cross), dim=-1)
+    return plucker.permute(2, 0, 1)
+def get_plucker_embeds_from_cameras(
+    c2w: List[torch.Tensor], fov: List[float], image_size: int
+) -> torch.Tensor:
+    """
+    Given lists of camera transformations and fov, returns the batched plucker embeddings.
+    Args:
+        c2w: list of camera-to-world transformation matrices
+        fov: list of field of view values
+        image_size: size of the image
+    Returns:
+        plucker_embeds: (B, 6, H, W) batched plucker embeddings
+    """
+    plucker_embeds = []
+    for cam_matrix, cam_fov in zip(c2w, fov):
+        focal = 0.5 * image_size / np.tan(0.5 * cam_fov)
+        plucker = compute_plucker_embed(cam_matrix, image_size, image_size, focal)
+        plucker_embeds.append(plucker)
+    return torch.stack(plucker_embeds)
+def get_plucker_embeds_from_cameras_ortho(
+    c2w: List[torch.Tensor], ortho_scale: List[float], image_size: int
+):
+    """
+    Given lists of camera transformations and fov, returns the batched plucker embeddings.
+    Parameters:
+        c2w: list of camera-to-world transformation matrices
+        fov: list of field of view values
+        image_size: size of the image
+    Returns:
+        plucker_embeds: plucker embeddings (B, 6, H, W)
+    """
+    plucker_embeds = []
+    # compute pairwise mask and plucker embeddings
+    for cam_matrix, scale in zip(c2w, ortho_scale):
+        # blender to opencv to pytorch3d
+        R, T = get_opencv_from_blender(cam_matrix)
+        cam_pos = -R.T @ T
+        view_dir = R.T @ torch.tensor([0, 0, 1]).float().to(cam_matrix.device)
+        # normalize camera position
+        cam_pos = F.normalize(cam_pos, dim=0)
+        plucker = torch.concat([view_dir, cam_pos])
+        plucker = plucker.unsqueeze(-1).unsqueeze(-1).repeat(1, image_size, image_size)
+        plucker_embeds.append(plucker)
+    plucker_embeds = torch.stack(plucker_embeds)
+    return plucker_embeds

mvadapter/utils/logging.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2024 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities."""
+import logging
+import os
+import sys
+import threading
+from logging import CRITICAL  # NOQA
+from logging import DEBUG  # NOQA
+from logging import ERROR  # NOQA
+from logging import FATAL  # NOQA
+from logging import INFO  # NOQA
+from logging import NOTSET  # NOQA
+from logging import WARN  # NOQA
+from logging import WARNING  # NOQA
+from typing import Dict, Optional
+from tqdm import auto as tqdm_lib
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+_default_log_level = logging.INFO
+_tqdm_active = True
+def _get_default_logging_level() -> int:
+    """
+    If LATEXTURE_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+    env_level_str = os.getenv("LATEXTURE_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option LATEXTURE_VERBOSITY={env_level_str}, "
+                f"has to be one of: { ', '.join(log_levels.keys()) }"
+            )
+    return _default_log_level
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+def _configure_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        _default_handler.flush = sys.stderr.flush
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+        enable_explicit_format()
+def _reset_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if not _default_handler:
+            return
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+def get_log_levels_dict() -> Dict[str, int]:
+    return log_levels
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+    This function is not supposed to be directly accessed unless you are writing a custom diffusers module.
+    """
+    if name is None:
+        name = _get_library_name()
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 Diffusers' root logger as an `int`.
+    Returns:
+        `int`:
+            Logging level integers which can be one of:
+            - `50`: `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+            - `40`: `diffusers.logging.ERROR`
+            - `30`: `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+            - `20`: `diffusers.logging.INFO`
+            - `10`: `diffusers.logging.DEBUG`
+    """
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 Diffusers' root logger.
+    Args:
+        verbosity (`int`):
+            Logging level which can be one of:
+            - `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+            - `diffusers.logging.ERROR`
+            - `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+            - `diffusers.logging.INFO`
+            - `diffusers.logging.DEBUG`
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+def set_verbosity_info() -> None:
+    """Set the verbosity to the `INFO` level."""
+    return set_verbosity(INFO)
+def set_verbosity_warning() -> None:
+    """Set the verbosity to the `WARNING` level."""
+    return set_verbosity(WARNING)
+def set_verbosity_debug() -> None:
+    """Set the verbosity to the `DEBUG` level."""
+    return set_verbosity(DEBUG)
+def set_verbosity_error() -> None:
+    """Set the verbosity to the `ERROR` level."""
+    return set_verbosity(ERROR)
+def disable_default_handler() -> None:
+    """Disable the default handler of the 🤗 Diffusers' root logger."""
+    _configure_library_root_logger()
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+def enable_default_handler() -> None:
+    """Enable the default handler of the 🤗 Diffusers' root logger."""
+    _configure_library_root_logger()
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace Diffusers' root logger."""
+    _configure_library_root_logger()
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace Diffusers' root logger."""
+    _configure_library_root_logger()
+    assert handler is not None and handler in _get_library_root_logger().handlers
+    _get_library_root_logger().removeHandler(handler)
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Diffusers' default handler to prevent
+    double logging if the root logger has been configured.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every 🤗 Diffusers' logger. The explicit formatter is as follows:
+    ```
+    [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+    for handler in handlers:
+        formatter = logging.Formatter(
+            "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s"
+        )
+        handler.setFormatter(formatter)
+def reset_format() -> None:
+    """
+    Resets the formatting for 🤗 Diffusers' loggers.
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+    for handler in handlers:
+        handler.setFormatter(None)
+def warning_advice(self, *args, **kwargs) -> None:
+    """
+    This method is identical to `logger.warning()`, but if env var LATEXTURE_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+    no_advisory_warnings = os.getenv("LATEXTURE_NO_ADVISORY_WARNINGS", False)
+    if no_advisory_warnings:
+        return
+    self.warning(*args, **kwargs)
+logging.Logger.warning_advice = warning_advice
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+        self._iterator = args[0] if args else None
+    def __iter__(self):
+        return iter(self._iterator)
+    def __getattr__(self, _):
+        """Return empty function."""
+        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+            return
+        return empty_fn
+    def __enter__(self):
+        return self
+    def __exit__(self, type_, value, traceback):
+        return
+class _tqdm_cls:
+    def __call__(self, *args, **kwargs):
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        else:
+            return EmptyTqdm(*args, **kwargs)
+    def set_lock(self, *args, **kwargs):
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+    def get_lock(self):
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+tqdm = _tqdm_cls()
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active
+    return bool(_tqdm_active)
+def enable_progress_bar() -> None:
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+def disable_progress_bar() -> None:
+    """Disable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False

mvadapter/utils/render.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import math
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import datetime
+from typing import List, Optional, Union
+import numpy as np
+import nvdiffrast.torch as dr
+import torch
+import torch.nn.functional as F
+import trimesh
+from PIL import Image
+from torch import BoolTensor, FloatTensor
+from . import logging
+from .camera import Camera
+logger = logging.get_logger(__name__)
+def dot(x: torch.FloatTensor, y: torch.FloatTensor) -> torch.FloatTensor:
+    return torch.sum(x * y, -1, keepdim=True)
+@dataclass
+class TexturedMesh:
+    v_pos: torch.FloatTensor
+    t_pos_idx: torch.LongTensor
+    # texture coordinates
+    v_tex: Optional[torch.FloatTensor] = None
+    t_tex_idx: Optional[torch.LongTensor] = None
+    # texture map
+    texture: Optional[torch.FloatTensor] = None
+    # vertices, faces after vertex merging
+    _stitched_v_pos: Optional[torch.FloatTensor] = None
+    _stitched_t_pos_idx: Optional[torch.LongTensor] = None
+    _v_nrm: Optional[torch.FloatTensor] = None
+    @property
+    def v_nrm(self) -> torch.FloatTensor:
+        if self._v_nrm is None:
+            self._v_nrm = self._compute_vertex_normal()
+        return self._v_nrm
+    def set_stitched_mesh(
+        self, v_pos: torch.FloatTensor, t_pos_idx: torch.LongTensor
+    ) -> None:
+        self._stitched_v_pos = v_pos
+        self._stitched_t_pos_idx = t_pos_idx
+    @property
+    def stitched_v_pos(self) -> torch.FloatTensor:
+        if self._stitched_v_pos is None:
+            logger.warning("Stitched vertices not available, using original vertices!")
+            return self.v_pos
+        return self._stitched_v_pos
+    @property
+    def stitched_t_pos_idx(self) -> torch.LongTensor:
+        if self._stitched_t_pos_idx is None:
+            logger.warning("Stitched faces not available, using original faces!")
+            return self.t_pos_idx
+        return self._stitched_t_pos_idx
+    def _compute_vertex_normal(self) -> torch.FloatTensor:
+        if self._stitched_v_pos is None or self._stitched_t_pos_idx is None:
+            logger.warning(
+                "Stitched vertices and faces not available, computing vertex normals on original mesh, which can be erroneous!"
+            )
+            v_pos, t_pos_idx = self.v_pos, self.t_pos_idx
+        else:
+            v_pos, t_pos_idx = self._stitched_v_pos, self._stitched_t_pos_idx
+        i0 = t_pos_idx[:, 0]
+        i1 = t_pos_idx[:, 1]
+        i2 = t_pos_idx[:, 2]
+        v0 = v_pos[i0, :]
+        v1 = v_pos[i1, :]
+        v2 = v_pos[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0)
+        # Splat face normals to vertices
+        v_nrm = torch.zeros_like(v_pos)
+        v_nrm.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
+        v_nrm.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
+        v_nrm.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
+        # Normalize, replace zero (degenerated) normals with some default value
+        v_nrm = torch.where(
+            dot(v_nrm, v_nrm) > 1e-20, v_nrm, torch.as_tensor([0.0, 0.0, 1.0]).to(v_nrm)
+        )
+        v_nrm = F.normalize(v_nrm, dim=1)
+        if torch.is_anomaly_enabled():
+            assert torch.all(torch.isfinite(v_nrm))
+        return v_nrm
+    def to(self, device: Optional[str] = None):
+        self.v_pos = self.v_pos.to(device)
+        self.t_pos_idx = self.t_pos_idx.to(device)
+        if self.v_tex is not None:
+            self.v_tex = self.v_tex.to(device)
+        if self.t_tex_idx is not None:
+            self.t_tex_idx = self.t_tex_idx.to(device)
+        if self.texture is not None:
+            self.texture = self.texture.to(device)
+        if self._stitched_v_pos is not None:
+            self._stitched_v_pos = self._stitched_v_pos.to(device)
+        if self._stitched_t_pos_idx is not None:
+            self._stitched_t_pos_idx = self._stitched_t_pos_idx.to(device)
+        if self._v_nrm is not None:
+            self._v_nrm = self._v_nrm.to(device)
+def load_mesh(
+    mesh_path: str,
+    rescale: bool = False,
+    move_to_center: bool = False,
+    scale: float = 0.5,
+    flip_uv: bool = True,
+    merge_vertices: bool = True,
+    default_uv_size: int = 2048,
+    shape_init_mesh_up: str = "+y",
+    shape_init_mesh_front: str = "+x",
+    front_x_to_y: bool = False,
+    device: Optional[str] = None,
+    return_transform: bool = False,
+) -> TexturedMesh:
+    scene = trimesh.load(mesh_path, force="mesh", process=False)
+    if isinstance(scene, trimesh.Trimesh):
+        mesh = scene
+    elif isinstance(scene, trimesh.scene.Scene):
+        mesh = trimesh.Trimesh()
+        for obj in scene.geometry.values():
+            mesh = trimesh.util.concatenate([mesh, obj])
+    else:
+        raise ValueError(f"Unknown mesh type at {mesh_path}.")
+    # move to center
+    if move_to_center:
+        centroid = mesh.vertices.mean(0)
+        mesh.vertices = mesh.vertices - centroid
+    # rescale
+    if rescale:
+        max_scale = np.abs(mesh.vertices).max()
+        mesh.vertices = mesh.vertices / max_scale * scale
+    dirs = ["+x", "+y", "+z", "-x", "-y", "-z"]
+    dir2vec = {
+        "+x": np.array([1, 0, 0]),
+        "+y": np.array([0, 1, 0]),
+        "+z": np.array([0, 0, 1]),
+        "-x": np.array([-1, 0, 0]),
+        "-y": np.array([0, -1, 0]),
+        "-z": np.array([0, 0, -1]),
+    }
+    if shape_init_mesh_up not in dirs or shape_init_mesh_front not in dirs:
+        raise ValueError(
+            f"shape_init_mesh_up and shape_init_mesh_front must be one of {dirs}."
+        )
+    if shape_init_mesh_up[1] == shape_init_mesh_front[1]:
+        raise ValueError(
+            "shape_init_mesh_up and shape_init_mesh_front must be orthogonal."
+        )
+    z_, x_ = (
+        dir2vec[shape_init_mesh_up],
+        dir2vec[shape_init_mesh_front],
+    )
+    y_ = np.cross(z_, x_)
+    std2mesh = np.stack([x_, y_, z_], axis=0).T
+    mesh2std = np.linalg.inv(std2mesh)
+    mesh.vertices = np.dot(mesh2std, mesh.vertices.T).T
+    if front_x_to_y:
+        x = mesh.vertices[:, 1].copy()
+        y = -mesh.vertices[:, 0].copy()
+        mesh.vertices[:, 0] = x
+        mesh.vertices[:, 1] = y
+    v_pos = torch.tensor(mesh.vertices, dtype=torch.float32)
+    t_pos_idx = torch.tensor(mesh.faces, dtype=torch.int64)
+    if hasattr(mesh, "visual") and hasattr(mesh.visual, "uv"):
+        v_tex = torch.tensor(mesh.visual.uv, dtype=torch.float32)
+        if flip_uv:
+            v_tex[:, 1] = 1.0 - v_tex[:, 1]
+        t_tex_idx = t_pos_idx.clone()
+        if (
+            hasattr(mesh.visual.material, "baseColorTexture")
+            and mesh.visual.material.baseColorTexture
+        ):
+            texture = torch.tensor(
+                np.array(mesh.visual.material.baseColorTexture) / 255.0,
+                dtype=torch.float32,
+            )[..., :3]
+        else:
+            texture = torch.zeros(
+                (default_uv_size, default_uv_size, 3), dtype=torch.float32
+            )
+    else:
+        v_tex = None
+        t_tex_idx = None
+        texture = None
+    textured_mesh = TexturedMesh(
+        v_pos=v_pos,
+        t_pos_idx=t_pos_idx,
+        v_tex=v_tex,
+        t_tex_idx=t_tex_idx,
+        texture=texture,
+    )
+    if merge_vertices:
+        mesh.merge_vertices(merge_tex=True)
+        textured_mesh.set_stitched_mesh(
+            torch.tensor(mesh.vertices, dtype=torch.float32),
+            torch.tensor(mesh.faces, dtype=torch.int64),
+        )
+    textured_mesh.to(device)
+    if return_transform:
+        return textured_mesh, np.array(centroid), max_scale / scale
+    return textured_mesh
+@dataclass
+class RenderOutput:
+    attr: Optional[torch.FloatTensor] = None
+    mask: Optional[torch.BoolTensor] = None
+    depth: Optional[torch.FloatTensor] = None
+    normal: Optional[torch.FloatTensor] = None
+    pos: Optional[torch.FloatTensor] = None
+class NVDiffRastContextWrapper:
+    def __init__(self, device: str, context_type: str = "gl"):
+        if context_type == "gl":
+            self.ctx = dr.RasterizeGLContext(device=device)
+        elif context_type == "cuda":
+            self.ctx = dr.RasterizeCudaContext(device=device)
+        else:
+            raise NotImplementedError
+    def rasterize(self, pos, tri, resolution, ranges=None, grad_db=True):
+        """
+        Rasterize triangles.
+        All input tensors must be contiguous and reside in GPU memory except for the ranges tensor that, if specified, has to reside in CPU memory. The output tensors will be contiguous and reside in GPU memory.
+        Arguments:
+        glctx	Rasterizer context of type RasterizeGLContext or RasterizeCudaContext.
+        pos	Vertex position tensor with dtype torch.float32. To enable range mode, this tensor should have a 2D shape [num_vertices, 4]. To enable instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
+        tri	Triangle tensor with shape [num_triangles, 3] and dtype torch.int32.
+        resolution	Output resolution as integer tuple (height, width).
+        ranges	In range mode, tensor with shape [minibatch_size, 2] and dtype torch.int32, specifying start indices and counts into tri. Ignored in instanced mode.
+        grad_db	Propagate gradients of image-space derivatives of barycentrics into pos in backward pass. Ignored if using an OpenGL context that was not configured to output image-space derivatives.
+        Returns:
+        A tuple of two tensors. The first output tensor has shape [minibatch_size, height, width, 4] and contains the main rasterizer output in order (u, v, z/w, triangle_id). If the OpenGL context was configured to output image-space derivatives of barycentrics, the second output tensor will also have shape [minibatch_size, height, width, 4] and contain said derivatives in order (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape [minibatch_size, height, width, 0].
+        """
+        return dr.rasterize(
+            self.ctx, pos.float(), tri.int(), resolution, ranges, grad_db
+        )
+    def interpolate(self, attr, rast, tri, rast_db=None, diff_attrs=None):
+        """
+        Interpolate vertex attributes.
+        All input tensors must be contiguous and reside in GPU memory. The output tensors will be contiguous and reside in GPU memory.
+        Arguments:
+        attr	Attribute tensor with dtype torch.float32. Shape is [num_vertices, num_attributes] in range mode, or [minibatch_size, num_vertices, num_attributes] in instanced mode. Broadcasting is supported along the minibatch axis.
+        rast	Main output tensor from rasterize().
+        tri	Triangle tensor with shape [num_triangles, 3] and dtype torch.int32.
+        rast_db	(Optional) Tensor containing image-space derivatives of barycentrics, i.e., the second output tensor from rasterize(). Enables computing image-space derivatives of attributes.
+        diff_attrs	(Optional) List of attribute indices for which image-space derivatives are to be computed. Special value 'all' is equivalent to list [0, 1, ..., num_attributes - 1].
+        Returns:
+        A tuple of two tensors. The first output tensor contains interpolated attributes and has shape [minibatch_size, height, width, num_attributes]. If rast_db and diff_attrs were specified, the second output tensor contains the image-space derivatives of the selected attributes and has shape [minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc. Otherwise, the second output tensor will be an empty tensor with shape [minibatch_size, height, width, 0].
+        """
+        return dr.interpolate(attr.float(), rast, tri.int(), rast_db, diff_attrs)
+    def texture(
+        self,
+        tex,
+        uv,
+        uv_da=None,
+        mip_level_bias=None,
+        mip=None,
+        filter_mode="auto",
+        boundary_mode="wrap",
+        max_mip_level=None,
+    ):
+        """
+        Perform texture sampling.
+        All input tensors must be contiguous and reside in GPU memory. The output tensor will be contiguous and reside in GPU memory.
+        Arguments:
+        tex	Texture tensor with dtype torch.float32. For 2D textures, must have shape [minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures, must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where tex_width and tex_height are equal. Note that boundary_mode must also be set to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
+        uv	Tensor containing per-pixel texture coordinates. When sampling a 2D texture, must have shape [minibatch_size, height, width, 2]. When sampling a cube map texture, must have shape [minibatch_size, height, width, 3].
+        uv_da	(Optional) Tensor containing image-space derivatives of texture coordinates. Must have same shape as uv except for the last dimension that is to be twice as long.
+        mip_level_bias	(Optional) Per-pixel bias for mip level selection. If uv_da is omitted, determines mip level directly. Must have shape [minibatch_size, height, width].
+        mip	(Optional) Preconstructed mipmap stack from a texture_construct_mip() call, or a list of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack, the tensors in the list must follow the same format as tex except for width and height that must follow the usual rules for mipmap sizes. The base level texture is still supplied in tex and must not be included in the list. Gradients of a custom mipmap stack are not automatically propagated to base texture but the mipmap tensors will receive gradients of their own. If a mipmap stack is not specified but the chosen filter mode requires it, the mipmap stack is constructed internally and discarded afterwards.
+        filter_mode	Texture filtering mode to be used. Valid values are 'auto', 'nearest', 'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto' selects 'linear' if neither uv_da or mip_level_bias is specified, and 'linear-mipmap-linear' when at least one of them is specified, these being the highest-quality modes possible depending on the availability of the image-space derivatives of the texture coordinates or direct mip level information.
+        boundary_mode	Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If tex defines a cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional part of texture coordinates. Mode 'clamp' clamps texture coordinates to the centers of the boundary texels. Mode 'zero' virtually extends the texture with all-zero values in all directions.
+        max_mip_level	If specified, limits the number of mipmaps constructed and used in mipmap-based filter modes.
+        Returns:
+        A tensor containing the results of the texture sampling with shape [minibatch_size, height, width, tex_channels]. Cube map fetches with invalid uv coordinates (e.g., zero vectors) output all zeros and do not propagate gradients.
+        """
+        return dr.texture(
+            tex.float(),
+            uv.float(),
+            uv_da,
+            mip_level_bias,
+            mip,
+            filter_mode,
+            boundary_mode,
+            max_mip_level,
+        )
+    def antialias(
+        self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0
+    ):
+        """
+        Perform antialiasing.
+        All input tensors must be contiguous and reside in GPU memory. The output tensor will be contiguous and reside in GPU memory.
+        Note that silhouette edge determination is based on vertex indices in the triangle tensor. For it to work properly, a vertex belonging to multiple triangles must be referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always classify the adjacent edges as silhouette edges, which leads to bad performance and potentially incorrect gradients. If you are unsure whether your data is good, check which pixels are modified by the antialias operation and compare to the example in the documentation.
+        Arguments:
+        color	Input image to antialias with shape [minibatch_size, height, width, num_channels].
+        rast	Main output tensor from rasterize().
+        pos	Vertex position tensor used in the rasterization operation.
+        tri	Triangle tensor used in the rasterization operation.
+        topology_hash	(Optional) Preconstructed topology hash for the triangle tensor. If not specified, the topology hash is constructed internally and discarded afterwards.
+        pos_gradient_boost	(Optional) Multiplier for gradients propagated to pos.
+        Returns:
+        A tensor containing the antialiased image with the same shape as color input tensor.
+        """
+        return dr.antialias(
+            color.float(),
+            rast,
+            pos.float(),
+            tri.int(),
+            topology_hash,
+            pos_gradient_boost,
+        )
+def get_clip_space_position(pos: torch.FloatTensor, mvp_mtx: torch.FloatTensor):
+    pos_homo = torch.cat([pos, torch.ones([pos.shape[0], 1]).to(pos)], dim=-1)
+    return torch.matmul(pos_homo, mvp_mtx.permute(0, 2, 1))
+def transform_points_homo(pos: torch.FloatTensor, mtx: torch.FloatTensor):
+    batch_size = pos.shape[0]
+    pos_shape = pos.shape[1:-1]
+    pos = pos.reshape(batch_size, -1, 3)
+    pos_homo = torch.cat([pos, torch.ones_like(pos[..., 0:1])], dim=-1)
+    pos = (pos_homo.unsqueeze(2) * mtx.unsqueeze(1)).sum(-1)[..., :3]
+    pos = pos.reshape(batch_size, *pos_shape, 3)
+    return pos
+class DepthNormalizationStrategy(ABC):
+    @abstractmethod
+    def __init__(self, *args, **kwargs):
+        pass
+    @abstractmethod
+    def __call__(
+        self, depth: torch.FloatTensor, mask: torch.BoolTensor
+    ) -> torch.FloatTensor:
+        pass
+class DepthControlNetNormalization(DepthNormalizationStrategy):
+    def __init__(
+        self, far_clip: float = 0.25, near_clip: float = 1.0, bg_value: float = 0.0
+    ):
+        self.far_clip = far_clip
+        self.near_clip = near_clip
+        self.bg_value = bg_value
+    def __call__(
+        self, depth: torch.FloatTensor, mask: torch.BoolTensor
+    ) -> torch.FloatTensor:
+        batch_size = depth.shape[0]
+        min_depth = depth.view(batch_size, -1).min(dim=-1)[0][:, None, None]
+        max_depth = depth.view(batch_size, -1).max(dim=-1)[0][:, None, None]
+        depth = 1.0 - ((depth - min_depth) / (max_depth - min_depth + 1e-5)).clamp(
+            0.0, 1.0
+        )
+        depth = depth * (self.near_clip - self.far_clip) + self.far_clip
+        depth[~mask] = self.bg_value
+        return depth
+class Zero123PlusPlusNormalization(DepthNormalizationStrategy):
+    def __init__(self, bg_value: float = 0.8):
+        self.bg_value = bg_value
+    def __call__(self, depth: FloatTensor, mask: BoolTensor) -> FloatTensor:
+        batch_size = depth.shape[0]
+        min_depth = depth.view(batch_size, -1).min(dim=-1)[0][:, None, None]
+        max_depth = depth.view(batch_size, -1).max(dim=-1)[0][:, None, None]
+        depth = ((depth - min_depth) / (max_depth - min_depth + 1e-5)).clamp(0.0, 1.0)
+        depth[~mask] = self.bg_value
+        return depth
+class SimpleNormalization(DepthNormalizationStrategy):
+    def __init__(
+        self,
+        scale: float = 1.0,
+        offset: float = -1.0,
+        clamp: bool = True,
+        bg_value: float = 1.0,
+    ):
+        self.scale = scale
+        self.offset = offset
+        self.clamp = clamp
+        self.bg_value = bg_value
+    def __call__(self, depth: FloatTensor, mask: BoolTensor) -> FloatTensor:
+        depth = depth * self.scale + self.offset
+        if self.clamp:
+            depth = depth.clamp(0.0, 1.0)
+        depth[~mask] = self.bg_value
+        return depth
+def render(
+    ctx: NVDiffRastContextWrapper,
+    mesh: TexturedMesh,
+    cam: Camera,
+    height: int,
+    width: int,
+    render_attr: bool = True,
+    render_depth: bool = True,
+    render_normal: bool = True,
+    depth_normalization_strategy: DepthNormalizationStrategy = DepthControlNetNormalization(),
+    attr_background: Union[float, torch.FloatTensor] = 0.5,
+    antialias_attr=False,
+    normal_background: Union[float, torch.FloatTensor] = 0.5,
+    texture_override=None,
+    texture_filter_mode: str = "linear",
+) -> RenderOutput:
+    output_dict = {}
+    v_pos_clip = get_clip_space_position(mesh.v_pos, cam.mvp_mtx)
+    rast, _ = ctx.rasterize(v_pos_clip, mesh.t_pos_idx, (height, width), grad_db=True)
+    mask = rast[..., 3] > 0
+    gb_pos, _ = ctx.interpolate(mesh.v_pos[None], rast, mesh.t_pos_idx)
+    output_dict.update({"mask": mask, "pos": gb_pos})
+    if render_depth:
+        gb_pos_vs = transform_points_homo(gb_pos, cam.w2c)
+        gb_depth = -gb_pos_vs[..., 2]
+        # set background pixels to min depth value for correct min/max calculation
+        gb_depth = torch.where(
+            mask,
+            gb_depth,
+            gb_depth.view(gb_depth.shape[0], -1).min(dim=-1)[0][:, None, None],
+        )
+        gb_depth = depth_normalization_strategy(gb_depth, mask)
+        output_dict["depth"] = gb_depth
+    if render_attr:
+        tex_c, _ = ctx.interpolate(mesh.v_tex[None], rast, mesh.t_tex_idx)
+        texture = (
+            texture_override[None]
+            if texture_override is not None
+            else mesh.texture[None]
+        )
+        gb_rgb_fg = ctx.texture(texture, tex_c, filter_mode=texture_filter_mode)
+        gb_rgb_bg = torch.ones_like(gb_rgb_fg) * attr_background
+        gb_rgb = torch.where(mask[..., None], gb_rgb_fg, gb_rgb_bg)
+        if antialias_attr:
+            gb_rgb = ctx.antialias(gb_rgb, rast, v_pos_clip, mesh.t_pos_idx)
+        output_dict["attr"] = gb_rgb
+    if render_normal:
+        gb_nrm, _ = ctx.interpolate(mesh.v_nrm[None], rast, mesh.stitched_t_pos_idx)
+        gb_nrm = F.normalize(gb_nrm, dim=-1, p=2)
+        gb_nrm[~mask] = normal_background
+        output_dict["normal"] = gb_nrm
+    return RenderOutput(**output_dict)

mvadapter/utils/saving.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import math
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from PIL import Image
+def tensor_to_image(
+    data: Union[Image.Image, torch.Tensor, np.ndarray],
+    batched: bool = False,
+    format: str = "HWC",
+) -> Union[Image.Image, List[Image.Image]]:
+    if isinstance(data, Image.Image):
+        return data
+    if isinstance(data, torch.Tensor):
+        data = data.detach().cpu().numpy()
+    if data.dtype == np.float32 or data.dtype == np.float16:
+        data = (data * 255).astype(np.uint8)
+    elif data.dtype == np.bool_:
+        data = data.astype(np.uint8) * 255
+    assert data.dtype == np.uint8
+    if format == "CHW":
+        if batched and data.ndim == 4:
+            data = data.transpose((0, 2, 3, 1))
+        elif not batched and data.ndim == 3:
+            data = data.transpose((1, 2, 0))
+    if batched:
+        return [Image.fromarray(d) for d in data]
+    return Image.fromarray(data)
+def largest_factor_near_sqrt(n: int) -> int:
+    """
+    Finds the largest factor of n that is closest to the square root of n.
+    Args:
+        n (int): The integer for which to find the largest factor near its square root.
+    Returns:
+        int: The largest factor of n that is closest to the square root of n.
+    """
+    sqrt_n = int(math.sqrt(n))  # Get the integer part of the square root
+    # First, check if the square root itself is a factor
+    if sqrt_n * sqrt_n == n:
+        return sqrt_n
+    # Otherwise, find the largest factor by iterating from sqrt_n downwards
+    for i in range(sqrt_n, 0, -1):
+        if n % i == 0:
+            return i
+    # If n is 1, return 1
+    return 1
+def make_image_grid(
+    images: List[Image.Image],
+    rows: Optional[int] = None,
+    cols: Optional[int] = None,
+    resize: Optional[int] = None,
+) -> Image.Image:
+    """
+    Prepares a single grid of images. Useful for visualization purposes.
+    """
+    if rows is None and cols is not None:
+        assert len(images) % cols == 0
+        rows = len(images) // cols
+    elif cols is None and rows is not None:
+        assert len(images) % rows == 0
+        cols = len(images) // rows
+    elif rows is None and cols is None:
+        rows = largest_factor_near_sqrt(len(images))
+        cols = len(images) // rows
+    assert len(images) == rows * cols
+    if resize is not None:
+        images = [img.resize((resize, resize)) for img in images]
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, img in enumerate(images):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+torchvision
+diffusers
+transformers==4.49.0
+einops
+huggingface_hub
+opencv-python
+trimesh==4.5.3
+omegaconf
+scikit-image
+numpy
+peft
+scipy==1.11.4
+jaxtyping
+typeguard
+pymeshlab==2022.2.post4
+open3d
+timm
+kornia
+ninja
+https://huggingface.co/spaces/JeffreyXiang/TRELLIS/resolve/main/wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl?download=true
+cvcuda_cu12
+gltflib
+torch-cluster