Spaces:

blanchon
/

rgb2x

Runtime error

App Files Files Community

blanchon commited on Nov 8, 2024

Commit

a9af355

0 Parent(s):

Initial commit

Browse files

Files changed (20) hide show

.gitattributes +38 -0
.gitignore +5 -0
LICENSE +26 -0
README.md +78 -0
assets/rgbx24_teaser.png +3 -0
environment.yml +0 -0
requirements.txt +10 -0
rgb2x/example/Castlereagh_corridor_photo.png +3 -0
rgb2x/gradio_demo_rgb2x.py +154 -0
rgb2x/load_image.py +119 -0
rgb2x/pipeline_rgb2x.py +821 -0
x2rgb/example/kitchen-albedo.png +3 -0
x2rgb/example/kitchen-irradiance.png +3 -0
x2rgb/example/kitchen-metallic.png +3 -0
x2rgb/example/kitchen-normal.png +3 -0
x2rgb/example/kitchen-ref.png +3 -0
x2rgb/example/kitchen-roughness.png +3 -0
x2rgb/gradio_demo_x2rgb.py +204 -0
x2rgb/load_image.py +119 -0
x2rgb/pipeline_x2rgb.py +967 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+# Auto detect text files and perform LF normalization
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+* text=auto
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+.venv/
+rgb2x/model_cache
+x2rgb/model_cache

LICENSE ADDED Viewed

	@@ -0,0 +1,26 @@

+# ADOBE RESEARCH LICENSE
+This license agreement (the “License”) between Adobe Inc., having a place of business at 345 Park Avenue, San Jose, California 95110-2704 (“Adobe”), and you, the individual or entity exercising rights under this License (“you” or “your”), sets forth the terms for your use of certain research materials that are owned by Adobe (the “Licensed Materials”). By exercising rights under this License, you accept and agree to be bound by its terms. If you are exercising rights under this license on behalf of an entity, then “you” means you and such entity, and you (personally) represent and warrant that you (personally) have all necessary authority to bind that entity to the terms of this License.
+1.	**GRANT OF LICENSE.**
+      1.1.	Adobe grants you a nonexclusive, worldwide, royalty-free, revocable, fully paid license to (A) reproduce, use, modify, and publicly display the Licensed Materials for noncommercial research purposes only; and (B) redistribute the Licensed Materials, and modifications or derivative works thereof, for noncommercial research purposes only, provided that you give recipients a copy of this License.
+      1.2.	You may add your own copyright statement to your modifications and may provide additional or different license terms for use, reproduction, modification, public display, and redistribution of your modifications and derivative works, provided that such license terms limit the use, reproduction, modification, public display, and redistribution of such modifications and derivative works to noncommercial research purposes only.
+      1.3.	For purposes of this License, noncommercial research purposes include academic research and teaching but do not include commercial licensing or distribution, development of commercial products, or any other activity which results in commercial gain.
+2.	**OWNERSHIP AND ATTRIBUTION.** Adobe and its licensors own all right, title, and interest in the Licensed Materials. You must keep intact any copyright or other notices or disclaimers in the Licensed Materials.
+3.	**DISCLAIMER OF WARRANTIES.** THE LICENSED MATERIALS ARE PROVIDED “AS IS” WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK AS TO THE RESULTS AND PERFORMANCE OF THE LICENSED MATERIALS IS ASSUMED BY YOU. ADOBE DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED OR STATUTORY, WITH REGARD TO ANY LICENSED MATERIALS PROVIDED UNDER THIS LICENSE, INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT OF THIRD-PARTY RIGHTS.
+4.	**LIMITATION OF LIABILITY.** IN NO EVENT WILL ADOBE BE LIABLE FOR ANY ACTUAL, INCIDENTAL, SPECIAL OR CONSEQUENTIAL DAMAGES OF ANY NATURE WHATSOEVER, INCLUDING WITHOUT LIMITATION, LOSS OF PROFITS OR OTHER COMMERCIAL LOSS, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ANY LICENSED MATERIALS PROVIDED UNDER THIS LICENSE, EVEN IF ADOBE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+5.	**TERM AND TERMINATION.**
+      5.1.	The License is effective upon acceptance by you and will remain in effect unless terminated earlier as permitted under this License.
+      5.2.	If you breach any material provision of this License, then your rights will terminate immediately.
+      5.3.	All clauses which by their nature should survive the termination of this License will survive such termination. In addition, and without limiting the generality of the preceding sentence, Sections 2 (Ownership and Attribution), 3 (Disclaimer of Warranties), and 4 (Limitation of Liability) will survive termination of this License.

README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+title: Rgbx
+emoji: 🚀
+colorFrom: gray
+colorTo: purple
+sdk: gradio
+sdk_version: 5.5.0
+app_file: app.py
+pinned: false
+---
+<h1 align="center"> RGB↔X: Image Decomposition and Synthesis Using Material- and Lighting-aware Diffusion Models </h1>
+<p align="center"><a href="https://zheng95z.github.io/" target="_blank">Zheng Zeng</a>, <a href="https://valentin.deschaintre.fr/" target="_blank">Valentin Deschaintre</a>, <a href="https://www.iliyan.com/" target="_blank">Iliyan Georgiev</a>, <a href="https://yannickhold.com/" target="_blank">Yannick Hold-Geoffroy</a>, <a href="https://yiweihu.netlify.app/" target="_blank">Yiwei Hu</a>, <a href="https://luanfujun.com/" target="_blank">Fujun Luan</a>, <a href="https://sites.cs.ucsb.edu/~lingqi/" target="_blank">Ling-Qi Yan</a>, <a href="http://www.miloshasan.net/" target="_blank">Miloš Hašan</a></p>
+<p align="center">ACM SIGGRAPH 2024</p>
+<p align="center"><img src="assets/rgbx24_teaser.png"></p>
+The three areas of realistic forward rendering, per-pixel inverse rendering, and generative image synthesis may seem like separate and unrelated sub-fields of graphics and vision. However, recent work has demonstrated improved estimation of per-pixel intrinsic channels (albedo, roughness, metallicity) based on a diffusion architecture; we call this the RGB→X problem. We further show that the reverse problem of synthesizing realistic images given intrinsic channels, X→RGB, can also be addressed in a diffusion framework.
+Focusing on the image domain of interior scenes, we introduce an improved diffusion model for RGB→X, which also estimates lighting, as well as the first diffusion X→RGB model capable of synthesizing realistic images from (full or partial) intrinsic channels. Our X→RGB model explores a middle ground between traditional rendering and generative models: we can specify only certain appearance properties that should be followed, and give freedom to the model to hallucinate a plausible version of the rest.
+This flexibility makes it possible to use a mix of heterogeneous training datasets, which differ in the available channels. We use multiple existing datasets and extend them with our own synthetic and real data, resulting in a model capable of extracting scene properties better than previous work and of generating highly realistic images of interior scenes.
+## Structure
+```
+├── assets                  <- Assets used by the README.md
+├── rgb2x                   <- Code for the RGB→X model
+│   ├── example                 <- Example photo
+│   └── model_cache             <- Model weights (automatically downloaded when running the inference script)
+├── x2rgb                   <- Code for the X→RGB model
+│   ├── example                 <- Example photo
+│   └── model_cache             <- Model weights (automatically downloaded when running the inference script)
+├── environment.yaml       <- Env file for creating conda environment
+├── LICENSE
+└── README.md
+```
+## Model Weights
+You don't need to manually download the model weights. The weights will be downloaded automatically to `/rgb2x/model_cache/` and `/x2rgb/model_cache/` when you run the inference scripts.
+You can manually acquire the weights by cloning the models from Hugging Face:
+```bash
+git-lfs install
+git clone https://huggingface.co/zheng95z/x-to-rgb
+git clone https://huggingface.co/zheng95z/rgb-to-x
+```
+## Installation
+Create a conda environment using the provided `environment.yaml` file.
+```bash
+conda env create -n rgbx -f environment.yaml
+conda activate rgbx
+```
+Note that this environment is only compatible with NVIDIA GPUs. Additionally, we recommend using a GPU with a minimum of 12GB of memory.
+## Inference
+When you run the inference scripts, gradio demos will be hosted on your local machine. You can access the demos by opening the URLs (shown in the terminal) in your browser.
+### RGB→X
+```bash
+cd rgb2x
+python gradio_demo_rgb2x.py
+```
+**Please note that the metallicity channel prediction might behave differently between the demo and the paper. This is because the demo utilizes a checkpoint that predicts roughness and metallicity separately, whereas in the paper, we used a checkpoint where the roughness and metallicity channels were combined into a single RGB image (with the blue channel set to 0). Unfortunately, the latter checkpoint was lost during the transition between computing platforms, and we apologize for the inconvenience. We plan to resolve this issue and will provide an updated demo in the near future.**
+### X→RGB
+```bash
+cd x2rgb
+python gradio_demo_x2rgb.py
+```
+## Acknowledgements
+This implementation builds upon Hugging Face’s [Diffusers](https://github.com/huggingface/diffusers) library. We also acknowledge [Gradio](https://www.gradio.app/) for providing an easy-to-use interface that allowed us to create the inference demos for our models.

assets/rgbx24_teaser.png ADDED Viewed

Git LFS Details

SHA256: 19c429cd26b2eb8dd9565d11d7a7a1107f350c82be9e9ef7c1e813e7b6eb43b4
Pointer size: 132 Bytes
Size of remote file: 2.05 MB

environment.yml ADDED Viewed

Binary file (648 Bytes). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+diffusers==0.20.0
+gradio==5.5.0
+imageio==2.34.1
+numpy==1.26.4
+opencv-python==4.9.0.80
+transformers==4.40.2
+spaces==0.30.4

rgb2x/example/Castlereagh_corridor_photo.png ADDED Viewed

Git LFS Details

SHA256: 8f77a445168dd92b97e214034f11291b8b3c0d98f3f12e34d591f56c39998fb4
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

rgb2x/gradio_demo_rgb2x.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import spaces
+import os
+from typing import cast
+import gradio as gr
+from PIL import Image
+import torch
+import torchvision
+from diffusers import DDIMScheduler
+from load_image import load_exr_image, load_ldr_image
+from pipeline_rgb2x import StableDiffusionAOVMatEstPipeline
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+current_directory = os.path.dirname(os.path.abspath(__file__))
+_pipe = StableDiffusionAOVMatEstPipeline.from_pretrained(
+    "zheng95z/rgb-to-x",
+    torch_dtype=torch.float16,
+    cache_dir=os.path.join(current_directory, "model_cache"),
+).to("cuda")
+pipe = cast(StableDiffusionAOVMatEstPipeline, _pipe)
+pipe.scheduler = DDIMScheduler.from_config(
+    pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+)
+pipe.set_progress_bar_config(disable=True)
+pipe.to("cuda")
+pipe = cast(StableDiffusionAOVMatEstPipeline, pipe)
+@spaces.GPU
+def generate(
+    photo,
+    seed: int,
+    inference_step: int,
+    num_samples: int,
+) -> list[Image.Image]:
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    if photo.name.endswith(".exr"):
+        photo = load_exr_image(photo.name, tonemaping=True, clamp=True).to("cuda")
+    elif (
+        photo.name.endswith(".png")
+        or photo.name.endswith(".jpg")
+        or photo.name.endswith(".jpeg")
+    ):
+        photo = load_ldr_image(photo.name, from_srgb=True).to("cuda")
+    # Check if the width and height are multiples of 8. If not, crop it using torchvision.transforms.CenterCrop
+    old_height = photo.shape[1]
+    old_width = photo.shape[2]
+    new_height = old_height
+    new_width = old_width
+    radio = old_height / old_width
+    max_side = 1000
+    if old_height > old_width:
+        new_height = max_side
+        new_width = int(new_height / radio)
+    else:
+        new_width = max_side
+        new_height = int(new_width * radio)
+    if new_width % 8 != 0 or new_height % 8 != 0:
+        new_width = new_width // 8 * 8
+        new_height = new_height // 8 * 8
+    photo = torchvision.transforms.Resize((new_height, new_width))(photo)
+    required_aovs = ["albedo", "normal", "roughness", "metallic", "irradiance"]
+    prompts = {
+        "albedo": "Albedo (diffuse basecolor)",
+        "normal": "Camera-space Normal",
+        "roughness": "Roughness",
+        "metallic": "Metallicness",
+        "irradiance": "Irradiance (diffuse lighting)",
+    }
+    return_list = []
+    for i in range(num_samples):
+        for aov_name in required_aovs:
+            prompt = prompts[aov_name]
+            generated_image = pipe(
+                prompt=prompt,
+                photo=photo,
+                num_inference_steps=inference_step,
+                height=new_height,
+                width=new_width,
+                generator=generator,
+                required_aovs=[aov_name],
+            ).images[0][0]  # type: ignore
+            generated_image = torchvision.transforms.Resize((old_height, old_width))(
+                generated_image
+            )
+            generated_image = (generated_image, f"Generated {aov_name} {i}")
+            return_list.append(generated_image)
+    return return_list
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("## Model RGB -> X (Realistic image -> Intrinsic channels)")
+    with gr.Row():
+        # Input side
+        with gr.Column():
+            gr.Markdown("### Given Image")
+            photo = gr.File(label="Photo", file_types=[".exr", ".png", ".jpg"])
+            gr.Markdown("### Parameters")
+            run_button = gr.Button(value="Run")
+            with gr.Accordion("Advanced options", open=False):
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=-1,
+                    maximum=2147483647,
+                    step=1,
+                    randomize=True,
+                )
+                inference_step = gr.Slider(
+                    label="Inference Step",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=50,
+                )
+                num_samples = gr.Slider(
+                    label="Samples",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=1,
+                )
+        # Output side
+        with gr.Column():
+            gr.Markdown("### Output Gallery")
+            result_gallery = gr.Gallery(
+                label="Output",
+                show_label=False,
+                elem_id="gallery",
+                columns=2,
+            )
+    run_button.click(
+        fn=generate,
+        inputs=[photo, seed, inference_step, num_samples],
+        outputs=result_gallery,
+        queue=True,
+    )
+if __name__ == "__main__":
+    demo.launch(debug=False, share=False, show_api=False)

rgb2x/load_image.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import cv2
+import torch
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import numpy as np
+def convert_rgb_2_XYZ(rgb):
+    # Reference: https://web.archive.org/web/20191027010220/http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
+    # rgb: (h, w, 3)
+    # XYZ: (h, w, 3)
+    XYZ = torch.ones_like(rgb)
+    XYZ[:, :, 0] = (
+        0.4124564 * rgb[:, :, 0] + 0.3575761 * rgb[:, :, 1] + 0.1804375 * rgb[:, :, 2]
+    )
+    XYZ[:, :, 1] = (
+        0.2126729 * rgb[:, :, 0] + 0.7151522 * rgb[:, :, 1] + 0.0721750 * rgb[:, :, 2]
+    )
+    XYZ[:, :, 2] = (
+        0.0193339 * rgb[:, :, 0] + 0.1191920 * rgb[:, :, 1] + 0.9503041 * rgb[:, :, 2]
+    )
+    return XYZ
+def convert_XYZ_2_Yxy(XYZ):
+    # XYZ: (h, w, 3)
+    # Yxy: (h, w, 3)
+    Yxy = torch.ones_like(XYZ)
+    Yxy[:, :, 0] = XYZ[:, :, 1]
+    sum = torch.sum(XYZ, dim=2)
+    inv_sum = 1.0 / torch.clamp(sum, min=1e-4)
+    Yxy[:, :, 1] = XYZ[:, :, 0] * inv_sum
+    Yxy[:, :, 2] = XYZ[:, :, 1] * inv_sum
+    return Yxy
+def convert_rgb_2_Yxy(rgb):
+    # rgb: (h, w, 3)
+    # Yxy: (h, w, 3)
+    return convert_XYZ_2_Yxy(convert_rgb_2_XYZ(rgb))
+def convert_XYZ_2_rgb(XYZ):
+    # XYZ: (h, w, 3)
+    # rgb: (h, w, 3)
+    rgb = torch.ones_like(XYZ)
+    rgb[:, :, 0] = (
+        3.2404542 * XYZ[:, :, 0] - 1.5371385 * XYZ[:, :, 1] - 0.4985314 * XYZ[:, :, 2]
+    )
+    rgb[:, :, 1] = (
+        -0.9692660 * XYZ[:, :, 0] + 1.8760108 * XYZ[:, :, 1] + 0.0415560 * XYZ[:, :, 2]
+    )
+    rgb[:, :, 2] = (
+        0.0556434 * XYZ[:, :, 0] - 0.2040259 * XYZ[:, :, 1] + 1.0572252 * XYZ[:, :, 2]
+    )
+    return rgb
+def convert_Yxy_2_XYZ(Yxy):
+    # Yxy: (h, w, 3)
+    # XYZ: (h, w, 3)
+    XYZ = torch.ones_like(Yxy)
+    XYZ[:, :, 0] = Yxy[:, :, 1] / torch.clamp(Yxy[:, :, 2], min=1e-6) * Yxy[:, :, 0]
+    XYZ[:, :, 1] = Yxy[:, :, 0]
+    XYZ[:, :, 2] = (
+        (1.0 - Yxy[:, :, 1] - Yxy[:, :, 2])
+        / torch.clamp(Yxy[:, :, 2], min=1e-4)
+        * Yxy[:, :, 0]
+    )
+    return XYZ
+def convert_Yxy_2_rgb(Yxy):
+    # Yxy: (h, w, 3)
+    # rgb: (h, w, 3)
+    return convert_XYZ_2_rgb(convert_Yxy_2_XYZ(Yxy))
+def load_ldr_image(image_path, from_srgb=False, clamp=False, normalize=False):
+    # Load png or jpg image
+    image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+    image = torch.from_numpy(image.astype(np.float32) / 255.0)  # (h, w, c)
+    image[~torch.isfinite(image)] = 0
+    if from_srgb:
+        # Convert from sRGB to linear RGB
+        image = image**2.2
+    if clamp:
+        image = torch.clamp(image, min=0.0, max=1.0)
+    if normalize:
+        # Normalize to [-1, 1]
+        image = image * 2.0 - 1.0
+        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
+    return image.permute(2, 0, 1)  # returns (c, h, w)
+def load_exr_image(image_path, tonemaping=False, clamp=False, normalize=False):
+    image = cv2.cvtColor(cv2.imread(image_path, -1), cv2.COLOR_BGR2RGB)
+    image = torch.from_numpy(image.astype("float32"))  # (h, w, c)
+    image[~torch.isfinite(image)] = 0
+    if tonemaping:
+        # Exposure adjuestment
+        image_Yxy = convert_rgb_2_Yxy(image)
+        lum = (
+            image[:, :, 0:1] * 0.2125
+            + image[:, :, 1:2] * 0.7154
+            + image[:, :, 2:3] * 0.0721
+        )
+        lum = torch.log(torch.clamp(lum, min=1e-6))
+        lum_mean = torch.exp(torch.mean(lum))
+        lp = image_Yxy[:, :, 0:1] * 0.18 / torch.clamp(lum_mean, min=1e-6)
+        image_Yxy[:, :, 0:1] = lp
+        image = convert_Yxy_2_rgb(image_Yxy)
+    if clamp:
+        image = torch.clamp(image, min=0.0, max=1.0)
+    if normalize:
+        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
+    return image.permute(2, 0, 1)  # returns (c, h, w)

rgb2x/pipeline_rgb2x.py ADDED Viewed

	@@ -0,0 +1,821 @@

+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+import numpy as np
+import PIL
+import torch
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import (
+    LoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    rescale_noise_cfg,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    CONFIG_NAME,
+    BaseOutput,
+    deprecate,
+    logging,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import CLIPTextModel, CLIPTokenizer
+logger = logging.get_logger(__name__)
+class VaeImageProcrssorAOV(VaeImageProcessor):
+    """
+    Image processor for VAE AOV.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+    ):
+        super().__init__()
+    def postprocess(
+        self,
+        image: torch.FloatTensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+        do_gamma_correction: bool = True,
+    ):
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate(
+                "Unsupported output_type",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+        image = torch.stack(
+            [
+                self.denormalize(image[i]) if do_denormalize[i] else image[i]
+                for i in range(image.shape[0])
+            ]
+        )
+        # Gamma correction
+        if do_gamma_correction:
+            image = torch.pow(image, 1.0 / 2.2)
+        if output_type == "pt":
+            return image
+        image = self.pt_to_numpy(image)
+        if output_type == "np":
+            return image
+        if output_type == "pil":
+            return self.numpy_to_pil(image)
+    def preprocess_normal(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> torch.Tensor:
+        image = torch.stack([image], axis=0)
+        return image
+@dataclass
+class StableDiffusionAOVPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion AOV pipelines.
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+    images: Union[List[PIL.Image.Image], np.ndarray]
+class StableDiffusionAOVMatEstPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin
+):
+    r"""
+    Pipeline for AOVs.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcrssorAOV(
+            vae_scale_factor=self.vae_scale_factor
+        )
+        self.register_to_config()
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_ prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            # pix2pix has two  negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            prompt_embeds = torch.cat(
+                [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            )
+        return prompt_embeds
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def prepare_image_latents(
+        self,
+        image,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        do_classifier_free_guidance,
+        generator=None,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == 4:
+            image_latents = image
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            if isinstance(generator, list):
+                image_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.mode()
+                    for i in range(batch_size)
+                ]
+                image_latents = torch.cat(image_latents, dim=0)
+            else:
+                image_latents = self.vae.encode(image).latent_dist.mode()
+        if (
+            batch_size > image_latents.shape[0]
+            and batch_size % image_latents.shape[0] == 0
+        ):
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate(
+                "len(prompt) != len(image)",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat(
+                [image_latents] * additional_image_per_prompt, dim=0
+            )
+        elif (
+            batch_size > image_latents.shape[0]
+            and batch_size % image_latents.shape[0] != 0
+        ):
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat(
+                [image_latents, image_latents, uncond_image_latents], dim=0
+            )
+        return image_latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        photo: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 100,
+        required_aovs: List[str] = ["albedo"],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        use_default_scaling_factor: Optional[bool] = False,
+        guidance_scale: float = 0.0,
+        image_guidance_scale: float = 0.0,
+        guidance_rescale: float = 0.0,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
+                image latents as `image`, but if passing latents directly it is not encoded again.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            image_guidance_scale (`float`, *optional*, defaults to 1.5):
+                Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
+                `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
+                linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
+                value of at least `1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionInstructPix2PixPipeline
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+        >>> image = download_image(img_url).resize((512, 512))
+        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+        ...     "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "make the mountains snowy"
+        >>> image = pipe(prompt=prompt, image=image).images[0]
+        ```
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Check inputs
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = (
+            guidance_scale > 1.0 and image_guidance_scale >= 1.0
+        )
+        # check if scheduler is in sigmas space
+        scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
+        # 2. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 3. Preprocess image
+        # Normalize image to [-1,1]
+        preprocessed_photo = self.image_processor.preprocess(photo)
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare Image latents
+        image_latents = self.prepare_image_latents(
+            preprocessed_photo,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            do_classifier_free_guidance,
+            generator,
+        )
+        image_latents = image_latents * self.vae.config.scaling_factor
+        height, width = image_latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Check that shapes of latents and image match the UNet channels
+        num_channels_image = image_latents.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Expand the latents if we are doing classifier free guidance.
+                # The latents are expanded 3 times because for pix2pix the guidance\
+                # is applied for both the text and the input image.
+                latent_model_input = (
+                    torch.cat([latents] * 3) if do_classifier_free_guidance else latents
+                )
+                # concat latents, image_latents in the channel dimension
+                scaled_latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                scaled_latent_model_input = torch.cat(
+                    [scaled_latent_model_input, image_latents], dim=1
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    scaled_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    (
+                        noise_pred_text,
+                        noise_pred_image,
+                        noise_pred_uncond,
+                    ) = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + guidance_scale * (noise_pred_text - noise_pred_image)
+                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+            aov_latents = latents / self.vae.config.scaling_factor
+            aov = self.vae.decode(aov_latents, return_dict=False)[0]
+            do_denormalize = [True] * aov.shape[0]
+            aov_name = required_aovs[0]
+            if aov_name == "albedo" or aov_name == "irradiance":
+                do_gamma_correction = True
+            else:
+                do_gamma_correction = False
+            if aov_name == "roughness" or aov_name == "metallic":
+                aov = aov[:, 0:1].repeat(1, 3, 1, 1)
+            aov = self.image_processor.postprocess(
+                aov,
+                output_type=output_type,
+                do_denormalize=do_denormalize,
+                do_gamma_correction=do_gamma_correction,
+            )
+            aovs = [aov]
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        return StableDiffusionAOVPipelineOutput(images=aovs)

x2rgb/example/kitchen-albedo.png ADDED Viewed

Git LFS Details

SHA256: d2b3e2ae5001c4214d5c87041e57933708cbb424eca6f7a2659c2c3e91a6a8ce
Pointer size: 131 Bytes
Size of remote file: 313 kB

x2rgb/example/kitchen-irradiance.png ADDED Viewed

Git LFS Details

SHA256: 259b873bba6405d72a87a30321f4572a47d296726368eba2f0303d4ab3bcd269
Pointer size: 131 Bytes
Size of remote file: 959 kB

x2rgb/example/kitchen-metallic.png ADDED Viewed

Git LFS Details

SHA256: cd6fec250659c8915c821b9063b851da4da59c2459c56fa08338cb81c5e6b70d
Pointer size: 130 Bytes
Size of remote file: 33.9 kB

x2rgb/example/kitchen-normal.png ADDED Viewed

Git LFS Details

SHA256: abf769887d2ee8fa050f56f50285502fcf8dbb8b69c28c1f3910ad1ee2874068
Pointer size: 131 Bytes
Size of remote file: 415 kB

x2rgb/example/kitchen-ref.png ADDED Viewed

Git LFS Details

SHA256: 19e57fc6737291cb59611786c9894fd4c2bedb0ba14b875942241195afff3534
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

x2rgb/example/kitchen-roughness.png ADDED Viewed

Git LFS Details

SHA256: 9d1195686031d170151798b00d095c48a40e1a8a508d65c5a841fcabb0ae8fad
Pointer size: 130 Bytes
Size of remote file: 84 kB

x2rgb/gradio_demo_x2rgb.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import spaces
+import os
+from typing import cast
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from diffusers import DDIMScheduler
+from load_image import load_exr_image, load_ldr_image
+from pipeline_x2rgb import StableDiffusionAOVDropoutPipeline
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+current_directory = os.path.dirname(os.path.abspath(__file__))
+_pipe = StableDiffusionAOVDropoutPipeline.from_pretrained(
+    "zheng95z/x-to-rgb",
+    torch_dtype=torch.float16,
+    cache_dir=os.path.join(current_directory, "model_cache"),
+).to("cuda")
+pipe = cast(StableDiffusionAOVDropoutPipeline, _pipe)
+pipe.scheduler = DDIMScheduler.from_config(
+    pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+)
+pipe.set_progress_bar_config(disable=True)
+pipe.to("cuda")
+pipe = cast(StableDiffusionAOVDropoutPipeline, pipe)
+@spaces.GPU
+def generate(
+    albedo,
+    normal,
+    roughness,
+    metallic,
+    irradiance,
+    prompt: str,
+    seed: int,
+    inference_step: int,
+    num_samples: int,
+    guidance_scale: float,
+    image_guidance_scale: float,
+) -> list[Image.Image]:
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    # Load and process each intrinsic channel image
+    def process_image(file, **kwargs):
+        if file is None:
+            return None
+        if file.name.endswith(".exr"):
+            return load_exr_image(file.name, **kwargs).to("cuda")
+        elif file.name.endswith((".png", ".jpg", ".jpeg")):
+            return load_ldr_image(file.name, **kwargs).to("cuda")
+        return None
+    albedo_image = process_image(albedo, clamp=True)
+    normal_image = process_image(normal, normalize=True)
+    roughness_image = process_image(roughness, clamp=True)
+    metallic_image = process_image(metallic, clamp=True)
+    irradiance_image = process_image(irradiance, tonemaping=True, clamp=True)
+    # Set default height and width based on the first available image
+    height, width = 768, 768
+    for img in [
+        albedo_image,
+        normal_image,
+        roughness_image,
+        metallic_image,
+        irradiance_image,
+    ]:
+        if img is not None:
+            height, width = img.shape[1], img.shape[2]
+            break
+    required_aovs = ["albedo", "normal", "roughness", "metallic", "irradiance"]
+    return_list = []
+    for i in range(num_samples):
+        generated_image = pipe(
+            prompt=prompt,
+            albedo=albedo_image,
+            normal=normal_image,
+            roughness=roughness_image,
+            metallic=metallic_image,
+            irradiance=irradiance_image,
+            num_inference_steps=inference_step,
+            height=height,
+            width=width,
+            generator=generator,
+            required_aovs=required_aovs,
+            guidance_scale=guidance_scale,
+            image_guidance_scale=image_guidance_scale,
+            guidance_rescale=0.7,
+            output_type="np",
+        ).images[0]  # type: ignore
+        return_list.append((generated_image, f"Generated Image {i}"))
+    # Append additional images to the output gallery
+    def post_process_image(img, **kwargs):
+        if img is not None:
+            return (img.cpu().numpy().transpose(1, 2, 0), kwargs.get("label", "Image"))
+        return np.zeros((height, width, 3))
+    return_list.extend(
+        [
+            post_process_image(albedo_image, label="Albedo"),
+            post_process_image(normal_image, label="Normal"),
+            post_process_image(roughness_image, label="Roughness"),
+            post_process_image(metallic_image, label="Metallic"),
+            post_process_image(irradiance_image, label="Irradiance"),
+        ]
+    )
+    return return_list
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("## Model X -> RGB (Intrinsic channels -> realistic image)")
+    with gr.Row():
+        # Input side
+        with gr.Column():
+            gr.Markdown("### Given intrinsic channels")
+            albedo = gr.File(label="Albedo", file_types=[".exr", ".png", ".jpg"])
+            normal = gr.File(label="Normal", file_types=[".exr", ".png", ".jpg"])
+            roughness = gr.File(label="Roughness", file_types=[".exr", ".png", ".jpg"])
+            metallic = gr.File(label="Metallic", file_types=[".exr", ".png", ".jpg"])
+            irradiance = gr.File(
+                label="Irradiance", file_types=[".exr", ".png", ".jpg"]
+            )
+            gr.Markdown("### Parameters")
+            prompt = gr.Textbox(label="Prompt")
+            run_button = gr.Button(value="Run")
+            with gr.Accordion("Advanced options", open=False):
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=-1,
+                    maximum=2147483647,
+                    step=1,
+                    randomize=True,
+                )
+                inference_step = gr.Slider(
+                    label="Inference Step",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=50,
+                )
+                num_samples = gr.Slider(
+                    label="Samples",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=1,
+                )
+                guidance_scale = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=7.5,
+                )
+                image_guidance_scale = gr.Slider(
+                    label="Image Guidance Scale",
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=1.5,
+                )
+        # Output side
+        with gr.Column():
+            gr.Markdown("### Output Gallery")
+            result_gallery = gr.Gallery(
+                label="Output",
+                show_label=False,
+                elem_id="gallery",
+                columns=2,
+            )
+    run_button.click(
+        fn=generate,
+        inputs=[
+            albedo,
+            normal,
+            roughness,
+            metallic,
+            irradiance,
+            prompt,
+            seed,
+            inference_step,
+            num_samples,
+            guidance_scale,
+            image_guidance_scale,
+        ],
+        outputs=result_gallery,
+        queue=True,
+    )
+if __name__ == "__main__":
+    demo.launch(debug=False, share=False, show_api=False)

x2rgb/load_image.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import cv2
+import torch
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import numpy as np
+def convert_rgb_2_XYZ(rgb):
+    # Reference: https://web.archive.org/web/20191027010220/http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
+    # rgb: (h, w, 3)
+    # XYZ: (h, w, 3)
+    XYZ = torch.ones_like(rgb)
+    XYZ[:, :, 0] = (
+        0.4124564 * rgb[:, :, 0] + 0.3575761 * rgb[:, :, 1] + 0.1804375 * rgb[:, :, 2]
+    )
+    XYZ[:, :, 1] = (
+        0.2126729 * rgb[:, :, 0] + 0.7151522 * rgb[:, :, 1] + 0.0721750 * rgb[:, :, 2]
+    )
+    XYZ[:, :, 2] = (
+        0.0193339 * rgb[:, :, 0] + 0.1191920 * rgb[:, :, 1] + 0.9503041 * rgb[:, :, 2]
+    )
+    return XYZ
+def convert_XYZ_2_Yxy(XYZ):
+    # XYZ: (h, w, 3)
+    # Yxy: (h, w, 3)
+    Yxy = torch.ones_like(XYZ)
+    Yxy[:, :, 0] = XYZ[:, :, 1]
+    sum = torch.sum(XYZ, dim=2)
+    inv_sum = 1.0 / torch.clamp(sum, min=1e-4)
+    Yxy[:, :, 1] = XYZ[:, :, 0] * inv_sum
+    Yxy[:, :, 2] = XYZ[:, :, 1] * inv_sum
+    return Yxy
+def convert_rgb_2_Yxy(rgb):
+    # rgb: (h, w, 3)
+    # Yxy: (h, w, 3)
+    return convert_XYZ_2_Yxy(convert_rgb_2_XYZ(rgb))
+def convert_XYZ_2_rgb(XYZ):
+    # XYZ: (h, w, 3)
+    # rgb: (h, w, 3)
+    rgb = torch.ones_like(XYZ)
+    rgb[:, :, 0] = (
+        3.2404542 * XYZ[:, :, 0] - 1.5371385 * XYZ[:, :, 1] - 0.4985314 * XYZ[:, :, 2]
+    )
+    rgb[:, :, 1] = (
+        -0.9692660 * XYZ[:, :, 0] + 1.8760108 * XYZ[:, :, 1] + 0.0415560 * XYZ[:, :, 2]
+    )
+    rgb[:, :, 2] = (
+        0.0556434 * XYZ[:, :, 0] - 0.2040259 * XYZ[:, :, 1] + 1.0572252 * XYZ[:, :, 2]
+    )
+    return rgb
+def convert_Yxy_2_XYZ(Yxy):
+    # Yxy: (h, w, 3)
+    # XYZ: (h, w, 3)
+    XYZ = torch.ones_like(Yxy)
+    XYZ[:, :, 0] = Yxy[:, :, 1] / torch.clamp(Yxy[:, :, 2], min=1e-6) * Yxy[:, :, 0]
+    XYZ[:, :, 1] = Yxy[:, :, 0]
+    XYZ[:, :, 2] = (
+        (1.0 - Yxy[:, :, 1] - Yxy[:, :, 2])
+        / torch.clamp(Yxy[:, :, 2], min=1e-4)
+        * Yxy[:, :, 0]
+    )
+    return XYZ
+def convert_Yxy_2_rgb(Yxy):
+    # Yxy: (h, w, 3)
+    # rgb: (h, w, 3)
+    return convert_XYZ_2_rgb(convert_Yxy_2_XYZ(Yxy))
+def load_ldr_image(image_path, from_srgb=False, clamp=False, normalize=False):
+    # Load png or jpg image
+    image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+    image = torch.from_numpy(image.astype(np.float32) / 255.0)  # (h, w, c)
+    image[~torch.isfinite(image)] = 0
+    if from_srgb:
+        # Convert from sRGB to linear RGB
+        image = image**2.2
+    if clamp:
+        image = torch.clamp(image, min=0.0, max=1.0)
+    if normalize:
+        # Normalize to [-1, 1]
+        image = image * 2.0 - 1.0
+        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
+    return image.permute(2, 0, 1)  # returns (c, h, w)
+def load_exr_image(image_path, tonemaping=False, clamp=False, normalize=False):
+    image = cv2.cvtColor(cv2.imread(image_path, -1), cv2.COLOR_BGR2RGB)
+    image = torch.from_numpy(image.astype("float32"))  # (h, w, c)
+    image[~torch.isfinite(image)] = 0
+    if tonemaping:
+        # Exposure adjuestment
+        image_Yxy = convert_rgb_2_Yxy(image)
+        lum = (
+            image[:, :, 0:1] * 0.2125
+            + image[:, :, 1:2] * 0.7154
+            + image[:, :, 2:3] * 0.0721
+        )
+        lum = torch.log(torch.clamp(lum, min=1e-6))
+        lum_mean = torch.exp(torch.mean(lum))
+        lp = image_Yxy[:, :, 0:1] * 0.18 / torch.clamp(lum_mean, min=1e-6)
+        image_Yxy[:, :, 0:1] = lp
+        image = convert_Yxy_2_rgb(image_Yxy)
+    if clamp:
+        image = torch.clamp(image, min=0.0, max=1.0)
+    if normalize:
+        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
+    return image.permute(2, 0, 1)  # returns (c, h, w)

x2rgb/pipeline_x2rgb.py ADDED Viewed

	@@ -0,0 +1,967 @@

+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    rescale_noise_cfg,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import CONFIG_NAME, BaseOutput, deprecate, logging, randn_tensor
+from transformers import CLIPTextModel, CLIPTokenizer
+logger = logging.get_logger(__name__)
+class VaeImageProcrssorAOV(VaeImageProcessor):
+    """
+    Image processor for VAE AOV.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+    ):
+        super().__init__()
+    def postprocess(
+        self,
+        image: torch.FloatTensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+        do_gamma_correction: bool = True,
+    ):
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate(
+                "Unsupported output_type",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+        image = torch.stack(
+            [
+                self.denormalize(image[i]) if do_denormalize[i] else image[i]
+                for i in range(image.shape[0])
+            ]
+        )
+        # Gamma correction
+        if do_gamma_correction:
+            image = torch.pow(image, 1.0 / 2.2)
+        if output_type == "pt":
+            return image
+        image = self.pt_to_numpy(image)
+        if output_type == "np":
+            return image
+        if output_type == "pil":
+            return self.numpy_to_pil(image)
+    def preprocess_normal(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> torch.Tensor:
+        image = torch.stack([image], axis=0)
+        return image
+@dataclass
+class StableDiffusionAOVPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion AOV pipelines.
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    predicted_x0_images: Optional[Union[List[PIL.Image.Image], np.ndarray]] = None
+class StableDiffusionAOVDropoutPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin
+):
+    r"""
+    Pipeline for AOVs.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcrssorAOV(
+            vae_scale_factor=self.vae_scale_factor
+        )
+        self.register_to_config()
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_ prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            # pix2pix has two  negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            prompt_embeds = torch.cat(
+                [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            )
+        return prompt_embeds
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def prepare_image_latents(
+        self,
+        image,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        do_classifier_free_guidance,
+        generator=None,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == 4:
+            image_latents = image
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            if isinstance(generator, list):
+                image_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.mode()
+                    for i in range(batch_size)
+                ]
+                image_latents = torch.cat(image_latents, dim=0)
+            else:
+                image_latents = self.vae.encode(image).latent_dist.mode()
+        if (
+            batch_size > image_latents.shape[0]
+            and batch_size % image_latents.shape[0] == 0
+        ):
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate(
+                "len(prompt) != len(image)",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat(
+                [image_latents] * additional_image_per_prompt, dim=0
+            )
+        elif (
+            batch_size > image_latents.shape[0]
+            and batch_size % image_latents.shape[0] != 0
+        ):
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat(
+                [image_latents, image_latents, uncond_image_latents], dim=0
+            )
+        return image_latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        height: int,
+        width: int,
+        prompt: Union[str, List[str]] = None,
+        albedo: Optional[
+            Union[
+                torch.FloatTensor,
+                PIL.Image.Image,
+                np.ndarray,
+                List[torch.FloatTensor],
+                List[PIL.Image.Image],
+                List[np.ndarray],
+            ]
+        ] = None,
+        normal: Optional[
+            Union[
+                torch.FloatTensor,
+                PIL.Image.Image,
+                np.ndarray,
+                List[torch.FloatTensor],
+                List[PIL.Image.Image],
+                List[np.ndarray],
+            ]
+        ] = None,
+        roughness: Optional[
+            Union[
+                torch.FloatTensor,
+                PIL.Image.Image,
+                np.ndarray,
+                List[torch.FloatTensor],
+                List[PIL.Image.Image],
+                List[np.ndarray],
+            ]
+        ] = None,
+        metallic: Optional[
+            Union[
+                torch.FloatTensor,
+                PIL.Image.Image,
+                np.ndarray,
+                List[torch.FloatTensor],
+                List[PIL.Image.Image],
+                List[np.ndarray],
+            ]
+        ] = None,
+        irradiance: Optional[
+            Union[
+                torch.FloatTensor,
+                PIL.Image.Image,
+                np.ndarray,
+                List[torch.FloatTensor],
+                List[PIL.Image.Image],
+                List[np.ndarray],
+            ]
+        ] = None,
+        guidance_scale: float = 0.0,
+        image_guidance_scale: float = 0.0,
+        guidance_rescale: float = 0.0,
+        num_inference_steps: int = 100,
+        required_aovs: List[str] = ["albedo"],
+        return_predicted_x0s: bool = False,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
+                image latents as `image`, but if passing latents directly it is not encoded again.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            image_guidance_scale (`float`, *optional*, defaults to 1.5):
+                Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
+                `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
+                linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
+                value of at least `1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionInstructPix2PixPipeline
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+        >>> image = download_image(img_url).resize((512, 512))
+        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+        ...     "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "make the mountains snowy"
+        >>> image = pipe(prompt=prompt, image=image).images[0]
+        ```
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Check inputs
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = (
+            guidance_scale >= 1.0 and image_guidance_scale >= 1.0
+        )
+        # check if scheduler is in sigmas space
+        scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
+        # 2. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 3. Preprocess image
+        # For normal, the preprocessing does nothing
+        # For others, the preprocessing remap the values to [-1, 1]
+        preprocessed_aovs = {}
+        for aov_name in required_aovs:
+            if aov_name == "albedo":
+                if albedo is not None:
+                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
+                        albedo
+                    )
+                else:
+                    preprocessed_aovs[aov_name] = None
+            if aov_name == "normal":
+                if normal is not None:
+                    preprocessed_aovs[aov_name] = (
+                        self.image_processor.preprocess_normal(normal)
+                    )
+                else:
+                    preprocessed_aovs[aov_name] = None
+            if aov_name == "roughness":
+                if roughness is not None:
+                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
+                        roughness
+                    )
+                else:
+                    preprocessed_aovs[aov_name] = None
+            if aov_name == "metallic":
+                if metallic is not None:
+                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
+                        metallic
+                    )
+                else:
+                    preprocessed_aovs[aov_name] = None
+            if aov_name == "irradiance":
+                if irradiance is not None:
+                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
+                        irradiance
+                    )
+                else:
+                    preprocessed_aovs[aov_name] = None
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        height_latent, width_latent = latents.shape[-2:]
+        # 6. Prepare Image latents
+        image_latents = []
+        # Magicial scaling factors for each AOV (calculated from the training data)
+        scaling_factors = {
+            "albedo": 0.17301377137652138,
+            "normal": 0.17483895473058078,
+            "roughness": 0.1680724853626448,
+            "metallic": 0.13135013390855135,
+        }
+        for aov_name, aov in preprocessed_aovs.items():
+            if aov is None:
+                image_latent = torch.zeros(
+                    batch_size,
+                    num_channels_latents,
+                    height_latent,
+                    width_latent,
+                    dtype=prompt_embeds.dtype,
+                    device=device,
+                )
+                if aov_name == "irradiance":
+                    image_latent = image_latent[:, 0:3]
+                if do_classifier_free_guidance:
+                    image_latents.append(
+                        torch.cat([image_latent, image_latent, image_latent], dim=0)
+                    )
+                else:
+                    image_latents.append(image_latent)
+            else:
+                if aov_name == "irradiance":
+                    image_latent = F.interpolate(
+                        aov.to(device=device, dtype=prompt_embeds.dtype),
+                        size=(height_latent, width_latent),
+                        mode="bilinear",
+                        align_corners=False,
+                        antialias=True,
+                    )
+                    if do_classifier_free_guidance:
+                        uncond_image_latent = torch.zeros_like(image_latent)
+                        image_latent = torch.cat(
+                            [image_latent, image_latent, uncond_image_latent], dim=0
+                        )
+                else:
+                    scaling_factor = scaling_factors[aov_name]
+                    image_latent = (
+                        self.prepare_image_latents(
+                            aov,
+                            batch_size,
+                            num_images_per_prompt,
+                            prompt_embeds.dtype,
+                            device,
+                            do_classifier_free_guidance,
+                            generator,
+                        )
+                        * scaling_factor
+                    )
+                image_latents.append(image_latent)
+        image_latents = torch.cat(image_latents, dim=1)
+        # 7. Check that shapes of latents and image match the UNet channels
+        num_channels_image = image_latents.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        predicted_x0s = []
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Expand the latents if we are doing classifier free guidance.
+                # The latents are expanded 3 times because for pix2pix the guidance\
+                # is applied for both the text and the input image.
+                latent_model_input = (
+                    torch.cat([latents] * 3) if do_classifier_free_guidance else latents
+                )
+                # concat latents, image_latents in the channel dimension
+                scaled_latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                scaled_latent_model_input = torch.cat(
+                    [scaled_latent_model_input, image_latents], dim=1
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    scaled_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    (
+                        noise_pred_text,
+                        noise_pred_image,
+                        noise_pred_uncond,
+                    ) = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + guidance_scale * (noise_pred_text - noise_pred_image)
+                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                output = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=True
+                )
+                latents = output[0]
+                if return_predicted_x0s:
+                    predicted_x0s.append(output[1])
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False
+            )[0]
+            if return_predicted_x0s:
+                predicted_x0_images = [
+                    self.vae.decode(
+                        predicted_x0 / self.vae.config.scaling_factor, return_dict=False
+                    )[0]
+                    for predicted_x0 in predicted_x0s
+                ]
+        else:
+            image = latents
+            predicted_x0_images = predicted_x0s
+        do_denormalize = [True] * image.shape[0]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        if return_predicted_x0s:
+            predicted_x0_images = [
+                self.image_processor.postprocess(
+                    predicted_x0_image,
+                    output_type=output_type,
+                    do_denormalize=do_denormalize,
+                )
+                for predicted_x0_image in predicted_x0_images
+            ]
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return image
+        if return_predicted_x0s:
+            return StableDiffusionAOVPipelineOutput(
+                images=image, predicted_x0_images=predicted_x0_images
+            )
+        else:
+            return StableDiffusionAOVPipelineOutput(images=image)