Spaces:

blanchon
/

rgb2x

Running on Zero

App Files Files Community

blanchon commited on Nov 8, 2024

Commit

7c0f7b6

1 Parent(s): cebcba2

Remove X2RGB and add examples

Browse files

Files changed (10) hide show

rgb2x/gradio_demo_rgb2x.py +12 -0
x2rgb/example/kitchen-albedo.png +0 -3
x2rgb/example/kitchen-irradiance.png +0 -3
x2rgb/example/kitchen-metallic.png +0 -3
x2rgb/example/kitchen-normal.png +0 -3
x2rgb/example/kitchen-ref.png +0 -3
x2rgb/example/kitchen-roughness.png +0 -3
x2rgb/gradio_demo_x2rgb.py +0 -204
x2rgb/load_image.py +0 -119
x2rgb/pipeline_x2rgb.py +0 -967

rgb2x/gradio_demo_rgb2x.py CHANGED Viewed

@@ -141,6 +141,18 @@ with gr.Blocks() as demo:
                 elem_id="gallery",
                 columns=2,
             )
     run_button.click(
         fn=generate,

                 elem_id="gallery",
                 columns=2,
             )
+            examples = gr.Examples(
+                examples=[
+                    [
+                        "rgb2x/example/Castlereagh_corridor_photo.png",
+                    ]
+                ],
+                inputs=[photo],
+                outputs=[result_gallery],
+                fn=generate,
+                cache_mode="eager",
+                cache_examples=True,
+            )
     run_button.click(
         fn=generate,

x2rgb/example/kitchen-albedo.png DELETED Viewed

Git LFS Details

SHA256: d2b3e2ae5001c4214d5c87041e57933708cbb424eca6f7a2659c2c3e91a6a8ce
Pointer size: 131 Bytes
Size of remote file: 313 kB

x2rgb/example/kitchen-irradiance.png DELETED Viewed

Git LFS Details

SHA256: 259b873bba6405d72a87a30321f4572a47d296726368eba2f0303d4ab3bcd269
Pointer size: 131 Bytes
Size of remote file: 959 kB

x2rgb/example/kitchen-metallic.png DELETED Viewed

Git LFS Details

SHA256: cd6fec250659c8915c821b9063b851da4da59c2459c56fa08338cb81c5e6b70d
Pointer size: 130 Bytes
Size of remote file: 33.9 kB

x2rgb/example/kitchen-normal.png DELETED Viewed

Git LFS Details

SHA256: abf769887d2ee8fa050f56f50285502fcf8dbb8b69c28c1f3910ad1ee2874068
Pointer size: 131 Bytes
Size of remote file: 415 kB

x2rgb/example/kitchen-ref.png DELETED Viewed

Git LFS Details

SHA256: 19e57fc6737291cb59611786c9894fd4c2bedb0ba14b875942241195afff3534
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

x2rgb/example/kitchen-roughness.png DELETED Viewed

Git LFS Details

SHA256: 9d1195686031d170151798b00d095c48a40e1a8a508d65c5a841fcabb0ae8fad
Pointer size: 130 Bytes
Size of remote file: 84 kB

x2rgb/gradio_demo_x2rgb.py DELETED Viewed

@@ -1,204 +0,0 @@
-import spaces
-import os
-from typing import cast
-import gradio as gr
-import numpy as np
-import torch
-from PIL import Image
-from diffusers import DDIMScheduler
-from load_image import load_exr_image, load_ldr_image
-from pipeline_x2rgb import StableDiffusionAOVDropoutPipeline
-os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
-current_directory = os.path.dirname(os.path.abspath(__file__))
-_pipe = StableDiffusionAOVDropoutPipeline.from_pretrained(
-    "zheng95z/x-to-rgb",
-    torch_dtype=torch.float16,
-    cache_dir=os.path.join(current_directory, "model_cache"),
-).to("cuda")
-pipe = cast(StableDiffusionAOVDropoutPipeline, _pipe)
-pipe.scheduler = DDIMScheduler.from_config(
-    pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
-)
-pipe.set_progress_bar_config(disable=True)
-pipe.to("cuda")
-pipe = cast(StableDiffusionAOVDropoutPipeline, pipe)
-@spaces.GPU
-def generate(
-    albedo,
-    normal,
-    roughness,
-    metallic,
-    irradiance,
-    prompt: str,
-    seed: int,
-    inference_step: int,
-    num_samples: int,
-    guidance_scale: float,
-    image_guidance_scale: float,
-) -> list[Image.Image]:
-    generator = torch.Generator(device="cuda").manual_seed(seed)
-    # Load and process each intrinsic channel image
-    def process_image(file, **kwargs):
-        if file is None:
-            return None
-        if file.name.endswith(".exr"):
-            return load_exr_image(file.name, **kwargs).to("cuda")
-        elif file.name.endswith((".png", ".jpg", ".jpeg")):
-            return load_ldr_image(file.name, **kwargs).to("cuda")
-        return None
-    albedo_image = process_image(albedo, clamp=True)
-    normal_image = process_image(normal, normalize=True)
-    roughness_image = process_image(roughness, clamp=True)
-    metallic_image = process_image(metallic, clamp=True)
-    irradiance_image = process_image(irradiance, tonemaping=True, clamp=True)
-    # Set default height and width based on the first available image
-    height, width = 768, 768
-    for img in [
-        albedo_image,
-        normal_image,
-        roughness_image,
-        metallic_image,
-        irradiance_image,
-    ]:
-        if img is not None:
-            height, width = img.shape[1], img.shape[2]
-            break
-    required_aovs = ["albedo", "normal", "roughness", "metallic", "irradiance"]
-    return_list = []
-    for i in range(num_samples):
-        generated_image = pipe(
-            prompt=prompt,
-            albedo=albedo_image,
-            normal=normal_image,
-            roughness=roughness_image,
-            metallic=metallic_image,
-            irradiance=irradiance_image,
-            num_inference_steps=inference_step,
-            height=height,
-            width=width,
-            generator=generator,
-            required_aovs=required_aovs,
-            guidance_scale=guidance_scale,
-            image_guidance_scale=image_guidance_scale,
-            guidance_rescale=0.7,
-            output_type="np",
-        ).images[0]  # type: ignore
-        return_list.append((generated_image, f"Generated Image {i}"))
-    # Append additional images to the output gallery
-    def post_process_image(img, **kwargs):
-        if img is not None:
-            return (img.cpu().numpy().transpose(1, 2, 0), kwargs.get("label", "Image"))
-        return np.zeros((height, width, 3))
-    return_list.extend(
-        [
-            post_process_image(albedo_image, label="Albedo"),
-            post_process_image(normal_image, label="Normal"),
-            post_process_image(roughness_image, label="Roughness"),
-            post_process_image(metallic_image, label="Metallic"),
-            post_process_image(irradiance_image, label="Irradiance"),
-        ]
-    )
-    return return_list
-with gr.Blocks() as demo:
-    with gr.Row():
-        gr.Markdown("## Model X -> RGB (Intrinsic channels -> realistic image)")
-    with gr.Row():
-        # Input side
-        with gr.Column():
-            gr.Markdown("### Given intrinsic channels")
-            albedo = gr.File(label="Albedo", file_types=[".exr", ".png", ".jpg"])
-            normal = gr.File(label="Normal", file_types=[".exr", ".png", ".jpg"])
-            roughness = gr.File(label="Roughness", file_types=[".exr", ".png", ".jpg"])
-            metallic = gr.File(label="Metallic", file_types=[".exr", ".png", ".jpg"])
-            irradiance = gr.File(
-                label="Irradiance", file_types=[".exr", ".png", ".jpg"]
-            )
-            gr.Markdown("### Parameters")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(value="Run")
-            with gr.Accordion("Advanced options", open=False):
-                seed = gr.Slider(
-                    label="Seed",
-                    minimum=-1,
-                    maximum=2147483647,
-                    step=1,
-                    randomize=True,
-                )
-                inference_step = gr.Slider(
-                    label="Inference Step",
-                    minimum=1,
-                    maximum=100,
-                    step=1,
-                    value=50,
-                )
-                num_samples = gr.Slider(
-                    label="Samples",
-                    minimum=1,
-                    maximum=100,
-                    step=1,
-                    value=1,
-                )
-                guidance_scale = gr.Slider(
-                    label="Guidance Scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=7.5,
-                )
-                image_guidance_scale = gr.Slider(
-                    label="Image Guidance Scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=1.5,
-                )
-        # Output side
-        with gr.Column():
-            gr.Markdown("### Output Gallery")
-            result_gallery = gr.Gallery(
-                label="Output",
-                show_label=False,
-                elem_id="gallery",
-                columns=2,
-            )
-    run_button.click(
-        fn=generate,
-        inputs=[
-            albedo,
-            normal,
-            roughness,
-            metallic,
-            irradiance,
-            prompt,
-            seed,
-            inference_step,
-            num_samples,
-            guidance_scale,
-            image_guidance_scale,
-        ],
-        outputs=result_gallery,
-        queue=True,
-    )
-if __name__ == "__main__":
-    demo.launch(debug=False, share=False, show_api=False)

x2rgb/load_image.py DELETED Viewed

@@ -1,119 +0,0 @@
-import os
-import cv2
-import torch
-os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
-import numpy as np
-def convert_rgb_2_XYZ(rgb):
-    # Reference: https://web.archive.org/web/20191027010220/http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
-    # rgb: (h, w, 3)
-    # XYZ: (h, w, 3)
-    XYZ = torch.ones_like(rgb)
-    XYZ[:, :, 0] = (
-        0.4124564 * rgb[:, :, 0] + 0.3575761 * rgb[:, :, 1] + 0.1804375 * rgb[:, :, 2]
-    )
-    XYZ[:, :, 1] = (
-        0.2126729 * rgb[:, :, 0] + 0.7151522 * rgb[:, :, 1] + 0.0721750 * rgb[:, :, 2]
-    )
-    XYZ[:, :, 2] = (
-        0.0193339 * rgb[:, :, 0] + 0.1191920 * rgb[:, :, 1] + 0.9503041 * rgb[:, :, 2]
-    )
-    return XYZ
-def convert_XYZ_2_Yxy(XYZ):
-    # XYZ: (h, w, 3)
-    # Yxy: (h, w, 3)
-    Yxy = torch.ones_like(XYZ)
-    Yxy[:, :, 0] = XYZ[:, :, 1]
-    sum = torch.sum(XYZ, dim=2)
-    inv_sum = 1.0 / torch.clamp(sum, min=1e-4)
-    Yxy[:, :, 1] = XYZ[:, :, 0] * inv_sum
-    Yxy[:, :, 2] = XYZ[:, :, 1] * inv_sum
-    return Yxy
-def convert_rgb_2_Yxy(rgb):
-    # rgb: (h, w, 3)
-    # Yxy: (h, w, 3)
-    return convert_XYZ_2_Yxy(convert_rgb_2_XYZ(rgb))
-def convert_XYZ_2_rgb(XYZ):
-    # XYZ: (h, w, 3)
-    # rgb: (h, w, 3)
-    rgb = torch.ones_like(XYZ)
-    rgb[:, :, 0] = (
-        3.2404542 * XYZ[:, :, 0] - 1.5371385 * XYZ[:, :, 1] - 0.4985314 * XYZ[:, :, 2]
-    )
-    rgb[:, :, 1] = (
-        -0.9692660 * XYZ[:, :, 0] + 1.8760108 * XYZ[:, :, 1] + 0.0415560 * XYZ[:, :, 2]
-    )
-    rgb[:, :, 2] = (
-        0.0556434 * XYZ[:, :, 0] - 0.2040259 * XYZ[:, :, 1] + 1.0572252 * XYZ[:, :, 2]
-    )
-    return rgb
-def convert_Yxy_2_XYZ(Yxy):
-    # Yxy: (h, w, 3)
-    # XYZ: (h, w, 3)
-    XYZ = torch.ones_like(Yxy)
-    XYZ[:, :, 0] = Yxy[:, :, 1] / torch.clamp(Yxy[:, :, 2], min=1e-6) * Yxy[:, :, 0]
-    XYZ[:, :, 1] = Yxy[:, :, 0]
-    XYZ[:, :, 2] = (
-        (1.0 - Yxy[:, :, 1] - Yxy[:, :, 2])
-        / torch.clamp(Yxy[:, :, 2], min=1e-4)
-        * Yxy[:, :, 0]
-    )
-    return XYZ
-def convert_Yxy_2_rgb(Yxy):
-    # Yxy: (h, w, 3)
-    # rgb: (h, w, 3)
-    return convert_XYZ_2_rgb(convert_Yxy_2_XYZ(Yxy))
-def load_ldr_image(image_path, from_srgb=False, clamp=False, normalize=False):
-    # Load png or jpg image
-    image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
-    image = torch.from_numpy(image.astype(np.float32) / 255.0)  # (h, w, c)
-    image[~torch.isfinite(image)] = 0
-    if from_srgb:
-        # Convert from sRGB to linear RGB
-        image = image**2.2
-    if clamp:
-        image = torch.clamp(image, min=0.0, max=1.0)
-    if normalize:
-        # Normalize to [-1, 1]
-        image = image * 2.0 - 1.0
-        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
-    return image.permute(2, 0, 1)  # returns (c, h, w)
-def load_exr_image(image_path, tonemaping=False, clamp=False, normalize=False):
-    image = cv2.cvtColor(cv2.imread(image_path, -1), cv2.COLOR_BGR2RGB)
-    image = torch.from_numpy(image.astype("float32"))  # (h, w, c)
-    image[~torch.isfinite(image)] = 0
-    if tonemaping:
-        # Exposure adjuestment
-        image_Yxy = convert_rgb_2_Yxy(image)
-        lum = (
-            image[:, :, 0:1] * 0.2125
-            + image[:, :, 1:2] * 0.7154
-            + image[:, :, 2:3] * 0.0721
-        )
-        lum = torch.log(torch.clamp(lum, min=1e-6))
-        lum_mean = torch.exp(torch.mean(lum))
-        lp = image_Yxy[:, :, 0:1] * 0.18 / torch.clamp(lum_mean, min=1e-6)
-        image_Yxy[:, :, 0:1] = lp
-        image = convert_Yxy_2_rgb(image_Yxy)
-    if clamp:
-        image = torch.clamp(image, min=0.0, max=1.0)
-    if normalize:
-        image = torch.nn.functional.normalize(image, dim=-1, eps=1e-6)
-    return image.permute(2, 0, 1)  # returns (c, h, w)

x2rgb/pipeline_x2rgb.py DELETED Viewed

@@ -1,967 +0,0 @@
-import inspect
-from dataclasses import dataclass
-from typing import Callable, List, Optional, Union
-import numpy as np
-import PIL
-import torch
-import torch.nn.functional as F
-from diffusers.configuration_utils import register_to_config
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
-    rescale_noise_cfg,
-)
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import CONFIG_NAME, BaseOutput, deprecate, logging, randn_tensor
-from transformers import CLIPTextModel, CLIPTokenizer
-logger = logging.get_logger(__name__)
-class VaeImageProcrssorAOV(VaeImageProcessor):
-    """
-    Image processor for VAE AOV.
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
-        vae_scale_factor (`int`, *optional*, defaults to `8`):
-            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
-        resample (`str`, *optional*, defaults to `lanczos`):
-            Resampling filter to use when resizing the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image to [-1,1].
-    """
-    config_name = CONFIG_NAME
-    @register_to_config
-    def __init__(
-        self,
-        do_resize: bool = True,
-        vae_scale_factor: int = 8,
-        resample: str = "lanczos",
-        do_normalize: bool = True,
-    ):
-        super().__init__()
-    def postprocess(
-        self,
-        image: torch.FloatTensor,
-        output_type: str = "pil",
-        do_denormalize: Optional[List[bool]] = None,
-        do_gamma_correction: bool = True,
-    ):
-        if not isinstance(image, torch.Tensor):
-            raise ValueError(
-                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
-            )
-        if output_type not in ["latent", "pt", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`pil`, `np`, `pt`, `latent`"
-            )
-            deprecate(
-                "Unsupported output_type",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False,
-            )
-            output_type = "np"
-        if output_type == "latent":
-            return image
-        if do_denormalize is None:
-            do_denormalize = [self.config.do_normalize] * image.shape[0]
-        image = torch.stack(
-            [
-                self.denormalize(image[i]) if do_denormalize[i] else image[i]
-                for i in range(image.shape[0])
-            ]
-        )
-        # Gamma correction
-        if do_gamma_correction:
-            image = torch.pow(image, 1.0 / 2.2)
-        if output_type == "pt":
-            return image
-        image = self.pt_to_numpy(image)
-        if output_type == "np":
-            return image
-        if output_type == "pil":
-            return self.numpy_to_pil(image)
-    def preprocess_normal(
-        self,
-        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> torch.Tensor:
-        image = torch.stack([image], axis=0)
-        return image
-@dataclass
-class StableDiffusionAOVPipelineOutput(BaseOutput):
-    """
-    Output class for Stable Diffusion AOV pipelines.
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
-            num_channels)`.
-        nsfw_content_detected (`List[bool]`)
-            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
-            `None` if safety checking could not be performed.
-    """
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    predicted_x0_images: Optional[Union[List[PIL.Image.Image], np.ndarray]] = None
-class StableDiffusionAOVDropoutPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin
-):
-    r"""
-    Pipeline for AOVs.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-    The pipeline also inherits the following loading methods:
-        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer ([`~transformers.CLIPTokenizer`]):
-            A `CLIPTokenizer` to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A `UNet2DConditionModel` to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-    """
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcrssorAOV(
-            vae_scale_factor=self.vae_scale_factor
-        )
-        self.register_to_config()
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pt"
-            ).input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                -1
-            ] and not torch.equal(text_input_ids, untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-            if (
-                hasattr(self.text_encoder.config, "use_attention_mask")
-                and self.text_encoder.config.use_attention_mask
-            ):
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(
-            bs_embed * num_images_per_prompt, seq_len, -1
-        )
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            if (
-                hasattr(self.text_encoder.config, "use_attention_mask")
-                and self.text_encoder.config.use_attention_mask
-            ):
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(
-                dtype=self.text_encoder.dtype, device=device
-            )
-            negative_prompt_embeds = negative_prompt_embeds.repeat(
-                1, num_images_per_prompt, 1
-            )
-            negative_prompt_embeds = negative_prompt_embeds.view(
-                batch_size * num_images_per_prompt, seq_len, -1
-            )
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            # pix2pix has two  negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
-            prompt_embeds = torch.cat(
-                [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
-            )
-        return prompt_embeds
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys()
-        )
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys()
-        )
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-    def check_inputs(
-        self,
-        prompt,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None
-            and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (
-            not isinstance(prompt, str) and not isinstance(prompt, list)
-        ):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            latents = randn_tensor(
-                shape, generator=generator, device=device, dtype=dtype
-            )
-        else:
-            latents = latents.to(device)
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-    def prepare_image_latents(
-        self,
-        image,
-        batch_size,
-        num_images_per_prompt,
-        dtype,
-        device,
-        do_classifier_free_guidance,
-        generator=None,
-    ):
-        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-        image = image.to(device=device, dtype=dtype)
-        batch_size = batch_size * num_images_per_prompt
-        if image.shape[1] == 4:
-            image_latents = image
-        else:
-            if isinstance(generator, list) and len(generator) != batch_size:
-                raise ValueError(
-                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                )
-            if isinstance(generator, list):
-                image_latents = [
-                    self.vae.encode(image[i : i + 1]).latent_dist.mode()
-                    for i in range(batch_size)
-                ]
-                image_latents = torch.cat(image_latents, dim=0)
-            else:
-                image_latents = self.vae.encode(image).latent_dist.mode()
-        if (
-            batch_size > image_latents.shape[0]
-            and batch_size % image_latents.shape[0] == 0
-        ):
-            # expand image_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate(
-                "len(prompt) != len(image)",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False,
-            )
-            additional_image_per_prompt = batch_size // image_latents.shape[0]
-            image_latents = torch.cat(
-                [image_latents] * additional_image_per_prompt, dim=0
-            )
-        elif (
-            batch_size > image_latents.shape[0]
-            and batch_size % image_latents.shape[0] != 0
-        ):
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            image_latents = torch.cat([image_latents], dim=0)
-        if do_classifier_free_guidance:
-            uncond_image_latents = torch.zeros_like(image_latents)
-            image_latents = torch.cat(
-                [image_latents, image_latents, uncond_image_latents], dim=0
-            )
-        return image_latents
-    @torch.no_grad()
-    def __call__(
-        self,
-        height: int,
-        width: int,
-        prompt: Union[str, List[str]] = None,
-        albedo: Optional[
-            Union[
-                torch.FloatTensor,
-                PIL.Image.Image,
-                np.ndarray,
-                List[torch.FloatTensor],
-                List[PIL.Image.Image],
-                List[np.ndarray],
-            ]
-        ] = None,
-        normal: Optional[
-            Union[
-                torch.FloatTensor,
-                PIL.Image.Image,
-                np.ndarray,
-                List[torch.FloatTensor],
-                List[PIL.Image.Image],
-                List[np.ndarray],
-            ]
-        ] = None,
-        roughness: Optional[
-            Union[
-                torch.FloatTensor,
-                PIL.Image.Image,
-                np.ndarray,
-                List[torch.FloatTensor],
-                List[PIL.Image.Image],
-                List[np.ndarray],
-            ]
-        ] = None,
-        metallic: Optional[
-            Union[
-                torch.FloatTensor,
-                PIL.Image.Image,
-                np.ndarray,
-                List[torch.FloatTensor],
-                List[PIL.Image.Image],
-                List[np.ndarray],
-            ]
-        ] = None,
-        irradiance: Optional[
-            Union[
-                torch.FloatTensor,
-                PIL.Image.Image,
-                np.ndarray,
-                List[torch.FloatTensor],
-                List[PIL.Image.Image],
-                List[np.ndarray],
-            ]
-        ] = None,
-        guidance_scale: float = 0.0,
-        image_guidance_scale: float = 0.0,
-        guidance_rescale: float = 0.0,
-        num_inference_steps: int = 100,
-        required_aovs: List[str] = ["albedo"],
-        return_predicted_x0s: bool = False,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
-                image latents as `image`, but if passing latents directly it is not encoded again.
-            num_inference_steps (`int`, *optional*, defaults to 100):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            image_guidance_scale (`float`, *optional*, defaults to 1.5):
-                Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
-                `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
-                linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
-                value of at least `1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
-        Examples:
-        ```py
-        >>> import PIL
-        >>> import requests
-        >>> import torch
-        >>> from io import BytesIO
-        >>> from diffusers import StableDiffusionInstructPix2PixPipeline
-        >>> def download_image(url):
-        ...     response = requests.get(url)
-        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
-        >>> image = download_image(img_url).resize((512, 512))
-        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-        ...     "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
-        ... )
-        >>> pipe = pipe.to("cuda")
-        >>> prompt = "make the mountains snowy"
-        >>> image = pipe(prompt=prompt, image=image).images[0]
-        ```
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-        """
-        # 0. Check inputs
-        self.check_inputs(
-            prompt,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-        # 1. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        device = self._execution_device
-        do_classifier_free_guidance = (
-            guidance_scale >= 1.0 and image_guidance_scale >= 1.0
-        )
-        # check if scheduler is in sigmas space
-        scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
-        # 2. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-        # 3. Preprocess image
-        # For normal, the preprocessing does nothing
-        # For others, the preprocessing remap the values to [-1, 1]
-        preprocessed_aovs = {}
-        for aov_name in required_aovs:
-            if aov_name == "albedo":
-                if albedo is not None:
-                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
-                        albedo
-                    )
-                else:
-                    preprocessed_aovs[aov_name] = None
-            if aov_name == "normal":
-                if normal is not None:
-                    preprocessed_aovs[aov_name] = (
-                        self.image_processor.preprocess_normal(normal)
-                    )
-                else:
-                    preprocessed_aovs[aov_name] = None
-            if aov_name == "roughness":
-                if roughness is not None:
-                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
-                        roughness
-                    )
-                else:
-                    preprocessed_aovs[aov_name] = None
-            if aov_name == "metallic":
-                if metallic is not None:
-                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
-                        metallic
-                    )
-                else:
-                    preprocessed_aovs[aov_name] = None
-            if aov_name == "irradiance":
-                if irradiance is not None:
-                    preprocessed_aovs[aov_name] = self.image_processor.preprocess(
-                        irradiance
-                    )
-                else:
-                    preprocessed_aovs[aov_name] = None
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-        # 5. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-        height_latent, width_latent = latents.shape[-2:]
-        # 6. Prepare Image latents
-        image_latents = []
-        # Magicial scaling factors for each AOV (calculated from the training data)
-        scaling_factors = {
-            "albedo": 0.17301377137652138,
-            "normal": 0.17483895473058078,
-            "roughness": 0.1680724853626448,
-            "metallic": 0.13135013390855135,
-        }
-        for aov_name, aov in preprocessed_aovs.items():
-            if aov is None:
-                image_latent = torch.zeros(
-                    batch_size,
-                    num_channels_latents,
-                    height_latent,
-                    width_latent,
-                    dtype=prompt_embeds.dtype,
-                    device=device,
-                )
-                if aov_name == "irradiance":
-                    image_latent = image_latent[:, 0:3]
-                if do_classifier_free_guidance:
-                    image_latents.append(
-                        torch.cat([image_latent, image_latent, image_latent], dim=0)
-                    )
-                else:
-                    image_latents.append(image_latent)
-            else:
-                if aov_name == "irradiance":
-                    image_latent = F.interpolate(
-                        aov.to(device=device, dtype=prompt_embeds.dtype),
-                        size=(height_latent, width_latent),
-                        mode="bilinear",
-                        align_corners=False,
-                        antialias=True,
-                    )
-                    if do_classifier_free_guidance:
-                        uncond_image_latent = torch.zeros_like(image_latent)
-                        image_latent = torch.cat(
-                            [image_latent, image_latent, uncond_image_latent], dim=0
-                        )
-                else:
-                    scaling_factor = scaling_factors[aov_name]
-                    image_latent = (
-                        self.prepare_image_latents(
-                            aov,
-                            batch_size,
-                            num_images_per_prompt,
-                            prompt_embeds.dtype,
-                            device,
-                            do_classifier_free_guidance,
-                            generator,
-                        )
-                        * scaling_factor
-                    )
-                image_latents.append(image_latent)
-        image_latents = torch.cat(image_latents, dim=1)
-        # 7. Check that shapes of latents and image match the UNet channels
-        num_channels_image = image_latents.shape[1]
-        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_image`: {num_channels_image} "
-                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input."
-            )
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        predicted_x0s = []
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # Expand the latents if we are doing classifier free guidance.
-                # The latents are expanded 3 times because for pix2pix the guidance\
-                # is applied for both the text and the input image.
-                latent_model_input = (
-                    torch.cat([latents] * 3) if do_classifier_free_guidance else latents
-                )
-                # concat latents, image_latents in the channel dimension
-                scaled_latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t
-                )
-                scaled_latent_model_input = torch.cat(
-                    [scaled_latent_model_input, image_latents], dim=1
-                )
-                # predict the noise residual
-                noise_pred = self.unet(
-                    scaled_latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    return_dict=False,
-                )[0]
-                # perform guidance
-                if do_classifier_free_guidance:
-                    (
-                        noise_pred_text,
-                        noise_pred_image,
-                        noise_pred_uncond,
-                    ) = noise_pred.chunk(3)
-                    noise_pred = (
-                        noise_pred_uncond
-                        + guidance_scale * (noise_pred_text - noise_pred_image)
-                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
-                    )
-                if do_classifier_free_guidance and guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(
-                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
-                    )
-                # compute the previous noisy sample x_t -> x_t-1
-                output = self.scheduler.step(
-                    noise_pred, t, latents, **extra_step_kwargs, return_dict=True
-                )
-                latents = output[0]
-                if return_predicted_x0s:
-                    predicted_x0s.append(output[1])
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
-                ):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        if not output_type == "latent":
-            image = self.vae.decode(
-                latents / self.vae.config.scaling_factor, return_dict=False
-            )[0]
-            if return_predicted_x0s:
-                predicted_x0_images = [
-                    self.vae.decode(
-                        predicted_x0 / self.vae.config.scaling_factor, return_dict=False
-                    )[0]
-                    for predicted_x0 in predicted_x0s
-                ]
-        else:
-            image = latents
-            predicted_x0_images = predicted_x0s
-        do_denormalize = [True] * image.shape[0]
-        image = self.image_processor.postprocess(
-            image, output_type=output_type, do_denormalize=do_denormalize
-        )
-        if return_predicted_x0s:
-            predicted_x0_images = [
-                self.image_processor.postprocess(
-                    predicted_x0_image,
-                    output_type=output_type,
-                    do_denormalize=do_denormalize,
-                )
-                for predicted_x0_image in predicted_x0_images
-            ]
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
-        if not return_dict:
-            return image
-        if return_predicted_x0s:
-            return StableDiffusionAOVPipelineOutput(
-                images=image, predicted_x0_images=predicted_x0_images
-            )
-        else:
-            return StableDiffusionAOVPipelineOutput(images=image)