Spaces:

Profakerr
/

Inpaint

Running on Zero

App Files Files Community

Profakerr commited on Jan 2

Commit

bd92452

verified ·

1 Parent(s): 2aa689b

Upload 2 files

Browse files

Files changed (2) hide show

app.py +253 -0
pipeline_fill_sd_xl.py +521 -0

app.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import gradio as gr
+import spaces
+from RealESRGAN import RealESRGAN
+import torch
+from diffusers import AutoencoderKL, TCDScheduler, DPMSolverMultistepScheduler
+from diffusers.models.model_loading_utils import load_state_dict
+from gradio_imageslider import ImageSlider
+from huggingface_hub import hf_hub_download
+from PIL import ImageDraw, ImageFont, Image
+from controlnet_union import ControlNetModel_Union
+from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
+MODELS = {
+    "RealVisXL V5.0 Lightning": "SG161222/RealVisXL_V5.0_Lightning",
+}
+config_file = hf_hub_download(
+    "xinsir/controlnet-union-sdxl-1.0",
+    filename="config_promax.json",
+)
+config = ControlNetModel_Union.load_config(config_file)
+controlnet_model = ControlNetModel_Union.from_config(config)
+model_file = hf_hub_download(
+    "xinsir/controlnet-union-sdxl-1.0",
+    filename="diffusion_pytorch_model_promax.safetensors",
+)
+state_dict = load_state_dict(model_file)
+model, _, _, _, _ = ControlNetModel_Union._load_pretrained_model(
+    controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0"
+)
+model.to(device="cuda", dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained(
+    "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+).to("cuda")
+pipe = StableDiffusionXLFillPipeline.from_pretrained(
+    "SG161222/RealVisXL_V5.0_Lightning",
+    torch_dtype=torch.float16,
+    vae=vae,
+    controlnet=model,
+    variant="fp16",
+).to("cuda")
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config,algorithm_type="dpmsolver++",use_karras_sigmas=True)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model2 = RealESRGAN(device, scale=2)
+model2.load_weights('weights/RealESRGAN_x2.pth', download=True)
+model4 = RealESRGAN(device, scale=4)
+model4.load_weights('weights/RealESRGAN_x4.pth', download=True)
+@spaces.GPU
+def inference(image, size):
+    global model2
+    global model4
+    global model8
+    if image is None:
+        raise gr.Error("Image not uploaded")
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if size == '2x':
+        try:
+            result = model2.predict(image.convert('RGB'))
+        except torch.cuda.OutOfMemoryError as e:
+            print(e)
+            model2 = RealESRGAN(device, scale=2)
+            model2.load_weights('weights/RealESRGAN_x2.pth', download=False)
+            result = model2.predict(image.convert('RGB'))
+    elif size == '4x':
+        try:
+            result = model4.predict(image.convert('RGB'))
+        except torch.cuda.OutOfMemoryError as e:
+            print(e)
+            model4 = RealESRGAN(device, scale=4)
+            model4.load_weights('weights/RealESRGAN_x4.pth', download=False)
+            result = model2.predict(image.convert('RGB'))
+    print(f"Image size ({device}): {size} ... OK")
+    return result
+def add_watermark(image, text="ProFaker", font_path="BRLNSDB.TTF", font_size=25):
+    # Load the Berlin Sans Demi font with the specified size
+    font = ImageFont.truetype(font_path, font_size)
+    # Position the watermark in the bottom right corner, adjusting for text size
+    text_bbox = font.getbbox(text)
+    text_width, text_height = text_bbox[2], text_bbox[3]
+    watermark_position = (image.width - text_width - 100, image.height - text_height - 150)
+    # Draw the watermark text with a translucent white color
+    draw = ImageDraw.Draw(image)
+    draw.text(watermark_position, text, font=font, fill=(255, 255, 255, 150))  # RGBA for transparency
+    return image
+@spaces.GPU
+def fill_image(prompt, negative_prompt, image, model_selection, paste_back, guidance_scale, num_steps, size):
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = pipe.encode_prompt(prompt, "cuda", True,negative_prompt=negative_prompt)
+    source = image["background"]
+    mask = image["layers"][0]
+    alpha_channel = mask.split()[3]
+    binary_mask = alpha_channel.point(lambda p: p > 0 and 255)
+    cnet_image = source.copy()
+    cnet_image.paste(0, (0, 0), binary_mask)
+    for image in pipe(
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        image=cnet_image,
+        guidance_scale = guidance_scale,
+        num_inference_steps = num_steps,
+    ):
+        yield image, cnet_image
+    print(f"{model_selection=}")
+    print(f"{paste_back=}")
+    if paste_back:
+        image = image.convert("RGBA")
+        cnet_image.paste(image, (0, 0), binary_mask)
+    else:
+        cnet_image = image
+    cnet_image = add_watermark(cnet_image)
+    if size !="0":
+        cnet_image = inference(cnet_image,size)
+    yield source, cnet_image
+def clear_result():
+    return gr.update(value=None)
+title = """<h1 align="center">ProFaker</h1>"""
+with gr.Blocks() as demo:
+    gr.HTML(title)
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(
+                label="Prompt",
+                info="Describe what to inpaint the mask with",
+                lines=3,
+            )
+            with gr.Accordion("Advanced Options", open=False):
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    info="Describe what you dont want in the mask",
+                    lines=3,
+                )
+                guidance_scale = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=1.5,
+                    step=0.1,
+                    label="Guidance Scale"
+                )
+                num_steps = gr.Slider(
+                    minimum=5,
+                    maximum=100,
+                    value=10,
+                    step=1,
+                    label="Steps"
+                )
+                size = gr.Radio(["0", "2x", "4x"], type="value", value="0", label="Image Quality")
+            input_image = gr.ImageMask(
+                type="pil", label="Input Image", crop_size=(1024,1024), layers=False
+            )
+        with gr.Column():
+            model_selection = gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value="RealVisXL V5.0 Lightning",
+                label="Model",
+            )
+            with gr.Row():
+                with gr.Column():
+                    run_button = gr.Button("Generate")
+                with gr.Column():
+                    paste_back = gr.Checkbox(True, label="Paste back original")
+            result = ImageSlider(
+                interactive=False,
+                label="Generated Image",
+                type="pil"
+            )
+    use_as_input_button = gr.Button("Use as Input Image", visible=False)
+    def use_output_as_input(output_image):
+        return gr.update(value=output_image[1])
+    use_as_input_button.click(
+        fn=use_output_as_input, inputs=[result], outputs=[input_image]
+    )
+    run_button.click(
+        fn=clear_result,
+        inputs=None,
+        outputs=result,
+    ).then(
+        fn=lambda: gr.update(visible=False),
+        inputs=None,
+        outputs=use_as_input_button,
+    ).then(
+        fn=fill_image,
+        inputs=[prompt, negative_prompt, input_image, model_selection, paste_back, guidance_scale, num_steps, size],
+        outputs=result,
+    ).then(
+        fn=lambda: gr.update(visible=True),
+        inputs=None,
+        outputs=use_as_input_button,
+    )
+    prompt.submit(
+        fn=clear_result,
+        inputs=None,
+        outputs=result,
+    ).then(
+        fn=lambda: gr.update(visible=False),
+        inputs=None,
+        outputs=use_as_input_button,
+    ).then(
+        fn=fill_image,
+        inputs=[prompt, negative_prompt, input_image, model_selection, paste_back, guidance_scale, num_steps, size],
+        outputs=result,
+    ).then(
+        fn=lambda: gr.update(visible=True),
+        inputs=None,
+        outputs=use_as_input_button,
+    )
+demo.queue(max_size=12).launch(share=False)

pipeline_fill_sd_xl.py ADDED Viewed

	@@ -0,0 +1,521 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union
+import cv2
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers import DPMSolverMultistepScheduler
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from controlnet_union import ControlNetModel_Union
+def latents_to_rgb(latents):
+    weights = ((60, -60, 25, -70), (60, -5, 15, -50), (60, 10, -5, -35))
+    weights_tensor = torch.t(
+        torch.tensor(weights, dtype=latents.dtype).to(latents.device)
+    )
+    biases_tensor = torch.tensor((150, 140, 130), dtype=latents.dtype).to(
+        latents.device
+    )
+    rgb_tensor = torch.einsum(
+        "...lxy,lr -> ...rxy", latents, weights_tensor
+    ) + biases_tensor.unsqueeze(-1).unsqueeze(-1)
+    image_array = rgb_tensor.clamp(0, 255)[0].byte().cpu().numpy()
+    image_array = image_array.transpose(1, 2, 0)  # Change the order of dimensions
+    denoised_image = cv2.fastNlMeansDenoisingColored(image_array, None, 10, 10, 7, 21)
+    blurred_image = cv2.GaussianBlur(denoised_image, (5, 5), 0)
+    final_image = PIL.Image.fromarray(blurred_image)
+    width, height = final_image.size
+    final_image = final_image.resize(
+        (width * 8, height * 8), PIL.Image.Resampling.LANCZOS
+    )
+    return final_image
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    **kwargs,
+):
+    scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+    timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class StableDiffusionXLFillPipeline(DiffusionPipeline, StableDiffusionMixin):
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+    ]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetModel_Union,
+        scheduler: DPMSolverMultistepScheduler,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+        self.register_to_config(
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt
+        )
+    def encode_prompt(
+        self,
+        prompt: str,
+        device: Optional[torch.device] = None,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            raise ValueError("Prompt cannot be None")
+        # Handle negative prompt
+        if negative_prompt is None:
+            negative_prompt = "" if do_classifier_free_guidance else None
+        negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+        # Define tokenizers and text encoders
+        tokenizers = (
+            [self.tokenizer, self.tokenizer_2]
+            if self.tokenizer is not None
+            else [self.tokenizer_2]
+        )
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2]
+            if self.text_encoder is not None
+            else [self.text_encoder_2]
+        )
+        prompt_2 = prompt
+        prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+        negative_prompt_2 = negative_prompt
+        negative_prompt_2 = [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+        # Process prompt embeddings
+        prompt_embeds_list = []
+        prompts = [prompt, prompt_2]
+        for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(device),
+                output_hidden_states=True,
+            )
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+            prompt_embeds_list.append(prompt_embeds)
+        prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+        # Process negative prompt embeddings
+        negative_prompt_embeds_list = []
+        if do_classifier_free_guidance:
+            negative_prompts = [negative_prompt, negative_prompt_2]
+            for neg_prompt, tokenizer, text_encoder in zip(negative_prompts, tokenizers, text_encoders):
+                uncond_input = tokenizer(
+                    neg_prompt,
+                    padding="max_length",
+                    max_length=text_inputs.input_ids.shape[-1],
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # Get pooled and hidden state embeddings
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+        else:
+            negative_prompt_embeds = None
+            negative_pooled_prompt_embeds = None
+        # Convert to proper dtype
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        # Reshape embeddings
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, 1, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)
+        if do_classifier_free_guidance:
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, 1, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * 1, seq_len, -1)
+        # Handle pooled embeddings
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, 1).view(bs_embed * 1, -1)
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, 1).view(bs_embed * 1, -1)
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+    def check_inputs(
+        self,
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+        image,
+        controlnet_conditioning_scale=1.0,
+    ):
+        if prompt_embeds is None:
+            raise ValueError(
+                "Provide `prompt_embeds`. Cannot leave `prompt_embeds` undefined."
+            )
+        if negative_prompt_embeds is None:
+            raise ValueError(
+                "Provide `negative_prompt_embeds`. Cannot leave `negative_prompt_embeds` undefined."
+            )
+        if prompt_embeds.shape != negative_prompt_embeds.shape:
+            raise ValueError(
+                "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                f" {negative_prompt_embeds.shape}."
+            )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel_Union)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel_Union)
+        ):
+            if not isinstance(image, PIL.Image.Image):
+                raise TypeError(
+                    f"image must be passed and has to be a PIL image, but is {type(image)}"
+                )
+        else:
+            assert False
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel_Union)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel_Union)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError(
+                    "For single controlnet: `controlnet_conditioning_scale` must be type `float`."
+                )
+        else:
+            assert False
+    def prepare_image(self, image, device, dtype, do_classifier_free_guidance=False):
+        image = self.control_image_processor.preprocess(image).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+        image = image.repeat_interleave(image_batch_size, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    def prepare_latents(
+        self, batch_size, num_channels_latents, height, width, dtype, device
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        latents = randn_tensor(shape, device=device, dtype=dtype)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt_embeds: torch.Tensor,
+        negative_prompt_embeds: torch.Tensor,
+        pooled_prompt_embeds: torch.Tensor,
+        negative_pooled_prompt_embeds: torch.Tensor,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 15,
+        guidance_scale: float = 1.5,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+    ):
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            image,
+            controlnet_conditioning_scale,
+        )
+        self._guidance_scale = guidance_scale
+        # 2. Define call parameters
+        batch_size = 1
+        device = self._execution_device
+        # 4. Prepare image
+        if isinstance(self.controlnet, ControlNetModel_Union):
+            image = self.prepare_image(
+                image=image,
+                device=device,
+                dtype=self.controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+            )
+            height, width = image.shape[-2:]
+        else:
+            assert False
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device
+        )
+        self._num_timesteps = len(timesteps)
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+        )
+        # 7 Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = negative_add_time_ids = torch.tensor(
+            image.shape[-2:] + torch.Size([0, 0]) + image.shape[-2:]
+        ).unsqueeze(0)
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size, 1)
+        controlnet_image_list = [0, 0, 0, 0, 0, 0, image, 0]
+        union_control_type = (
+            torch.Tensor([0, 0, 0, 0, 0, 0, 1, 0])
+            .to(device, dtype=prompt_embeds.dtype)
+            .repeat(batch_size * 2, 1)
+        )
+        added_cond_kwargs = {
+            "text_embeds": add_text_embeds,
+            "time_ids": add_time_ids,
+            "control_type": union_control_type,
+        }
+        controlnet_prompt_embeds = prompt_embeds
+        controlnet_added_cond_kwargs = added_cond_kwargs
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # controlnet(s) inference
+                control_model_input = latent_model_input
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond_list=controlnet_image_list,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    guess_mode=False,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=None,
+                    cross_attention_kwargs={},
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, return_dict=False
+                )[0]
+                if i == 2:
+                    prompt_embeds = prompt_embeds[-1:]
+                    add_text_embeds = add_text_embeds[-1:]
+                    add_time_ids = add_time_ids[-1:]
+                    union_control_type = union_control_type[-1:]
+                    added_cond_kwargs = {
+                        "text_embeds": add_text_embeds,
+                        "time_ids": add_time_ids,
+                        "control_type": union_control_type,
+                    }
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+                    image = image[-1:]
+                    controlnet_image_list = [0, 0, 0, 0, 0, 0, image, 0]
+                    self._guidance_scale = 0.0
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    yield latents_to_rgb(latents)
+        latents = latents / self.vae.config.scaling_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image)[0]
+        # Offload all models
+        self.maybe_free_model_hooks()
+        yield image