outpaint-video-zoom

Runtime error

File size: 9,791 Bytes

import gradio as gr
import spaces
import torch
from diffusers import AutoencoderKL, TCDScheduler
from diffusers.models.model_loading_utils import load_state_dict
from gradio_imageslider import ImageSlider
from huggingface_hub import hf_hub_download

from controlnet_union import ControlNetModel_Union
from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline

from PIL import Image, ImageDraw
import numpy as np

MODELS = {
    "RealVisXL V5.0 Lightning": "SG161222/RealVisXL_V5.0_Lightning",
}

config_file = hf_hub_download(
    "xinsir/controlnet-union-sdxl-1.0",
    filename="config_promax.json",
)

config = ControlNetModel_Union.load_config(config_file)
controlnet_model = ControlNetModel_Union.from_config(config)
model_file = hf_hub_download(
    "xinsir/controlnet-union-sdxl-1.0",
    filename="diffusion_pytorch_model_promax.safetensors",
)
state_dict = load_state_dict(model_file)
model, _, _, _, _ = ControlNetModel_Union._load_pretrained_model(
    controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0"
)
model.to(device="cuda", dtype=torch.float16)

vae = AutoencoderKL.from_pretrained(
    "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
).to("cuda")

pipe = StableDiffusionXLFillPipeline.from_pretrained(
    "SG161222/RealVisXL_V5.0_Lightning",
    torch_dtype=torch.float16,
    vae=vae,
    controlnet=model,
    variant="fp16",
).to("cuda")

pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)

def resize_and_pad(image, target_size=(1024, 1024), resize_width=512):
    aspect_ratio = image.height / image.width
    new_height = int(resize_width * aspect_ratio)
    
    resized_image = image.resize((resize_width, new_height), Image.LANCZOS)
    
    new_image = Image.new('RGB', target_size, (255, 255, 255))
    
    paste_x = (target_size[0] - resize_width) // 2
    paste_y = (target_size[1] - new_height) // 2
    
    new_image.paste(resized_image, (paste_x, paste_y))
    
    mask = Image.new('L', target_size, 255)
    mask_draw = ImageDraw.Draw(mask)
    mask_draw.rectangle([paste_x, paste_y, paste_x + resize_width, paste_y + new_height], fill=0)
    
    return new_image, mask

@spaces.GPU
def infer(image, model_selection, width, height, overlap_width, num_inference_steps, prompt_input=None, expand_mode=False):
    if expand_mode:
        background, mask = resize_and_pad(image)
        cnet_image = background.copy()
        cnet_image.paste(0, (0, 0), mask)
    else:
        source = image
        target_size = (width, height)
        overlap = overlap_width

        if source.width < target_size[0] and source.height < target_size[1]:
            scale_factor = min(target_size[0] / source.width, target_size[1] / source.height)
            new_width = int(source.width * scale_factor)
            new_height = int(source.height * scale_factor)
            source = source.resize((new_width, new_height), Image.LANCZOS)

        if source.width > target_size[0] or source.height > target_size[1]:
            scale_factor = min(target_size[0] / source.width, target_size[1] / source.height)
            new_width = int(source.width * scale_factor)
            new_height = int(source.height * scale_factor)
            source = source.resize((new_width, new_height), Image.LANCZOS)

        margin_x = (target_size[0] - source.width) // 2
        margin_y = (target_size[1] - source.height) // 2

        background = Image.new('RGB', target_size, (255, 255, 255))
        background.paste(source, (margin_x, margin_y))

        mask = Image.new('L', target_size, 255)
        mask_draw = ImageDraw.Draw(mask)
        mask_draw.rectangle([
            (margin_x + overlap, margin_y + overlap),
            (margin_x + source.width - overlap, margin_y + source.height - overlap)
        ], fill=0)

        cnet_image = background.copy()
        cnet_image.paste(0, (0, 0), mask)

    final_prompt = "high quality"
    if prompt_input and prompt_input.strip() != "":
        final_prompt += ", " + prompt_input

    (
        prompt_embeds,
        negative_prompt_embeds,
        pooled_prompt_embeds,
        negative_pooled_prompt_embeds,
    ) = pipe.encode_prompt(final_prompt, "cuda", True)

    for image in pipe(
        prompt_embeds=prompt_embeds,
        negative_prompt_embeds=negative_prompt_embeds,
        pooled_prompt_embeds=pooled_prompt_embeds,
        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
        image=cnet_image,
        num_inference_steps=num_inference_steps
    ):
        yield cnet_image, image

    image = image.convert("RGBA")
    cnet_image.paste(image, (0, 0), mask)

    yield background, cnet_image

def preload_presets(target_ratio):
    if target_ratio == "9:16":
        return 720, 1280, gr.update(visible=False), gr.update(open=False)
    elif target_ratio == "16:9":
        return 1280, 720, gr.update(visible=False), gr.update(open=False)
    elif target_ratio == "Expand":
        return 1024, 1024, gr.update(visible=True), gr.update(open=False)
    elif target_ratio == "Custom":
        return 720, 1280, gr.update(visible=False), gr.update(open=True)

def clear_result():
    return gr.update(value=None)

css = """
.gradio-container {
    width: 1200px !important;
}
"""

title = """<h1 align="center">Diffusers Image Outpaint</h1>
<div align="center">Drop an image you would like to extend, pick your expected ratio and hit Generate.</div>
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
    <p style="display: flex;gap: 6px;">
         <a href="https://huggingface.co/spaces/fffiloni/diffusers-image-outpout?duplicate=true">
            <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-md.svg" alt="Duplicate this Space">
        </a> to skip the queue and enjoy faster inference on the GPU of your choice 
    </p>
</div>
"""

with gr.Blocks(css=css) as demo:
    with gr.Column():
        gr.HTML(title)

        with gr.Row():
            with gr.Column():
                input_image = gr.Image(
                    type="pil",
                    label="Input Image",
                    sources=["upload"],
                    height = 300
                )
                
                prompt_input = gr.Textbox(label="Prompt (Optional)")
                
                with gr.Row():
                    target_ratio = gr.Radio(
                        label = "Expected Ratio",
                        choices = ["9:16", "16:9", "Expand", "Custom"],
                        value = "9:16",
                        scale = 2
                    )
                    
                    run_button = gr.Button("Generate", scale=1)

                expand_mode = gr.Checkbox(label="Use Expand Mode", visible=False)

                with gr.Accordion(label="Advanced settings", open=False) as settings_panel:
                    with gr.Column(): 
                        with gr.Row():
                            width_slider = gr.Slider(
                                label="Width",
                                minimum=720,
                                maximum=1440,
                                step=8,
                                value=720,
                            )
                            height_slider = gr.Slider(
                                label="Height",
                                minimum=720,
                                maximum=1440,
                                step=8,
                                value=1280,
                            )
                        with gr.Row():
                            model_selection = gr.Dropdown(
                                choices=list(MODELS.keys()),
                                value="RealVisXL V5.0 Lightning",
                                label="Model",
                            )
                            num_inference_steps = gr.Slider(label="Steps", minimum=4, maximum=12, step=1, value=8)

                        overlap_width = gr.Slider(
                            label="Mask overlap width",
                            minimum=1,
                            maximum=50,
                            value=42,
                            step=1
                        )

                gr.Examples(
                    examples=[
                        ["./examples/example_1.webp", "RealVisXL V5.0 Lightning", 1280, 720],  
                        ["./examples/example_2.jpg", "RealVisXL V5.0 Lightning", 720, 1280],  
                        ["./examples/example_3.jpg", "RealVisXL V5.0 Lightning", 1024, 1024],  
                    ],
                    inputs=[input_image, model_selection, width_slider, height_slider],
                )

            with gr.Column():
                result = ImageSlider(
                    interactive=False,
                    label="Generated Image",
                )

    target_ratio.change(
        fn = preload_presets,
        inputs = [target_ratio],
        outputs = [width_slider, height_slider, expand_mode, settings_panel],
        queue = False
    )
    
    run_button.click(
        fn=clear_result,
        inputs=None,
        outputs=result,
    ).then(
        fn=infer,
        inputs=[input_image, model_selection, width_slider, height_slider, overlap_width, num_inference_steps, prompt_input, expand_mode],
        outputs=result,
    )

    prompt_input.submit(
        fn=clear_result,
        inputs=None,
        outputs=result,
    ).then(
        fn=infer,
        inputs=[input_image, model_selection, width_slider, height_slider, overlap_width, num_inference_steps, prompt_input, expand_mode],
        outputs=result,
    )

demo.queue(max_size=12).launch(share=False, show_error=True, show_api=False)