Spaces:

dbaranchuk
/

instruct-p2p-distill

Runtime error

File size: 15,922 Bytes

import spaces
import gradio as gr
import numpy as np
import random
import torch
from diffusers import DDPMScheduler, StableDiffusionPipeline, DDIMScheduler, UNet2DConditionModel
import p2p, generation, inversion

model_id = 'runwayml/stable-diffusion-v1-5'
dtype=torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"

# Reverse
# -----------------------------
pipe_reverse = StableDiffusionPipeline.from_pretrained(model_id, 
                                                       scheduler=DDIMScheduler.from_pretrained(model_id, 
                                                                               subfolder="scheduler"),
                                                       ).to(device=device, dtype=dtype)
unet = UNet2DConditionModel.from_pretrained("dbaranchuk/sd15-cfg-distill-unet").to(device)
pipe_reverse.unet = unet 
pipe_reverse.load_lora_weights("dbaranchuk/icd-lora-sd15",
                               weight_name='reverse-259-519-779-999.safetensors')
pipe_reverse.fuse_lora()
pipe_reverse.to(device)
# -----------------------------

# Forward
# -----------------------------
pipe_forward = StableDiffusionPipeline.from_pretrained(model_id, 
                                                       scheduler=DDIMScheduler.from_pretrained(model_id, 
                                                                               subfolder="scheduler"),
                                                       ).to(device=device, dtype=dtype)
unet = UNet2DConditionModel.from_pretrained("dbaranchuk/sd15-cfg-distill-unet").to(device)
pipe_forward.unet = unet 
pipe_forward.load_lora_weights("dbaranchuk/icd-lora-sd15",
                               weight_name='forward-19-259-519-779.safetensors')
pipe_forward.fuse_lora()
pipe_forward.to(device)
# -----------------------------

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

@spaces.GPU(duration=30)
def infer(image_path, input_prompt, edited_prompt, guidance, tau, 
          crs, srs, amplify_factor, amplify_word, 
          blend_orig, blend_edited, is_replacement):

    tokenizer = pipe_forward.tokenizer
    noise_scheduler = DDPMScheduler.from_pretrained(
                      "runwayml/stable-diffusion-v1-5", subfolder="scheduler", )

    NUM_REVERSE_CONS_STEPS = 4
    REVERSE_TIMESTEPS = [259, 519, 779, 999]
    NUM_FORWARD_CONS_STEPS = 4
    FORWARD_TIMESTEPS = [19, 259, 519, 779]
    NUM_DDIM_STEPS = 50

    solver = generation.Generator(
    model=pipe_forward,
    noise_scheduler=noise_scheduler,
    n_steps=NUM_DDIM_STEPS,
    forward_cons_model=pipe_forward,
    forward_timesteps=FORWARD_TIMESTEPS,
    reverse_cons_model=pipe_reverse,
    reverse_timesteps=REVERSE_TIMESTEPS,
    num_endpoints=NUM_REVERSE_CONS_STEPS,
    num_forward_endpoints=NUM_FORWARD_CONS_STEPS,
    max_forward_timestep_index=49,
    start_timestep=19)

    p2p.NUM_DDIM_STEPS = NUM_DDIM_STEPS
    p2p.tokenizer = tokenizer
    p2p.device = 'cuda'

    prompt = [input_prompt]

    (image_gt, image_rec), ddim_latent, uncond_embeddings = inversion.invert(
         # Playing params
         image_path=image_path,
         prompt=prompt,

         # Fixed params
         is_cons_inversion=True,
         w_embed_dim=512,
         inv_guidance_scale=0.0,
         stop_step=50,
         solver=solver,
         seed=10500)

    p2p.NUM_DDIM_STEPS = 4
    p2p.tokenizer = tokenizer
    p2p.device = 'cuda'

    prompts = [input_prompt,
               edited_prompt
              ]

    # Playing params
    cross_replace_steps = {'default_': crs, }
    self_replace_steps = srs
    blend_word = (((blend_orig,), (blend_edited,)))
    eq_params = {"words": (amplify_word,), "values": (amplify_factor,)}

    controller = p2p.make_controller(prompts,
                                     is_replacement, # (is_replacement) True if only one word is changed
                                     cross_replace_steps,
                                     self_replace_steps,
                                     blend_word,
                                     eq_params)

    tau = tau
    image, _ = generation.runner(
         # Playing params
         guidance_scale=guidance-1,
         tau1=tau,  # Dynamic guidance if tau < 1.0
         tau2=tau,

         # Fixed params
         model=pipe_reverse,
         is_cons_forward=True,
         w_embed_dim=512,
         solver=solver,
         prompt=prompts,
         controller=controller,
         num_inference_steps=50,
         generator=None,
         latent=ddim_latent,
         uncond_embeddings=uncond_embeddings,
         return_type='image')

    image = generation.to_pil_images(image[1, :, :, :])
    return image

css="""
#col-container {
    margin: 0 auto;
    max-width: 1024px;
}
"""

if torch.cuda.is_available():
    power_device = "GPU"
else:
    power_device = "CPU"

with gr.Blocks(css=css) as demo:
    
    with gr.Column(elem_id="col-container"):
        gr.Markdown(
            f"""
        # ⚡ Invertible Consistency Distillation ⚡ 
        # ⚡ Text-guided image editing with 8-step iCD-SD1.5 ⚡
        This is a demo for [Invertible Consistency Distillation](https://yandex-research.github.io/invertible-cd/), 
        a diffusion distillation method proposed in [Invertible Consistency Distillation for Text-Guided Image Editing in Around 7 Steps](https://arxiv.org/abs/2406.14539)
        by [Yandex Research](https://github.com/yandex-research).
        Currently running on {power_device}
        """
        )
        gr.Markdown(
            "**Please** check the examples to catch the intuition behind the hyperparameters, which are quite important for successful editing. A short description: <br />1. *Dynamic guidance tau*. Controls the interval where guidance is applied: if t < tau, then guidance is turned on for t < tau."
            " Lower tau values provide better reference preservation. We commonly use tau=0.6 and tau=0.8. <br />"
            "2. *Cross replace steps (crs)* and *self replace steps (srs)*. Controls the time step interval " 
            "where the cross- and self-attention maps are replaced. Higher values lead to better preservation of the reference image. "
            "The optimal values depend on the particular image. " 
            "Mostly, we use crs and srs from 0.2 to 0.6. <br />"
            "3. *Amplify word* and *Amplify factor*. Define the word that needs to be enhanced in the edited image. <br />"
            "4. *Blended word*. Specifies the object used for making local edits. That is, edit only selected objects. <br />"
            "5. *Is replacement*. You can set True, if you replace only one word in the original prompt. But False also works in these cases."
        )
        gr.Markdown(
            "Feel free to check out our [image generation demo](https://huggingface.co/spaces/dbaranchuk/iCD-image-generation) as well."
        )
        gr.Markdown(
            "If you enjoy the space, feel free to give a ⭐ to the <a href='https://github.com/yandex-research/invertible-cd' target='_blank'>Github Repo</a>. [![GitHub Stars](https://img.shields.io/github/stars/yandex-research/invertible-cd?style=social)](https://github.com/yandex-research/invertible-cd)"
        )
        with gr.Row():
            
            input_prompt = gr.Text(
                label="Origial prompt",
                max_lines=1,
                placeholder="Enter your prompt",
            )

            prompt = gr.Text(
                label="Edited prompt",
                max_lines=1,
                placeholder="Enter your prompt",
            )
            
        
        with gr.Row():
            
            with gr.Column():
                input_image = gr.Image(label="Input image", height=512, width=512, show_label=False)
            with gr.Column():
                result = gr.Image(label="Result", height=512, width=512, show_label=False)

        with gr.Accordion("Advanced Settings", open=True):
            
            with gr.Row():
                
                guidance_scale = gr.Slider(
                    label="Guidance scale",
                    minimum=1.0,
                    maximum=20.0,
                    step=1.0,
                    value=20.0,
                )

                tau = gr.Slider(
                    label="Dynamic guidance tau",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.2,
                    value=0.8,
                )

            with gr.Row():
                
                crs = gr.Slider(
                    label="Cross replace steps",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.1,
                    value=0.4
                )

                srs = gr.Slider(
                    label="Self replace steps",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.1,
                    value=0.4,
                )

            with gr.Row():
                amplify_word = gr.Text(
                   label="Amplify word",
                    max_lines=1,
                   placeholder="Enter your word",
                 )

                amplify_factor = gr.Slider(
                    label="Amplify factor",
                    minimum=0.0,
                    maximum=30,
                    step=1.0,
                    value=1,
                )
            with gr.Row():
                
                blend_orig = gr.Text(
                   label="Blended word 1",
                    max_lines=1,
                   placeholder="Enter your word",)

                blend_edited = gr.Text(
                   label="Blended word 2",
                    max_lines=1,
                   placeholder="Enter your word",)                

            with gr.Row():

                is_replacement = gr.Checkbox(label="Is replacement?", value=False)

        with gr.Row():
            run_button = gr.Button("Edit", scale=0)

        with gr.Row():
            examples = [
                [
                    "examples/orig_3.jpg", #input_image
                    "a photo of a basket of apples", #src_prompt
                    "a photo of a basket of oranges", #tgt_prompt
                    20, #guidance_scale
                    0.6, #tau
                    0.4, #crs
                    0.6, #srs
                    1, #amplify factor
                    'oranges', # amplify word
                    '', #orig blend
                    'oranges', #edited blend
                    False #replacement
                ],
                [
                    "examples/orig_3.jpg", #input_image
                    "a photo of a basket of apples", #src_prompt
                    "a photo of a basket of puppies", #tgt_prompt
                    20, #guidance_scale
                    0.6, #tau
                    0.4, #crs
                    0.1, #srs
                    2, #amplify factor
                    'puppies', # amplify word
                    '', #orig blend
                    'puppies', #edited blend
                    True #replacement
                ],
                [
                    "examples/orig_3.jpg", #input_image
                    "a photo of a basket of apples", #src_prompt
                    "a photo of a basket of apples under snowfall", #tgt_prompt
                    20, #guidance_scale
                    0.6, #tau
                    0.4, #crs
                    0.4, #srs
                    30, #amplify factor
                    'snowfall', # amplify word
                    '', #orig blend
                    'snowfall', #edited blend
                    False #replacement
                ],
                [
                    "examples/orig_1.jpg", #input_image
                    "a photo of an owl", #src_prompt
                    "a photo of an yellow owl", #tgt_prompt
                    20, #guidance_scale
                    0.6, #tau
                    0.9, #crs
                    0.9, #srs
                    20, #amplify factor
                    'yellow', # amplify word
                    'owl', #orig blend
                    'yellow', #edited blend
                    False #replacement
                ],
               [
                    "examples/orig_1.jpg", #input_image
                    "a photo of an owl", #src_prompt
                    "an anime-style painting of an owl", #tgt_prompt
                    20, #guidance_scale
                    0.8, #tau
                    0.6, #crs
                    0.3, #srs
                    10, #amplify factor
                    'anime-style', # amplify word
                    'painting', #orig blend
                    'anime-style', #edited blend
                    False #replacement
                ],
                [
                    "examples/orig_1.jpg", #input_image
                    "a photo of an owl", #src_prompt
                    "a photo of an owl underwater with many fishes nearby", #tgt_prompt
                    20, #guidance_scale
                    0.8, #tau
                    0.4, #crs
                    0.4, #srs
                    18, #amplify factor
                    'fishes', # amplify word
                    '', #orig blend
                    'fishes', #edited blend
                    False #replacement
                ],
                [
                    "examples/orig_2.jpg", #input_image
                    "a photograph of a teddy bear sitting on a wall", #src_prompt
                    "a photograph of a teddy bear sitting on a wall surrounded by roses", #tgt_prompt
                    20, #guidance_scale
                    0.6, #tau
                    0.4, #crs
                    0.1, #srs
                    25, #amplify factor
                    'roses', # amplify word
                    '', #orig blend
                    'roses', #edited blend
                    False #replacement
                ],
                [
                    "examples/orig_2.jpg", #input_image
                    "a photograph of a teddy bear sitting on a wall", #src_prompt
                    "a photograph of a wooden bear sitting on a wall", #tgt_prompt
                    20, #guidance_scale
                    0.8, #tau
                    0.5, #crs
                    0.5, #srs
                    14, #amplify factor
                    'wooden', # amplify word
                    '', #orig blend
                    'wooden', #edited blend
                    True #replacement
                ],
                [
                    "examples/orig_2.jpg", #input_image
                    "a photograph of a teddy bear sitting on a wall", #src_prompt
                    "a photograph of a teddy rabbit sitting on a wall", #tgt_prompt
                    20, #guidance_scale
                    0.8, #tau
                    0.4, #crs
                    0.4, #srs
                    3, #amplify factor
                    'rabbit', # amplify word
                    '', #orig blend
                    'rabbit', #edited blend
                    True #replacement
                ],
            ]
  
            gr.Examples(
               examples = examples,
               inputs =[input_image, input_prompt, prompt,
                guidance_scale, tau, crs, srs, amplify_factor, amplify_word,
                blend_orig, blend_edited, is_replacement],
               outputs=[
                        result
                        ],
               fn=infer, cache_examples=True
            )

    run_button.click(
        fn = infer,
        inputs=[input_image, input_prompt, prompt,
                guidance_scale, tau, crs, srs, amplify_factor, amplify_word,
                blend_orig, blend_edited, is_replacement],
        outputs = [result]
    )

demo.queue().launch()