Spaces:

Hatman
/

InstantStyle

Running on Zero

App Files Files Community

Hatman commited on Jul 18, 2024

Commit

65fe6f1

verified ·

1 Parent(s): 8e3f1b8

Upload 3 files

Browse files

Files changed (2) hide show

app.py +159 -265
requirements.txt +1 -11

app.py CHANGED Viewed

@@ -1,266 +1,160 @@
-import sys
-sys.path.append('./')
-import os
-import cv2
-import torch
-import random
-import numpy as np
-from PIL import Image
-from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline
-import spaces
-import gradio as gr
-from huggingface_hub import hf_hub_download
-from ip_adapter import IPAdapterXL
-import os
-os.system("git lfs install")
-os.system("git clone https://huggingface.co/h94/IP-Adapter")
-os.system("mv IP-Adapter/sdxl_models sdxl_models")
-# global variable
-MAX_SEED = np.iinfo(np.int32).max
-device = "cuda" if torch.cuda.is_available() else "cpu"
-dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
-# initialization
-base_model_path = "kandinsky-community/kandinsky-2-2-prior"
-image_encoder_path = "sdxl_models/image_encoder"
-ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
-controlnet_path = "kandinsky-community/kandinsky-2-2-controlnet-depth"
-controlnet = KandinskyV22ControlnetPipeline.from_pretrained(controlnet_path, use_safetensors=False, torch_dtype=torch.float16).to(device)
-# load SDXL pipeline
-pipe = KandinskyV22PriorPipeline.from_pretrained(
-    base_model_path,
-    controlnet=controlnet,
-    torch_dtype=torch.float16,
-    add_watermarker=False,
-)
-# load ip-adapter
-# target_blocks=["block"] for original IP-Adapter
-# target_blocks=["up_blocks.0.attentions.1"] for style blocks only
-# target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks
-ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1"])
-def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    return seed
-def resize_img(
-    input_image,
-    max_side=1280,
-    min_side=1024,
-    size=None,
-    pad_to_max_side=False,
-    mode=Image.BILINEAR,
-    base_pixel_number=64,
-):
-    w, h = input_image.size
-    if size is not None:
-        w_resize_new, h_resize_new = size
-    else:
-        ratio = min_side / min(h, w)
-        w, h = round(ratio * w), round(ratio * h)
-        ratio = max_side / max(h, w)
-        input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
-        w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
-        h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
-    input_image = input_image.resize([w_resize_new, h_resize_new], mode)
-    if pad_to_max_side:
-        res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
-        offset_x = (max_side - w_resize_new) // 2
-        offset_y = (max_side - h_resize_new) // 2
-        res[
-            offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new
-        ] = np.array(input_image)
-        input_image = Image.fromarray(res)
-    return input_image
-@spaces.GPU(enable_queue=True)
-def create_image(image_pil,
-                 input_image,
-                 prompt,
-                 n_prompt,
-                 scale,
-                 control_scale,
-                 guidance_scale,
-                 num_samples,
-                 num_inference_steps,
-                 seed,
-                 target="Load only style blocks",
-                 neg_content_prompt=None,
-                 neg_content_scale=0):
-    if isinstance(image_pil,  np.ndarray):
-        image_pil = Image.fromarray(image_pil)
-    if target =="Load original IP-Adapter":
-        # target_blocks=["blocks"] for original IP-Adapter
-        ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device, target_blocks=["blocks"])
-    elif target=="Load only style blocks":
-        # target_blocks=["up_blocks.0.attentions.1"] for style blocks only
-        ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1"])
-    elif target=="Load only layout blocks":
-        # target_blocks=["up_blocks.0.attentions.1"] for style blocks only
-        ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device, target_blocks=["down_blocks.2.attentions.1"])
-    elif target == "Load style+layout block":
-        # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks
-        ip_model = IPAdapterXL(pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"])
-    if input_image is not None:
-        input_image = resize_img(input_image, max_side=1024)
-        cv_input_image = pil_to_cv2(input_image)
-        detected_map = cv2.Canny(cv_input_image, 50, 200)
-        canny_map = Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB))
-    else:
-        canny_map = Image.new('RGB', (1024, 1024), color=(255, 255, 255))
-        control_scale = 0
-    if float(control_scale) == 0:
-        canny_map = canny_map.resize((1024,1024))
-    if len(neg_content_prompt) > 0 and neg_content_scale != 0:
-        images = ip_model.generate(pil_image=image_pil,
-                                prompt=prompt,
-                                negative_prompt=n_prompt,
-                                scale=scale,
-                                guidance_scale=guidance_scale,
-                                num_samples=num_samples,
-                                num_inference_steps=num_inference_steps,
-                                seed=seed,
-                                image=canny_map,
-                                controlnet_conditioning_scale=float(control_scale),
-                                neg_content_prompt=neg_content_prompt,
-                                neg_content_scale=neg_content_scale
-                                )
-    else:
-        images = ip_model.generate(pil_image=image_pil,
-                                prompt=prompt,
-                                negative_prompt=n_prompt,
-                                scale=scale,
-                                guidance_scale=guidance_scale,
-                                num_samples=num_samples,
-                                num_inference_steps=num_inference_steps,
-                                seed=seed,
-                                image=canny_map,
-                                controlnet_conditioning_scale=float(control_scale),
-                                )
-    return images
-def pil_to_cv2(image_pil):
-    image_np = np.array(image_pil)
-    image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
-    return image_cv2
-# Description
-title = r"""
-<h1 align="center">InstantStyle</h1>
-"""
-description = r"""
-How to use:<br>
-1. Upload a style image.
-2. Set stylization mode, only use style block by default.
-2. Enter a text prompt, as done in normal text-to-image models.
-3. Click the <b>Submit</b> button to begin customization.
-4. Share your stylized photo with your friends and enjoy! 😊
-Advanced usage:<br>
-1. Click advanced options.
-2. Upload another source image for image-based stylization using ControlNet.
-3. Enter negative content prompt to avoid content leakage.
-"""
-article = r"""
----
-```bibtex
-@article{wang2024instantstyle,
-  title={InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation},
-  author={Wang, Haofan and Wang, Qixun and Bai, Xu and Qin, Zekui and Chen, Anthony},
-  journal={arXiv preprint arXiv:2404.02733},
-  year={2024}
-}
-```
-"""
-block = gr.Blocks().queue(max_size=10, api_open=True)
-with block:
-    # description
-    gr.Markdown(title)
-    gr.Markdown(description)
-    with gr.Tabs():
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    with gr.Column():
-                        image_pil = gr.Image(label="Style Image", type="numpy")
-                target = gr.Radio(["Load only style blocks", "Load style+layout block", "Load original IP-Adapter"],
-                                  value="Load only style blocks",
-                                  label="Style mode")
-                prompt = gr.Textbox(label="Prompt",
-                                    value="a cat, masterpiece, best quality, high quality")
-                scale = gr.Slider(minimum=0,maximum=2.0, step=0.01,value=1.0, label="Scale")
-                with gr.Accordion(open=False, label="Advanced Options"):
-                    with gr.Column():
-                        src_image_pil = gr.Image(label="Source Image (optional)", type='pil')
-                    control_scale = gr.Slider(minimum=0,maximum=1.0, step=0.01,value=0.5, label="Controlnet conditioning scale")
-                    n_prompt = gr.Textbox(label="Neg Prompt", value="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry")
-                    neg_content_prompt = gr.Textbox(label="Neg Content Prompt", value="")
-                    neg_content_scale = gr.Slider(minimum=0, maximum=1.0, step=0.01,value=0.5, label="Neg Content Scale")
-                    guidance_scale = gr.Slider(minimum=1,maximum=15.0, step=0.01,value=5.0, label="guidance scale")
-                    num_samples= gr.Slider(minimum=1,maximum=4.0, step=1.0,value=1.0, label="num samples")
-                    num_inference_steps = gr.Slider(minimum=5,maximum=50.0, step=1.0,value=20, label="num inference steps")
-                    seed = gr.Slider(minimum=-1000000,maximum=1000000,value=1, step=1, label="Seed Value")
-                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-                generate_button = gr.Button("Generate Image")
-            with gr.Column():
-                generated_image = gr.Gallery(label="Generated Image")
-        generate_button.click(
-            fn=randomize_seed_fn,
-            inputs=[seed, randomize_seed],
-            outputs=seed,
-            queue=False,
-            api_name=False,
-        ).then(
-            fn=create_image,
-            inputs=[image_pil,
-                    src_image_pil,
-                    prompt,
-                    n_prompt,
-                    scale,
-                    control_scale,
-                    guidance_scale,
-                    num_samples,
-                    num_inference_steps,
-                    seed,
-                    target,
-                    neg_content_prompt,
-                    neg_content_scale],
-            outputs=[generated_image])
-    gr.Markdown(article)
 block.launch(show_error=True)

+import sys
+sys.path.append('./')
+import torch
+import random
+import spaces
+import gradio as gr
+from diffusers import AutoPipelineForText2Image
+from diffusers.utils import load_image
+# global variable
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, 2000)
+    return seed
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=dtype).to(device)
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+@spaces.GPU(enable_queue=True)
+def create_image(image_pil,
+                 prompt,
+                 n_prompt,
+                 scale,
+                 control_scale,
+                 guidance_scale,
+                 num_inference_steps,
+                 seed,
+                 target="Load only style blocks",
+                 ):
+    if target !="Load original IP-Adapter":
+        if target=="Load only style blocks":
+            scale = {
+                "up": {"block_0": [0.0, control_scale, 0.0]},
+            }
+        elif target=="Load only layout blocks":
+            scale = {
+                "down": {"block_2": [0.0, control_scale]},
+            }
+        elif target == "Load style+layout block":
+            scale = {
+                "down": {"block_2": [0.0, control_scale]},
+                "up": {"block_0": [0.0, control_scale, 0.0]},
+            }
+        pipeline.set_ip_adapter_scale(scale)
+    style_image = load_image(image_pil)
+    generator = torch.Generator(device="cpu").manual_seed(randomize_seed_fn(seed, False))
+    image = pipeline(
+        prompt=prompt,
+        ip_adapter_image=style_image,
+        negative_prompt=n_prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+    )
+    return image
+# Description
+title = r"""
+<h1 align="center">InstantStyle</h1>
+"""
+description = r"""
+How to use:<br>
+1. Upload a style image.
+2. Set stylization mode, only use style block by default.
+2. Enter a text prompt, as done in normal text-to-image models.
+3. Click the <b>Submit</b> button to begin customization.
+4. Share your stylized photo with your friends and enjoy! 😊
+Advanced usage:<br>
+1. Click advanced options.
+2. Upload another source image for image-based stylization using ControlNet.
+3. Enter negative content prompt to avoid content leakage.
+"""
+article = r"""
+---
+```bibtex
+@article{wang2024instantstyle,
+  title={InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation},
+  author={Wang, Haofan and Wang, Qixun and Bai, Xu and Qin, Zekui and Chen, Anthony},
+  journal={arXiv preprint arXiv:2404.02733},
+  year={2024}
+}
+```
+"""
+block = gr.Blocks().queue(max_size=10, api_open=True)
+with block:
+    # description
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Tabs():
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        image_pil = gr.Image(label="Style Image", type="numpy")
+                target = gr.Radio(["Load only style blocks", "Load only layout blocks","Load style+layout block", "Load original IP-Adapter"],
+                                  value="Load only style blocks",
+                                  label="Style mode")
+                prompt = gr.Textbox(label="Prompt",
+                                    value="a cat, masterpiece, best quality, high quality")
+                scale = gr.Slider(minimum=0,maximum=2.0, step=0.01,value=1.0, label="Scale")
+                with gr.Accordion(open=False, label="Advanced Options"):
+                    control_scale = gr.Slider(minimum=0,maximum=1.0, step=0.01,value=0.5, label="Controlnet conditioning scale")
+                    n_prompt = gr.Textbox(label="Neg Prompt", value="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry")
+                    guidance_scale = gr.Slider(minimum=1,maximum=15.0, step=0.01,value=5.0, label="guidance scale")
+                    num_inference_steps = gr.Slider(minimum=5,maximum=50.0, step=1.0,value=20, label="num inference steps")
+                    seed = gr.Slider(minimum=-1000000,maximum=1000000,value=1, step=1, label="Seed Value")
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                generate_button = gr.Button("Generate Image")
+            with gr.Column():
+                generated_image = gr.Gallery(label="Generated Image")
+        generate_button.click(
+            fn=randomize_seed_fn,
+            inputs=[seed, randomize_seed],
+            outputs=seed,
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=create_image,
+            inputs=[image_pil,
+                    prompt,
+                    n_prompt,
+                    scale,
+                    control_scale,
+                    guidance_scale,
+                    num_inference_steps,
+                    seed,
+                    target],
+            outputs=[generated_image])
+    gr.Markdown(article)
 block.launch(show_error=True)

requirements.txt CHANGED Viewed

@@ -1,16 +1,6 @@
 diffusers>=0.25.1
 torch>=2.0.0
-torchvision>=0.15.1
 transformers>=4.37.1
-accelerate
-safetensors
-einops
 spaces>=0.19.4
-omegaconf
-peft
 huggingface-hub>=0.20.2
-opencv-python
-gradio==4.38.0
-controlnet_aux
-gdown
-peft

 diffusers>=0.25.1
 torch>=2.0.0
 transformers>=4.37.1
 spaces>=0.19.4
 huggingface-hub>=0.20.2
+gradio