Spaces:

TencentARC
/

BrushEdit

Build error

App Files Files Community

Yw22 commited on Dec 6, 2024

Commit

6444ed9

0 Parent(s):

init

Browse files

Files changed (36) hide show

.gitattributes +62 -0
.gitignore +178 -0
README.md +11 -0
app/gpt4_o/brushedit_all_in_one_pipeline.py +80 -0
app/gpt4_o/brushedit_app.py +914 -0
app/gpt4_o/instructions.py +106 -0
app/gpt4_o/requirements.txt +18 -0
app/gpt4_o/run_app.sh +5 -0
app/gpt4_o/vlm_pipeline.py +138 -0
app/utils/utils.py +197 -0
assets/hedgehog_rm_fg/hedgehog.png +3 -0
assets/hedgehog_rm_fg/image_edit_82314e18-c64c-4003-9ef9-52cebf254b2f_2.png +3 -0
assets/hedgehog_rm_fg/mask_82314e18-c64c-4003-9ef9-52cebf254b2f.png +3 -0
assets/hedgehog_rm_fg/masked_image_82314e18-c64c-4003-9ef9-52cebf254b2f.png +3 -0
assets/hedgehog_rm_fg/prompt.txt +1 -0
assets/hedgehog_rp_bg/hedgehog.png +3 -0
assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png +3 -0
assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png +3 -0
assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png +3 -0
assets/hedgehog_rp_bg/prompt.txt +1 -0
assets/hedgehog_rp_fg/hedgehog.png +3 -0
assets/hedgehog_rp_fg/image_edit_5cab3448-5a3a-459c-9144-35cca3d34273_0.png +3 -0
assets/hedgehog_rp_fg/mask_5cab3448-5a3a-459c-9144-35cca3d34273.png +3 -0
assets/hedgehog_rp_fg/masked_image_5cab3448-5a3a-459c-9144-35cca3d34273.png +3 -0
assets/hedgehog_rp_fg/prompt.txt +1 -0
assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png +3 -0
assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png +3 -0
assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png +3 -0
assets/mona_lisa/mona_lisa.png +3 -0
assets/mona_lisa/prompt.txt +1 -0
assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png +3 -0
assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png +3 -0
assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png +3 -0
assets/sunflower_girl/prompt.txt +1 -0
assets/sunflower_girl/sunflower_girl.png +3 -0
requirements.txt +20 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,62 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rm_fg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rm_fg/image_edit_82314e18-c64c-4003-9ef9-52cebf254b2f_2.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rm_fg/mask_82314e18-c64c-4003-9ef9-52cebf254b2f.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rm_fg/masked_image_82314e18-c64c-4003-9ef9-52cebf254b2f.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_bg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_fg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_fg/image_edit_5cab3448-5a3a-459c-9144-35cca3d34273_0.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_fg/mask_5cab3448-5a3a-459c-9144-35cca3d34273.png filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_fg/masked_image_5cab3448-5a3a-459c-9144-35cca3d34273.png filter=lfs diff=lfs merge=lfs -text
+assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png filter=lfs diff=lfs merge=lfs -text
+assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png filter=lfs diff=lfs merge=lfs -text
+assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png filter=lfs diff=lfs merge=lfs -text
+assets/mona_lisa/mona_lisa.png filter=lfs diff=lfs merge=lfs -text
+assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png filter=lfs diff=lfs merge=lfs -text
+assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png filter=lfs diff=lfs merge=lfs -text
+assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png filter=lfs diff=lfs merge=lfs -text
+assets/sunflower_girl/sunflower_girl.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Initially taken from GitHub's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a Python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+# ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# RL pipelines may produce mp4 outputs
+*.mp4
+# dependencies
+/transformers
+# ruff
+.ruff_cache
+# wandb
+wandb

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: BrushEdit
+emoji: 🤠
+colorFrom: indigo
+colorTo: gray
+sdk: gradio
+sdk_version: 4.38.1
+app_file: app/gpt4_o/brushedit_app.py
+pinned: false
+python_version: 3.1
+---

app/gpt4_o/brushedit_all_in_one_pipeline.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from PIL import Image, ImageEnhance
+from diffusers.image_processor  import VaeImageProcessor
+import numpy as np
+import cv2
+def BrushEdit_Pipeline(pipe,
+                    prompts,
+                    mask_np,
+                    original_image,
+                    generator,
+                    num_inference_steps,
+                    guidance_scale,
+                    control_strength,
+                    negative_prompt,
+                    num_samples,
+                    blending):
+    if mask_np.ndim != 3:
+        mask_np = mask_np[:, :, np.newaxis]
+    mask_np = mask_np / 255
+    height, width = mask_np.shape[0], mask_np.shape[1]
+    # back/foreground
+    # if mask_np[94:547,94:546].sum() < mask_np.sum() - mask_np[94:547,94:546].sum() and mask_np[0,:].sum()>0 and mask_np[-1,:].sum()>0 and mask_np[:,0].sum()>0 and mask_np[:,-1].sum()>0  and mask_np[1,:].sum()>0 and mask_np[-2,:].sum()>0 and mask_np[:,1].sum()>0 and mask_np[:,-2].sum()>0 :
+    #     mask_np = 1 - mask_np
+    ## resize the mask and original image to the same size which is divisible by vae_scale_factor
+    image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
+    height_new, width_new = image_processor.get_default_height_width(original_image, height, width)
+    mask_np = cv2.resize(mask_np, (width_new, height_new))[:,:,np.newaxis]
+    mask_blurred = cv2.GaussianBlur(mask_np*255, (21, 21), 0)/255
+    mask_blurred = mask_blurred[:, :, np.newaxis]
+    original_image = cv2.resize(original_image, (width_new, height_new))
+    init_image = original_image * (1 - mask_np)
+    init_image = Image.fromarray(init_image.astype(np.uint8)).convert("RGB")
+    mask_image = Image.fromarray((mask_np.repeat(3, -1) * 255).astype(np.uint8)).convert("RGB")
+    brushnet_conditioning_scale = float(control_strength)
+    images = pipe(
+        [prompts] * num_samples,
+        init_image,
+        mask_image,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        generator=generator,
+        brushnet_conditioning_scale=brushnet_conditioning_scale,
+        negative_prompt=[negative_prompt]*num_samples,
+        height=height_new,
+        width=width_new,
+    ).images
+    if blending:
+        mask_blurred = mask_blurred * 0.5 + 0.5
+        ## convert to vae shape format, must be divisible by 8
+        original_image_pil = Image.fromarray(original_image).convert("RGB")
+        init_image_np = np.array(image_processor.preprocess(original_image_pil, height=height_new, width=width_new).squeeze())
+        init_image_np = ((init_image_np.transpose(1,2,0) + 1.) / 2.) * 255
+        init_image_np = init_image_np.astype(np.uint8)
+        image_all = []
+        for image_i in images:
+            image_np = np.array(image_i)
+            ## blending
+            image_pasted = init_image_np * (1 - mask_blurred) + mask_blurred * image_np
+            image_pasted = image_pasted.astype(np.uint8)
+            image = Image.fromarray(image_pasted)
+            image_all.append(image)
+    else:
+        image_all = images
+    return image_all, mask_image

app/gpt4_o/brushedit_app.py ADDED Viewed

	@@ -0,0 +1,914 @@

+##!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os, random
+import numpy as np
+import torch
+import gradio as gr
+import spaces
+from PIL import Image
+from huggingface_hub import hf_hub_download
+from segment_anything import SamPredictor, build_sam, SamAutomaticMaskGenerator
+from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
+from scipy.ndimage import binary_dilation, binary_erosion
+from app.gpt4_o.vlm_pipeline import (
+    vlm_response_editing_type,
+    vlm_response_object_wait_for_edit,
+    vlm_response_mask,
+    vlm_response_prompt_after_apply_instruction
+)
+from app.gpt4_o.brushedit_all_in_one_pipeline import BrushEdit_Pipeline
+from app.utils.utils import load_grounding_dino_model
+#### Description ####
+head = r"""
+<div style="text-align: center;">
+    <h1> BrushEdit: All-In-One Image Inpainting and Editing</h1>
+    <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+        <a href='https://tencentarc.github.io/BrushNet/'><img src='https://img.shields.io/badge/Project_Page-BrushNet-green' alt='Project Page'></a>
+        <a href='https://github.com/TencentARC/BrushNet/blob/main/InstructionGuidedEditing/CVPR2024workshop_technique_report.pdf'><img src='https://img.shields.io/badge/Paper-Arxiv-blue'></a>
+        <a href='https://github.com/TencentARC/BrushNet'><img src='https://img.shields.io/badge/Code-Github-orange'></a>
+    </div>
+    </br>
+</div>
+"""
+descriptions = r"""
+Official Gradio Demo for <a href='https://tencentarc.github.io/BrushNet/'><b>BrushEdit: All-In-One Image Inpainting and Editing</b></a><br>
+🧙 BrushEdit enables precise, user-friendly instruction-based image editing via a inpainting model.<br>
+"""
+instructions = r"""
+Currently, we support two modes: <b>fully automated command editing</b> and <b>interactive command editing</b>.
+🛠️ <b>Fully automated instruction-based editing</b>:
+<ul>
+    <li> ⭐️ <b>step1:</b> Upload or select one image from Example. </li>
+    <li> ⭐️ <b>step2:</b> Input the instructions (supports addition, deletion, and modification), e.g. remove xxx .</li>
+    <li> ⭐️ <b>step3:</b> Click <b>Run</b> button to automatic edit image.</li>
+</ul>
+🛠️ <b>Interactive instruction-based editing</b>:
+<ul>
+    <li> ⭐️ <b>step1:</b> Upload or select one image from Example. </li>
+    <li> ⭐️ <b>step2:</b> Use a brush to outline the area you want to edit. </li>
+    <li> ⭐️ <b>step3:</b> Input the instructions. </li>
+    <li> ⭐️ <b>step4:</b> Click <b>Run</b> button to automatic edit image. </li>
+</ul>
+💡 <b>Some tips</b>:
+<ul>
+    <li> 🤠 After input the instructions, you can click the <b>Generate Mask</b> button. The mask generated by VLM will be displayed in the preview panel on the right side. </li>
+    <li> 🤠 After generating the mask or when you use the brush to draw the mask, you can perform operations such as  <b>randomization</b>,  <b>dilation</b>,  <b>erosion</b>, and  <b>movement</b>. </li>
+    <li> 🤠 After input the instructions, you can click the <b>Generate Target Prompt</b> button. The target prompt will be displayed in the text box, and you can modify it according to your ideas. </li>
+</ul>
+☕️ Have fun!
+            """
+# - - - - - examples  - - - - -  #
+EXAMPLES = [
+    # [
+    # {"background": Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA"),
+    #  "layers": [Image.new("RGBA", (Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").width, Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").height), (0, 0, 0, 0))],
+    #  "composite": Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA")},
+    # #  Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA"),
+    #  "add a shining necklace",
+    # #  [Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.jpg")],
+    # #  [Image.open("assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png")],
+    # #  [Image.open("assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png")]
+    # ],
+    [
+    # load_image_from_url("https://github.com/liyaowei-stu/BrushEdit/blob/main/assets/mona_lisa/mona_lisa.png"),
+    Image.open("assets/mona_lisa/mona_lisa.png").convert("RGBA"),
+     "add a shining necklace",
+    #  [Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.jpg")],
+    #  [Image.open("assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png")],
+    #  [Image.open("assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png")]
+    ],
+]
+## init VLM
+from openai import OpenAI
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
+vlm = OpenAI(base_url="http://v2.open.venus.oa.com/llmproxy")
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# download hf models
+base_model_path = hf_hub_download(
+    repo_id="Yw22/BrushEdit",
+    subfolder="base_model/realisticVisionV60B1_v51VAE",
+    token=os.getenv("HF_TOKEN"),
+)
+brushnet_path = hf_hub_download(
+    repo_id="Yw22/BrushEdit",
+    subfolder="brushnetX",
+    token=os.getenv("HF_TOKEN"),
+)
+sam_path = hf_hub_download(
+    repo_id="Yw22/BrushEdit",
+    subfolder="sam",
+    filename="sam_vit_h_4b8939.pth",
+    token=os.getenv("HF_TOKEN"),
+)
+groundingdino_path = hf_hub_download(
+    repo_id="Yw22/BrushEdit",
+    subfolder="grounding_dino",
+    filename="groundingdino_swint_ogc.pth",
+    token=os.getenv("HF_TOKEN"),
+)
+# input brushnetX ckpt path
+brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch.float16)
+pipe = StableDiffusionBrushNetPipeline.from_pretrained(
+        base_model_path, brushnet=brushnet, torch_dtype=torch.float16, low_cpu_mem_usage=False
+    )
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed or when using Torch 2.0.
+# pipe.enable_xformers_memory_efficient_attention()
+pipe.enable_model_cpu_offload()
+## init SAM
+sam = build_sam(checkpoint=sam_path)
+sam.to(device=device)
+sam_predictor = SamPredictor(sam)
+sam_automask_generator = SamAutomaticMaskGenerator(sam)
+## init groundingdino_model
+config_file = 'third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
+groundingdino_model = load_grounding_dino_model(config_file, groundingdino_path, device=device)
+## Ordinary function
+def crop_and_resize(image: Image.Image,
+                    target_width: int,
+                    target_height: int) -> Image.Image:
+    """
+    Crops and resizes an image while preserving the aspect ratio.
+    Args:
+        image (Image.Image): Input PIL image to be cropped and resized.
+        target_width (int): Target width of the output image.
+        target_height (int): Target height of the output image.
+    Returns:
+        Image.Image: Cropped and resized image.
+    """
+    # Original dimensions
+    original_width, original_height = image.size
+    original_aspect = original_width / original_height
+    target_aspect = target_width / target_height
+    # Calculate crop box to maintain aspect ratio
+    if original_aspect > target_aspect:
+        # Crop horizontally
+        new_width = int(original_height * target_aspect)
+        new_height = original_height
+        left = (original_width - new_width) / 2
+        top = 0
+        right = left + new_width
+        bottom = original_height
+    else:
+        # Crop vertically
+        new_width = original_width
+        new_height = int(original_width / target_aspect)
+        left = 0
+        top = (original_height - new_height) / 2
+        right = original_width
+        bottom = top + new_height
+    # Crop and resize
+    cropped_image = image.crop((left, top, right, bottom))
+    resized_image = cropped_image.resize((target_width, target_height), Image.NEAREST)
+    return resized_image
+def move_mask_func(mask, direction, units):
+    binary_mask = mask.squeeze()>0
+    rows, cols = binary_mask.shape
+    moved_mask = np.zeros_like(binary_mask, dtype=bool)
+    if direction == 'down':
+        # move down
+        moved_mask[max(0, units):, :] = binary_mask[:rows - units, :]
+    elif direction == 'up':
+        # move up
+        moved_mask[:rows - units, :] = binary_mask[units:, :]
+    elif direction == 'right':
+        # move left
+        moved_mask[:, max(0, units):] = binary_mask[:, :cols - units]
+    elif direction == 'left':
+        # move right
+        moved_mask[:, :cols - units] = binary_mask[:, units:]
+    return moved_mask
+def random_mask_func(mask, dilation_type='square'):
+    # Randomly select the size of dilation
+    dilation_size = np.random.randint(20, 40)  # Randomly select the size of dilation
+    binary_mask = mask.squeeze()>0
+    if dilation_type == 'square_dilation':
+        structure = np.ones((dilation_size, dilation_size), dtype=bool)
+        dilated_mask = binary_dilation(binary_mask, structure=structure)
+    elif dilation_type == 'square_erosion':
+        structure = np.ones((dilation_size, dilation_size), dtype=bool)
+        dilated_mask = binary_erosion(binary_mask, structure=structure)
+    elif dilation_type == 'bounding_box':
+        # find the most left top and left bottom point
+        rows, cols = np.where(binary_mask)
+        if len(rows) == 0 or len(cols) == 0:
+            return mask  # return original mask if no valid points
+        min_row = np.min(rows)
+        max_row = np.max(rows)
+        min_col = np.min(cols)
+        max_col = np.max(cols)
+        # create a bounding box
+        dilated_mask = np.zeros_like(binary_mask, dtype=bool)
+        dilated_mask[min_row:max_row + 1, min_col:max_col + 1] = True
+    elif dilation_type == 'bounding_ellipse':
+        # find the most left top and left bottom point
+        rows, cols = np.where(binary_mask)
+        if len(rows) == 0 or len(cols) == 0:
+            return mask  # return original mask if no valid points
+        min_row = np.min(rows)
+        max_row = np.max(rows)
+        min_col = np.min(cols)
+        max_col = np.max(cols)
+        # calculate the center and axis length of the ellipse
+        center = ((min_col + max_col) // 2, (min_row + max_row) // 2)
+        a = (max_col - min_col) // 2  # half long axis
+        b = (max_row - min_row) // 2  # half short axis
+        # create a bounding ellipse
+        y, x = np.ogrid[:mask.shape[0], :mask.shape[1]]
+        ellipse_mask = ((x - center[0])**2 / a**2 + (y - center[1])**2 / b**2) <= 1
+        dilated_mask = np.zeros_like(binary_mask, dtype=bool)
+        dilated_mask[ellipse_mask] = True
+    else:
+        raise ValueError("dilation_type must be 'square' or 'ellipse'")
+    # use binary dilation
+    dilated_mask =  np.uint8(dilated_mask[:,:,np.newaxis]) * 255
+    return dilated_mask
+## Gradio component function
+@spaces.GPU(duration=180)
+def process(input_image,
+    original_image,
+    original_mask,
+    prompt,
+    negative_prompt,
+    control_strength,
+    seed,
+    randomize_seed,
+    guidance_scale,
+    num_inference_steps,
+    num_samples,
+    blending,
+    category,
+    target_prompt,
+    resize_and_crop):
+    import ipdb; ipdb.set_trace()
+    if original_image is None:
+        raise gr.Error('Please upload the input image')
+    if prompt is None:
+        raise gr.Error("Please input your instructions, e.g., remove the xxx")
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    # load example image
+    # if isinstance(original_image, str):
+    #     # image_name = image_examples[original_image][0]
+    #     # original_image = cv2.imread(image_name)
+    #     # original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
+    #     original_image = input_image
+    #     num_samples = 1
+    #     blending = True
+    if category is not None:
+        pass
+    else:
+        category = vlm_response_editing_type(vlm, original_image, prompt)
+    if original_mask is not None:
+        original_mask = np.clip(original_mask, 0, 255).astype(np.uint8)
+    else:
+        object_wait_for_edit = vlm_response_object_wait_for_edit(vlm,
+                                                                 category,
+                                                                 prompt)
+        original_mask = vlm_response_mask(vlm,
+                                          category,
+                                          original_image,
+                                          prompt,
+                                          object_wait_for_edit,
+                                          sam,
+                                          sam_predictor,
+                                          sam_automask_generator,
+                                          groundingdino_model,
+                                          )[:,:,None]
+    if len(target_prompt) <= 1:
+        prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(vlm,
+                                                                                 original_image,
+                                                                                 prompt)
+    else:
+        prompt_after_apply_instruction = target_prompt
+    generator = torch.Generator("cuda").manual_seed(random.randint(0, 2147483647) if randomize_seed else seed)
+    image, mask_image = BrushEdit_Pipeline(pipe,
+                                    prompt_after_apply_instruction,
+                                    original_mask,
+                                    original_image,
+                                    generator,
+                                    num_inference_steps,
+                                    guidance_scale,
+                                    control_strength,
+                                    negative_prompt,
+                                    num_samples,
+                                    blending)
+    masked_image = original_image * (1 - (original_mask>0))
+    masked_image = masked_image.astype(np.uint8)
+    masked_image = Image.fromarray(masked_image)
+    import uuid
+    uuid = str(uuid.uuid4())
+    image[0].save(f"outputs/image_edit_{uuid}_0.png")
+    image[1].save(f"outputs/image_edit_{uuid}_1.png")
+    image[2].save(f"outputs/image_edit_{uuid}_2.png")
+    image[3].save(f"outputs/image_edit_{uuid}_3.png")
+    mask_image.save(f"outputs/mask_{uuid}.png")
+    masked_image.save(f"outputs/masked_image_{uuid}.png")
+    return image, [mask_image], [masked_image], ''
+def generate_target_prompt(input_image,
+                           original_image,
+                           prompt):
+    # load example image
+    if isinstance(original_image, str):
+        original_image = input_image
+    prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(vlm,
+                                                            original_image,
+                                                            prompt)
+    return prompt_after_apply_instruction
+def process_mask(input_image,
+    original_image,
+    prompt,
+    resize_and_crop):
+    if original_image is None:
+        raise gr.Error('Please upload the input image')
+    if prompt is None:
+        raise gr.Error("Please input your instructions, e.g., remove the xxx")
+    ## load mask
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.array(alpha_mask)
+    # load example image
+    if isinstance(original_image, str):
+        original_image = input_image["background"]
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        category = vlm_response_editing_type(vlm, original_image, prompt)
+        object_wait_for_edit = vlm_response_object_wait_for_edit(vlm,
+                                                                category,
+                                                                prompt)
+        # original mask: h,w,1 [0, 255]
+        original_mask = vlm_response_mask(
+            vlm,
+            category,
+            original_image,
+            prompt,
+            object_wait_for_edit,
+            sam,
+            sam_predictor,
+            sam_automask_generator,
+            groundingdino_model,
+            )[:,:,None]
+    else:
+        original_mask = input_mask[:,:,None]
+        category = None
+    mask_image = Image.fromarray(original_mask.squeeze().astype(np.uint8)).convert("RGB")
+    masked_image = original_image * (1 - (original_mask>0))
+    masked_image = masked_image.astype(np.uint8)
+    masked_image = Image.fromarray(masked_image)
+    ## not work for image editor
+    # background = input_image["background"]
+    # mask_array = original_mask.squeeze()
+    # layer_rgba = np.array(input_image['layers'][0])
+    # layer_rgba[mask_array > 0] = [0, 0, 0, 255]
+    # layer_rgba = Image.fromarray(layer_rgba, 'RGBA')
+    # black_image = Image.new("RGBA", layer_rgba.size, (0, 0, 0, 255))
+    # composite = Image.composite(black_image, background, layer_rgba)
+    # output_base =  {"layers": [layer_rgba], "background": background, "composite": composite}
+    return [masked_image], [mask_image], original_mask.astype(np.uint8), category
+def process_random_mask(input_image, original_image, original_mask, resize_and_crop):
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        if original_mask is None:
+            raise gr.Error('Please generate mask first')
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    dilation_type = np.random.choice(['bounding_box', 'bounding_ellipse'])
+    random_mask = random_mask_func(original_mask, dilation_type).squeeze()
+    mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
+    masked_image = original_image * (1 - (random_mask[:,:,None]>0))
+    masked_image = masked_image.astype(original_image.dtype)
+    masked_image = Image.fromarray(masked_image)
+    return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
+def process_dilation_mask(input_image, original_image, original_mask, resize_and_crop):
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        if original_mask is None:
+            raise gr.Error('Please generate mask first')
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    dilation_type = np.random.choice(['square_dilation'])
+    random_mask = random_mask_func(original_mask, dilation_type).squeeze()
+    mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
+    masked_image = original_image * (1 - (random_mask[:,:,None]>0))
+    masked_image = masked_image.astype(original_image.dtype)
+    masked_image = Image.fromarray(masked_image)
+    return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
+def process_erosion_mask(input_image, original_image, original_mask, resize_and_crop):
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        if original_mask is None:
+            raise gr.Error('Please generate mask first')
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    dilation_type = np.random.choice(['square_erosion'])
+    random_mask = random_mask_func(original_mask, dilation_type).squeeze()
+    mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
+    masked_image = original_image * (1 - (random_mask[:,:,None]>0))
+    masked_image = masked_image.astype(original_image.dtype)
+    masked_image = Image.fromarray(masked_image)
+    return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
+def move_mask_left(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        if original_mask is None:
+            raise gr.Error('Please generate mask first')
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    moved_mask = move_mask_func(original_mask, 'left', int(moving_pixels)).squeeze()
+    mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+    masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+    masked_image = masked_image.astype(original_image.dtype)
+    masked_image = Image.fromarray(masked_image)
+    if moved_mask.max() <= 1:
+        moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+        original_mask = moved_mask
+    return [masked_image], [mask_image], original_mask.astype(np.uint8)
+def move_mask_right(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        if original_mask is None:
+            raise gr.Error('Please generate mask first')
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    moved_mask = move_mask_func(original_mask, 'right', int(moving_pixels)).squeeze()
+    mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+    masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+    masked_image = masked_image.astype(original_image.dtype)
+    masked_image = Image.fromarray(masked_image)
+    if moved_mask.max() <= 1:
+        moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+        original_mask = moved_mask
+    return [masked_image], [mask_image], original_mask.astype(np.uint8)
+def move_mask_up(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        if original_mask is None:
+            raise gr.Error('Please generate mask first')
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    moved_mask = move_mask_func(original_mask, 'up', int(moving_pixels)).squeeze()
+    mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+    masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+    masked_image = masked_image.astype(original_image.dtype)
+    masked_image = Image.fromarray(masked_image)
+    if moved_mask.max() <= 1:
+        moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+        original_mask = moved_mask
+    return [masked_image], [mask_image], original_mask.astype(np.uint8)
+def move_mask_down(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
+    alpha_mask = input_image["layers"][0].split()[3]
+    input_mask = np.asarray(alpha_mask)
+    if resize_and_crop:
+        original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
+        input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
+        original_image = np.array(original_image)
+        input_mask = np.array(input_mask)
+    if input_mask.max() == 0:
+        if original_mask is None:
+            raise gr.Error('Please generate mask first')
+        original_mask = original_mask
+    else:
+        original_mask = input_mask[:,:,None]
+    moved_mask = move_mask_func(original_mask, 'down', int(moving_pixels)).squeeze()
+    mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+    masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+    masked_image = masked_image.astype(original_image.dtype)
+    masked_image = Image.fromarray(masked_image)
+    if moved_mask.max() <= 1:
+        moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+        original_mask = moved_mask
+    return [masked_image], [mask_image], original_mask.astype(np.uint8)
+def store_img(base):
+    import ipdb; ipdb.set_trace()
+    image_pil = base["background"].convert("RGB")
+    original_image = np.array(image_pil)
+    # import ipdb; ipdb.set_trace()
+    if max(original_image.shape[0], original_image.shape[1]) * 1.0 / min(original_image.shape[0], original_image.shape[1])>2.0:
+        raise gr.Error('image aspect ratio cannot be larger than 2.0')
+    return base, original_image, None, "", None, None, None, None, None
+def reset_func(input_image, original_image, original_mask, prompt, target_prompt):
+    input_image = None
+    original_image = None
+    original_mask = None
+    prompt = ''
+    mask_gallery = []
+    masked_gallery = []
+    result_gallery = []
+    target_prompt = ''
+    return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt
+block = gr.Blocks(
+        theme=gr.themes.Soft(
+             radius_size=gr.themes.sizes.radius_none,
+             text_size=gr.themes.sizes.text_md
+         )
+        ).queue()
+with block as demo:
+    with gr.Row():
+        with gr.Column():
+            gr.HTML(head)
+    gr.Markdown(descriptions)
+    with gr.Accordion(label="🧭 Instructions:", open=True, elem_id="accordion"):
+        with gr.Row(equal_height=True):
+            gr.Markdown(instructions)
+    original_image = gr.State(value=None)
+    original_mask = gr.State(value=None)
+    category = gr.State(value=None)
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                input_image = gr.ImageEditor(
+                    label="Input Image",
+                    type="pil",
+                    brush=gr.Brush(colors=["#000000"], default_size = 30, color_mode="fixed"),
+                    layers = False,
+                    interactive=True,
+                    height=800,
+                    # transforms=("crop"),
+                    # crop_size=(640, 640),
+                    )
+            prompt = gr.Textbox(label="Prompt", placeholder="Please input your instruction.",value='',lines=1)
+            with gr.Row():
+                mask_button = gr.Button("Generate Mask")
+                random_mask_button = gr.Button("Random Generated Mask")
+            with gr.Row():
+                dilation_mask_button = gr.Button("Dilation Generated Mask")
+                erosion_mask_button = gr.Button("Erosion Generated Mask")
+            with gr.Row():
+                generate_target_prompt_button = gr.Button("Generate Target Prompt")
+                run_button = gr.Button("Run")
+            target_prompt = gr.Text(
+                        label="Target prompt",
+                        max_lines=5,
+                        placeholder="VLM-generated target prompt, you can first generate if and then modify it (optional)",
+                        value='',
+                        lines=2
+                    )
+            resize_and_crop = gr.Checkbox(label="Resize and Crop (640 x 640)", value=False)
+            with gr.Accordion("More input params (highly-recommended)", open=False, elem_id="accordion1"):
+                negative_prompt = gr.Text(
+                        label="Negative Prompt",
+                        max_lines=5,
+                        placeholder="Please input your negative prompt",
+                        value='ugly, low quality',lines=1
+                    )
+                control_strength = gr.Slider(
+                    label="Control Strength: ", show_label=True, minimum=0, maximum=1.1, value=1, step=0.01
+                    )
+                with gr.Group():
+                    seed = gr.Slider(
+                        label="Seed: ", minimum=0, maximum=2147483647, step=1, value=648464818
+                    )
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
+                blending = gr.Checkbox(label="Blending mode", value=True)
+                num_samples = gr.Slider(
+                    label="Num samples", minimum=0, maximum=4, step=1, value=4
+                )
+                with gr.Group():
+                    with gr.Row():
+                        guidance_scale = gr.Slider(
+                            label="Guidance scale",
+                            minimum=1,
+                            maximum=12,
+                            step=0.1,
+                            value=7.5,
+                        )
+                        num_inference_steps = gr.Slider(
+                            label="Number of inference steps",
+                            minimum=1,
+                            maximum=50,
+                            step=1,
+                            value=50,
+                        )
+        with gr.Column():
+            with gr.Row():
+                with gr.Tabs(elem_classes=["feedback"]):
+                    with gr.TabItem("Mask"):
+                        mask_gallery = gr.Gallery(label='Mask', show_label=False, elem_id="gallery", preview=True, height=360)
+                with gr.Tabs(elem_classes=["feedback"]):
+                    with gr.TabItem("Masked Image"):
+                        masked_gallery = gr.Gallery(label='Masked Image', show_label=False, elem_id="gallery", preview=True, height=360)
+            moving_pixels = gr.Slider(
+                    label="Moving pixels:", show_label=True, minimum=0, maximum=50, value=4, step=1
+                    )
+            with gr.Row():
+                move_left_button = gr.Button("Move Left")
+                move_right_button = gr.Button("Move Right")
+            with gr.Row():
+                move_up_button = gr.Button("Move Up")
+                move_down_button = gr.Button("Move Down")
+            with gr.Tabs(elem_classes=["feedback"]):
+                with gr.TabItem("Outputs"):
+                    result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery", preview=True, height=360)
+            reset_button = gr.Button("Reset")
+    with gr.Row():
+    #     # example = gr.Examples(
+    #     #     label="Quick Example",
+    #     #     examples=EXAMPLES,
+    #     #     inputs=[prompt, seed, result_gallery, mask_gallery, masked_gallery],
+    #     #     examples_per_page=10,
+    #     #     cache_examples=False,
+    #     # )
+        example = gr.Examples(
+            label="Quick Example",
+            examples=EXAMPLES,
+            inputs=[input_image, prompt],
+            examples_per_page=10,
+            cache_examples=False,
+        )
+        # def process_example(prompt, seed, eg_output):
+        #     import ipdb; ipdb.set_trace()
+        #     eg_output_path = os.path.join("assets/", eg_output)
+        #     return prompt, seed, [Image.open(eg_output_path)]
+        # example = gr.Examples(
+        #     label="Quick Example",
+        #     examples=EXAMPLES,
+        #     inputs=[prompt, seed, eg_output],
+        #     outputs=[prompt, seed, result_gallery],
+        #     fn=process_example,
+        #     examples_per_page=10,
+        #     run_on_click=True,
+        #     cache_examples=False,
+        # )
+    input_image.upload(
+        store_img,
+        [input_image],
+        [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt]
+    )
+    ips=[input_image,
+         original_image,
+         original_mask,
+         prompt,
+         negative_prompt,
+         control_strength,
+         seed,
+         randomize_seed,
+         guidance_scale,
+         num_inference_steps,
+         num_samples,
+         blending,
+         category,
+         target_prompt,
+         resize_and_crop]
+    ## run brushedit
+    run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, target_prompt])
+    ## mask func
+    mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask, category])
+    random_mask_button.click(fn=process_random_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
+    dilation_mask_button.click(fn=process_dilation_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[ masked_gallery, mask_gallery, original_mask])
+    erosion_mask_button.click(fn=process_erosion_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[ masked_gallery, mask_gallery, original_mask])
+    ## move mask func
+    move_left_button.click(fn=move_mask_left, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
+    move_right_button.click(fn=move_mask_right, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
+    move_up_button.click(fn=move_mask_up, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
+    move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
+    ## prompt func
+    generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt])
+    ## reset func
+    reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt])
+demo.launch(server_name="0.0.0.0")

app/gpt4_o/instructions.py ADDED Viewed

	@@ -0,0 +1,106 @@

+def create_editing_category_messages(editing_prompt):
+    messages = [{
+            "role": "system",
+            "content": [
+                {
+                "type": "text",
+                "text": "I will give you an image and an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
+    1. Addition: Adding new objects within the images, e.g., add a bird to the image \n\
+    2. Remove: Removing objects, e.g., remove the mask \n\
+    3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
+    4. Global: Edit the entire image, e.g., let's see it in winter \n\
+    5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc.",
+                },]
+            },
+            {
+            "role": "user",
+            "content": [
+                {
+                "type": "text",
+                "text": editing_prompt
+                },
+            ]
+            }]
+    return messages
+def create_ori_object_messages(editing_prompt):
+    messages =  [
+                {
+                "role": "system",
+                "content": [
+                    {
+                    "type": "text",
+                    "text": "I will give you an editing instruction of the image. Please output the object needed to be edited. You only need to output the basic description of the object in no more than 5 words.  The output should only contain one noun. \n \
+                    For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else."
+                    },]
+                },
+                {
+                "role": "user",
+                "content": [
+                    {
+                    "type": "text",
+                    "text": editing_prompt
+                    }
+                ]
+                }
+            ]
+    return messages
+def create_add_object_messages(editing_prompt, base64_image, height=640, width=640):
+    size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
+    messages = [
+                {
+                "role": "user",
+                "content": [
+                    {
+                    "type": "text",
+                    "text": "I need to add an object to the image following the instruction: " + editing_prompt + ". " + size_str + " \n \
+                    Can you give me a possible bounding box of the location for the added object? Please output with the format of [top - left x coordinate , top - left y coordinate , box width , box height]. You should only output the bounding box position and nothing else. Please refer to the example below for the desired format.\n\
+                    [Examples]\n \
+                    [19, 101, 32, 153]\n  \
+                    [54, 12, 242, 96]"
+                    },
+                    {
+                    "type": "image_url",
+                    "image_url": {
+                        "url":f"data:image/jpeg;base64,{base64_image}"
+                        },
+                    }
+                        ]
+                        }
+                    ]
+    return messages
+def create_apply_editing_messages(editing_prompt, base64_image):
+    messages =  [
+            {
+            "role": "system",
+            "content": [
+                {
+                "type": "text",
+                "text": "I will provide an image along with an editing instruction. Please describe the new content that should be present in the image after applying the instruction. \n \
+                    For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'. The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example. Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct."
+                },]
+            },
+            {
+            "role": "user",
+            "content": [
+                {
+                "type": "text",
+                "text": editing_prompt
+                },
+                {"type": "image_url",
+                "image_url": {
+                    "url":f"data:image/jpeg;base64,{base64_image}"
+                    },
+                },
+            ]
+            }
+        ]
+    return messages

app/gpt4_o/requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+datasets
+Pillow==9.5.0
+opencv-python
+imgaug
+accelerate==0.20.3
+image-reward
+hpsv2
+torchmetrics
+open-clip-torch
+clip
+# gradio==4.44.1
+gradio==4.38.1
+segment_anything
+openai

app/gpt4_o/run_app.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+export PYTHONPATH=.:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0
+python app/gpt4_o/brushedit_app.py

app/gpt4_o/vlm_pipeline.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import base64
+import re
+import torch
+from PIL import Image
+from io import BytesIO
+import numpy as np
+import gradio as gr
+from app.gpt4_o.instructions import (
+    create_editing_category_messages,
+    create_ori_object_messages,
+    create_add_object_messages,
+    create_apply_editing_messages)
+from app.utils.utils import run_grounded_sam
+def encode_image(img):
+    img = Image.fromarray(img.astype('uint8'))
+    buffered = BytesIO()
+    img.save(buffered, format="PNG")
+    img_bytes = buffered.getvalue()
+    return base64.b64encode(img_bytes).decode('utf-8')
+def run_gpt4o_vl_inference(vlm,
+                           messages):
+    response = vlm.chat.completions.create(
+        model="gpt-4o-2024-08-06",
+        messages=messages
+    )
+    response_str = response.choices[0].message.content
+    return response_str
+def vlm_response_editing_type(vlm,
+                              image,
+                              editing_prompt):
+    base64_image = encode_image(image)
+    messages = create_editing_category_messages(editing_prompt)
+    response_str = run_gpt4o_vl_inference(vlm, messages)
+    for category_name in ["Addition","Remove","Local","Global","Background"]:
+        if category_name.lower() in response_str.lower():
+            return category_name
+    raise ValueError("Please input correct commands, including add, delete, and modify commands.")
+def vlm_response_object_wait_for_edit(vlm,
+                                      category,
+                                      editing_prompt):
+    if category in ["Background", "Global", "Addition"]:
+        edit_object = "nan"
+        return edit_object
+    messages = create_ori_object_messages(editing_prompt)
+    response_str = run_gpt4o_vl_inference(vlm, messages)
+    return response_str
+def vlm_response_mask(vlm,
+                      category,
+                      image,
+                      editing_prompt,
+                      object_wait_for_edit,
+                      sam=None,
+                      sam_predictor=None,
+                      sam_automask_generator=None,
+                      groundingdino_model=None,
+                      ):
+    mask = None
+    if editing_prompt is None or len(editing_prompt)==0:
+        raise gr.Error("Please input the editing instruction!")
+    height, width = image.shape[:2]
+    if category=="Addition":
+        base64_image = encode_image(image)
+        messages = create_add_object_messages(editing_prompt, base64_image, height=height, width=width)
+        try:
+            response_str = run_gpt4o_vl_inference(vlm, messages)
+            pattern = r'\[\d{1,3}(?:,\s*\d{1,3}){3}\]'
+            box = re.findall(pattern, response_str)
+            box = box[0][1:-1].split(",")
+            for i in range(len(box)):
+                box[i] = int(box[i])
+            cus_mask = np.zeros((height, width))
+            cus_mask[box[1]: box[1]+box[3], box[0]: box[0]+box[2]]=255
+            mask = cus_mask
+        except:
+            raise gr.Error("Please set the mask manually, MLLM cannot output the mask!")
+    elif category=="Background":
+        labels = "background"
+    elif category=="Global":
+        mask = 255 * np.zeros((height, width))
+    else:
+        labels = object_wait_for_edit
+    if mask is None:
+        for thresh in [0.3,0.25,0.2,0.15,0.1,0.05,0]:
+            try:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                detections = run_grounded_sam(
+                    input_image={"image":Image.fromarray(image.astype('uint8')),
+                                 "mask":None},
+                    text_prompt=labels,
+                    task_type="seg",
+                    box_threshold=thresh,
+                    text_threshold=0.25,
+                    iou_threshold=0.5,
+                    scribble_mode="split",
+                    sam=sam,
+                    sam_predictor=sam_predictor,
+                    sam_automask_generator=sam_automask_generator,
+                    groundingdino_model=groundingdino_model,
+                    device=device,
+                )
+                mask = np.array(detections[0,0,...].cpu()) * 255
+                break
+            except:
+                print(f"wrong in threshhold: {thresh}, continue")
+                continue
+    return mask
+def vlm_response_prompt_after_apply_instruction(vlm,
+                                                image,
+                                                editing_prompt):
+    base64_image = encode_image(image)
+    messages = create_apply_editing_messages(editing_prompt, base64_image)
+    response_str = run_gpt4o_vl_inference(vlm, messages)
+    return response_str

app/utils/utils.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import numpy as np
+import torch
+import torchvision
+from scipy import ndimage
+# BLIP
+from transformers import BlipProcessor, BlipForConditionalGeneration
+# SAM
+from segment_anything import build_sam, SamPredictor, SamAutomaticMaskGenerator
+# GroundingDINO
+from groundingdino.datasets import transforms as T
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+def load_grounding_dino_model(model_config_path, model_checkpoint_path, device):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+def generate_caption(processor, blip_model, raw_image, device):
+    # unconditional image captioning
+    inputs = processor(raw_image, return_tensors="pt").to(device, torch.float16)
+    out = blip_model.generate(**inputs)
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    return caption
+def transform_image(image_pil):
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image
+def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    scores = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        if with_logits:
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+        scores.append(logit.max().item())
+    return boxes_filt, torch.Tensor(scores), pred_phrases
+def run_grounded_sam(input_image,
+                     text_prompt,
+                     task_type,
+                     box_threshold,
+                     text_threshold,
+                     iou_threshold,
+                     scribble_mode,
+                     sam,
+                     groundingdino_model,
+                     sam_predictor=None,
+                     sam_automask_generator=None,
+                     device="cuda"):
+    global blip_processor, blip_model, inpaint_pipeline
+    # load image
+    image = input_image["image"]
+    scribble = input_image["mask"]
+    size = image.size # w, h
+    if sam_predictor is None:
+        sam_predictor = SamPredictor(sam)
+        sam_automask_generator = SamAutomaticMaskGenerator(sam)
+    image_pil = image.convert("RGB")
+    image = np.array(image_pil)
+    if task_type == 'scribble':
+        sam_predictor.set_image(image)
+        scribble = scribble.convert("RGB")
+        scribble = np.array(scribble)
+        scribble = scribble.transpose(2, 1, 0)[0]
+        # 将连通域进行标记
+        labeled_array, num_features = ndimage.label(scribble >= 255)
+        # 计算每个连通域的质心
+        centers = ndimage.center_of_mass(scribble, labeled_array, range(1, num_features+1))
+        centers = np.array(centers)
+        point_coords = torch.from_numpy(centers)
+        point_coords = sam_predictor.transform.apply_coords_torch(point_coords, image.shape[:2])
+        point_coords = point_coords.unsqueeze(0).to(device)
+        point_labels = torch.from_numpy(np.array([1] * len(centers))).unsqueeze(0).to(device)
+        if scribble_mode == 'split':
+            point_coords = point_coords.permute(1, 0, 2)
+            point_labels = point_labels.permute(1, 0)
+        masks, _, _ = sam_predictor.predict_torch(
+            point_coords=point_coords if len(point_coords) > 0 else None,
+            point_labels=point_labels if len(point_coords) > 0 else None,
+            mask_input = None,
+            boxes = None,
+            multimask_output = False,
+        )
+    elif task_type == 'automask':
+        masks = sam_automask_generator.generate(image)
+    else:
+        transformed_image = transform_image(image_pil)
+        if task_type == 'automatic':
+            # generate caption and tags
+            # use Tag2Text can generate better captions
+            # https://huggingface.co/spaces/xinyu1205/Tag2Text
+            # but there are some bugs...
+            blip_processor = blip_processor or BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+            blip_model = blip_model or BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to(device)
+            text_prompt = generate_caption(blip_processor, blip_model, image_pil, device)
+            print(f"Caption: {text_prompt}")
+        # run grounding dino model
+        boxes_filt, scores, pred_phrases = get_grounding_output(
+            groundingdino_model, transformed_image, text_prompt, box_threshold, text_threshold
+        )
+        # process boxes
+        H, W = size[1], size[0]
+        for i in range(boxes_filt.size(0)):
+            boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+            boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+            boxes_filt[i][2:] += boxes_filt[i][:2]
+        boxes_filt = boxes_filt.cpu()
+        if task_type == 'seg' or task_type == 'inpainting' or task_type == 'automatic':
+            sam_predictor.set_image(image)
+            if task_type == 'automatic':
+                # use NMS to handle overlapped boxes
+                print(f"Before NMS: {boxes_filt.shape[0]} boxes")
+                nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
+                boxes_filt = boxes_filt[nms_idx]
+                pred_phrases = [pred_phrases[idx] for idx in nms_idx]
+                print(f"After NMS: {boxes_filt.shape[0]} boxes")
+                print(f"Revise caption with number: {text_prompt}")
+            transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(device)
+            masks, _, _ = sam_predictor.predict_torch(
+                point_coords = None,
+                point_labels = None,
+                boxes = transformed_boxes,
+                multimask_output = False,
+            )
+            return masks
+        else:
+            print("task_type:{} error!".format(task_type))

assets/hedgehog_rm_fg/hedgehog.png ADDED Viewed

Git LFS Details

SHA256: e64da50164ce3136e269a7f02db37a375373ffa1133c4c1d0345762f725ad7b6
Pointer size: 131 Bytes
Size of remote file: 735 kB

assets/hedgehog_rm_fg/image_edit_82314e18-c64c-4003-9ef9-52cebf254b2f_2.png ADDED Viewed

Git LFS Details

SHA256: 691bc84fa24f8abec7efcb93fe5ca3d111288e2790e7182ca618894bd8026005
Pointer size: 131 Bytes
Size of remote file: 712 kB

assets/hedgehog_rm_fg/mask_82314e18-c64c-4003-9ef9-52cebf254b2f.png ADDED Viewed

Git LFS Details

SHA256: 2e466f5080433b6b2b3cb0cdbdd882a39b071e9c2eb8a118cbc836253280911f
Pointer size: 129 Bytes
Size of remote file: 3.03 kB

assets/hedgehog_rm_fg/masked_image_82314e18-c64c-4003-9ef9-52cebf254b2f.png ADDED Viewed

Git LFS Details

SHA256: a5d6a5e7a66ed91b581c58e59240a9b911193b8300d0c8a44ef353a3826cedb2
Pointer size: 131 Bytes
Size of remote file: 653 kB

assets/hedgehog_rm_fg/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 648464818: remove the hedgehog.

assets/hedgehog_rp_bg/hedgehog.png ADDED Viewed

Git LFS Details

SHA256: e64da50164ce3136e269a7f02db37a375373ffa1133c4c1d0345762f725ad7b6
Pointer size: 131 Bytes
Size of remote file: 735 kB

assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png ADDED Viewed

Git LFS Details

SHA256: 42864ee8965bff065c457bef8f0764b95c5ac488d7c1d253adec9965b15e687e
Pointer size: 131 Bytes
Size of remote file: 816 kB

assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png ADDED Viewed

Git LFS Details

SHA256: 6744128de344ecbf3974fc759aab14f25ad41f63eeb977984b9d6c169a01df41
Pointer size: 129 Bytes
Size of remote file: 9.55 kB

assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png ADDED Viewed

Git LFS Details

SHA256: 71b3b37b57c2543450389a64605b6369c7b6d37c12bd01393c01488228db2c08
Pointer size: 131 Bytes
Size of remote file: 565 kB

assets/hedgehog_rp_bg/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 648464818: make the hedgehog in Italy.

assets/hedgehog_rp_fg/hedgehog.png ADDED Viewed

Git LFS Details

SHA256: e64da50164ce3136e269a7f02db37a375373ffa1133c4c1d0345762f725ad7b6
Pointer size: 131 Bytes
Size of remote file: 735 kB

assets/hedgehog_rp_fg/image_edit_5cab3448-5a3a-459c-9144-35cca3d34273_0.png ADDED Viewed

Git LFS Details

SHA256: 5eae70b41f84e33a6256cef1022487e5ade532e8d6e6042d0d2a66c7de8c5491
Pointer size: 131 Bytes
Size of remote file: 698 kB

assets/hedgehog_rp_fg/mask_5cab3448-5a3a-459c-9144-35cca3d34273.png ADDED Viewed

Git LFS Details

SHA256: 2e466f5080433b6b2b3cb0cdbdd882a39b071e9c2eb8a118cbc836253280911f
Pointer size: 129 Bytes
Size of remote file: 3.03 kB

assets/hedgehog_rp_fg/masked_image_5cab3448-5a3a-459c-9144-35cca3d34273.png ADDED Viewed

Git LFS Details

SHA256: a5d6a5e7a66ed91b581c58e59240a9b911193b8300d0c8a44ef353a3826cedb2
Pointer size: 131 Bytes
Size of remote file: 653 kB

assets/hedgehog_rp_fg/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 648464818: replace the hedgehog to flamingo.

assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png ADDED Viewed

Git LFS Details

SHA256: 165d04619b4b24b5261b3c016236b3539d13c744fb302ea72f649fb04d4378f9
Pointer size: 132 Bytes
Size of remote file: 1.08 MB

assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png ADDED Viewed

Git LFS Details

SHA256: 8f2c982378902f5f78aa9f58ed3739eb43474a3202b9b74688419ddf87c09600
Pointer size: 129 Bytes
Size of remote file: 3.33 kB

assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png ADDED Viewed

Git LFS Details

SHA256: 5718c352ebcd1a6526fc84f5190f54a71e6edb4f8b628c0e633b283a7ef964f7
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

assets/mona_lisa/mona_lisa.png ADDED Viewed

Git LFS Details

SHA256: f176865dad43ad3d4358b9bfcdcabdc17ac25b6744461420a8d0b13634d5b048
Pointer size: 132 Bytes
Size of remote file: 1.38 MB

assets/mona_lisa/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 648464818: add a shining necklace.

assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png ADDED Viewed

Git LFS Details

SHA256: 6fe94816353deaebf737b9a92cc3b25c15d78d13adbb1295138b5a87bc56126c
Pointer size: 132 Bytes
Size of remote file: 1.79 MB

assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png ADDED Viewed

Git LFS Details

SHA256: a2f7070129faf02815c736915648fb63d3461022c5babbce11966ff925f65271
Pointer size: 129 Bytes
Size of remote file: 7.08 kB

assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png ADDED Viewed

Git LFS Details

SHA256: 16c08cfb84ccc64c3e32617772ce9b9822b9de152784a74076e1ef93806f7e0f
Pointer size: 132 Bytes
Size of remote file: 1.76 MB

assets/sunflower_girl/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 648464818: add a wreath on head..

assets/sunflower_girl/sunflower_girl.png ADDED Viewed

Git LFS Details

SHA256: 67cfa6af6126d774a32a355266311acaa9088f7d799aa28ecae4b95d784ce936
Pointer size: 132 Bytes
Size of remote file: 1.8 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+torch
+torchvision
+torchaudio
+transformers>=4.25.1
+gradio==4.38.1
+ftfy
+tensorboard
+datasets
+Pillow==9.5.0
+opencv-python
+imgaug
+accelerate==0.20.3
+image-reward
+hpsv2
+torchmetrics
+open-clip-torch
+clip
+segment_anything
+git+https://github.com/liyaowei-stu/BrushEdit.git
+git+https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/main/GroundingDINO