Spaces:

orpatashnik
/

local-prompt-mixing

Runtime error

App Files Files Community

orpatashnik commited on Mar 24, 2023

Commit

c4e6a63

1 Parent(s): 04d6ff1

add code

Browse files

Files changed (14) hide show

README.md +1 -1
gradio_app.py +204 -0
main.py +150 -0
requirements.txt +121 -0
src/attention_based_segmentation.py +67 -0
src/attention_utils.py +99 -0
src/diffusion_model_wrapper.py +252 -0
src/null_text_inversion.py +201 -0
src/prompt_mixing.py +86 -0
src/prompt_to_prompt_controllers.py +205 -0
src/prompt_utils.py +64 -0
src/seq_aligner.py +195 -0
style.css +3 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: indigo
 colorTo: yellow
 sdk: gradio
 sdk_version: 3.23.0
-app_file: app.py
 pinned: false
 license: mit
 ---

 colorTo: yellow
 sdk: gradio
 sdk_version: 3.23.0
+app_file: gradio_app.py
 pinned: false
 license: mit
 ---

gradio_app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from __future__ import annotations
+import gradio as gr
+import numpy as np
+from PIL import Image
+import nltk
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+from main import LPMConfig, main
+DESCRIPTION = '''# Localizing Object-level Shape Variations with Text-to-Image Diffusion Models
+This is a demo for our ''Localizing Object-level Shape Variations with Text-to-Image Diffusion Models'' [paper](https://arxiv.org/abs/2303.11306).
+We introduce a method that generates object-level shape variation for a given image.
+This demo allows using a real image as well as a generated image. For a real image, a matching prompt is required.
+'''
+def main_pipeline(
+        prompt: str,
+        object_of_interest: str,
+        proxy_words: str,
+        number_of_variations: int,
+        start_prompt_range: int,
+        end_prompt_range: int,
+        objects_to_preserve: str,
+        background_nouns: str,
+        seed: int,
+        input_image: str):
+        prompt = prompt.replace(object_of_interest, '{word}')
+        print(number_of_variations)
+        print(proxy_words)
+        proxy_words = proxy_words.split(',') if proxy_words != '' else []
+        objects_to_preserve = objects_to_preserve.split(',') if objects_to_preserve != '' else []
+        background_nouns = background_nouns.split(',') if background_nouns != '' else []
+        args = LPMConfig(
+            seed=seed,
+            prompt=prompt,
+            object_of_interest=object_of_interest,
+            proxy_words=proxy_words,
+            number_of_variations=number_of_variations,
+            start_prompt_range=start_prompt_range,
+            end_prompt_range=end_prompt_range,
+            objects_to_preserve=objects_to_preserve,
+            background_nouns=background_nouns,
+            real_image_path="" if input_image is None else input_image
+        )
+        result_images, result_proxy_words = main(args)
+        result_images = [im.permute(1, 2, 0).cpu().numpy() for im in result_images]
+        result_images = [(im * 255).astype(np.uint8) for im in result_images]
+        result_images = [Image.fromarray(im) for im in result_images]
+        return result_images, ",".join(result_proxy_words)
+with gr.Blocks(css='style.css') as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                label="Input image (optional)",
+                type="filepath"
+            )
+            prompt = gr.Text(
+                label='Prompt',
+                max_lines=1,
+                placeholder='A table below a lamp',
+            )
+            object_of_interest = gr.Text(
+                label='Object of interest',
+                max_lines=1,
+                placeholder='lamp',
+            )
+            proxy_words = gr.Text(
+                label='Proxy words - words used to obtain variations (a comma-separated list of words, can leave empty)',
+                max_lines=1,
+                placeholder=''
+            )
+            number_of_variations = gr.Slider(
+                label='Number of variations (used only for automatic proxy-words)',
+                minimum=2,
+                maximum=30,
+                value=20,
+                step=1
+            )
+            start_prompt_range = gr.Slider(
+                label='Number of steps before starting shape interval',
+                minimum=0,
+                maximum=50,
+                value=7,
+                step=1
+            )
+            end_prompt_range = gr.Slider(
+                label='Number of steps before ending shape interval',
+                minimum=1,
+                maximum=50,
+                value=17,
+                step=1
+            )
+            objects_to_preserve = gr.Text(
+                label='Words corresponding to objects to preserve (a comma-separated list of words, can leave empty)',
+                max_lines=1,
+                placeholder='table',
+            )
+            background_nouns = gr.Text(
+                label='Words corresponding to objects that should be copied from original image (a comma-separated list of words, can leave empty)',
+                max_lines=1,
+                placeholder='',
+            )
+            seed = gr.Slider(
+                label='Seed',
+                minimum=1,
+                maximum=100000,
+                value=0,
+                step=1
+            )
+            run_button = gr.Button('Generate')
+        with gr.Column():
+            result = gr.Gallery(label='Result').style(grid=4)
+            proxy_words_result = gr.Text(label='Used proxy words')
+            examples = [
+                [
+                    "hamster eating watermelon on the beach",
+                    "watermelon",
+                    "",
+                    20,
+                    6,
+                    16,
+                    "",
+                    "hamster,beach",
+                    48,
+                    None
+                ],
+                [
+                    "A decorated lamp in the livingroom",
+                    "lamp",
+                    "",
+                    20,
+                    4,
+                    14,
+                    "livingroom",
+                    "",
+                    42,
+                    None
+                ],
+                [
+                    "a snake in the field eats an apple",
+                    "snake",
+                    "",
+                    20,
+                    7,
+                    17,
+                    "apple",
+                    "apple,field",
+                    10,
+                    None
+                ]
+            ]
+            gr.Examples(examples=examples,
+                        inputs=[
+                            prompt,
+                            object_of_interest,
+                            proxy_words,
+                            number_of_variations,
+                            start_prompt_range,
+                            end_prompt_range,
+                            objects_to_preserve,
+                            background_nouns,
+                            seed,
+                            input_image
+                        ],
+                        outputs=[
+                            result,
+                            proxy_words_result
+                        ],
+                        fn=main_pipeline,
+                        cache_examples=False)
+    inputs = [
+        prompt,
+        object_of_interest,
+        proxy_words,
+        number_of_variations,
+        start_prompt_range,
+        end_prompt_range,
+        objects_to_preserve,
+        background_nouns,
+        seed,
+        input_image
+    ]
+    outputs = [
+        result,
+        proxy_words_result
+    ]
+    run_button.click(fn=main_pipeline, inputs=inputs, outputs=outputs)
+demo.queue(max_size=50).launch(share=False)

main.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import json
+import os
+from dataclasses import dataclass, field
+from typing import List
+import pyrallis
+import torch
+from torch.utils.data import DataLoader
+from torchvision.utils import save_image
+from torchvision.transforms import ToTensor
+from tqdm import tqdm
+from src.prompt_to_prompt_controllers import AttentionStore, AttentionReplace
+from src.null_text_inversion import invert_image
+from src.prompt_utils import get_proxy_prompts
+from src.prompt_mixing import PromptMixing
+from src.diffusion_model_wrapper import DiffusionModelWrapper, get_stable_diffusion_model, get_stable_diffusion_config, \
+    generate_original_image
+def save_args_dict(args, similar_words):
+    exp_path = os.path.join(args.exp_dir, args.prompt.replace(' ', '-'), f"seed={args.seed}_{args.exp_name}")
+    os.makedirs(exp_path, exist_ok=True)
+    args_dict = vars(args)
+    args_dict['similar_words'] = similar_words
+    with open(os.path.join(exp_path, "opt.json"), 'w') as fp:
+        json.dump(args_dict, fp, sort_keys=True, indent=4)
+    return exp_path
+def main(args):
+    ldm_stable = get_stable_diffusion_model(args)
+    ldm_stable_config = get_stable_diffusion_config(args)
+    similar_words, prompts, another_prompts = get_proxy_prompts(args, ldm_stable)
+    exp_path = save_args_dict(args, similar_words)
+    images = []
+    x_t = None
+    uncond_embeddings = None
+    if args.real_image_path != "":
+        x_t, uncond_embeddings = invert_image(args, ldm_stable, ldm_stable_config, prompts, exp_path)
+    image, x_t, orig_all_latents, orig_mask, average_attention = generate_original_image(args, ldm_stable, ldm_stable_config, prompts, x_t, uncond_embeddings)
+    save_image(ToTensor()(image[0]), f"{exp_path}/{similar_words[0]}.jpg")
+    save_image(torch.from_numpy(orig_mask).float(), f"{exp_path}/{similar_words[0]}_mask.jpg")
+    images.append(image[0])
+    object_of_interest_index = args.prompt.split().index('{word}') + 1
+    pm = PromptMixing(args, object_of_interest_index, average_attention)
+    do_other_obj_self_attn_masking = len(args.objects_to_preserve) > 0 and args.end_preserved_obj_self_attn_masking > 0
+    do_self_or_cross_attn_inject = args.cross_attn_inject_steps != 0.0 or args.self_attn_inject_steps != 0.0
+    if do_other_obj_self_attn_masking:
+        print("Do self attn other obj masking")
+    if do_self_or_cross_attn_inject:
+        print(f'Do self attn inject for {args.self_attn_inject_steps} steps')
+        print(f'Do cross attn inject for {args.cross_attn_inject_steps} steps')
+    another_prompts_dataloader = DataLoader(another_prompts[1:], batch_size=args.batch_size, shuffle=False)
+    for another_prompt_batch in tqdm(another_prompts_dataloader):
+        batch_size = len(another_prompt_batch["word"])
+        batch_prompts = prompts * batch_size
+        batch_another_prompt = another_prompt_batch["prompt"]
+        if do_self_or_cross_attn_inject or do_other_obj_self_attn_masking:
+            batch_prompts.append(prompts[0])
+            batch_another_prompt.insert(0, prompts[0])
+        if do_self_or_cross_attn_inject:
+            controller = AttentionReplace(batch_another_prompt, ldm_stable.tokenizer, ldm_stable.device,
+                                          ldm_stable_config["low_resource"], ldm_stable_config["num_diffusion_steps"],
+                                          cross_replace_steps=args.cross_attn_inject_steps,
+                                          self_replace_steps=args.self_attn_inject_steps)
+        else:
+            controller = AttentionStore(ldm_stable_config["low_resource"])
+        diffusion_model_wrapper = DiffusionModelWrapper(args, ldm_stable, ldm_stable_config, controller, prompt_mixing=pm)
+        with torch.no_grad():
+            image, x_t, _, mask = diffusion_model_wrapper.forward(batch_prompts, latent=x_t, other_prompt=batch_another_prompt,
+                                                                  post_background=args.background_post_process, orig_all_latents=orig_all_latents,
+                                                                  orig_mask=orig_mask, uncond_embeddings=uncond_embeddings)
+        for i in range(batch_size):
+            image_index = i + 1 if do_self_or_cross_attn_inject or do_other_obj_self_attn_masking else i
+            save_image(ToTensor()(image[image_index]), f"{exp_path}/{another_prompt_batch['word'][i]}.jpg")
+            if mask is not None:
+                save_image(torch.from_numpy(mask).float(), f"{exp_path}/{another_prompt_batch['word'][i]}_mask.jpg")
+            images.append(image[image_index])
+    images = [ToTensor()(image) for image in images]
+    save_image(images, f"{exp_path}/grid.jpg", nrow=min(max([i for i in range(2, 8) if len(images) % i == 0]), 8))
+    return images, similar_words
+@dataclass
+class LPMConfig:
+    # general config
+    seed: int = 10
+    batch_size: int = 1
+    exp_dir: str = "results"
+    exp_name: str = ""
+    display_images: bool = False
+    gpu_id: int = 0
+    # Stable Diffusion config
+    auth_token: str = ""
+    low_resource: bool = True
+    num_diffusion_steps: int = 50
+    guidance_scale: float = 7.5
+    max_num_words: int = 77
+    # prompt-mixing
+    prompt: str = "a {word} in the field eats an apple"
+    object_of_interest: str = "snake"                                   # The object for which we generate variations
+    proxy_words: List[str] = field(default_factory=lambda :[])          # Leave empty for automatic proxy words
+    number_of_variations: int = 20
+    start_prompt_range: int = 7                                         # Number of steps to begin prompt-mixing
+    end_prompt_range: int = 17                                          # Number of steps to finish prompt-mixing
+    # attention based shape localization
+    objects_to_preserve: List[str] = field(default_factory=lambda :[])  # Objects for which apply attention based shape localization
+    remove_obj_from_self_mask: bool = True                              # If set to True, removes the object of interest from the self-attention mask
+    obj_pixels_injection_threshold: float = 0.05
+    end_preserved_obj_self_attn_masking: int = 40
+    # real image
+    real_image_path: str = ""
+    # controllable background preservation
+    background_post_process: bool = True
+    background_nouns: List[str] = field(default_factory=lambda :[])     # Objects to take from the original image in addition to the background
+    num_segments: int = 5                                               # Number of clusters for the segmentation
+    background_segment_threshold: float = 0.3                           # Threshold for the segments labeling
+    background_blend_timestep: int = 35                                 # Number of steps before background blending
+    # other
+    cross_attn_inject_steps: float = 0.0
+    self_attn_inject_steps: float = 0.0
+if __name__ == '__main__':
+    args = pyrallis.parse(config_class=LPMConfig)
+    print(args)
+    main(args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,121 @@

+accelerate==0.18.0
+anyio==3.6.2
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.2.1
+attrs==22.2.0
+backcall==0.2.0
+backports.functools-lru-cache==1.6.4
+beautifulsoup4==4.11.2
+bleach==6.0.0
+brotlipy==0.7.0
+certifi==2022.12.7
+cffi==1.15.1
+chardet==4.0.0
+charset-normalizer==2.0.4
+click==8.1.3
+comm==0.1.2
+contourpy==1.0.5
+cryptography==38.0.4
+cycler==0.11.0
+debugpy==1.5.1
+decorator==5.1.1
+defusedxml==0.7.1
+diffusers==0.10.2
+entrypoints==0.4
+executing==1.2.0
+fastjsonschema==2.16.2
+filelock==3.10.4
+flit_core==3.6.0
+fonttools==4.25.0
+huggingface-hub==0.13.3
+idna==3.4
+importlib-metadata==6.0.0
+importlib-resources==5.10.2
+ipykernel==6.19.2
+ipython==8.8.0
+ipython-genutils==0.2.0
+jedi==0.18.2
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+jupyter-client==7.3.4
+jupyter_core==4.12.0
+jupyter-server==1.23.5
+jupyterlab-pygments==0.2.2
+kiwisolver==1.4.4
+MarkupSafe==2.1.2
+matplotlib==3.6.2
+matplotlib-inline==0.1.6
+mistune==2.0.5
+mkl-fft==1.3.1
+mkl-random==1.2.2
+mkl-service==2.4.0
+munkres==1.1.4
+mypy-extensions==1.0.0
+nbclassic==0.5.1
+nbclient==0.7.2
+nbconvert==7.2.9
+nbformat==5.7.3
+nest-asyncio==1.5.6
+nltk==3.8.1
+notebook==6.5.2
+notebook_shim==0.2.2
+numpy==1.23.5
+opencv-python==4.7.0.72
+packaging==23.0
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.3.0
+pip==23.0.1
+pkgutil_resolve_name==1.3.10
+ply==3.11
+prometheus-client==0.16.0
+prompt-toolkit==3.0.36
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+Pygments==2.14.0
+pyOpenSSL==22.0.0
+pyparsing==3.0.9
+PyQt5-sip==12.11.0
+pyrallis==0.3.1
+pyrsistent==0.19.3
+PySocks==1.7.1
+python-dateutil==2.8.2
+PyYAML==6.0
+pyzmq==25.0.0
+regex==2023.3.23
+requests==2.28.1
+scikit-learn==1.2.2
+scipy==1.10.1
+Send2Trash==1.8.0
+setuptools==65.6.3
+sip==6.6.2
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.3.2.post1
+stack-data==0.6.2
+terminado==0.17.1
+threadpoolctl==3.1.0
+tinycss2==1.2.1
+tokenizers==0.13.2
+toml==0.10.2
+torch==1.13.1
+torchaudio==0.13.1
+torchvision==0.14.1
+tornado==6.2
+tqdm==4.65.0
+traitlets==5.7.1
+transformers==4.25.1
+typing_extensions==4.4.0
+typing-inspect==0.8.0
+urllib3==1.26.14
+wcwidth==0.2.6
+webencodings==0.5.1
+websocket-client==1.5.1
+wheel==0.37.1
+zipp==3.11.0

src/attention_based_segmentation.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import nltk
+from sklearn.cluster import KMeans
+import numpy as np
+from src.attention_utils import aggregate_attention
+class Segmentor:
+    def __init__(self, controller, prompts, num_segments, background_segment_threshold, res=32, background_nouns=[]):
+        self.controller = controller
+        self.prompts = prompts
+        self.num_segments = num_segments
+        self.background_segment_threshold = background_segment_threshold
+        self.resolution = res
+        self.background_nouns = background_nouns
+        self.self_attention = aggregate_attention(controller, res=32, from_where=("up", "down"), prompts=prompts,
+                                             is_cross=False, select=len(prompts) - 1)
+        self.cross_attention = aggregate_attention(controller, res=16, from_where=("up", "down"), prompts=prompts,
+                                              is_cross=True, select=len(prompts) - 1)
+        tokenized_prompt = nltk.word_tokenize(prompts[-1])
+        self.nouns = [(i, word) for (i, (word, pos)) in enumerate(nltk.pos_tag(tokenized_prompt)) if pos[:2] == 'NN']
+    def __call__(self, *args, **kwargs):
+        clusters = self.cluster()
+        cluster2noun = self.cluster2noun(clusters)
+        return cluster2noun
+    def cluster(self):
+        np.random.seed(1)
+        resolution = self.self_attention.shape[0]
+        attn = self.self_attention.cpu().numpy().reshape(resolution ** 2, resolution ** 2)
+        kmeans = KMeans(n_clusters=self.num_segments, n_init=10).fit(attn)
+        clusters = kmeans.labels_
+        clusters = clusters.reshape(resolution, resolution)
+        return clusters
+    def cluster2noun(self, clusters):
+        result = {}
+        nouns_indices = [index for (index, word) in self.nouns]
+        nouns_maps = self.cross_attention.cpu().numpy()[:, :, [i + 1 for i in nouns_indices]]
+        normalized_nouns_maps = np.zeros_like(nouns_maps).repeat(2, axis=0).repeat(2, axis=1)
+        for i in range(nouns_maps.shape[-1]):
+            curr_noun_map = nouns_maps[:, :, i].repeat(2, axis=0).repeat(2, axis=1)
+            normalized_nouns_maps[:, :, i] = (curr_noun_map - np.abs(curr_noun_map.min())) / curr_noun_map.max()
+        for c in range(self.num_segments):
+            cluster_mask = np.zeros_like(clusters)
+            cluster_mask[clusters == c] = 1
+            score_maps = [cluster_mask * normalized_nouns_maps[:, :, i] for i in range(len(nouns_indices))]
+            scores = [score_map.sum() / cluster_mask.sum() for score_map in score_maps]
+            result[c] = self.nouns[np.argmax(np.array(scores))] if max(scores) > self.background_segment_threshold else "BG"
+        return result
+    def get_background_mask(self, obj_token_index):
+        clusters = self.cluster()
+        cluster2noun = self.cluster2noun(clusters)
+        mask = clusters.copy()
+        obj_segments = [c for c in cluster2noun if cluster2noun[c][0] == obj_token_index - 1]
+        background_segments = [c for c in cluster2noun if cluster2noun[c] == "BG" or cluster2noun[c][1] in self.background_nouns]
+        for c in range(self.num_segments):
+            if c in background_segments and c not in obj_segments:
+                mask[clusters == c] = 0
+            else:
+                mask[clusters == c] = 1
+        return mask

src/attention_utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import numpy as np
+from typing import Tuple, List
+from cv2 import putText, getTextSize, FONT_HERSHEY_SIMPLEX
+# import matplotlib.pyplot as plt
+from PIL import Image
+from src.prompt_to_prompt_controllers import AttentionStore
+def aggregate_attention(attention_store: AttentionStore, res: int, from_where: List[str], is_cross: bool, select: int, prompts):
+    out = []
+    attention_maps = attention_store.get_average_attention()
+    num_pixels = res ** 2
+    for location in from_where:
+        for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+            if item.shape[1] == num_pixels:
+                cross_maps = item.reshape(len(prompts), -1, res, res, item.shape[-1])[select]
+                out.append(cross_maps)
+    out = torch.cat(out, dim=0)
+    out = out.sum(0) / out.shape[0]
+    return out.cpu()
+def show_cross_attention(attention_store: AttentionStore, res: int, from_where: List[str], prompts, tokenizer, select: int = 0):
+    tokens = tokenizer.encode(prompts[select])
+    decoder = tokenizer.decode
+    attention_maps = aggregate_attention(attention_store, res, from_where, True, select, prompts)
+    images = []
+    for i in range(len(tokens)):
+        image = attention_maps[:, :, i]
+        image = 255 * image / image.max()
+        image = image.unsqueeze(-1).expand(*image.shape, 3)
+        image = image.numpy().astype(np.uint8)
+        image = np.array(Image.fromarray(image).resize((256, 256)))
+        image = text_under_image(image, decoder(int(tokens[i])))
+        images.append(image)
+    view_images(np.stack(images, axis=0))
+def show_self_attention_comp(attention_store: AttentionStore, res: int, from_where: List[str],
+                             max_com=10, select: int = 0):
+    attention_maps = aggregate_attention(attention_store, res, from_where, False, select).numpy().reshape(
+        (res ** 2, res ** 2))
+    u, s, vh = np.linalg.svd(attention_maps - np.mean(attention_maps, axis=1, keepdims=True))
+    images = []
+    for i in range(max_com):
+        image = vh[i].reshape(res, res)
+        image = image - image.min()
+        image = 255 * image / image.max()
+        image = np.repeat(np.expand_dims(image, axis=2), 3, axis=2).astype(np.uint8)
+        image = Image.fromarray(image).resize((256, 256))
+        image = np.array(image)
+        images.append(image)
+    view_images(np.concatenate(images, axis=1))
+def view_images(images, num_rows=1, offset_ratio=0.02):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
+    images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
+    num_items = len(images)
+    h, w, c = images[0].shape
+    offset = int(h * offset_ratio)
+    num_cols = num_items // num_rows
+    image_ = np.ones((h * num_rows + offset * (num_rows - 1),
+                      w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255
+    for i in range(num_rows):
+        for j in range(num_cols):
+            image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[
+                i * num_cols + j]
+    pil_img = Image.fromarray(image_)
+    display(pil_img)
+def text_under_image(image: np.ndarray, text: str, text_color: Tuple[int, int, int] = (0, 0, 0)):
+    h, w, c = image.shape
+    offset = int(h * .2)
+    img = np.ones((h + offset, w, c), dtype=np.uint8) * 255
+    font = FONT_HERSHEY_SIMPLEX
+    img[:h] = image
+    textsize = getTextSize(text, font, 1, 2)[0]
+    text_x, text_y = (w - textsize[0]) // 2, h + offset - textsize[1] // 2
+    putText(img, text, (text_x, text_y ), font, 1, text_color, 2)
+    return img
+def display(image):
+    global display_index
+    plt.imshow(image)
+    plt.show()

src/diffusion_model_wrapper.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+import numpy as np
+from typing import Optional, List
+from diffusers import DDIMScheduler, StableDiffusionPipeline
+from tqdm import tqdm
+from cv2 import dilate
+from src.attention_utils import show_cross_attention
+from src.attention_based_segmentation import Segmentor
+from src.prompt_to_prompt_controllers import DummyController, AttentionStore
+def get_stable_diffusion_model(args):
+    device = torch.device(f'cuda:{args.gpu_id}') if torch.cuda.is_available() else torch.device('cpu')
+    if args.real_image_path != "":
+        scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+        ldm_stable = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=args.auth_token, scheduler=scheduler).to(device)
+    else:
+        ldm_stable = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=args.auth_token).to(device)
+    return ldm_stable
+def get_stable_diffusion_config(args):
+    return {
+        "low_resource": args.low_resource,
+        "num_diffusion_steps": args.num_diffusion_steps,
+        "guidance_scale": args.guidance_scale,
+        "max_num_words": args.max_num_words
+    }
+def generate_original_image(args, ldm_stable, ldm_stable_config, prompts, latent, uncond_embeddings):
+    g_cpu = torch.Generator(device=ldm_stable.device).manual_seed(args.seed)
+    controller = AttentionStore(ldm_stable_config["low_resource"])
+    diffusion_model_wrapper = DiffusionModelWrapper(args, ldm_stable, ldm_stable_config, controller, generator=g_cpu)
+    image, x_t, orig_all_latents, _ = diffusion_model_wrapper.forward(prompts,
+                                                                      latent=latent,
+                                                                      uncond_embeddings=uncond_embeddings)
+    orig_mask = Segmentor(controller, prompts, args.num_segments, args.background_segment_threshold, background_nouns=args.background_nouns)\
+        .get_background_mask(args.prompt.split(' ').index("{word}") + 1)
+    average_attention = controller.get_average_attention()
+    return image, x_t, orig_all_latents, orig_mask, average_attention
+class DiffusionModelWrapper:
+    def __init__(self, args, model, model_config, controller=None, prompt_mixing=None, generator=None):
+        self.args = args
+        self.model = model
+        self.model_config = model_config
+        self.controller = controller
+        if self.controller is None:
+            self.controller = DummyController()
+        self.prompt_mixing = prompt_mixing
+        self.device = model.device
+        self.generator = generator
+        self.height = 512
+        self.width = 512
+        self.diff_step = 0
+        self.register_attention_control()
+    def diffusion_step(self, latents, context, t, other_context=None):
+        if self.model_config["low_resource"]:
+            self.uncond_pred = True
+            noise_pred_uncond = self.model.unet(latents, t, encoder_hidden_states=(context[0], None))["sample"]
+            self.uncond_pred = False
+            noise_prediction_text = self.model.unet(latents, t, encoder_hidden_states=(context[1], other_context))["sample"]
+        else:
+            latents_input = torch.cat([latents] * 2)
+            noise_pred = self.model.unet(latents_input, t, encoder_hidden_states=(context, other_context))["sample"]
+            noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + self.model_config["guidance_scale"] * (noise_prediction_text - noise_pred_uncond)
+        latents = self.model.scheduler.step(noise_pred, t, latents)["prev_sample"]
+        latents = self.controller.step_callback(latents)
+        return latents
+    def latent2image(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.model.vae.decode(latents)['sample']
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        image = (image * 255).astype(np.uint8)
+        return image
+    def init_latent(self, latent, batch_size):
+        if latent is None:
+            latent = torch.randn(
+                (1, self.model.unet.in_channels, self.height // 8, self.width // 8),
+                generator=self.generator, device=self.model.device
+            )
+        latents = latent.expand(batch_size,  self.model.unet.in_channels, self.height // 8, self.width // 8).to(self.device)
+        return latent, latents
+    def register_attention_control(self):
+        def ca_forward(model_self, place_in_unet):
+            to_out = model_self.to_out
+            if type(to_out) is torch.nn.modules.container.ModuleList:
+                to_out = model_self.to_out[0]
+            else:
+                to_out = model_self.to_out
+            def forward(x, context=None, mask=None):
+                batch_size, sequence_length, dim = x.shape
+                h = model_self.heads
+                q = model_self.to_q(x)
+                is_cross = context is not None
+                context = context if is_cross else (x, None)
+                k = model_self.to_k(context[0])
+                if is_cross and self.prompt_mixing is not None:
+                    v_context = self.prompt_mixing.get_context_for_v(self.diff_step, context[0], context[1])
+                    v = model_self.to_v(v_context)
+                else:
+                    v = model_self.to_v(context[0])
+                q = model_self.reshape_heads_to_batch_dim(q)
+                k = model_self.reshape_heads_to_batch_dim(k)
+                v = model_self.reshape_heads_to_batch_dim(v)
+                sim = torch.einsum("b i d, b j d -> b i j", q, k) * model_self.scale
+                if mask is not None:
+                    mask = mask.reshape(batch_size, -1)
+                    max_neg_value = -torch.finfo(sim.dtype).max
+                    mask = mask[:, None, :].repeat(h, 1, 1)
+                    sim.masked_fill_(~mask, max_neg_value)
+                # attention, what we cannot get enough of
+                attn = sim.softmax(dim=-1)
+                if self.enbale_attn_controller_changes:
+                    attn = self.controller(attn, is_cross, place_in_unet)
+                if is_cross and context[1] is not None and self.prompt_mixing is not None:
+                    attn = self.prompt_mixing.get_cross_attn(self, self.diff_step, attn, place_in_unet, batch_size)
+                if not is_cross and (not self.model_config["low_resource"] or not self.uncond_pred) and self.prompt_mixing is not None:
+                    attn = self.prompt_mixing.get_self_attn(self, self.diff_step, attn, place_in_unet, batch_size)
+                out = torch.einsum("b i j, b j d -> b i d", attn, v)
+                out = model_self.reshape_batch_dim_to_heads(out)
+                return to_out(out)
+            return forward
+        def register_recr(net_, count, place_in_unet):
+            if net_.__class__.__name__ == 'CrossAttention':
+                net_.forward = ca_forward(net_, place_in_unet)
+                return count + 1
+            elif hasattr(net_, 'children'):
+                for net__ in net_.children():
+                    count = register_recr(net__, count, place_in_unet)
+            return count
+        cross_att_count = 0
+        sub_nets = self.model.unet.named_children()
+        for net in sub_nets:
+            if "down" in net[0]:
+                cross_att_count += register_recr(net[1], 0, "down")
+            elif "up" in net[0]:
+                cross_att_count += register_recr(net[1], 0, "up")
+            elif "mid" in net[0]:
+                cross_att_count += register_recr(net[1], 0, "mid")
+        self.controller.num_att_layers = cross_att_count
+    def get_text_embedding(self, prompt: List[str], max_length=None, truncation=True):
+        text_input = self.model.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.model.tokenizer.model_max_length if max_length is None else max_length,
+            truncation=truncation,
+            return_tensors="pt",
+        )
+        text_embeddings = self.model.text_encoder(text_input.input_ids.to(self.device))[0]
+        max_length = text_input.input_ids.shape[-1]
+        return text_embeddings, max_length
+    @torch.no_grad()
+    def forward(self, prompt: List[str], latent: Optional[torch.FloatTensor] = None,
+                other_prompt: List[str] = None, post_background = False, orig_all_latents = None, orig_mask = None,
+                uncond_embeddings=None, start_time=51, return_type='image'):
+        self.enbale_attn_controller_changes = True
+        batch_size = len(prompt)
+        text_embeddings, max_length = self.get_text_embedding(prompt)
+        if uncond_embeddings is None:
+            uncond_embeddings_, _ = self.get_text_embedding([""] * batch_size, max_length=max_length, truncation=False)
+        else:
+            uncond_embeddings_ = None
+        other_context = None
+        if other_prompt is not None:
+            other_text_embeddings, _ = self.get_text_embedding(other_prompt)
+            other_context = other_text_embeddings
+        latent, latents = self.init_latent(latent, batch_size)
+        # set timesteps
+        self.model.scheduler.set_timesteps(self.model_config["num_diffusion_steps"])
+        all_latents = []
+        object_mask = None
+        self.diff_step = 0
+        for i, t in enumerate(tqdm(self.model.scheduler.timesteps[-start_time:])):
+            if uncond_embeddings_ is None:
+                context = [uncond_embeddings[i].expand(*text_embeddings.shape), text_embeddings]
+            else:
+                context = [uncond_embeddings_, text_embeddings]
+            if not self.model_config["low_resource"]:
+                context = torch.cat(context)
+            self.down_cross_index = 0
+            self.mid_cross_index = 0
+            self.up_cross_index = 0
+            latents = self.diffusion_step(latents, context, t, other_context)
+            if post_background and self.diff_step == self.args.background_blend_timestep:
+                object_mask = Segmentor(self.controller,
+                                        prompt,
+                                        self.args.num_segments,
+                                        self.args.background_segment_threshold,
+                                        background_nouns=self.args.background_nouns)\
+                    .get_background_mask(self.args.prompt.split(' ').index("{word}") + 1)
+                self.enbale_attn_controller_changes = False
+                mask = object_mask.astype(np.bool8) + orig_mask.astype(np.bool8)
+                mask = torch.from_numpy(mask).float().cuda()
+                shape = (1, 1, mask.shape[0], mask.shape[1])
+                mask = torch.nn.Upsample(size=(64, 64), mode='nearest')(mask.view(shape))
+                mask_eroded = dilate(mask.cpu().numpy()[0, 0], np.ones((3, 3), np.uint8), iterations=1)
+                mask = torch.from_numpy(mask_eroded).float().cuda().view(1, 1, 64, 64)
+                latents = mask * latents + (1 - mask) * orig_all_latents[self.diff_step]
+            all_latents.append(latents)
+            self.diff_step += 1
+        if return_type == 'image':
+            image = self.latent2image(latents)
+        else:
+            image = latents
+        return image, latent, all_latents, object_mask
+    def show_last_cross_attention(self, res: int, from_where: List[str], prompts, select: int = 0):
+        show_cross_attention(self.controller, res, from_where, prompts, tokenizer=self.model.tokenizer, select=select)

src/null_text_inversion.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from typing import Union
+from torchvision.transforms import ToTensor
+from torchvision.utils import save_image
+from tqdm import tqdm
+import torch
+from torch.optim.adam import Adam
+import torch.nn.functional as nnf
+import numpy as np
+from PIL import Image
+def load_512(image_path, left=0, right=0, top=0, bottom=0):
+    if type(image_path) is str:
+        image = np.array(Image.open(image_path))[:, :, :3]
+    else:
+        image = image_path
+    h, w, c = image.shape
+    left = min(left, w-1)
+    right = min(right, w - left - 1)
+    top = min(top, h - left - 1)
+    bottom = min(bottom, h - top - 1)
+    image = image[top:h-bottom, left:w-right]
+    h, w, c = image.shape
+    if h < w:
+        offset = (w - h) // 2
+        image = image[:, offset:offset + h]
+    elif w < h:
+        offset = (h - w) // 2
+        image = image[offset:offset + w]
+    image = np.array(Image.fromarray(image).resize((512, 512)))
+    return image
+def invert_image(args, ldm_stable, ldm_stable_config, prompts, exp_path):
+    print("Start null text inversion")
+    null_inversion = NullInversion(ldm_stable, ldm_stable_config)
+    (image_gt, image_enc), x_t, uncond_embeddings = null_inversion.invert(args.real_image_path, prompts[0], offsets=(0,0,0,0), verbose=True)
+    save_image(ToTensor()(image_gt), f"{exp_path}/real_image.jpg")
+    save_image(ToTensor()(image_enc), f"{exp_path}/image_enc.jpg")
+    print("End null text inversion")
+    return x_t, uncond_embeddings
+class NullInversion:
+    def __init__(self, model, model_config):
+        self.model = model
+        self.model_config = model_config
+        self.tokenizer = self.model.tokenizer
+        self.model.scheduler.set_timesteps(self.model_config["num_diffusion_steps"])
+        self.prompt = None
+        self.context = None
+    def prev_step(self, model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, sample: Union[torch.FloatTensor, np.ndarray]):
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+        pred_sample_direction = (1 - alpha_prod_t_prev) ** 0.5 * model_output
+        prev_sample = alpha_prod_t_prev ** 0.5 * pred_original_sample + pred_sample_direction
+        return prev_sample
+    def next_step(self, model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, sample: Union[torch.FloatTensor, np.ndarray]):
+        timestep, next_timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999), timestep
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_timestep]
+        beta_prod_t = 1 - alpha_prod_t
+        next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+        next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
+        next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
+        return next_sample
+    def get_noise_pred_single(self, latents, t, context):
+        noise_pred = self.model.unet(latents, t, encoder_hidden_states=context)["sample"]
+        return noise_pred
+    def get_noise_pred(self, latents, t, is_forward=True, context=None):
+        latents_input = torch.cat([latents] * 2)
+        if context is None:
+            context = self.context
+        guidance_scale = 1 if is_forward else self.model_config["guidance_scale"]
+        noise_pred = self.model.unet(latents_input, t, encoder_hidden_states=context)["sample"]
+        noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+        if is_forward:
+            latents = self.next_step(noise_pred, t, latents)
+        else:
+            latents = self.prev_step(noise_pred, t, latents)
+        return latents
+    @torch.no_grad()
+    def latent2image(self, latents, return_type='np'):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.model.vae.decode(latents)['sample']
+        if return_type == 'np':
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+            image = (image * 255).astype(np.uint8)
+        return image
+    @torch.no_grad()
+    def image2latent(self, image):
+        with torch.no_grad():
+            if type(image) is Image:
+                image = np.array(image)
+            if type(image) is torch.Tensor and image.dim() == 4:
+                latents = image
+            else:
+                image = torch.from_numpy(image).float() / 127.5 - 1
+                image = image.permute(2, 0, 1).unsqueeze(0).to(self.model.device)
+                latents = self.model.vae.encode(image)['latent_dist'].mean
+                latents = latents * 0.18215
+        return latents
+    @torch.no_grad()
+    def init_prompt(self, prompt: str):
+        uncond_input = self.model.tokenizer(
+            [""], padding="max_length", max_length=self.model.tokenizer.model_max_length,
+            return_tensors="pt"
+        )
+        uncond_embeddings = self.model.text_encoder(uncond_input.input_ids.to(self.model.device))[0]
+        text_input = self.model.tokenizer(
+            [prompt],
+            padding="max_length",
+            max_length=self.model.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.model.text_encoder(text_input.input_ids.to(self.model.device))[0]
+        self.context = torch.cat([uncond_embeddings, text_embeddings])
+        self.prompt = prompt
+    @torch.no_grad()
+    def ddim_loop(self, latent):
+        uncond_embeddings, cond_embeddings = self.context.chunk(2)
+        all_latent = [latent]
+        latent = latent.clone().detach()
+        for i in tqdm(range(self.model_config["num_diffusion_steps"])):
+            t = self.model.scheduler.timesteps[len(self.model.scheduler.timesteps) - i - 1]
+            noise_pred = self.get_noise_pred_single(latent, t, cond_embeddings)
+            latent = self.next_step(noise_pred, t, latent)
+            all_latent.append(latent)
+        return all_latent
+    @property
+    def scheduler(self):
+        return self.model.scheduler
+    @torch.no_grad()
+    def ddim_inversion(self, image):
+        latent = self.image2latent(image)
+        image_rec = self.latent2image(latent)
+        ddim_latents = self.ddim_loop(latent)
+        return image_rec, ddim_latents
+    def null_optimization(self, latents, num_inner_steps, epsilon):
+        uncond_embeddings, cond_embeddings = self.context.chunk(2)
+        uncond_embeddings_list = []
+        latent_cur = latents[-1]
+        with tqdm(total=num_inner_steps * (self.model_config["num_diffusion_steps"])) as bar:
+            for i in range(self.model_config["num_diffusion_steps"]):
+                uncond_embeddings = uncond_embeddings.clone().detach()
+                uncond_embeddings.requires_grad = True
+                optimizer = Adam([uncond_embeddings], lr=1e-2 * (1. - i / 100.))
+                latent_prev = latents[len(latents) - i - 2]
+                t = self.model.scheduler.timesteps[i]
+                with torch.no_grad():
+                    noise_pred_cond = self.get_noise_pred_single(latent_cur, t, cond_embeddings)
+                for j in range(num_inner_steps):
+                    noise_pred_uncond = self.get_noise_pred_single(latent_cur, t, uncond_embeddings)
+                    noise_pred = noise_pred_uncond + self.model_config["guidance_scale"] * (noise_pred_cond - noise_pred_uncond)
+                    latents_prev_rec = self.prev_step(noise_pred, t, latent_cur)
+                    loss = nnf.mse_loss(latents_prev_rec, latent_prev)
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+                    loss_item = loss.item()
+                    bar.update()
+                    if loss_item < epsilon + i * 2e-5:
+                        break
+                bar.update(num_inner_steps - j - 1)
+                uncond_embeddings_list.append(uncond_embeddings[:1].detach())
+                with torch.no_grad():
+                    context = torch.cat([uncond_embeddings, cond_embeddings])
+                    latent_cur = self.get_noise_pred(latent_cur, t, False, context)
+        # bar.close()
+        return uncond_embeddings_list
+    def invert(self, image_path: str, prompt: str, offsets=(0,0,0,0), num_inner_steps=10, early_stop_epsilon=1e-5, verbose=False):
+        self.init_prompt(prompt)
+        image_gt = load_512(image_path, *offsets)
+        if verbose:
+            print("DDIM inversion...")
+        image_rec, ddim_latents = self.ddim_inversion(image_gt)
+        if verbose:
+            print("Null-text optimization...")
+        uncond_embeddings = self.null_optimization(ddim_latents, num_inner_steps, early_stop_epsilon)
+        return (image_gt, image_rec), ddim_latents[-1], uncond_embeddings

src/prompt_mixing.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from scipy.signal import medfilt2d
+class PromptMixing:
+    def __init__(self, args, object_of_interest_index, avg_cross_attn=None):
+        self.object_of_interest_index = object_of_interest_index
+        self.objects_to_preserve = [args.prompt.split().index(o) + 1 for o in args.objects_to_preserve]
+        self.obj_pixels_injection_threshold = args.obj_pixels_injection_threshold
+        self.start_other_prompt_range = args.start_prompt_range
+        self.end_other_prompt_range = args.end_prompt_range
+        self.start_cross_attn_replace_range = args.num_diffusion_steps
+        self.end_cross_attn_replace_range = args.num_diffusion_steps
+        self.start_self_attn_replace_range = 0
+        self.end_self_attn_replace_range = args.end_preserved_obj_self_attn_masking
+        self.remove_obj_from_self_mask = args.remove_obj_from_self_mask
+        self.avg_cross_attn = avg_cross_attn
+        self.low_resource = args.low_resource
+    def get_context_for_v(self, t, context, other_context):
+        if other_context is not None and \
+           self.start_other_prompt_range <= t < self.end_other_prompt_range:
+            if self.low_resource:
+                return other_context
+            else:
+                v_context = context.clone()
+                # first half of context is for the uncoditioned image
+                v_context[v_context.shape[0]//2:] = other_context
+                return v_context
+        else:
+            return context
+    def get_cross_attn(self, diffusion_model_wrapper, t, attn, place_in_unet, batch_size):
+        if self.start_cross_attn_replace_range <= t < self.end_cross_attn_replace_range:
+            if self.low_resource:
+                attn[:,:,self.object_of_interest_index] = 0.2 * torch.from_numpy(medfilt2d(attn[:, :, self.object_of_interest_index].cpu().numpy(), kernel_size=3)).to(attn.device) + \
+                                                          0.8 * attn[:, :, self.object_of_interest_index]
+            else:
+                # first half of attn maps is for the uncoditioned image
+                min_h = attn.shape[0] // 2
+                attn[min_h:, :, self.object_of_interest_index] = 0.2 * torch.from_numpy(medfilt2d(attn[min_h:, :, self.object_of_interest_index].cpu().numpy(), kernel_size=3)).to(attn.device) + \
+                                                                 0.8 * attn[min_h:, :, self.object_of_interest_index]
+        return attn
+    def get_self_attn(self, diffusion_model_wrapper, t, attn, place_in_unet, batch_size):
+        if attn.shape[1] <= 32 ** 2 and \
+           self.avg_cross_attn is not None and \
+           self.start_self_attn_replace_range <= t < self.end_self_attn_replace_range:
+            key = f"{place_in_unet}_cross"
+            attn_index = getattr(diffusion_model_wrapper, f'{key}_index')
+            cr = self.avg_cross_attn[key][attn_index]
+            setattr(diffusion_model_wrapper, f'{key}_index', attn_index+1)
+            if self.low_resource:
+                attn = self.mask_self_attn_patches(attn, cr, batch_size)
+            else:
+                # first half of attn maps is for the uncoditioned image
+                attn[attn.shape[0]//2:] = self.mask_self_attn_patches(attn[attn.shape[0]//2:], cr, batch_size//2)
+        return attn
+    def mask_self_attn_patches(self, self_attn, cross_attn, batch_size):
+        h = self_attn.shape[0] // batch_size
+        tokens = self.objects_to_preserve
+        obj_token = self.object_of_interest_index
+        normalized_cross_attn =  cross_attn - cross_attn.min()
+        normalized_cross_attn /= normalized_cross_attn.max()
+        mask = torch.zeros_like(self_attn[0])
+        for tk in tokens:
+            mask_tk_in = torch.unique((normalized_cross_attn[:,:,tk] > self.obj_pixels_injection_threshold).nonzero(as_tuple=True)[1])
+            mask[mask_tk_in, :] = 1
+            mask[:, mask_tk_in] = 1
+        if self.remove_obj_from_self_mask:
+            obj_patches = torch.unique((normalized_cross_attn[:,:,obj_token] > self.obj_pixels_injection_threshold).nonzero(as_tuple=True)[1])
+            mask[obj_patches, :] = 0
+            mask[:, obj_patches] = 0
+        self_attn[h:] = self_attn[h:] * (1 - mask) + self_attn[:h].repeat(batch_size - 1, 1, 1) * mask
+        return self_attn

src/prompt_to_prompt_controllers.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+import numpy as np
+import abc
+from typing import Optional, Union, Tuple, Dict
+import src.seq_aligner as seq_aligner
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return self.num_att_layers if self.low_resource else 0
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            if self.low_resource:
+                attn = self.forward(attn, is_cross, place_in_unet)
+            else:
+                h = attn.shape[0]
+                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self, low_resource):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+        self.low_resource = low_resource
+class EmptyControl(AttentionControl):
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        return attn
+class DummyController:
+    def __call__(self, *args):
+        return args[0]
+    def __init__(self):
+        self.num_att_layers = 0
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [], "mid_self": [], "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in
+                             self.attention_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self, low_resource):
+        super(AttentionStore, self).__init__(low_resource)
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+class AttentionControlEdit(AttentionStore, abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def replace_self_attention(self, attn_base, att_replace):
+        if att_replace.shape[2] <= 16 ** 2:
+            return attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
+        else:
+            return att_replace
+    @abc.abstractmethod
+    def replace_cross_attention(self, attn_base, att_replace):
+        raise NotImplementedError
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
+        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
+            h = attn.shape[0] // (self.batch_size)
+            attn = attn.reshape(self.batch_size, h, *attn.shape[1:])
+            attn_base, attn_repalce = attn[0], attn[1:]
+            if is_cross:
+                alpha_words = self.cross_replace_alpha[self.cur_step]
+                attn_repalce_new = self.replace_cross_attention(attn_base, attn_repalce) * alpha_words + (
+                            1 - alpha_words) * attn_repalce
+                attn[1:] = attn_repalce_new
+            else:
+                attn[1:] = self.replace_self_attention(attn_base, attn_repalce)
+            attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
+        return attn
+    def __init__(self, prompts, tokenizer, device, low_resource, num_steps: int,
+                 cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+                 self_replace_steps: Union[float, Tuple[float, float]]):
+        super(AttentionControlEdit, self).__init__(low_resource)
+        self.batch_size = len(prompts)
+        self.tokenizer = tokenizer
+        self.cross_replace_alpha = get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps,
+                                                                            self.tokenizer).to(device)
+        if type(self_replace_steps) is float:
+            self_replace_steps = 0, self_replace_steps
+        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
+class AttentionReplace(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        return torch.einsum('hpw,bwn->bhpn', attn_base, self.mapper.to(attn_base.dtype))
+    def __init__(self, prompts, tokenizer, device, low_resource, num_steps: int, cross_replace_steps: float, self_replace_steps: float):
+        super(AttentionReplace, self).__init__(prompts, tokenizer, device, low_resource, num_steps, cross_replace_steps, self_replace_steps)
+        self.mapper = seq_aligner.get_replacement_mapper(prompts, self.tokenizer).to(device)
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if type(word_place) is str:
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif type(word_place) is int:
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+def update_alpha_time_word(alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int, word_inds: Optional[torch.Tensor]=None):
+    if type(bounds) is float:
+        bounds = 0, bounds
+    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
+    if word_inds is None:
+        word_inds = torch.arange(alpha.shape[2])
+    alpha[: start, prompt_ind, word_inds] = 0
+    alpha[start: end, prompt_ind, word_inds] = 1
+    alpha[end:, prompt_ind, word_inds] = 0
+    return alpha
+def get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+                                   tokenizer, max_num_words=77):
+    if type(cross_replace_steps) is not dict:
+        cross_replace_steps = {"default_": cross_replace_steps}
+    if "default_" not in cross_replace_steps:
+        cross_replace_steps["default_"] = (0., 1.)
+    alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
+    for i in range(len(prompts) - 1):
+        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"],
+                                                  i)
+    for key, item in cross_replace_steps.items():
+        if key != "default_":
+             inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
+             for i, ind in enumerate(inds):
+                 if len(ind) > 0:
+                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
+    alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words) # time, batch, heads, pixels, words
+    return alpha_time_words

src/prompt_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import json
+import torch
+import numpy as np
+from tqdm import tqdm
+def get_topk_similar_words(model, prompt, base_word, vocab, k=30):
+    text_input = model.tokenizer(
+        [prompt.format(word=base_word)],
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        encoder_output = model.text_encoder(text_input.input_ids.to(model.device))
+    full_prompt_embedding = encoder_output.pooler_output
+    full_prompt_embedding = full_prompt_embedding / full_prompt_embedding.norm(p=2, dim=-1, keepdim=True)
+    prompts = [prompt.format(word=word) for word in vocab]
+    batch_size = 1000
+    all_prompts_embeddings = []
+    for i in tqdm(range(0, len(prompts), batch_size)):
+        curr_prompts = prompts[i:i + batch_size]
+        with torch.no_grad():
+            text_input = model.tokenizer(
+                curr_prompts,
+                padding="max_length",
+                max_length=model.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            curr_embeddings = model.text_encoder(text_input.input_ids.to(model.device)).pooler_output
+        all_prompts_embeddings.append(curr_embeddings)
+    all_prompts_embeddings = torch.cat(all_prompts_embeddings)
+    all_prompts_embeddings = all_prompts_embeddings / all_prompts_embeddings.norm(p=2, dim=-1, keepdim=True)
+    prompts_similarities = all_prompts_embeddings.matmul(full_prompt_embedding.view(-1, 1))
+    sorted_prompts_similarities = np.flip(prompts_similarities.cpu().numpy().reshape(-1).argsort())
+    print(f"prompt: {prompt}")
+    print(f"initial word: {base_word}")
+    print(f"TOP {k} SIMILAR WORDS:")
+    similar_words = [vocab[index] for index in sorted_prompts_similarities[:k]]
+    print(similar_words)
+    return similar_words
+def get_proxy_words(args, ldm_stable):
+    if len(args.proxy_words) > 0:
+        return [args.object_of_interest] + args.proxy_words
+    vocab = list(json.load(open("vocab.json")).keys())
+    vocab = [word for word in vocab if word.isalpha() and len(word) > 1]
+    filtered_vocab = get_topk_similar_words(ldm_stable, "a photo of a {word}", args.object_of_interest, vocab, k=50)
+    proxy_words = get_topk_similar_words(ldm_stable, args.prompt, args.object_of_interest, filtered_vocab, k=args.number_of_variations)
+    if proxy_words[0] != args.object_of_interest:
+        proxy_words = [args.object_of_interest] + proxy_words
+    return proxy_words
+def get_proxy_prompts(args, ldm_stable):
+    proxy_words = get_proxy_words(args, ldm_stable)
+    prompts = [args.prompt.format(word=args.object_of_interest)]
+    proxy_prompts = [{"word": word, "prompt": args.prompt.format(word=word)} for word in proxy_words]
+    return proxy_words, prompts, proxy_prompts

src/seq_aligner.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+class ScoreParams:
+    def __init__(self, gap, match, mismatch):
+        self.gap = gap
+        self.match = match
+        self.mismatch = mismatch
+    def mis_match_char(self, x, y):
+        if x != y:
+            return self.mismatch
+        else:
+            return self.match
+def get_matrix(size_x, size_y, gap):
+    matrix = []
+    for i in range(len(size_x) + 1):
+        sub_matrix = []
+        for j in range(len(size_y) + 1):
+            sub_matrix.append(0)
+        matrix.append(sub_matrix)
+    for j in range(1, len(size_y) + 1):
+        matrix[0][j] = j*gap
+    for i in range(1, len(size_x) + 1):
+        matrix[i][0] = i*gap
+    return matrix
+def get_matrix(size_x, size_y, gap):
+    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
+    matrix[0, 1:] = (np.arange(size_y) + 1) * gap
+    matrix[1:, 0] = (np.arange(size_x) + 1) * gap
+    return matrix
+def get_traceback_matrix(size_x, size_y):
+    matrix = np.zeros((size_x + 1, size_y +1), dtype=np.int32)
+    matrix[0, 1:] = 1
+    matrix[1:, 0] = 2
+    matrix[0, 0] = 4
+    return matrix
+def global_align(x, y, score):
+    matrix = get_matrix(len(x), len(y), score.gap)
+    trace_back = get_traceback_matrix(len(x), len(y))
+    for i in range(1, len(x) + 1):
+        for j in range(1, len(y) + 1):
+            left = matrix[i, j - 1] + score.gap
+            up = matrix[i - 1, j] + score.gap
+            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
+            matrix[i, j] = max(left, up, diag)
+            if matrix[i, j] == left:
+                trace_back[i, j] = 1
+            elif matrix[i, j] == up:
+                trace_back[i, j] = 2
+            else:
+                trace_back[i, j] = 3
+    return matrix, trace_back
+def get_aligned_sequences(x, y, trace_back):
+    x_seq = []
+    y_seq = []
+    i = len(x)
+    j = len(y)
+    mapper_y_to_x = []
+    while i > 0 or j > 0:
+        if trace_back[i, j] == 3:
+            x_seq.append(x[i-1])
+            y_seq.append(y[j-1])
+            i = i-1
+            j = j-1
+            mapper_y_to_x.append((j, i))
+        elif trace_back[i][j] == 1:
+            x_seq.append('-')
+            y_seq.append(y[j-1])
+            j = j-1
+            mapper_y_to_x.append((j, -1))
+        elif trace_back[i][j] == 2:
+            x_seq.append(x[i-1])
+            y_seq.append('-')
+            i = i-1
+        elif trace_back[i][j] == 4:
+            break
+    mapper_y_to_x.reverse()
+    return x_seq, y_seq, torch.tensor(mapper_y_to_x, dtype=torch.int64)
+def get_mapper(x: str, y: str, tokenizer, max_len=77):
+    x_seq = tokenizer.encode(x)
+    y_seq = tokenizer.encode(y)
+    score = ScoreParams(0, 1, -1)
+    matrix, trace_back = global_align(x_seq, y_seq, score)
+    mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
+    alphas = torch.ones(max_len)
+    alphas[: mapper_base.shape[0]] = mapper_base[:, 1].ne(-1).float()
+    mapper = torch.zeros(max_len, dtype=torch.int64)
+    mapper[:mapper_base.shape[0]] = mapper_base[:, 1]
+    mapper[mapper_base.shape[0]:] = len(y_seq) + torch.arange(max_len - len(y_seq))
+    return mapper, alphas
+def get_refinement_mapper(prompts, tokenizer, max_len=77):
+    x_seq = prompts[0]
+    mappers, alphas = [], []
+    for i in range(1, len(prompts)):
+        mapper, alpha = get_mapper(x_seq, prompts[i], tokenizer, max_len)
+        mappers.append(mapper)
+        alphas.append(alpha)
+    return torch.stack(mappers), torch.stack(alphas)
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if type(word_place) is str:
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif type(word_place) is int:
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
+    words_x = x.split(' ')
+    words_y = y.split(' ')
+    if len(words_x) != len(words_y):
+        raise ValueError(f"attention replacement edit can only be applied on prompts with the same length"
+                         f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words.")
+    inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
+    inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace]
+    inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace]
+    mapper = np.zeros((max_len, max_len))
+    i = j = 0
+    cur_inds = 0
+    while i < max_len and j < max_len:
+        if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
+            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
+            if len(inds_source_) == len(inds_target_):
+                mapper[inds_source_, inds_target_] = 1
+            else:
+                ratio = 1 / len(inds_target_)
+                for i_t in inds_target_:
+                    mapper[inds_source_, i_t] = ratio
+            cur_inds += 1
+            i += len(inds_source_)
+            j += len(inds_target_)
+        elif cur_inds < len(inds_source):
+            mapper[i, j] = 1
+            i += 1
+            j += 1
+        else:
+            mapper[j, j] = 1
+            i += 1
+            j += 1
+    return torch.from_numpy(mapper).float()
+def get_replacement_mapper(prompts, tokenizer, max_len=77):
+    x_seq = prompts[0]
+    mappers = []
+    for i in range(1, len(prompts)):
+        mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len)
+        mappers.append(mapper)
+    return torch.stack(mappers)

style.css ADDED Viewed

	@@ -0,0 +1,3 @@

+h1 {
+  text-align: center;
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff