rich-text-to-image

Runtime error

App Files Files Community

songweig commited on May 15, 2023

Commit

ea48617

1 Parent(s): 4d49165

token map v3

Browse files

Files changed (3) hide show

app.py +71 -69
models/region_diffusion.py +15 -28
utils/attention_utils.py +29 -17

app.py CHANGED Viewed

@@ -29,10 +29,10 @@ If you are encountering an error or not achieving your desired outcome, here are
 canvas_html = """<iframe id='rich-text-root' style='width:100%' height='360px' src='file=rich-text-to-json-iframe.html' frameborder='0' scrolling='no'></iframe>"""
 get_js_data = """
-async (text_input, negative_prompt, height, width, seed, steps, num_segments, segment_threshold, inject_interval, guidance_weight, color_guidance_weight, rich_text_input, background_aug) => {
   const richEl = document.getElementById("rich-text-root");
   const data = richEl? richEl.contentDocument.body._data : {};
-  return [text_input, negative_prompt, height, width, seed, steps, num_segments, segment_threshold, inject_interval, guidance_weight, color_guidance_weight, JSON.stringify(data), background_aug];
 }
 """
 set_js_data = """
@@ -66,27 +66,27 @@ def main():
     def generate(
         text_input: str,
         negative_text: str,
-        height: int,
-        width: int,
-        seed: int,
-        steps: int,
         num_segments: int,
         segment_threshold: float,
         inject_interval: float,
-        guidance_weight: float,
         color_guidance_weight: float,
         rich_text_input: str,
-        background_aug: bool,
     ):
         run_dir = 'results/'
         os.makedirs(run_dir, exist_ok=True)
         # Load region diffusion model.
-        height = int(height)
-        width = int(width)
         steps = 41 if not steps else steps
         guidance_weight = 8.5 if not guidance_weight else guidance_weight
-        text_input = rich_text_input if rich_text_input != '' else text_input
-        print('text_input', text_input)
         if (text_input == '' or rich_text_input == ''):
             raise gr.Error("Please enter some text.")
         # parse json to span attributes
@@ -132,25 +132,25 @@ def main():
                                                                512//8, 512//8, region_target_token_ids[:-1], seed,
                                                                base_tokens, segment_threshold=segment_threshold, num_segments=num_segments,
                                                                return_vis=True)
         color_obj_masks = [transforms.functional.resize(color_obj_mask, (height, width),
                                                         interpolation=transforms.InterpolationMode.BICUBIC,
                                                         antialias=True)
                            for color_obj_mask in color_obj_masks]
         text_format_dict['color_obj_atten'] = color_obj_masks
         model.remove_tokenmap_hooks()
         # generate image from rich text
         begin_time = time.time()
         seed_everything(seed)
-        if background_aug:
-            bg_aug_end = 500
-        else:
-            bg_aug_end = 1000
         rich_img = model.prompt_to_img(region_text_prompts, [negative_text],
                                        height=height, width=width, num_inference_steps=steps,
                                        guidance_scale=guidance_weight, use_guidance=use_grad_guidance,
                                        text_format_dict=text_format_dict, inject_selfattn=inject_interval,
-                                       bg_aug_end=bg_aug_end)
         print('time lapses to generate image from rich text: %.4f' %
               (time.time()-begin_time))
         return [plain_img[0], rich_img[0], segments_vis, token_maps]
@@ -191,6 +191,12 @@ def main():
                                             maximum=1,
                                             step=0.01,
                                             value=0.)
                 color_guidance_weight = gr.Slider(label='Color weight',
                                                   info='(To obtain more precise color, increase this, while too large value may cause artifacts.)',
                                                   minimum=0,
@@ -209,10 +215,6 @@ def main():
                                  value=6,
                                  elem_id="seed"
                                  )
-                background_aug = gr.Checkbox(
-                    label='Precise region alignment',
-                    info='(For strict region alignment, select this option, but beware of potential artifacts when using with style.)',
-                    value=True)
                 with gr.Accordion('Other Parameters', open=False):
                     steps = gr.Slider(label='Number of Steps',
                                       minimum=0,
@@ -266,32 +268,32 @@ def main():
                     5,
                     0.3,
                     0,
                     6,
-                    1,
                     None,
-                    True
                 ],
                 [
-                    '{"ops":[{"insert":"A "},{"attributes":{"link":"kitchen island with a stove with gas burners and a built-in oven "},"insert":"kitchen island"},{"insert":" next to a "},{"attributes":{"link":"an open refrigerator stocked with fresh produce, dairy products, and beverages. "},"insert":"refrigerator"},{"insert":", by James McDonald and Joarc Architects, home, interior, octane render, deviantart, cinematic, key art, hyperrealism, sun light, sunrays, canon eos c 300, ƒ 1.8, 35 mm, 8k, medium - format print"}]}',
                     '',
-                    6,
                     0.5,
                     0,
                     6,
-                    1,
                     None,
-                    True
                 ],
                 [
                     '{"ops":[{"insert":"A "},{"attributes":{"link":"Happy Kung fu panda art, elder, asian art, volumetric lighting, dramatic scene, ultra detailed, realism, chinese"},"insert":"panda"},{"insert":" standing on a cliff by a waterfall, wildlife photography, photograph, high quality, wildlife, f 1.8, soft focus, 8k, national geographic, award - winning photograph by nick nichols"}]}',
                     '',
-                    4,
                     0.3,
                     0,
                     4,
-                    1,
                     None,
-                    True
                 ],
             ]
@@ -303,10 +305,10 @@ def main():
                             num_segments,
                             segment_threshold,
                             inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
-                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
@@ -315,42 +317,42 @@ def main():
                             token_map,
                         ],
                         fn=generate,
-                        # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
             color_examples = [
                 [
-                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#00ffff"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
                     'lowres, had anatomy, bad hands, cropped, worst quality',
-                    9,
-                    0.25,
                     0.3,
                     6,
                     0.5,
                     None,
-                    True
                 ],
                 [
-                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#eeeeee"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
                     'lowres, had anatomy, bad hands, cropped, worst quality',
-                    9,
-                    0.25,
                     0.3,
                     6,
-                    0.1,
                     None,
-                    True
                 ],
                 [
                     '{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#FD6C9E"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background."}]}',
                     '',
-                    5,
-                    0.3,
                     0.5,
                     6,
                     0.5,
                     None,
-                    False
                 ],
                 [
                     '{"ops":[{"insert":"A mesmerizing sight that captures the beauty of a "},{"attributes":{"color":"#4775fc"},"insert":"rose"},{"insert":" blooming, close up"}]}',
@@ -358,21 +360,21 @@ def main():
                     3,
                     0.3,
                     0,
                     9,
                     1,
                     None,
-                    False
                 ],
                 [
                     '{"ops":[{"insert":"A "},{"attributes":{"color":"#FFD700"},"insert":"marble statue of a wolf\'s head and shoulder"},{"insert":", surrounded by colorful flowers michelangelo, detailed, intricate, full of color, led lighting, trending on artstation, 4 k, hyperrealistic, 3 5 mm, focused, extreme details, unreal engine 5, masterpiece "}]}',
                     '',
                     5,
                     0.3,
-                    0,
                     5,
                     0.6,
                     None,
-                    False
                 ],
             ]
             gr.Examples(examples=color_examples,
@@ -383,10 +385,10 @@ def main():
                             num_segments,
                             segment_threshold,
                             inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
-                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
@@ -395,7 +397,7 @@ def main():
                             token_map,
                         ],
                         fn=generate,
-                        # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
@@ -403,13 +405,13 @@ def main():
                 [
                     '{"ops":[{"insert":"a "},{"attributes":{"font":"mirza"},"insert":"beautiful garden"},{"insert":" with a "},{"attributes":{"font":"roboto"},"insert":"snow mountain in the background"},{"insert":""}]}',
                     '',
-                    5,
-                    0.3,
                     0.2,
                     3,
-                    0.5,
                     None,
-                    False
                 ],
                 [
                     '{"ops":[{"attributes":{"link":"the awe-inspiring sky and ocean in the style of J.M.W. Turner"},"insert":"the awe-inspiring sky and sea"},{"insert":" by "},{"attributes":{"font":"mirza"},"insert":"a coast with flowers and grasses in spring"}]}',
@@ -417,21 +419,21 @@ def main():
                     5,
                     0.3,
                     0,
                     9,
                     0.5,
                     None,
-                    False
                 ],
                 [
                     '{"ops":[{"insert":"a "},{"attributes":{"font":"slabo"},"insert":"night sky filled with stars"},{"insert":" above a "},{"attributes":{"font":"roboto"},"insert":"turbulent sea with giant waves"}]}',
                     '',
                     2,
-                    0.4,
                     0,
                     6,
                     0.5,
                     None,
-                    False
                 ],
             ]
             gr.Examples(examples=style_examples,
@@ -442,10 +444,10 @@ def main():
                             num_segments,
                             segment_threshold,
                             inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
-                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
@@ -454,7 +456,7 @@ def main():
                             token_map,
                         ],
                         fn=generate,
-                        # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
@@ -465,10 +467,10 @@ def main():
                     5,
                     0.3,
                     0,
                     13,
                     1,
                     None,
-                    False
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, "}, {"attributes": {"size": "20px"}, "insert": "pepperoni"}, {"insert": ", and mushroom on the top, 4k, photorealistic"}]}',
@@ -476,10 +478,10 @@ def main():
                     5,
                     0.3,
                     0,
                     13,
                     1,
                     None,
-                    False
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, pepperoni, and "}, {"attributes": {"size": "70px"}, "insert": "mushroom"}, {"insert": " on the top, 4k, photorealistic"}]}',
@@ -487,10 +489,10 @@ def main():
                     5,
                     0.3,
                     0,
                     13,
                     1,
                     None,
-                    False
                 ],
             ]
             gr.Examples(examples=size_examples,
@@ -501,10 +503,10 @@ def main():
                             num_segments,
                             segment_threshold,
                             inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
-                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
@@ -513,24 +515,24 @@ def main():
                             token_map,
                         ],
                         fn=generate,
-                        # cache_examples=True,
                         examples_per_page=20)
         generate_button.click(fn=lambda: gr.update(visible=False), inputs=None, outputs=share_row, queue=False).then(
             fn=generate,
             inputs=[
                 text_input,
                 negative_prompt,
-                height,
-                width,
-                seed,
-                steps,
                 num_segments,
                 segment_threshold,
                 inject_interval,
-                guidance_weight,
                 color_guidance_weight,
                 rich_text_input,
-                background_aug
             ],
             outputs=[plaintext_result, richtext_result, segments, token_map],
             _js=get_js_data

 canvas_html = """<iframe id='rich-text-root' style='width:100%' height='360px' src='file=rich-text-to-json-iframe.html' frameborder='0' scrolling='no'></iframe>"""
 get_js_data = """
+async (text_input, negative_prompt, num_segments, segment_threshold, inject_interval, inject_background, seed, color_guidance_weight, rich_text_input, height, width, steps, guidance_weights) => {
   const richEl = document.getElementById("rich-text-root");
   const data = richEl? richEl.contentDocument.body._data : {};
+  return [text_input, negative_prompt, num_segments, segment_threshold, inject_interval, inject_background, seed, color_guidance_weight, JSON.stringify(data), height, width, steps, guidance_weights];
 }
 """
 set_js_data = """
     def generate(
         text_input: str,
         negative_text: str,
         num_segments: int,
         segment_threshold: float,
         inject_interval: float,
+        inject_background: float,
+        seed: int,
         color_guidance_weight: float,
         rich_text_input: str,
+        height: int,
+        width: int,
+        steps: int,
+        guidance_weight: float,
     ):
         run_dir = 'results/'
         os.makedirs(run_dir, exist_ok=True)
         # Load region diffusion model.
+        height = int(height) if height else 512
+        width = int(width) if width else 512
         steps = 41 if not steps else steps
         guidance_weight = 8.5 if not guidance_weight else guidance_weight
+        text_input = rich_text_input if rich_text_input != '' and rich_text_input != None else text_input
+        print('text_input', text_input, width, height, steps, guidance_weight, num_segments, segment_threshold, inject_interval, inject_background, color_guidance_weight, negative_text)
         if (text_input == '' or rich_text_input == ''):
             raise gr.Error("Please enter some text.")
         # parse json to span attributes
                                                                512//8, 512//8, region_target_token_ids[:-1], seed,
                                                                base_tokens, segment_threshold=segment_threshold, num_segments=num_segments,
                                                                return_vis=True)
+        color_obj_atten_all = torch.zeros_like(color_obj_masks[-1])
+        for obj_mask in color_obj_masks[:-1]:
+            color_obj_atten_all += obj_mask
         color_obj_masks = [transforms.functional.resize(color_obj_mask, (height, width),
                                                         interpolation=transforms.InterpolationMode.BICUBIC,
                                                         antialias=True)
                            for color_obj_mask in color_obj_masks]
         text_format_dict['color_obj_atten'] = color_obj_masks
+        text_format_dict['color_obj_atten_all'] = color_obj_atten_all
         model.remove_tokenmap_hooks()
         # generate image from rich text
         begin_time = time.time()
         seed_everything(seed)
         rich_img = model.prompt_to_img(region_text_prompts, [negative_text],
                                        height=height, width=width, num_inference_steps=steps,
                                        guidance_scale=guidance_weight, use_guidance=use_grad_guidance,
                                        text_format_dict=text_format_dict, inject_selfattn=inject_interval,
+                                       inject_background=inject_background)
         print('time lapses to generate image from rich text: %.4f' %
               (time.time()-begin_time))
         return [plain_img[0], rich_img[0], segments_vis, token_maps]
                                             maximum=1,
                                             step=0.01,
                                             value=0.)
+                inject_background = gr.Slider(label='Unformatted token preservation',
+                                            info='(To affect less the tokens without any rich-text attributes, increase this.)',
+                                            minimum=0,
+                                            maximum=1,
+                                            step=0.01,
+                                            value=0.3)
                 color_guidance_weight = gr.Slider(label='Color weight',
                                                   info='(To obtain more precise color, increase this, while too large value may cause artifacts.)',
                                                   minimum=0,
                                  value=6,
                                  elem_id="seed"
                                  )
                 with gr.Accordion('Other Parameters', open=False):
                     steps = gr.Slider(label='Number of Steps',
                                       minimum=0,
                     5,
                     0.3,
                     0,
+                    0.5,
                     6,
+                    0,
                     None,
                 ],
                 [
+                    '{"ops":[{"insert":"A "},{"attributes":{"link":"Thor Kitchen 30 Inch Wide Freestanding Gas Range with Automatic Re-Ignition System"},"insert":"kitchen island"},{"insert":" next to a "},{"attributes":{"link":"an open refrigerator stocked with fresh produce, dairy products, and beverages. "},"insert":"refrigerator"},{"insert":", by James McDonald and Joarc Architects, home, interior, octane render, deviantart, cinematic, key art, hyperrealism, sun light, sunrays, canon eos c 300, ƒ 1.8, 35 mm, 8k, medium - format print"}]}',
                     '',
+                    7,
                     0.5,
                     0,
+                    0.5,
                     6,
+                    0,
                     None,
                 ],
                 [
                     '{"ops":[{"insert":"A "},{"attributes":{"link":"Happy Kung fu panda art, elder, asian art, volumetric lighting, dramatic scene, ultra detailed, realism, chinese"},"insert":"panda"},{"insert":" standing on a cliff by a waterfall, wildlife photography, photograph, high quality, wildlife, f 1.8, soft focus, 8k, national geographic, award - winning photograph by nick nichols"}]}',
                     '',
+                    5,
                     0.3,
                     0,
+                    0.1,
                     4,
+                    0,
                     None,
                 ],
             ]
                             num_segments,
                             segment_threshold,
                             inject_interval,
+                            inject_background,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             token_map,
                         ],
                         fn=generate,
+                        cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
             color_examples = [
                 [
+                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#04a704"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
                     'lowres, had anatomy, bad hands, cropped, worst quality',
+                    11,
+                    0.3,
+                    0.3,
                     0.3,
                     6,
                     0.5,
                     None,
                 ],
                 [
+                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#999999"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
                     'lowres, had anatomy, bad hands, cropped, worst quality',
+                    11,
+                    0.3,
+                    0.3,
                     0.3,
                     6,
+                    0.5,
                     None,
                 ],
                 [
                     '{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#FD6C9E"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background."}]}',
                     '',
+                    10,
+                    0.4,
                     0.5,
+                    0.3,
                     6,
                     0.5,
                     None,
                 ],
                 [
                     '{"ops":[{"insert":"A mesmerizing sight that captures the beauty of a "},{"attributes":{"color":"#4775fc"},"insert":"rose"},{"insert":" blooming, close up"}]}',
                     3,
                     0.3,
                     0,
+                    0,
                     9,
                     1,
                     None,
                 ],
                 [
                     '{"ops":[{"insert":"A "},{"attributes":{"color":"#FFD700"},"insert":"marble statue of a wolf\'s head and shoulder"},{"insert":", surrounded by colorful flowers michelangelo, detailed, intricate, full of color, led lighting, trending on artstation, 4 k, hyperrealistic, 3 5 mm, focused, extreme details, unreal engine 5, masterpiece "}]}',
                     '',
                     5,
+                    0.4,
+                    0.3,
                     0.3,
                     5,
                     0.6,
                     None,
                 ],
             ]
             gr.Examples(examples=color_examples,
                             num_segments,
                             segment_threshold,
                             inject_interval,
+                            inject_background,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             token_map,
                         ],
                         fn=generate,
+                        cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
                 [
                     '{"ops":[{"insert":"a "},{"attributes":{"font":"mirza"},"insert":"beautiful garden"},{"insert":" with a "},{"attributes":{"font":"roboto"},"insert":"snow mountain in the background"},{"insert":""}]}',
                     '',
+                    10,
+                    0.4,
+                    0,
                     0.2,
                     3,
+                    0,
                     None,
                 ],
                 [
                     '{"ops":[{"attributes":{"link":"the awe-inspiring sky and ocean in the style of J.M.W. Turner"},"insert":"the awe-inspiring sky and sea"},{"insert":" by "},{"attributes":{"font":"mirza"},"insert":"a coast with flowers and grasses in spring"}]}',
                     5,
                     0.3,
                     0,
+                    0,
                     9,
                     0.5,
                     None,
                 ],
                 [
                     '{"ops":[{"insert":"a "},{"attributes":{"font":"slabo"},"insert":"night sky filled with stars"},{"insert":" above a "},{"attributes":{"font":"roboto"},"insert":"turbulent sea with giant waves"}]}',
                     '',
                     2,
+                    0.35,
+                    0,
                     0,
                     6,
                     0.5,
                     None,
                 ],
             ]
             gr.Examples(examples=style_examples,
                             num_segments,
                             segment_threshold,
                             inject_interval,
+                            inject_background,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             token_map,
                         ],
                         fn=generate,
+                        cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
                     5,
                     0.3,
                     0,
+                    0,
                     13,
                     1,
                     None,
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, "}, {"attributes": {"size": "20px"}, "insert": "pepperoni"}, {"insert": ", and mushroom on the top, 4k, photorealistic"}]}',
                     5,
                     0.3,
                     0,
+                    0,
                     13,
                     1,
                     None,
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, pepperoni, and "}, {"attributes": {"size": "70px"}, "insert": "mushroom"}, {"insert": " on the top, 4k, photorealistic"}]}',
                     5,
                     0.3,
                     0,
+                    0,
                     13,
                     1,
                     None,
                 ],
             ]
             gr.Examples(examples=size_examples,
                             num_segments,
                             segment_threshold,
                             inject_interval,
+                            inject_background,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             token_map,
                         ],
                         fn=generate,
+                        cache_examples=True,
                         examples_per_page=20)
         generate_button.click(fn=lambda: gr.update(visible=False), inputs=None, outputs=share_row, queue=False).then(
             fn=generate,
             inputs=[
                 text_input,
                 negative_prompt,
                 num_segments,
                 segment_threshold,
                 inject_interval,
+                inject_background,
+                seed,
                 color_guidance_weight,
                 rich_text_input,
+                height,
+                width,
+                steps,
+                guidance_weight,
             ],
             outputs=[plaintext_result, richtext_result, segments, token_map],
             _js=get_js_data

models/region_diffusion.py CHANGED Viewed

@@ -84,13 +84,13 @@ class RegionDiffusion(nn.Module):
         return text_embeddings
     def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5,
-                        latents=None, use_guidance=False, text_format_dict={}, inject_selfattn=0, bg_aug_end=1000):
         if latents is None:
             latents = torch.randn(
                 (1, self.unet.in_channels, height // 8, width // 8), device=self.device)
-        if inject_selfattn > 0:
             latents_reference = latents.clone().detach()
         self.scheduler.set_timesteps(num_inference_steps)
         n_styles = text_embeddings.shape[0]-1
@@ -102,11 +102,12 @@ class RegionDiffusion(nn.Module):
                 with torch.no_grad():
                     # tokens without any attributes
                     feat_inject_step = t > (1-inject_selfattn) * 1000
                     noise_pred_uncond_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[:1],
-                                                      text_format_dict={})['sample']
                     noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[-1:],
                                                     text_format_dict=text_format_dict)['sample']
-                    if inject_selfattn > 0:
                         noise_pred_uncond_refer = self.unet(latents_reference, t, encoder_hidden_states=text_embeddings[:1],
                                                             text_format_dict={})['sample']
                         self.register_selfattn_hooks(feat_inject_step)
@@ -117,33 +118,18 @@ class RegionDiffusion(nn.Module):
                     noise_pred_text = noise_pred_text_cur * self.masks[-1]
                     # tokens with attributes
                     for style_i, mask in enumerate(self.masks[:-1]):
-                        if t > bg_aug_end:
-                            rand_rgb = torch.rand([1, 3, 1, 1]).cuda()
-                            black_background = torch.ones(
-                                [1, 3, height, width]).cuda()*rand_rgb
-                            black_latent = self.encode_imgs(
-                                black_background)
-                            noise = torch.randn_like(black_latent)
-                            black_latent_noisy = self.scheduler.add_noise(
-                                black_latent, noise, t)
-                            masked_latent = (
-                                mask > 0.001) * latents + (mask < 0.001) * black_latent_noisy
-                            noise_pred_uncond_cur = self.unet(masked_latent, t, encoder_hidden_states=text_embeddings[:1],
-                                                              text_format_dict={})['sample']
-                        else:
-                            masked_latent = latents
                         self.register_replacement_hooks(feat_inject_step)
-                        noise_pred_text_cur = self.unet(masked_latent, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
                                                         text_format_dict={})['sample']
                         self.remove_replacement_hooks()
                         noise_pred_uncond = noise_pred_uncond + noise_pred_uncond_cur*mask
                         noise_pred_text = noise_pred_text + noise_pred_text_cur*mask
                 # perform classifier-free guidance
                 noise_pred = noise_pred_uncond + guidance_scale * \
                     (noise_pred_text - noise_pred_uncond)
-                if inject_selfattn > 0:
                     noise_pred_refer = noise_pred_uncond_refer + guidance_scale * \
                         (noise_pred_text_refer - noise_pred_uncond_refer)
@@ -174,12 +160,15 @@ class RegionDiffusion(nn.Module):
                                 imgs*attn_map[:, 0]).sum(2).sum(2)/attn_map[:, 0].sum()
                             loss = self.color_loss(
                                 avg_rgb, rgb_val[:, :, 0, 0])*100
-                            # print(loss)
                             loss_total += loss
                         loss_total.backward()
                     latents = (
-                        latents - latents.grad * text_format_dict['color_guidance_weight'] * self.masks[0]).detach().clone()
         return latents
     def predict_x0(self, x_t, eps_t, t):
@@ -255,7 +244,7 @@ class RegionDiffusion(nn.Module):
         return latents
     def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
-                      guidance_scale=7.5, latents=None, text_format_dict={}, use_guidance=False, inject_selfattn=0, bg_aug_end=1000):
         if isinstance(prompts, str):
             prompts = [prompts]
@@ -271,7 +260,7 @@ class RegionDiffusion(nn.Module):
         latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents,
                                        num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
                                        use_guidance=use_guidance, text_format_dict=text_format_dict,
-                                       inject_selfattn=inject_selfattn, bg_aug_end=bg_aug_end)  # [1, 4, 64, 64]
         # Img latents -> imgs
         imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
@@ -345,8 +334,6 @@ class RegionDiffusion(nn.Module):
             """
             # out[0] - final output of residual layer
             # out[1] - residual hidden feature
-            # import ipdb
-            # ipdb.set_trace()
             assert out[1].shape[-1] == 16
             activations[name] = out[1].detach()
         attention_dict = collections.defaultdict(list)

         return text_embeddings
     def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5,
+                        latents=None, use_guidance=False, text_format_dict={}, inject_selfattn=0, inject_background=0):
         if latents is None:
             latents = torch.randn(
                 (1, self.unet.in_channels, height // 8, width // 8), device=self.device)
+        if inject_selfattn > 0 or inject_background > 0:
             latents_reference = latents.clone().detach()
         self.scheduler.set_timesteps(num_inference_steps)
         n_styles = text_embeddings.shape[0]-1
                 with torch.no_grad():
                     # tokens without any attributes
                     feat_inject_step = t > (1-inject_selfattn) * 1000
+                    background_inject_step = i == int(inject_background * len(self.scheduler.timesteps)) and inject_background > 0
                     noise_pred_uncond_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[:1],
+                                                    text_format_dict={})['sample']
                     noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[-1:],
                                                     text_format_dict=text_format_dict)['sample']
+                    if inject_selfattn > 0 or inject_background > 0:
                         noise_pred_uncond_refer = self.unet(latents_reference, t, encoder_hidden_states=text_embeddings[:1],
                                                             text_format_dict={})['sample']
                         self.register_selfattn_hooks(feat_inject_step)
                     noise_pred_text = noise_pred_text_cur * self.masks[-1]
                     # tokens with attributes
                     for style_i, mask in enumerate(self.masks[:-1]):
                         self.register_replacement_hooks(feat_inject_step)
+                        noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
                                                         text_format_dict={})['sample']
                         self.remove_replacement_hooks()
                         noise_pred_uncond = noise_pred_uncond + noise_pred_uncond_cur*mask
                         noise_pred_text = noise_pred_text + noise_pred_text_cur*mask
                 # perform classifier-free guidance
                 noise_pred = noise_pred_uncond + guidance_scale * \
                     (noise_pred_text - noise_pred_uncond)
+                if inject_selfattn > 0 or inject_background > 0:
                     noise_pred_refer = noise_pred_uncond_refer + guidance_scale * \
                         (noise_pred_text_refer - noise_pred_uncond_refer)
                                 imgs*attn_map[:, 0]).sum(2).sum(2)/attn_map[:, 0].sum()
                             loss = self.color_loss(
                                 avg_rgb, rgb_val[:, :, 0, 0])*100
                             loss_total += loss
                         loss_total.backward()
                     latents = (
+                        latents - latents.grad * text_format_dict['color_guidance_weight'] * text_format_dict['color_obj_atten_all']).detach().clone()
+                # apply background injection
+                if background_inject_step:
+                    latents = latents_reference * self.masks[-1] + latents * \
+                        (1-self.masks[-1])
         return latents
     def predict_x0(self, x_t, eps_t, t):
         return latents
     def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
+                      guidance_scale=7.5, latents=None, text_format_dict={}, use_guidance=False, inject_selfattn=0, inject_background=0):
         if isinstance(prompts, str):
             prompts = [prompts]
         latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents,
                                        num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
                                        use_guidance=use_guidance, text_format_dict=text_format_dict,
+                                       inject_selfattn=inject_selfattn, inject_background=inject_background)  # [1, 4, 64, 64]
         # Img latents -> imgs
         imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
             """
             # out[0] - final output of residual layer
             # out[1] - residual hidden feature
             assert out[1].shape[-1] == 16
             activations[name] = out[1].detach()
         attention_dict = collections.defaultdict(list)

utils/attention_utils.py CHANGED Viewed

@@ -6,25 +6,26 @@ import seaborn as sns
 import torch
 import torchvision
-from sklearn.cluster import KMeans
 SelfAttentionLayers = [
-    # 'down_blocks.0.attentions.0.transformer_blocks.0.attn1',
-    # 'down_blocks.0.attentions.1.transformer_blocks.0.attn1',
     'down_blocks.1.attentions.0.transformer_blocks.0.attn1',
-    # 'down_blocks.1.attentions.1.transformer_blocks.0.attn1',
     'down_blocks.2.attentions.0.transformer_blocks.0.attn1',
     'down_blocks.2.attentions.1.transformer_blocks.0.attn1',
     'mid_block.attentions.0.transformer_blocks.0.attn1',
     'up_blocks.1.attentions.0.transformer_blocks.0.attn1',
     'up_blocks.1.attentions.1.transformer_blocks.0.attn1',
     'up_blocks.1.attentions.2.transformer_blocks.0.attn1',
-    # 'up_blocks.2.attentions.0.transformer_blocks.0.attn1',
     'up_blocks.2.attentions.1.transformer_blocks.0.attn1',
-    # 'up_blocks.2.attentions.2.transformer_blocks.0.attn1',
-    # 'up_blocks.3.attentions.0.transformer_blocks.0.attn1',
-    # 'up_blocks.3.attentions.1.transformer_blocks.0.attn1',
-    # 'up_blocks.3.attentions.2.transformer_blocks.0.attn1',
 ]
@@ -208,8 +209,8 @@ def get_token_maps_deprecated(attention_maps, save_dir, width, height, obj_token
     return attention_maps_averaged_normalized, token_maps_vis
-def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, height, obj_tokens, kmeans_seed=0, tokens_vis=None,
-                   preprocess=False, segment_threshold=0.30, num_segments=9, return_vis=False):
     r"""Function to visualize attention maps.
     Args:
         save_dir (str): Path to save attention maps
@@ -219,9 +220,11 @@ def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, heigh
     # create the segmentation mask using self-attention maps
     resolution = 32
-    attn_maps_1024 = {8: [], 16: [], 32: []}
     for attn_map in selfattn_maps.values():
         resolution_map = np.sqrt(attn_map.shape[1]).astype(int)
         attn_map = attn_map.reshape(
             1, resolution_map, resolution_map, resolution_map**2).permute([3, 0, 1, 2])
         attn_map = torch.nn.functional.interpolate(attn_map, (resolution, resolution),
@@ -229,10 +232,15 @@ def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, heigh
         attn_maps_1024[resolution_map].append(attn_map.permute([1, 2, 3, 0]).reshape(
             1, resolution**2, resolution_map**2))
     attn_maps_1024 = torch.cat([torch.cat(v).mean(0).cpu()
-                                for v in attn_maps_1024.values()], -1).numpy()
-    kmeans = KMeans(n_clusters=num_segments,
-                    n_init=10).fit(attn_maps_1024)
-    clusters = kmeans.labels_
     clusters = clusters.reshape(resolution, resolution)
     fig = plt.figure()
     plt.imshow(clusters)
@@ -258,6 +266,10 @@ def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, heigh
     cross_attn_maps_1024 = torch.cat(
         cross_attn_maps_1024).mean(0).cpu().numpy()
     normalized_span_maps = []
     for token_ids in obj_tokens:
         span_token_maps = cross_attn_maps_1024[:, :, token_ids.numpy()]
@@ -297,7 +309,7 @@ def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, heigh
     foreground_token_maps = [token_map[None, :, :]
                              for token_map in foreground_token_maps]
     token_maps_vis = plot_attention_maps([foreground_token_maps, resized_token_maps], obj_tokens,
-                                         save_dir, kmeans_seed, tokens_vis)
     resized_token_maps = [token_map.unsqueeze(1).repeat(
         [1, 4, 1, 1]).to(attn_map.dtype).cuda() for token_map in resized_token_maps]
     if return_vis:

 import torch
 import torchvision
+from utils.richtext_utils import seed_everything
+from sklearn.cluster import SpectralClustering
 SelfAttentionLayers = [
+    'down_blocks.0.attentions.0.transformer_blocks.0.attn1',
+    'down_blocks.0.attentions.1.transformer_blocks.0.attn1',
     'down_blocks.1.attentions.0.transformer_blocks.0.attn1',
+    'down_blocks.1.attentions.1.transformer_blocks.0.attn1',
     'down_blocks.2.attentions.0.transformer_blocks.0.attn1',
     'down_blocks.2.attentions.1.transformer_blocks.0.attn1',
     'mid_block.attentions.0.transformer_blocks.0.attn1',
     'up_blocks.1.attentions.0.transformer_blocks.0.attn1',
     'up_blocks.1.attentions.1.transformer_blocks.0.attn1',
     'up_blocks.1.attentions.2.transformer_blocks.0.attn1',
+    'up_blocks.2.attentions.0.transformer_blocks.0.attn1',
     'up_blocks.2.attentions.1.transformer_blocks.0.attn1',
+    'up_blocks.2.attentions.2.transformer_blocks.0.attn1',
+    'up_blocks.3.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.3.attentions.1.transformer_blocks.0.attn1',
+    'up_blocks.3.attentions.2.transformer_blocks.0.attn1',
 ]
     return attention_maps_averaged_normalized, token_maps_vis
+def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, height, obj_tokens, seed=0, tokens_vis=None,
+                   preprocess=False, segment_threshold=0.3, num_segments=5, return_vis=False, save_attn=False):
     r"""Function to visualize attention maps.
     Args:
         save_dir (str): Path to save attention maps
     # create the segmentation mask using self-attention maps
     resolution = 32
+    attn_maps_1024 = {8: [], 16: [], 32: [], 64: []}
     for attn_map in selfattn_maps.values():
         resolution_map = np.sqrt(attn_map.shape[1]).astype(int)
+        if resolution_map != resolution:
+            continue
         attn_map = attn_map.reshape(
             1, resolution_map, resolution_map, resolution_map**2).permute([3, 0, 1, 2])
         attn_map = torch.nn.functional.interpolate(attn_map, (resolution, resolution),
         attn_maps_1024[resolution_map].append(attn_map.permute([1, 2, 3, 0]).reshape(
             1, resolution**2, resolution_map**2))
     attn_maps_1024 = torch.cat([torch.cat(v).mean(0).cpu()
+                                for v in attn_maps_1024.values() if len(v) > 0], -1).numpy()
+    if save_attn:
+        print('saving self-attention maps...', attn_maps_1024.shape)
+        torch.save(torch.from_numpy(attn_maps_1024),
+                   'results/maps/selfattn_maps.pth')
+    seed_everything(seed)
+    sc = SpectralClustering(num_segments, affinity='precomputed', n_init=100,
+                            assign_labels='kmeans')
+    clusters = sc.fit_predict(attn_maps_1024)
     clusters = clusters.reshape(resolution, resolution)
     fig = plt.figure()
     plt.imshow(clusters)
     cross_attn_maps_1024 = torch.cat(
         cross_attn_maps_1024).mean(0).cpu().numpy()
+    if save_attn:
+        print('saving cross-attention maps...', cross_attn_maps_1024.shape)
+        torch.save(torch.from_numpy(cross_attn_maps_1024),
+                   'results/maps/crossattn_maps.pth')
     normalized_span_maps = []
     for token_ids in obj_tokens:
         span_token_maps = cross_attn_maps_1024[:, :, token_ids.numpy()]
     foreground_token_maps = [token_map[None, :, :]
                              for token_map in foreground_token_maps]
     token_maps_vis = plot_attention_maps([foreground_token_maps, resized_token_maps], obj_tokens,
+                                         save_dir, seed, tokens_vis)
     resized_token_maps = [token_map.unsqueeze(1).repeat(
         [1, 4, 1, 1]).to(attn_map.dtype).cuda() for token_map in resized_token_maps]
     if return_vis: