Spaces:

xilluill
/

KV-Edit

Running on Zero

App Files Files Community

xilluill commited on 21 days ago

Commit

9b891da

1 Parent(s): ca71d6b

update attn scale

Browse files

Files changed (3) hide show

app.py +8 -4
flux/modules/layers.py +4 -2
models/kv_edit.py +38 -0

app.py CHANGED Viewed

@@ -30,6 +30,7 @@ class SamplingOptions:
     seed: int = 42
     re_init: bool = False
     attn_mask: bool = False
 def resize_image(image_array, max_width=512, max_height=512):
     # 将numpy数组转换为PIL图像
@@ -96,7 +97,7 @@ def edit(brush_canvas,
             inversion_num_steps, denoise_num_steps,
             skip_step,
             inversion_guidance, denoise_guidance,seed,
-            re_init,attn_mask
             ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch.cuda.empty_cache()
@@ -136,7 +137,8 @@ def edit(brush_canvas,
         denoise_guidance=denoise_guidance,
         seed=seed,
         re_init=re_init,
-        attn_mask=attn_mask
     )
@@ -215,7 +217,8 @@ def create_demo(model_name: str):
         3️⃣ Fill in your target prompt, then adjust the hyperparameters. <br>
         4️⃣ Click the "Edit" button to generate your edited image! <br>
-        🔔🔔 [<b>Important</b>] We suggest trying less skip steps, "re_init" and "attn_mask" only when the result is too similar to the original content (e.g. removing objects or changing color).<br>
         """
     article = r"""
     If our work is helpful, please help to ⭐ the <a href='https://github.com/Xilluill/KV-Edit' target='_blank'>Github Repo</a>. Thanks!
@@ -252,6 +255,7 @@ def create_demo(model_name: str):
                     skip_step = gr.Slider(0, 30, 0, step=1, label="Number of skip steps")
                     inversion_guidance = gr.Slider(1.0, 10.0, 1.5, step=0.1, label="inversion Guidance", interactive=not is_schnell)
                     denoise_guidance = gr.Slider(1.0, 10.0, 5.5, step=0.1, label="denoise Guidance", interactive=not is_schnell)
                     seed = gr.Textbox('0', label="Seed (-1 for random)", visible=True)
                     with gr.Row():
                         re_init = gr.Checkbox(label="re_init", value=False)
@@ -268,7 +272,7 @@ def create_demo(model_name: str):
                     skip_step,
                     inversion_guidance,
                     denoise_guidance,seed,
-                    re_init,attn_mask
                     ],
             outputs=[output_image]
         )

     seed: int = 42
     re_init: bool = False
     attn_mask: bool = False
+    attn_scale_value: float = 0.0
 def resize_image(image_array, max_width=512, max_height=512):
     # 将numpy数组转换为PIL图像
             inversion_num_steps, denoise_num_steps,
             skip_step,
             inversion_guidance, denoise_guidance,seed,
+            re_init,attn_mask,attn_scale_value
             ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch.cuda.empty_cache()
         denoise_guidance=denoise_guidance,
         seed=seed,
         re_init=re_init,
+        attn_mask=attn_mask,
+        attn_scale_value = attn_scale_value
     )
         3️⃣ Fill in your target prompt, then adjust the hyperparameters. <br>
         4️⃣ Click the "Edit" button to generate your edited image! <br>
+        🔔🔔 [<b>Important</b>] Less skip steps, "re_init" and "attn_mask"  will enhance the editing performance, making the results aligned with your text but may lead to discontinuous images.  <br>
+        We recommend trying to increase "attn_scale" to increase attention between mask and background.<br>
         """
     article = r"""
     If our work is helpful, please help to ⭐ the <a href='https://github.com/Xilluill/KV-Edit' target='_blank'>Github Repo</a>. Thanks!
                     skip_step = gr.Slider(0, 30, 0, step=1, label="Number of skip steps")
                     inversion_guidance = gr.Slider(1.0, 10.0, 1.5, step=0.1, label="inversion Guidance", interactive=not is_schnell)
                     denoise_guidance = gr.Slider(1.0, 10.0, 5.5, step=0.1, label="denoise Guidance", interactive=not is_schnell)
+                    attn_scale_value = gr.Slider(0.0, 5.0, 1, step=0.1, label="attn_scale")
                     seed = gr.Textbox('0', label="Seed (-1 for random)", visible=True)
                     with gr.Row():
                         re_init = gr.Checkbox(label="re_init", value=False)
                     skip_step,
                     inversion_guidance,
                     denoise_guidance,seed,
+                    re_init,attn_mask,attn_scale_value
                     ],
             outputs=[output_image]
         )

flux/modules/layers.py CHANGED Viewed

@@ -300,7 +300,8 @@ class DoubleStreamBlock_kv(DoubleStreamBlock):
             if 'attention_mask' in info:
                 attn = attention(q, k, v, pe=pe,attention_mask=info['attention_mask'])
             else:
-                attn = attention(q, k, v, pe=pe)
         # elif feature_k_name in info['feature']:
         else:
@@ -377,7 +378,8 @@ class SingleStreamBlock_kv(SingleStreamBlock):
             k = torch.cat((txt_k, source_img_k), dim=2)
             v = torch.cat((txt_v, source_img_v), dim=2)
-            attn = attention(q, k, v, pe=pe, pe_q = info['pe_mask'])
         # compute attention
         # attn = attention(q, k, v, pe=pe)

             if 'attention_mask' in info:
                 attn = attention(q, k, v, pe=pe,attention_mask=info['attention_mask'])
             else:
+                # attn = attention(q, k, v, pe=pe)
+                attn = attention(q, k, v, pe=pe, pe_q = info['pe_mask'],attention_mask=info['attention_scale'])
         # elif feature_k_name in info['feature']:
         else:
             k = torch.cat((txt_k, source_img_k), dim=2)
             v = torch.cat((txt_v, source_img_v), dim=2)
+            # attn = attention(q, k, v, pe=pe, pe_q = info['pe_mask'])
+            attn = attention(q, k, v, pe=pe, pe_q = info['pe_mask'],attention_mask=info['attention_scale'])
         # compute attention
         # attn = attention(q, k, v, pe=pe)

models/kv_edit.py CHANGED Viewed

@@ -76,6 +76,37 @@ class only_Flux(torch.nn.Module): # 仅包括初始化函数
         attention_mask[background_token_indices.unsqueeze(1), background_token_indices] = True  # 关注背景区域
         return attention_mask.unsqueeze(0)
 class Flux_kv_edit_inf(only_Flux):
     def __init__(self, device,name):
@@ -200,6 +231,13 @@ class Flux_kv_edit(only_Flux):
             inp_target["img"] = zt_noise[:, mask_indices,...]
         else:
             inp_target["img"] = zt[:, mask_indices,...]
         info['inverse'] = False
         x, _ = denoise_kv(self.model, **inp_target, timesteps=denoise_timesteps, guidance=opts.denoise_guidance, inverse=False, info=info)

         attention_mask[background_token_indices.unsqueeze(1), background_token_indices] = True  # 关注背景区域
         return attention_mask.unsqueeze(0)
+    def create_attention_scale(self,seq_len, mask_indices, text_len=512, device='cuda',scale = 0):
+        """
+        创建注意力局部缩放
+        Args:
+            seq_len (int): 序列长度。
+            mask_indices (List[int]): 图像令牌中掩码区域的索引。
+            text_len (int): 文本令牌的长度，默认 512。
+            device (str): 设备类型，如 'cuda' 或 'cpu'。
+        Returns:
+            torch.Tensor: 形状为 (seq_len, seq_len) 的注意力缩放
+        """
+        # 初始缩放为全 1
+        attention_scale = torch.zeros(1, seq_len, dtype=torch.bfloat16, device=device) # 相加时广播
+        # 文本令牌索引
+        text_indices = torch.arange(0, text_len, device=device)
+        # 掩码区域令牌索引
+        mask_token_indices = torch.tensor([idx + text_len for idx in mask_indices], device=device)
+        # 背景区域令牌索引
+        all_indices = torch.arange(text_len, seq_len, device=device)
+        background_token_indices = torch.tensor([idx for idx in all_indices if idx not in mask_token_indices])
+        attention_scale[0, background_token_indices] = scale #
+        print(f"attention_scale:{scale}")
+        return attention_scale.unsqueeze(0)
 class Flux_kv_edit_inf(only_Flux):
     def __init__(self, device,name):
             inp_target["img"] = zt_noise[:, mask_indices,...]
         else:
             inp_target["img"] = zt[:, mask_indices,...]
+        if opts.attn_scale_value != 0:
+            attention_scale = self.create_attention_scale(L+512, mask_indices, device=mask.device,scale = opts.attn_scale_value)
+            info['attention_scale'] = attention_scale
+        else:
+            info['attention_scale'] = None
         info['inverse'] = False
         x, _ = denoise_kv(self.model, **inp_target, timesteps=denoise_timesteps, guidance=opts.denoise_guidance, inverse=False, info=info)