Spaces:

AbstractPhil
/

shunt-adapter-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 9 days ago

Commit

cae6d82

1 Parent(s): acd9841

yes

Browse files

Files changed (1) hide show

app.py +32 -9

app.py CHANGED Viewed

@@ -111,7 +111,7 @@ def encode_sdxl_prompt(prompt, negative_prompt=""):
         clip_l_embeds = pipe.text_encoder(tokens_l)[0]
         neg_clip_l_embeds = pipe.text_encoder(neg_tokens_l)[0]
-        # CLIP-G embeddings (1280d)
         clip_g_embeds = pipe.text_encoder_2(tokens_g)[0]
         neg_clip_g_embeds = pipe.text_encoder_2(neg_tokens_g)[0]
@@ -143,14 +143,7 @@ def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noi
         pipe.scheduler = SCHEDULERS[scheduler_name].from_config(pipe.scheduler.config)
     # Get T5 embeddings for semantic understanding
-    t5_ids = t5_tok(
-        prompt,
-        return_tensors="pt",
-        padding="max_length",
-        max_length=77,  # Match CLIP's standard length
-        truncation=True
-    ).input_ids.to(device)
-    print(t5_ids.shape)
     t5_seq = t5_mod(t5_ids).last_hidden_state
     # Get proper SDXL CLIP embeddings
@@ -160,6 +153,19 @@ def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noi
     adapter_l = load_adapter(repo_l, adapter_l_file, config_l) if adapter_l_file else None
     adapter_g = load_adapter(repo_g, adapter_g_file, config_g) if adapter_g_file else None
     # Apply CLIP-L adapter
     if adapter_l is not None:
         anchor_l, delta_l, log_sigma_l, attn_l1, attn_l2, tau_l, g_pred_l, gate_l = adapter_l(t5_seq, clip_embeds["clip_l"])
@@ -187,6 +193,23 @@ def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noi
             clip_g_mod = clip_g_mod * (1 - gate_g_scaled) + anchor_g * gate_g_scaled
         if noise > 0:
             clip_g_mod += torch.randn_like(clip_g_mod) * noise
     else:
         clip_g_mod = clip_embeds["clip_g"]
         delta_g_final = torch.zeros_like(clip_embeds["clip_g"])

         clip_l_embeds = pipe.text_encoder(tokens_l)[0]
         neg_clip_l_embeds = pipe.text_encoder(neg_tokens_l)[0]
+        # CLIP-G embeddings (1280d) - get the hidden states [0], not pooled [1]
         clip_g_embeds = pipe.text_encoder_2(tokens_g)[0]
         neg_clip_g_embeds = pipe.text_encoder_2(neg_tokens_g)[0]
         pipe.scheduler = SCHEDULERS[scheduler_name].from_config(pipe.scheduler.config)
     # Get T5 embeddings for semantic understanding
+    t5_ids = t5_tok(prompt, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
     t5_seq = t5_mod(t5_ids).last_hidden_state
     # Get proper SDXL CLIP embeddings
     adapter_l = load_adapter(repo_l, adapter_l_file, config_l) if adapter_l_file else None
     adapter_g = load_adapter(repo_g, adapter_g_file, config_g) if adapter_g_file else None
+    # Ensure all embeddings have the same sequence length (77 tokens)
+    seq_len = 77
+    # Resize T5 to match CLIP sequence length
+    if t5_seq.size(1) != seq_len:
+        t5_seq = torch.nn.functional.interpolate(
+            t5_seq.transpose(1, 2),
+            size=seq_len,
+            mode="nearest"
+        ).transpose(1, 2)
+    print(f"After resize - T5: {t5_seq.shape}, CLIP-L: {clip_embeds['clip_l'].shape}, CLIP-G: {clip_embeds['clip_g'].shape}")
     # Apply CLIP-L adapter
     if adapter_l is not None:
         anchor_l, delta_l, log_sigma_l, attn_l1, attn_l2, tau_l, g_pred_l, gate_l = adapter_l(t5_seq, clip_embeds["clip_l"])
             clip_g_mod = clip_g_mod * (1 - gate_g_scaled) + anchor_g * gate_g_scaled
         if noise > 0:
             clip_g_mod += torch.randn_like(clip_g_mod) * noise
+    else:
+        clip_g_mod = clip_embeds["clip_g"]
+        delta_g_final = torch.zeros_like(clip_embeds["clip_g"])
+        gate_g_scaled = torch.zeros_like(clip_embeds["clip_g"])
+        g_pred_g = torch.tensor(0.0)
+        tau_g = torch.tensor(0.0) 2)
+        else:
+            t5_seq_resized = t5_seq
+        anchor_g, delta_g, log_sigma_g, attn_g1, attn_g2, tau_g, g_pred_g, gate_g = adapter_g(t5_seq_resized, clip_embeds["clip_g"])
+        gate_g_scaled = gate_g * gate_prob
+        delta_g_final = delta_g * strength * gate_g_scaled
+        clip_g_mod = clip_embeds["clip_g"] + delta_g_final
+        if use_anchor:
+            clip_g_mod = clip_g_mod * (1 - gate_g_scaled) + anchor_g * gate_g_scaled
+        if noise > 0:
+            clip_g_mod += torch.randn_like(clip_g_mod) * noise
     else:
         clip_g_mod = clip_embeds["clip_g"]
         delta_g_final = torch.zeros_like(clip_embeds["clip_g"])