Spaces:

AbstractPhil
/

shunt-adapter-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 8 days ago

Commit

b6b9cb1

verified ·

1 Parent(s): 5759aab

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -48

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download
 from two_stream_shunt_adapter import TwoStreamShuntAdapter
 from configs import T5_SHUNT_REPOS
 # ─── Global Variables ─────────────────────────────────────────
 t5_tok = None
@@ -33,6 +34,7 @@ config_g = T5_SHUNT_REPOS["clip_g"]["config"]
 # ─── Helper Functions ─────────────────────────────────────────
 def load_adapter(repo, filename, config, device):
     from safetensors.torch import safe_open
     path = hf_hub_download(repo_id=repo, filename=filename)
@@ -46,29 +48,42 @@ def load_adapter(repo, filename, config, device):
 def plot_heat(mat, title):
     """Create heatmap visualization with proper shape handling"""
-    import io
     # Ensure we have a 2D array for visualization
     if len(mat.shape) == 1:
         mat = mat.reshape(1, -1)
     elif len(mat.shape) == 3:
-        mat = mat.mean(axis=0)
     elif len(mat.shape) > 3:
         mat = mat.reshape(-1, mat.shape[-1])
-    fig, ax = plt.subplots(figsize=(8, 4), dpi=100)
-    im = ax.imshow(mat, aspect="auto", cmap="RdBu_r", origin="upper")
-    ax.set_title(title, fontsize=12, fontweight='bold')
-    ax.set_xlabel("Token Position")
-    ax.set_ylabel("Feature Dimension")
-    plt.colorbar(im, ax=ax, shrink=0.8)
     buf = io.BytesIO()
     plt.savefig(buf, format="png", bbox_inches='tight', dpi=100)
     buf.seek(0)
     pil_image = Image.open(buf)
-    plt.close(fig)
-    return pil_image
 def encode_sdxl_prompt(pipe, prompt, negative_prompt, device):
     """Generate CLIP-L and CLIP-G embeddings using SDXL's text encoders"""
@@ -92,15 +107,18 @@ def encode_sdxl_prompt(pipe, prompt, negative_prompt, device):
     with torch.no_grad():
         # CLIP-L: [0] = sequence, [1] = pooled
-        clip_l_embeds = pipe.text_encoder(tokens_l)[0]
-        neg_clip_l_embeds = pipe.text_encoder(neg_tokens_l)[0]
-        # CLIP-G: [0] = pooled, [1] = sequence (different from CLIP-L!)
-        clip_g_output = pipe.text_encoder_2(tokens_g)
         clip_g_embeds = clip_g_output[1]  # sequence embeddings
         pooled_embeds = clip_g_output[0]  # pooled embeddings
-        neg_clip_g_output = pipe.text_encoder_2(neg_tokens_g)
         neg_clip_g_embeds = neg_clip_g_output[1]
         neg_pooled_embeds = neg_clip_g_output[0]
@@ -139,6 +157,9 @@ def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noi
     if seed != -1:
         torch.manual_seed(seed)
         np.random.seed(seed)
     # Set scheduler
     if scheduler_name in SCHEDULERS:
@@ -148,7 +169,9 @@ def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noi
     t5_ids = t5_tok(
         prompt, return_tensors="pt", padding="max_length", max_length=77, truncation=True
     ).input_ids.to(device)
-    t5_seq = t5_mod(t5_ids).last_hidden_state
     # Get CLIP embeddings
     clip_embeds = encode_sdxl_prompt(pipe, prompt, negative_prompt, device)
@@ -159,41 +182,83 @@ def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noi
     # Apply CLIP-L adapter
     if adapter_l is not None:
-        anchor_l, delta_l, log_sigma_l, attn_l1, attn_l2, tau_l, g_pred_l, gate_l = adapter_l(
-            t5_seq.float(), clip_embeds["clip_l"].float()
-        )
-        gate_l_scaled = gate_l * gate_prob
-        delta_l_final = delta_l * strength * gate_l_scaled
-        clip_l_mod = clip_embeds["clip_l"] + delta_l_final.to(dtype)
-        if use_anchor:
-            clip_l_mod = clip_l_mod * (1 - gate_l_scaled.to(dtype)) + anchor_l.to(dtype) * gate_l_scaled.to(dtype)
-        if noise > 0:
-            clip_l_mod += torch.randn_like(clip_l_mod) * noise
     else:
         clip_l_mod = clip_embeds["clip_l"]
         delta_l_final = torch.zeros_like(clip_embeds["clip_l"])
         gate_l_scaled = torch.zeros_like(clip_embeds["clip_l"])
-        g_pred_l, tau_l = torch.tensor(0.0), torch.tensor(0.0)
     # Apply CLIP-G adapter
     if adapter_g is not None:
-        anchor_g, delta_g, log_sigma_g, attn_g1, attn_g2, tau_g, g_pred_g, gate_g = adapter_g(
-            t5_seq.float(), clip_embeds["clip_g"].float()
-        )
-        gate_g_scaled = gate_g * gate_prob
-        delta_g_final = delta_g * strength * gate_g_scaled
-        clip_g_mod = clip_embeds["clip_g"] + delta_g_final.to(dtype)
-        if use_anchor:
-            clip_g_mod = clip_g_mod * (1 - gate_g_scaled.to(dtype)) + anchor_g.to(dtype) * gate_g_scaled.to(dtype)
-        if noise > 0:
-            clip_g_mod += torch.randn_like(clip_g_mod) * noise
     else:
         clip_g_mod = clip_embeds["clip_g"]
         delta_g_final = torch.zeros_like(clip_embeds["clip_g"])
         gate_g_scaled = torch.zeros_like(clip_embeds["clip_g"])
-        g_pred_g, tau_g = torch.tensor(0.0), torch.tensor(0.0)
     # Combine embeddings for SDXL: [CLIP-L(768) + CLIP-G(1280)] = 2048
     prompt_embeds = torch.cat([clip_l_mod, clip_g_mod], dim=-1)
@@ -210,18 +275,18 @@ def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noi
         width=width,
         height=height,
         num_images_per_prompt=1,
-        generator=torch.Generator(device=device).manual_seed(seed) if seed != -1 else None
     ).images[0]
     # Create visualizations
-    delta_l_viz = plot_heat(delta_l_final.squeeze().cpu().numpy(), "CLIP-L Delta Values")
-    gate_l_viz = plot_heat(gate_l_scaled.squeeze().cpu().numpy().mean(axis=-1, keepdims=True), "CLIP-L Gate Activations")
-    delta_g_viz = plot_heat(delta_g_final.squeeze().cpu().numpy(), "CLIP-G Delta Values")
-    gate_g_viz = plot_heat(gate_g_scaled.squeeze().cpu().numpy().mean(axis=-1, keepdims=True), "CLIP-G Gate Activations")
     # Statistics
-    stats_l = f"g_pred_l: {g_pred_l.mean().item():.3f}, τ_l: {tau_l.mean().item():.3f}"
-    stats_g = f"g_pred_g: {g_pred_g.mean().item():.3f}, τ_g: {tau_g.mean().item():.3f}"
     return image, delta_l_viz, gate_l_viz, delta_g_viz, gate_g_viz, stats_l, stats_g
@@ -286,7 +351,7 @@ def create_interface():
                     width = gr.Slider(512, 1536, value=1024, step=64, label="Width")
                     height = gr.Slider(512, 1536, value=1024, step=64, label="Height")
-                seed = gr.Number(value=-1, label="Seed (-1 for random)")
                 generate_btn = gr.Button("🚀 Generate Image", variant="primary", size="lg")

 from huggingface_hub import hf_hub_download
 from two_stream_shunt_adapter import TwoStreamShuntAdapter
 from configs import T5_SHUNT_REPOS
+import io
 # ─── Global Variables ─────────────────────────────────────────
 t5_tok = None
 # ─── Helper Functions ─────────────────────────────────────────
 def load_adapter(repo, filename, config, device):
+    """Load adapter from safetensors file"""
     from safetensors.torch import safe_open
     path = hf_hub_download(repo_id=repo, filename=filename)
 def plot_heat(mat, title):
     """Create heatmap visualization with proper shape handling"""
+    # Handle different input shapes
+    if isinstance(mat, torch.Tensor):
+        mat = mat.detach().cpu().numpy()
     # Ensure we have a 2D array for visualization
     if len(mat.shape) == 1:
+        # 1D array - reshape to single row
         mat = mat.reshape(1, -1)
     elif len(mat.shape) == 3:
+        # 3D array - average over batch dimension
+        if mat.shape[0] == 1:
+            mat = mat.squeeze(0)
+        else:
+            mat = mat.mean(axis=0)
     elif len(mat.shape) > 3:
+        # Flatten higher dimensions
         mat = mat.reshape(-1, mat.shape[-1])
+    # Create figure with proper DPI
+    plt.figure(figsize=(8, 4), dpi=100)
+    plt.imshow(mat, aspect="auto", cmap="RdBu_r", origin="upper", interpolation='nearest')
+    plt.title(title, fontsize=12, fontweight='bold')
+    plt.xlabel("Token Position")
+    plt.ylabel("Feature Dimension")
+    plt.colorbar(shrink=0.8)
+    plt.tight_layout()
+    # Convert to PIL Image
     buf = io.BytesIO()
     plt.savefig(buf, format="png", bbox_inches='tight', dpi=100)
     buf.seek(0)
     pil_image = Image.open(buf)
+    plt.close()
+    # Convert to numpy array for Gradio
+    return np.array(pil_image)
 def encode_sdxl_prompt(pipe, prompt, negative_prompt, device):
     """Generate CLIP-L and CLIP-G embeddings using SDXL's text encoders"""
     with torch.no_grad():
         # CLIP-L: [0] = sequence, [1] = pooled
+        clip_l_output = pipe.text_encoder(tokens_l, output_hidden_states=False)
+        clip_l_embeds = clip_l_output[0]
+        neg_clip_l_output = pipe.text_encoder(neg_tokens_l, output_hidden_states=False)
+        neg_clip_l_embeds = neg_clip_l_output[0]
+        # CLIP-G: [0] = pooled, [1] = sequence
+        clip_g_output = pipe.text_encoder_2(tokens_g, output_hidden_states=False)
         clip_g_embeds = clip_g_output[1]  # sequence embeddings
         pooled_embeds = clip_g_output[0]  # pooled embeddings
+        neg_clip_g_output = pipe.text_encoder_2(neg_tokens_g, output_hidden_states=False)
         neg_clip_g_embeds = neg_clip_g_output[1]
         neg_pooled_embeds = neg_clip_g_output[0]
     if seed != -1:
         torch.manual_seed(seed)
         np.random.seed(seed)
+        generator = torch.Generator(device=device).manual_seed(seed)
+    else:
+        generator = None
     # Set scheduler
     if scheduler_name in SCHEDULERS:
     t5_ids = t5_tok(
         prompt, return_tensors="pt", padding="max_length", max_length=77, truncation=True
     ).input_ids.to(device)
+    with torch.no_grad():
+        t5_seq = t5_mod(t5_ids).last_hidden_state
     # Get CLIP embeddings
     clip_embeds = encode_sdxl_prompt(pipe, prompt, negative_prompt, device)
     # Apply CLIP-L adapter
     if adapter_l is not None:
+        with torch.no_grad():
+            # Run adapter forward pass
+            adapter_output = adapter_l(t5_seq.float(), clip_embeds["clip_l"].float())
+            # Unpack outputs (ensure correct number of outputs)
+            if len(adapter_output) == 8:
+                anchor_l, delta_l, log_sigma_l, attn_l1, attn_l2, tau_l, g_pred_l, gate_l = adapter_output
+            else:
+                # Handle different return formats
+                anchor_l = adapter_output[0]
+                delta_l = adapter_output[1]
+                gate_l = adapter_output[-1] if len(adapter_output) > 2 else torch.ones_like(delta_l)
+                tau_l = adapter_output[-2] if len(adapter_output) > 6 else torch.tensor(1.0)
+                g_pred_l = adapter_output[-3] if len(adapter_output) > 6 else torch.tensor(1.0)
+            # Apply gate scaling
+            gate_l_scaled = torch.sigmoid(gate_l) * gate_prob
+            # Compute final delta with strength and gate
+            delta_l_final = delta_l * strength * gate_l_scaled
+            # Apply delta to embeddings
+            clip_l_mod = clip_embeds["clip_l"] + delta_l_final.to(dtype)
+            # Apply anchor mixing if enabled
+            if use_anchor:
+                clip_l_mod = clip_l_mod * (1 - gate_l_scaled.to(dtype)) + anchor_l.to(dtype) * gate_l_scaled.to(dtype)
+            # Add noise if specified
+            if noise > 0:
+                clip_l_mod += torch.randn_like(clip_l_mod) * noise
     else:
         clip_l_mod = clip_embeds["clip_l"]
         delta_l_final = torch.zeros_like(clip_embeds["clip_l"])
         gate_l_scaled = torch.zeros_like(clip_embeds["clip_l"])
+        g_pred_l = torch.tensor(0.0)
+        tau_l = torch.tensor(0.0)
     # Apply CLIP-G adapter
     if adapter_g is not None:
+        with torch.no_grad():
+            # Run adapter forward pass
+            adapter_output = adapter_g(t5_seq.float(), clip_embeds["clip_g"].float())
+            # Unpack outputs (ensure correct number of outputs)
+            if len(adapter_output) == 8:
+                anchor_g, delta_g, log_sigma_g, attn_g1, attn_g2, tau_g, g_pred_g, gate_g = adapter_output
+            else:
+                # Handle different return formats
+                anchor_g = adapter_output[0]
+                delta_g = adapter_output[1]
+                gate_g = adapter_output[-1] if len(adapter_output) > 2 else torch.ones_like(delta_g)
+                tau_g = adapter_output[-2] if len(adapter_output) > 6 else torch.tensor(1.0)
+                g_pred_g = adapter_output[-3] if len(adapter_output) > 6 else torch.tensor(1.0)
+            # Apply gate scaling
+            gate_g_scaled = torch.sigmoid(gate_g) * gate_prob
+            # Compute final delta with strength and gate
+            delta_g_final = delta_g * strength * gate_g_scaled
+            # Apply delta to embeddings
+            clip_g_mod = clip_embeds["clip_g"] + delta_g_final.to(dtype)
+            # Apply anchor mixing if enabled
+            if use_anchor:
+                clip_g_mod = clip_g_mod * (1 - gate_g_scaled.to(dtype)) + anchor_g.to(dtype) * gate_g_scaled.to(dtype)
+            # Add noise if specified
+            if noise > 0:
+                clip_g_mod += torch.randn_like(clip_g_mod) * noise
     else:
         clip_g_mod = clip_embeds["clip_g"]
         delta_g_final = torch.zeros_like(clip_embeds["clip_g"])
         gate_g_scaled = torch.zeros_like(clip_embeds["clip_g"])
+        g_pred_g = torch.tensor(0.0)
+        tau_g = torch.tensor(0.0)
     # Combine embeddings for SDXL: [CLIP-L(768) + CLIP-G(1280)] = 2048
     prompt_embeds = torch.cat([clip_l_mod, clip_g_mod], dim=-1)
         width=width,
         height=height,
         num_images_per_prompt=1,
+        generator=generator
     ).images[0]
     # Create visualizations
+    delta_l_viz = plot_heat(delta_l_final.squeeze(), "CLIP-L Delta Values")
+    gate_l_viz = plot_heat(gate_l_scaled.squeeze().mean(dim=-1, keepdim=True), "CLIP-L Gate Activations")
+    delta_g_viz = plot_heat(delta_g_final.squeeze(), "CLIP-G Delta Values")
+    gate_g_viz = plot_heat(gate_g_scaled.squeeze().mean(dim=-1, keepdim=True), "CLIP-G Gate Activations")
     # Statistics
+    stats_l = f"g_pred_l: {float(g_pred_l.mean().item() if hasattr(g_pred_l, 'mean') else g_pred_l):.3f}, τ_l: {float(tau_l.mean().item() if hasattr(tau_l, 'mean') else tau_l):.3f}"
+    stats_g = f"g_pred_g: {float(g_pred_g.mean().item() if hasattr(g_pred_g, 'mean') else g_pred_g):.3f}, τ_g: {float(tau_g.mean().item() if hasattr(tau_g, 'mean') else tau_g):.3f}"
     return image, delta_l_viz, gate_l_viz, delta_g_viz, gate_g_viz, stats_l, stats_g
                     width = gr.Slider(512, 1536, value=1024, step=64, label="Width")
                     height = gr.Slider(512, 1536, value=1024, step=64, label="Height")
+                seed = gr.Number(value=-1, label="Seed (-1 for random)", precision=0)
                 generate_btn = gr.Button("🚀 Generate Image", variant="primary", size="lg")