Spaces:

AbstractPhil
/

shunt-adapter-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 7 days ago

Commit

c22af2e

verified ·

1 Parent(s): 504e98b

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -112

app.py CHANGED Viewed

@@ -39,14 +39,13 @@ config_g = T5_SHUNT_REPOS["clip_g"]["config"]
 # ─── Loader ───────────────────────────────────────────────────
 from safetensors.torch import safe_open
-@spaces.GPU
 def load_adapter(repo, filename, config):
     # Don't initialize device here
     path = hf_hub_download(repo_id=repo, filename=filename)
     model = TwoStreamShuntAdapter(config).eval()
     tensors = {}
-    with safe_open(path, framework="pt", device="cuda") as f:
         for key in f.keys():
             tensors[key] = f.get_tensor(key)
     model.load_state_dict(tensors)
@@ -129,131 +128,128 @@ def encode_sdxl_prompt(prompt, negative_prompt=""):
         "neg_pooled": neg_pooled_embeds
     }
 # ─── Inference ────────────────────────────────────────────
 @spaces.GPU
-def infer(
-    prompt, negative_prompt, adapter_l_file, adapter_g_file,
-    strength, noise, gate_prob, use_anchor,
-    steps, cfg_scale, scheduler_name,
-    width, height, seed
-):
-    import torch
-    import numpy as np
     global t5_tok, t5_mod, pipe
     device = torch.device("cuda")
     dtype = torch.float16
-    with torch.no_grad():
-        # Initialize tokenizer and model
-        if t5_tok is None:
-            t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
-            t5_mod = T5EncoderModel.from_pretrained("google/flan-t5-base").to(device).eval()
-        if pipe is None:
-            pipe = StableDiffusionXLPipeline.from_pretrained(
-                "stabilityai/stable-diffusion-xl-base-1.0",
-                torch_dtype=dtype,
-                variant="fp16",
-                use_safetensors=True
-            ).to(device)
-            pipe.text_encoder = pipe.text_encoder.to(device)
-            pipe.text_encoder_2 = pipe.text_encoder_2.to(device)
-        # Reproducibility
-        if seed != -1:
-            torch.manual_seed(seed)
-            np.random.seed(seed)
-        # Scheduler
-        if scheduler_name in SCHEDULERS:
-            pipe.scheduler = SCHEDULERS[scheduler_name].from_config(pipe.scheduler.config)
-        # T5 embeddings
-        t5_ids = t5_tok(
-            prompt, return_tensors="pt",
-            padding="max_length", max_length=77, truncation=True
-        ).input_ids.to(device)
-        t5_seq = t5_mod(t5_ids).last_hidden_state
-        # CLIP embeddings
-        clip_embeds = encode_sdxl_prompt(prompt, negative_prompt)
-        # Debug shapes
-        print(f"T5 seq shape: {t5_seq.shape}")
-        print(f"CLIP-L shape: {clip_embeds['clip_l'].shape}")
-        print(f"CLIP-G shape: {clip_embeds['clip_g'].shape}")
-        # Load adapters
-        adapter_l = load_adapter(repo_l, adapter_l_file, config_l).to(device) if adapter_l_file else None
-        adapter_g = load_adapter(repo_g, adapter_g_file, config_g).to(device) if adapter_g_file else None
-        # ---- Adapter L ----
-        if adapter_l:
-            anchor_l, delta_l, log_sigma_l, attn_l1, attn_l2, tau_l, g_pred_l, gate_l = adapter_l(t5_seq, clip_embeds["clip_l"])
-            gate_l_scaled = gate_l * gate_prob
-            delta_l_final = delta_l * strength * gate_l_scaled
-            clip_l_mod = clip_embeds["clip_l"] + delta_l_final
-            if use_anchor:
-                clip_l_mod = clip_l_mod * (1 - gate_l_scaled) + anchor_l * gate_l_scaled
-            if noise > 0:
-                clip_l_mod += torch.randn_like(clip_l_mod) * noise
-        else:
-            clip_l_mod = clip_embeds["clip_l"]
-            delta_l_final = torch.zeros_like(clip_l_mod)
-            gate_l_scaled = torch.zeros_like(clip_l_mod)
-            g_pred_l = torch.tensor(0.0)
-            tau_l = torch.tensor(0.0)
-        # ---- Adapter G ----
-        if adapter_g:
-            anchor_g, delta_g, log_sigma_g, attn_g1, attn_g2, tau_g, g_pred_g, gate_g = adapter_g(t5_seq, clip_embeds["clip_g"])
-            gate_g_scaled = gate_g * gate_prob
-            delta_g_final = delta_g * strength * gate_g_scaled
-            clip_g_mod = clip_embeds["clip_g"] + delta_g_final
-            if use_anchor:
-                clip_g_mod = clip_g_mod * (1 - gate_g_scaled) + anchor_g * gate_g_scaled
-            if noise > 0:
-                clip_g_mod += torch.randn_like(clip_g_mod) * noise
-        else:
-            clip_g_mod = clip_embeds["clip_g"]
-            delta_g_final = torch.zeros_like(clip_g_mod)
-            gate_g_scaled = torch.zeros_like(clip_g_mod)
-            g_pred_g = torch.tensor(0.0)
-            tau_g = torch.tensor(0.0)
-        # ---- Combine embeddings ----
-        prompt_embeds = torch.cat([clip_l_mod, clip_g_mod], dim=-1).to(dtype)
-        neg_embeds = torch.cat([clip_embeds["neg_clip_l"], clip_embeds["neg_clip_g"]], dim=-1).to(dtype)
-        # ---- Generate image ----
-        generator = torch.Generator(device=device).manual_seed(seed) if seed != -1 else None
-        image = pipe(
-            prompt_embeds=prompt_embeds,
-            pooled_prompt_embeds=clip_embeds["pooled"],
-            negative_prompt_embeds=neg_embeds,
-            negative_pooled_prompt_embeds=clip_embeds["neg_pooled"],
-            num_inference_steps=steps,
-            guidance_scale=cfg_scale,
-            width=width,
-            height=height,
-            num_images_per_prompt=1,
-            generator=generator,
-        ).images[0]
     return (
         image,
         plot_heat(delta_l_final.squeeze().cpu().numpy(), "Δ CLIP-L"),
-        plot_heat(gate_l_scaled.squeeze().cpu().numpy(), "Gate CLIP-L"),
         plot_heat(delta_g_final.squeeze().cpu().numpy(), "Δ CLIP-G"),
         plot_heat(gate_g_scaled.squeeze().cpu().numpy(), "Gate CLIP-G"),
         f"g_pred_l: {g_pred_l.mean().item():.3f}, τ_l: {tau_l.mean().item():.3f}",
         f"g_pred_g: {g_pred_g.mean().item():.3f}, τ_g: {tau_g.mean().item():.3f}"
     )
 # ─── Gradio Interface ─────────────────────────────────────────
 with gr.Blocks(title="SDXL Dual Shunt Adapter", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🧠 SDXL Dual Shunt Adapter • T5→CLIP Enhancement")

 # ─── Loader ───────────────────────────────────────────────────
 from safetensors.torch import safe_open
 def load_adapter(repo, filename, config):
     # Don't initialize device here
     path = hf_hub_download(repo_id=repo, filename=filename)
     model = TwoStreamShuntAdapter(config).eval()
     tensors = {}
+    with safe_open(path, framework="pt", device="cpu") as f:
         for key in f.keys():
             tensors[key] = f.get_tensor(key)
     model.load_state_dict(tensors)
         "neg_pooled": neg_pooled_embeds
     }
+# ─── Inference ────────────────────────────────────────────────
+@torch.no_grad()
+def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noise, gate_prob,
+          use_anchor, steps, cfg_scale, scheduler_name, width, height, seed):
 # ─── Inference ────────────────────────────────────────────
 @spaces.GPU
+@torch.no_grad()
+def infer(prompt, negative_prompt, adapter_l_file, adapter_g_file, strength, noise, gate_prob,
+          use_anchor, steps, cfg_scale, scheduler_name, width, height, seed):
+    # Initialize device and models inside GPU context
     global t5_tok, t5_mod, pipe
     device = torch.device("cuda")
     dtype = torch.float16
+    # Load models if not already loaded
+    if t5_tok is None:
+        t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
+        t5_mod = T5EncoderModel.from_pretrained("google/flan-t5-base").to(device).eval()
+    if pipe is None:
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=dtype,
+            variant="fp16",
+            use_safetensors=True
+        ).to(device)
+    # Set seed for reproducibility
+    if seed != -1:
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    # Set scheduler
+    if scheduler_name in SCHEDULERS:
+        pipe.scheduler = SCHEDULERS[scheduler_name].from_config(pipe.scheduler.config)
+    # Get T5 embeddings for semantic understanding - standardize to 77 tokens like CLIP
+    t5_ids = t5_tok(
+        prompt,
+        return_tensors="pt",
+        padding="max_length",
+        max_length=77,
+        truncation=True
+    ).input_ids.to(device)
+    t5_seq = t5_mod(t5_ids).last_hidden_state
+    # Get proper SDXL CLIP embeddings
+    clip_embeds = encode_sdxl_prompt(pipe, prompt, negative_prompt, device)
+    # Debug shapes
+    print(f"T5 seq shape: {t5_seq.shape}")
+    print(f"CLIP-L shape: {clip_embeds['clip_l'].shape}")
+    print(f"CLIP-G shape: {clip_embeds['clip_g'].shape}")
+    # Load adapters
+    adapter_l = load_adapter(repo_l, adapter_l_file, config_l).to(device) if adapter_l_file else None
+    adapter_g = load_adapter(repo_g, adapter_g_file, config_g).to(device) if adapter_g_file else None
+    # Apply CLIP-L adapter
+    if adapter_l is not None:
+        anchor_l, delta_l, log_sigma_l, attn_l1, attn_l2, tau_l, g_pred_l, gate_l = adapter_l(t5_seq, clip_embeds["clip_l"])
+        gate_l_scaled = gate_l * gate_prob
+        delta_l_final = delta_l * strength * gate_l_scaled
+        clip_l_mod = clip_embeds["clip_l"] + delta_l_final
+        if use_anchor:
+            clip_l_mod = clip_l_mod * (1 - gate_l_scaled) + anchor_l * gate_l_scaled
+        if noise > 0:
+            clip_l_mod += torch.randn_like(clip_l_mod) * noise
+    else:
+        clip_l_mod = clip_embeds["clip_l"]
+        delta_l_final = torch.zeros_like(clip_embeds["clip_l"])
+        gate_l_scaled = torch.zeros_like(clip_embeds["clip_l"])
+        g_pred_l = torch.tensor(0.0)
+        tau_l = torch.tensor(0.0)
+    # Apply CLIP-G adapter
+    if adapter_g is not None:
+        anchor_g, delta_g, log_sigma_g, attn_g1, attn_g2, tau_g, g_pred_g, gate_g = adapter_g(t5_seq, clip_embeds["clip_g"])
+        gate_g_scaled = gate_g * gate_prob
+        delta_g_final = delta_g * strength * gate_g_scaled
+        clip_g_mod = clip_embeds["clip_g"] + delta_g_final
+        if use_anchor:
+            clip_g_mod = clip_g_mod * (1 - gate_g_scaled) + anchor_g * gate_g_scaled
+        if noise > 0:
+            clip_g_mod += torch.randn_like(clip_g_mod) * noise
+    else:
+        clip_g_mod = clip_embeds["clip_g"]
+        delta_g_final = torch.zeros_like(clip_embeds["clip_g"])
+        gate_g_scaled = torch.zeros_like(clip_embeds["clip_g"])
+        g_pred_g = torch.tensor(0.0)
+        tau_g = torch.tensor(0.0)
+    # Combine embeddings in SDXL format: [CLIP-L(768) + CLIP-G(1280)] = 2048
+    prompt_embeds = torch.cat([clip_l_mod, clip_g_mod], dim=-1).to(dtype)
+    neg_embeds = torch.cat([clip_embeds["neg_clip_l"], clip_embeds["neg_clip_g"]], dim=-1).to(dtype)
+    # Generate image with proper SDXL parameters
+    image = pipe(
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=clip_embeds["pooled"],
+        negative_prompt_embeds=neg_embeds,
+        negative_pooled_prompt_embeds=clip_embeds["neg_pooled"],
+        num_inference_steps=steps,
+        guidance_scale=cfg_scale,
+        width=width,
+        height=height,
+        num_images_per_prompt=1,  # Explicitly set this
+        generator=torch.Generator(device=device).manual_seed(seed) if seed != -1 else None
+    ).images[0]
     return (
         image,
         plot_heat(delta_l_final.squeeze().cpu().numpy(), "Δ CLIP-L"),
+        plot_heat(gate_l_scaled.squeeze().cpu().numpy(), "Gate CLIP-L"),
         plot_heat(delta_g_final.squeeze().cpu().numpy(), "Δ CLIP-G"),
         plot_heat(gate_g_scaled.squeeze().cpu().numpy(), "Gate CLIP-G"),
         f"g_pred_l: {g_pred_l.mean().item():.3f}, τ_l: {tau_l.mean().item():.3f}",
         f"g_pred_g: {g_pred_g.mean().item():.3f}, τ_g: {tau_g.mean().item():.3f}"
     )
 # ─── Gradio Interface ─────────────────────────────────────────
 with gr.Blocks(title="SDXL Dual Shunt Adapter", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🧠 SDXL Dual Shunt Adapter • T5→CLIP Enhancement")