Spaces:

AbstractPhil
/

shunt-adapter-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 9 days ago

Commit

ca066a9

1 Parent(s): 11aea4e

initial push for v1

Browse files

Files changed (3) hide show

app.py +121 -142
configs.py +149 -0
two_stream_shunt_adapter.py +123 -0

app.py CHANGED Viewed

@@ -1,153 +1,132 @@
 import gradio as gr
 import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
 from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
     image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
     ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
         fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
     )
 if __name__ == "__main__":

+import torch
 import gradio as gr
 import numpy as np
+import matplotlib.pyplot as plt
+from transformers import T5Tokenizer, T5EncoderModel
 from diffusers import DiffusionPipeline
+from safetensors.torch import load_file
+from huggingface_hub import hf_hub_download
+from shunt_adapter import TwoStreamShuntAdapter
+from adapter_config import T5_SHUNT_REPOS
+# ─── Device & Model Setup ─────────────────────────────────────
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
+t5_mod = T5EncoderModel.from_pretrained("google/flan-t5-base").to(device).eval()
+pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=dtype,
+    variant="fp16" if dtype == torch.float16 else None
+).to(device)
+# ─── Adapter Configs ──────────────────────────────────────────
+clip_l_opts = T5_SHUNT_REPOS["clip_l"]["shunts_available"]["shunt_list"]
+clip_g_opts = T5_SHUNT_REPOS["clip_g"]["shunts_available"]["shunt_list"]
+repo_l = T5_SHUNT_REPOS["clip_l"]["repo"]
+repo_g = T5_SHUNT_REPOS["clip_g"]["repo"]
+config_l = T5_SHUNT_REPOS["clip_l"]["config"]
+config_g = T5_SHUNT_REPOS["clip_g"]["config"]
+# ─── Loader ───────────────────────────────────────────────────
+def load_adapter(repo, filename, config):
+    path = hf_hub_download(repo_id=repo, filename=filename)
+    model = TwoStreamShuntAdapter(config).to(device).eval()
+    model.load_state_dict(load_file(path, device=device))
+    return model
+# ─── Visualization ────────────────────────────────────────────
+def plot_heat(mat, title):
+    import io
+    fig, ax = plt.subplots(figsize=(6, 3), dpi=100)
+    im = ax.imshow(mat, aspect="auto", cmap="bwr", origin="upper")
+    ax.set_title(title)
+    plt.colorbar(im, ax=ax)
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png", bbox_inches='tight')
+    buf.seek(0)
+    return buf
+# ─── Inference ────────────────────────────────────────────────
+@torch.no_grad()
+def infer(prompt, adapter_l_file, adapter_g_file, strength, noise, gate_prob, use_anchor):
+    t5_ids = t5_tok(prompt, return_tensors="pt").input_ids.to(device)
+    t5_seq = t5_mod(t5_ids).last_hidden_state
+    adapter_l = load_adapter(repo_l, adapter_l_file, config_l)
+    adapter_g = load_adapter(repo_g, adapter_g_file, config_g)
+    clip_l_in = torch.randn(t5_seq.shape[0], 77, 768).to(device)
+    clip_g_in = torch.randn(t5_seq.shape[0], 77, 1280).to(device)
+    anchor_l, delta_l, log_sigma_l, attn_l1, attn_l2, tau_l, g_pred_l, gate_l = adapter_l(t5_seq, clip_l_in)
+    gate_l_scaled = gate_l * gate_prob
+    delta_l_final = delta_l * strength * gate_l_scaled
+    clip_l_mod = clip_l_in + delta_l_final
+    if use_anchor:
+        clip_l_mod = clip_l_mod * (1 - gate_l_scaled) + anchor_l * gate_l_scaled
+    if noise > 0:
+        clip_l_mod += torch.randn_like(clip_l_mod) * noise
+    anchor_g, delta_g, log_sigma_g, attn_g1, attn_g2, tau_g, g_pred_g, gate_g = adapter_g(t5_seq, clip_g_in)
+    gate_g_scaled = gate_g * gate_prob
+    delta_g_final = delta_g * strength * gate_g_scaled
+    clip_g_mod = clip_g_in + delta_g_final
+    if use_anchor:
+        clip_g_mod = clip_g_mod * (1 - gate_g_scaled) + anchor_g * gate_g_scaled
+    if noise > 0:
+        clip_g_mod += torch.randn_like(clip_g_mod) * noise
+    prompt_embeds = torch.cat([clip_l_mod, clip_g_mod], dim=-1).to(dtype)
+    neg_embeds = torch.zeros_like(prompt_embeds)
     image = pipe(
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=neg_embeds,
+        num_inference_steps=20,
+        guidance_scale=5.0
     ).images[0]
+    return (
+        image,
+        plot_heat(delta_l_final.squeeze().cpu().numpy(), "Δ CLIP-L"),
+        plot_heat(gate_l_scaled.squeeze().cpu().numpy(), "Gate CLIP-L"),
+        plot_heat(delta_g_final.squeeze().cpu().numpy(), "Δ CLIP-G"),
+        plot_heat(gate_g_scaled.squeeze().cpu().numpy(), "Gate CLIP-G"),
+        f"g_pred_l: {g_pred_l.mean().item():.3f}, τ_l: {tau_l.mean().item():.3f}",
+        f"g_pred_g: {g_pred_g.mean().item():.3f}, τ_g: {tau_g.mean().item():.3f}"
+    )
+# ─── Gradio App ───────────────────────────────────────────────
+with gr.Blocks(title="Dual Adapter T5→CLIP") as demo:
+    gr.Markdown("# 🧠 Dual Shunt Adapter • SDXL Inference")
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(label="Prompt", value="a futuristic control station")
+            adapter_l = gr.Dropdown(choices=clip_l_opts, label="CLIP-L (768d) Adapter")
+            adapter_g = gr.Dropdown(choices=clip_g_opts, label="CLIP-G (1280d) Adapter")
+            strength = gr.Slider(0.0, 5.0, value=1.0, step=0.1, label="Adapter Strength")
+            noise = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Noise Injection")
+            gate_prob = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Gate Probability")
+            use_anchor = gr.Checkbox(label="Use Anchor", value=True)
+            run_btn = gr.Button("Run")
+        with gr.Column():
+            out_img = gr.Image(label="Generated Image")
+            delta_l = gr.Image(label="Δ CLIP-L")
+            gate_l = gr.Image(label="Gate CLIP-L")
+            delta_g = gr.Image(label="Δ CLIP-G")
+            gate_g = gr.Image(label="Gate CLIP-G")
+            stats_l = gr.Textbox(label="CLIP-L Stats")
+            stats_g = gr.Textbox(label="CLIP-G Stats")
+    run_btn.click(
         fn=infer,
+        inputs=[prompt, adapter_l, adapter_g, strength, noise, gate_prob, use_anchor],
+        outputs=[out_img, delta_l, gate_l, delta_g, gate_g, stats_l, stats_g]
     )
 if __name__ == "__main__":

configs.py ADDED Viewed

	@@ -0,0 +1,149 @@

+T5_SHUNT_REPOS = {
+    "clip_g": {
+        "models": ["vit-bigG-14", 'flan-t5-base'],
+        "config": {
+            "adapter_id": "003", "name": "DualShuntAdapter-G",
+            "t5": {
+                "model": "google/flan-t5-base",
+                "hidden_size": 768
+            },
+            "clip": {
+                "model": "openai/clip-vit-large-patch14",
+                "hidden_size": 1280
+            },
+            "hidden_size": 1280,  # This is the adapter's output size
+            "bottleneck": 640, "heads": 20,
+            "tau_init": 0.1, "max_guidance": 10.0,
+            "proj_layers": 2, "layer_norm": True, "dropout": 0.1,
+            "use_dropout": True, "use_proj_stack": True, "assert_input_dims": True,
+            "routing": {"type": "cross_attention", "enable_causal_mask": False, "bidirectional": True},
+            "version": "v0.3.2"
+        },
+        "repo": "AbstractPhil/t5-flan-base-vit-bigG-14-dual-stream-adapter",
+        "shunts_available": {
+            "shunt_type_name": "DualStreamAdapter-G",
+            "config_file_name": "config.json",
+            "shunt_list": [
+                "t5-flan-vit-bigG-14-dual_shunt_caption.safetensors",
+                "t5-flan-vit-bigG-14-dual_shunt_no_caption_e1.safetensors",
+                "t5-flan-vit-bigG-14-dual_shunt_no_caption_e2.safetensors",
+                "t5-flan-vit-bigG-14-dual_shunt_no_caption_e3.safetensors",
+                "t5-flan-vit-bigG-14-dual_shunt_summarize.safetensors",
+                "dual_shunt_omega_no_caption_e1_step_10000.safetensors",
+                "dual_shunt_omega_no_caption_noised_e1_step_1000.safetensors",
+                "dual_shunt_omega_no_caption_noised_e1_step_4000.safetensors",
+                "dual_shunt_omega_no_caption_noised_e1_step_10000.safetensors",
+            ],
+        }
+    },
+    "clip_l": {
+        "models": ["vit-l-14", 'flan-t5-base'],
+        "config": {
+            "adapter_id": "002",
+            "name": "DualShuntAdapter",
+            "t5": {"model": "google/flan-t5-base", "hidden_size": 768},
+            "clip": {"model": "openai/clip-vit-large-patch14", "hidden_size": 768},
+            "hidden_size": 768,  # This is the adapter's output size
+            "bottleneck": 384, "heads": 12,
+            "tau_init": 0.1, "max_guidance": 10.0,
+            "proj_layers": 2, "layer_norm": True, "dropout": 0.1,
+            "use_dropout": True, "use_proj_stack": True, "assert_input_dims": True,
+            "routing": {"type": "cross_attention", "enable_causal_mask": False, "bidirectional": True},
+            "version": "v0.3.2"
+        },
+        "repo": "AbstractPhil/t5-flan-base-vit-l-14-dual-stream-adapter",
+        "shunts_available": {
+            "shunt_type_name": "DualStreamAdapter-L",
+            "config_file_name": "config.json",
+            "shunt_list": [
+                "t5-vit-l-14-dual_shunt_caption.safetensors",
+                "t5-vit-l-14-dual_shunt_no_caption.safetensors",
+                "t5-vit-l-14-dual_shunt_summarize.safetensors",
+            ],
+        },
+    }
+}
+# ─── Adapter Configs ─────────────────────────────────────────────
+BERT_CONFIGS = {
+    "mobilebert-base-uncased": {
+        "repo_name": "google/mobilebert-uncased",
+        "use_huggingface": True,  # defaults to simple loading from HuggingFace, if False, will use repo_name and subfolder
+        "subfolder": "",
+    },
+    "bert-base-uncased": {
+        "repo_name": "bert-base-uncased",
+        "use_huggingface": True,  # defaults to simple loading from HuggingFace, if False, will use repo_name and subfolder
+    },
+    "bert-large-uncased": {
+        "repo_name": "bert-large-uncased",
+        "use_huggingface": True,  # defaults to simple loading from HuggingFace, if False, will use repo_name and subfolder
+    },
+    "bert-base-cased": {
+        "repo_name": "bert-base-cased",
+        "use_huggingface": True,  # defaults to simple loading from HuggingFace, if False, will use repo_name and subfolder
+    }
+}
+T5_CONFIGS = {
+    "flan-t5-base": {
+        "repo_name": "google/flan-t5-base",
+        "use_huggingface": True, # defaults to simple loading from HuggingFace, if False, will use repo_name and subfolder
+    },
+    "t5-small": {
+        "repo_name": "google-t5/t5-small",
+        "use_huggingface": True, # defaults to simple loading from HuggingFace, if False, will use repo_name and subfolder
+    },
+    "t5_small_human_attentive_try2_pass3": {
+        "repo_name": "AbstractPhil/t5_small_human_attentive_try2_pass3",
+        "use_huggingface": True, # defaults to simple loading from HuggingFace, if False, will use repo_name and subfolder
+        # the necessary config is present here for posterity in case it fails to load from HuggingFace.
+        "subfolder": "",
+        "tokenizer": "t5-small",
+        "file_name": "model.safetensors",
+        "config": {
+              "config_file_name": "config.json",
+              "architectures": [
+                "T5ForConditionalGeneration"
+              ],
+              "attention_dropout": 0.3,
+              "classifier_dropout": 0.0,
+              "d_ff": 2048,
+              "d_kv": 64,
+              "d_model": 512,
+              "decoder_start_token_id": 0,
+              "dense_act_fn": "relu",
+              "dropout_rate": 0.0, #0.3,                  # disable for generation
+              "eos_token_id": 1,
+              "feed_forward_proj": "relu",
+              "initializer_factor": 1.0,
+              "is_encoder_decoder": True,
+              "is_gated_act": False,
+              "layer_norm_epsilon": 1e-06,
+              "model_type": "t5",
+              "n_positions": 512,
+              "num_decoder_layers": 6,
+              "num_heads": 8,
+              "num_layers": 6,
+              "output_past": True,
+              "pad_token_id": 0,
+              "relative_attention_max_distance": 128,
+              "relative_attention_num_buckets": 32,
+              "task_specific_params": {
+                "caption": {
+                  "early_stopping": True,
+                  "length_penalty": 1.0,
+                  "max_length": 64,
+                  "num_beams": 4,
+                  "prefix": "caption: "
+                }
+              },
+              "torch_dtype": "float32",
+              "transformers_version": "4.51.3",
+              "use_cache": True,
+              "vocab_size": 32128
+        }
+    }
+}

two_stream_shunt_adapter.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ─── Residual Pocket Block ───────────────────────────────────
+class BottleneckResBlock(nn.Module):
+    def __init__(self, dim, kernel=3, dropout=0.1):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel, padding=kernel // 2, groups=1)
+        self.proj = nn.Sequential(
+            nn.Linear(dim, dim * 2),
+            nn.GELU(),
+            nn.Linear(dim * 2, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        residual = x
+        x = self.norm(x)
+        x = x.transpose(1, 2)
+        x = self.conv(x).transpose(1, 2)
+        return residual + self.proj(x)
+# ─── Two Stream Shunt Adapter ──────────────────────────────────────
+class TwoStreamShuntAdapter(nn.Module):
+    def __init__(self, config: dict):
+        super().__init__()
+        self.config = config
+        self.t5_dim = config["t5"]["hidden_size"]
+        self.clip_dim = config["clip"]["hidden_size"]
+        self.bneck = config["bottleneck"]
+        self.heads = config["heads"]
+        self.tau_init = config["tau_init"]
+        self.max_guidance = config["max_guidance"]
+        use_norm   = config.get("layer_norm", True)
+        use_do     = config.get("use_dropout", True)
+        do_p       = config.get("dropout", 0.1)
+        proj_depth = config.get("proj_layers", 2)
+        def build_projection(input_dim, output_dim):
+            layers = []
+            last_dim = input_dim
+            if use_norm:
+                layers.append(nn.LayerNorm(last_dim))
+            for i in range(proj_depth):
+                next_dim = self.bneck * (2 if i == 0 and proj_depth > 1 else 1)
+                layers.append(nn.Linear(last_dim, next_dim))
+                layers.append(nn.GELU())
+                if use_do:
+                    layers.append(nn.Dropout(do_p))
+                last_dim = next_dim
+            layers.append(nn.Linear(last_dim, output_dim))
+            return nn.Sequential(*layers)
+        # Projections
+        self.proj_t5   = build_projection(self.t5_dim, self.bneck)
+        self.proj_clip = build_projection(self.clip_dim, self.bneck)
+        # Attention
+        self.cross_t2c = nn.MultiheadAttention(self.bneck, self.heads, batch_first=True, dropout=do_p)
+        self.cross_c2t = nn.MultiheadAttention(self.bneck, self.heads, batch_first=True, dropout=do_p)
+        self.tau       = nn.Parameter(torch.full((self.heads, 1, 1), self.tau_init))
+        # Residual Pocket
+        self.pocket_blocks = nn.Sequential(
+            BottleneckResBlock(self.bneck, dropout=do_p),
+            BottleneckResBlock(self.bneck, dropout=do_p)
+        )
+        # Fuse
+        self.fuse = nn.Sequential(
+            nn.LayerNorm(2 * self.bneck),
+            nn.Linear(2 * self.bneck, self.bneck * 2),
+            nn.GELU(),
+            nn.Linear(self.bneck * 2, self.bneck)
+        )
+        # Output Projections
+        self.anchor_proj = build_projection(self.bneck, self.clip_dim)
+        self.delta_proj  = build_projection(self.bneck, self.clip_dim)
+        self.logsig_proj = build_projection(self.bneck, self.clip_dim)
+        self.gate_proj = nn.Sequential(
+            nn.LayerNorm(self.bneck),
+            nn.Linear(self.bneck, self.bneck),
+            nn.GELU(),
+            nn.Linear(self.bneck, 1),
+            nn.Tanh(),
+            nn.Sigmoid()
+        )
+        self.guidance_proj = nn.Sequential(
+            nn.LayerNorm(self.bneck),
+            nn.Linear(self.bneck, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, t5_seq: torch.Tensor, clip_seq: torch.Tensor):
+        if self.config.get("assert_input_dims", True):
+            assert t5_seq.size(-1) == self.t5_dim
+            assert clip_seq.size(-1) == self.clip_dim
+        t5_b   = self.proj_t5(t5_seq)
+        clip_b = self.proj_clip(clip_seq)
+        t2c, attn_t2c = self.cross_t2c(t5_b, clip_b, clip_b, need_weights=True, average_attn_weights=False)
+        c2t, attn_c2t = self.cross_c2t(clip_b, t5_b, t5_b, need_weights=True, average_attn_weights=False)
+        pocket = self.pocket_blocks(t2c)
+        pocket_mean = pocket.mean(1, keepdim=True).expand(-1, clip_b.size(1), -1)
+        h = self.fuse(torch.cat([pocket_mean, c2t], dim=-1))
+        anchor    = self.anchor_proj(h)
+        delta     = self.delta_proj(h) * self.gate_proj(h)
+        log_sigma = self.logsig_proj(h)
+        g_tok  = self.guidance_proj(h).squeeze(-1)
+        g_pred = g_tok.mean(1, keepdim=True) * self.max_guidance
+        return anchor, delta, log_sigma, attn_t2c, attn_c2t, self.tau, g_pred, self.gate_proj(h)