Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 7 days ago

Commit

aa28bbb

verified ·

1 Parent(s): 415afa1

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -56

app.py CHANGED Viewed

@@ -1,52 +1,54 @@
-# app.py – encoder-only demo for bert-beatrix-2048
-# ------------------------------------------------
-# launch:  python app.py     →  http://localhost:7860
-import json, re, sys
-from pathlib import Path, PurePosixPath        # ← PurePosixPath import
 import gradio as gr
 import spaces
 import torch
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
-# 0.  Download & patch config.json  ---------------------------------
 # ------------------------------------------------------------------
-REPO_ID     = "AbstractPhil/bert-beatrix-2048"
-LOCAL_CKPT  = "bert-beatrix-2048"              # cache dir name
 snapshot_download(
     repo_id=REPO_ID,
     revision="main",
-    local_dir=LOCAL_CKPT,
     local_dir_use_symlinks=False,
 )
-cfg_path = Path(LOCAL_CKPT) / "config.json"
 with cfg_path.open() as f:
     cfg = json.load(f)
 auto_map = cfg.get("auto_map", {})
 patched  = False
 for k, v in auto_map.items():
-    if "--" in v:                              # strip   repo--module.Class
         auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
         patched = True
 if patched:
-    cfg["auto_map"] = auto_map
     with cfg_path.open("w") as f:
         json.dump(cfg, f, indent=2)
-    print("🛠️  Patched config.json → auto_map now points to local modules")
 # ------------------------------------------------------------------
-# 1.  Load model / tokenizer  ---------------------------------------
 # ------------------------------------------------------------------
-handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CKPT)
 full_model = full_model.eval().cuda()
 encoder     = full_model.bert.encoder
@@ -56,7 +58,7 @@ emb_drop    = full_model.bert.emb_drop
 # ------------------------------------------------------------------
-# 2.  Symbolic roles  ------------------------------------------------
 # ------------------------------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
@@ -66,48 +68,67 @@ SYMBOLIC_ROLES = [
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
-missing = [t for t in SYMBOLIC_ROLES
-           if tokenizer.convert_tokens_to_ids(t) == tokenizer.unk_token_id]
 if missing:
     sys.exit(f"❌ Tokenizer is missing {missing}")
 # ------------------------------------------------------------------
-# 3.  Encoder-only helper  ------------------------------------------
 # ------------------------------------------------------------------
 @spaces.GPU
 def encode_and_trace(text: str, selected_roles: list[str]):
     """
-    Returns **exactly three scalars** matching the 3 gradio outputs:
-        1) tokens_str  (comma-separated list found)
-        2) norm_str    (mean L2-norm of those embeddings)
-        3) count_int   (# tokens matched)
     """
     with torch.no_grad():
         batch = tokenizer(text, return_tensors="pt").to("cuda")
-        ids, attn = batch.input_ids, batch.attention_mask
-        x = emb_drop(emb_ln(embeddings(ids)))
-        ext = full_model.bert.get_extended_attention_mask(attn, x.shape[:-1])
-        enc = encoder(x, attention_mask=ext)                   # (1, S, H)
-        role_ids = {tokenizer.convert_tokens_to_ids(t) for t in selected_roles}
-        mask     = torch.tensor([tid in role_ids for tid in ids[0].tolist()],
-                                device=enc.device, dtype=torch.bool)
-        found = [tokenizer.convert_ids_to_tokens([tid])[0]
-                 for tid in ids[0].tolist() if tid in role_ids]
-        tokens_str = ", ".join(found) or "(none)"
-        if mask.any():
-            mean_vec = enc[0][mask].mean(0)
-            norm_str = f"{mean_vec.norm().item():.4f}"
         else:
-            norm_str = "0.0000"
-        count_int = int(mask.sum().item())
-        return tokens_str, norm_str, count_int       # ← three outputs!
 # ------------------------------------------------------------------
@@ -116,23 +137,36 @@ def encode_and_trace(text: str, selected_roles: list[str]):
 def build_interface():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
-            "### 🧠 Symbolic Encoder Inspector\n"
-            "Paste text that includes the `<role>` tokens and inspect their "
-            "hidden-state statistics."
         )
         with gr.Row():
             with gr.Column():
-                txt  = gr.Textbox(label="Input", lines=3,
-                                  placeholder="A <subject> wearing <upper_body_clothing> …")
-                chk  = gr.CheckboxGroup(SYMBOLIC_ROLES, label="Roles to trace")
-                run  = gr.Button("Encode & Trace")
             with gr.Column():
-                out_tok  = gr.Textbox(label="Tokens found")
-                out_norm = gr.Textbox(label="Mean norm")
-                out_cnt  = gr.Textbox(label="Token count")
-        run.click(encode_and_trace, [txt, chk], [out_tok, out_norm, out_cnt])
     return demo

+# app.py – encoder-only demo for bert-beatrix-2048 + role-probe
+# ------------------------------------------------------------
+# launch:  python app.py
+# (gradio UI appears at http://localhost:7860)
+import json, sys
+from pathlib import Path, PurePosixPath
 import gradio as gr
 import spaces
 import torch
+import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
+# 0.  Download & patch config.json  --------------------------------
 # ------------------------------------------------------------------
+REPO_ID    = "AbstractPhil/bert-beatrix-2048"
+LOCAL_DIR  = "bert-beatrix-2048"              # local cache dir
 snapshot_download(
     repo_id=REPO_ID,
     revision="main",
+    local_dir=LOCAL_DIR,
     local_dir_use_symlinks=False,
 )
+cfg_path = Path(LOCAL_DIR) / "config.json"
 with cfg_path.open() as f:
     cfg = json.load(f)
 auto_map = cfg.get("auto_map", {})
 patched  = False
 for k, v in auto_map.items():
+    if "--" in v:                             # e.g.  "repo--module.Class"
         auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
         patched = True
 if patched:
     with cfg_path.open("w") as f:
         json.dump(cfg, f, indent=2)
+    print("🛠️  Patched config.json → auto_map fixed.")
 # ------------------------------------------------------------------
+# 1.  Model / tokenizer  -------------------------------------------
 # ------------------------------------------------------------------
+handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_DIR)
 full_model = full_model.eval().cuda()
 encoder     = full_model.bert.encoder
 # ------------------------------------------------------------------
+# 2.  Symbolic token set  ------------------------------------------
 # ------------------------------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
+ROLE_ID = {tok: tokenizer.convert_tokens_to_ids(tok) for tok in SYMBOLIC_ROLES}
+missing = [tok for tok, tid in ROLE_ID.items() if tid == tokenizer.unk_token_id]
 if missing:
     sys.exit(f"❌ Tokenizer is missing {missing}")
 # ------------------------------------------------------------------
+# 3.  Encoder-only + role-similarity probe  ------------------------
 # ------------------------------------------------------------------
 @spaces.GPU
 def encode_and_trace(text: str, selected_roles: list[str]):
     """
+    For each *selected* role:
+        • find the contextual token whose hidden state is most similar to that
+          role’s own embedding (cosine similarity)
+        • return “role → token (sim)”, using tokens even when the prompt
+          contained no <role> markers at all.
+    Also keeps the older diagnostics.
     """
     with torch.no_grad():
         batch = tokenizer(text, return_tensors="pt").to("cuda")
+        ids, mask = batch.input_ids, batch.attention_mask            # (1, S)
+        # ---------- encoder ----------
+        x   = emb_drop(emb_ln(embeddings(ids)))
+        msk = full_model.bert.get_extended_attention_mask(mask, x.shape[:-1])
+        h   = encoder(x, attention_mask=msk).squeeze(0)              # (S, H)
+        # L2-normalise hidden states once
+        h_norm = F.normalize(h, dim=-1)                              # (S, H)
+        # ---------- probe each selected role -----------------------
+        matches = []
+        for role in selected_roles:
+            role_vec = embeddings.word_embeddings.weight[ROLE_ID[role]].to(h.device)
+            role_vec = F.normalize(role_vec, dim=-1)                 # (H)
+            sims = (h_norm @ role_vec)                               # (S)
+            best_idx = int(sims.argmax().item())
+            best_sim = float(sims[best_idx])
+            match_tok = tokenizer.convert_ids_to_tokens(int(ids[0, best_idx]))
+            matches.append(f"{role} → {match_tok} ({best_sim:.2f})")
+        match_str = ", ".join(matches) if matches else "(no roles selected)"
+        # ---------- string-match diagnostics -----------------------
+        present = [tok for tok_id, tok in zip(ids[0].tolist(),
+                                              tokenizer.convert_ids_to_tokens(ids[0]))
+                   if tok in selected_roles]
+        present_str = ", ".join(present) or "(none)"
+        count = len(present)
+        # ---------- hidden-state norm of *explicit* role tokens ----
+        if count:
+            exp_mask = torch.tensor([tid in ROLE_ID.values() for tid in ids[0]], device=h.device)
+            norm_val = f"{h[exp_mask].mean(0).norm().item():.4f}"
         else:
+            norm_val = "0.0000"
+        return present_str, match_str, norm_val, count
 # ------------------------------------------------------------------
 def build_interface():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
+            "## 🧠 Symbolic Encoder Inspector  \n"
+            "Select one or more symbolic *roles* on the left.  "
+            "The tool shows which regular tokens (if any) the model thinks "
+            "best fit each role — even when your text doesn’t contain the "
+            "explicit `<role>` marker."
         )
         with gr.Row():
             with gr.Column():
+                txt = gr.Textbox(
+                    label="Input text",
+                    lines=3,
+                    placeholder="Example: A small child in bright red boots jumps over a muddy puddle…",
+                )
+                roles = gr.CheckboxGroup(
+                    choices=SYMBOLIC_ROLES,
+                    label="Roles to probe",
+                )
+                btn = gr.Button("Run encoder probe")
             with gr.Column():
+                out_present = gr.Textbox(label="Explicit role tokens found")
+                out_match   = gr.Textbox(label="Role → Best-Match Token (cos θ)")
+                out_norm    = gr.Textbox(label="Mean hidden-state norm (explicit)")
+                out_count   = gr.Textbox(label="# explicit role tokens")
+        btn.click(
+            encode_and_trace,
+            inputs=[txt, roles],
+            outputs=[out_present, out_match, out_norm, out_count],
+        )
     return demo