Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 12 days ago

Commit

ed080e6

verified ·

1 Parent(s): aa28bbb

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -84

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
-# app.py – encoder-only demo for bert-beatrix-2048 + role-probe
-# ------------------------------------------------------------
 # launch:  python app.py
-# (gradio UI appears at http://localhost:7860)
-import json, sys
 from pathlib import Path, PurePosixPath
 import gradio as gr
 import spaces
@@ -14,41 +14,40 @@ from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
 # 0.  Download & patch config.json  --------------------------------
 # ------------------------------------------------------------------
-REPO_ID    = "AbstractPhil/bert-beatrix-2048"
-LOCAL_DIR  = "bert-beatrix-2048"              # local cache dir
 snapshot_download(
     repo_id=REPO_ID,
     revision="main",
-    local_dir=LOCAL_DIR,
     local_dir_use_symlinks=False,
 )
-cfg_path = Path(LOCAL_DIR) / "config.json"
 with cfg_path.open() as f:
     cfg = json.load(f)
-auto_map = cfg.get("auto_map", {})
-patched  = False
 for k, v in auto_map.items():
-    if "--" in v:                             # e.g.  "repo--module.Class"
         auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
         patched = True
 if patched:
     with cfg_path.open("w") as f:
         json.dump(cfg, f, indent=2)
-    print("🛠️  Patched config.json → auto_map fixed.")
 # ------------------------------------------------------------------
 # 1.  Model / tokenizer  -------------------------------------------
 # ------------------------------------------------------------------
-handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_DIR)
 full_model = full_model.eval().cuda()
 encoder     = full_model.bert.encoder
@@ -56,7 +55,6 @@ embeddings  = full_model.bert.embeddings
 emb_ln      = full_model.bert.emb_ln
 emb_drop    = full_model.bert.emb_drop
 # ------------------------------------------------------------------
 # 2.  Symbolic token set  ------------------------------------------
 # ------------------------------------------------------------------
@@ -68,105 +66,111 @@ SYMBOLIC_ROLES = [
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
-ROLE_ID = {tok: tokenizer.convert_tokens_to_ids(tok) for tok in SYMBOLIC_ROLES}
-missing = [tok for tok, tid in ROLE_ID.items() if tid == tokenizer.unk_token_id]
 if missing:
     sys.exit(f"❌ Tokenizer is missing {missing}")
 # ------------------------------------------------------------------
-# 3.  Encoder-only + role-similarity probe  ------------------------
 # ------------------------------------------------------------------
 @spaces.GPU
 def encode_and_trace(text: str, selected_roles: list[str]):
-    """
-    For each *selected* role:
-        • find the contextual token whose hidden state is most similar to that
-          role’s own embedding (cosine similarity)
-        • return “role → token (sim)”, using tokens even when the prompt
-          contained no <role> markers at all.
-    Also keeps the older diagnostics.
-    """
     with torch.no_grad():
         batch = tokenizer(text, return_tensors="pt").to("cuda")
-        ids, mask = batch.input_ids, batch.attention_mask            # (1, S)
-        # ---------- encoder ----------
-        x   = emb_drop(emb_ln(embeddings(ids)))
-        msk = full_model.bert.get_extended_attention_mask(mask, x.shape[:-1])
-        h   = encoder(x, attention_mask=msk).squeeze(0)              # (S, H)
-        # L2-normalise hidden states once
-        h_norm = F.normalize(h, dim=-1)                              # (S, H)
-        # ---------- probe each selected role -----------------------
-        matches = []
         for role in selected_roles:
-            role_vec = embeddings.word_embeddings.weight[ROLE_ID[role]].to(h.device)
-            role_vec = F.normalize(role_vec, dim=-1)                 # (H)
-            sims = (h_norm @ role_vec)                               # (S)
-            best_idx = int(sims.argmax().item())
-            best_sim = float(sims[best_idx])
-            match_tok = tokenizer.convert_ids_to_tokens(int(ids[0, best_idx]))
-            matches.append(f"{role} → {match_tok} ({best_sim:.2f})")
-        match_str = ", ".join(matches) if matches else "(no roles selected)"
-        # ---------- string-match diagnostics -----------------------
-        present = [tok for tok_id, tok in zip(ids[0].tolist(),
-                                              tokenizer.convert_ids_to_tokens(ids[0]))
-                   if tok in selected_roles]
-        present_str = ", ".join(present) or "(none)"
-        count = len(present)
-        # ---------- hidden-state norm of *explicit* role tokens ----
-        if count:
-            exp_mask = torch.tensor([tid in ROLE_ID.values() for tid in ids[0]], device=h.device)
-            norm_val = f"{h[exp_mask].mean(0).norm().item():.4f}"
-        else:
-            norm_val = "0.0000"
-        return present_str, match_str, norm_val, count
 # ------------------------------------------------------------------
-# 4.  Gradio UI  ----------------------------------------------------
 # ------------------------------------------------------------------
 def build_interface():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
-            "## 🧠 Symbolic Encoder Inspector  \n"
-            "Select one or more symbolic *roles* on the left.  "
-            "The tool shows which regular tokens (if any) the model thinks "
-            "best fit each role — even when your text doesn’t contain the "
-            "explicit `<role>` marker."
         )
         with gr.Row():
             with gr.Column():
                 txt = gr.Textbox(
-                    label="Input text",
                     lines=3,
-                    placeholder="Example: A small child in bright red boots jumps over a muddy puddle…",
                 )
                 roles = gr.CheckboxGroup(
                     choices=SYMBOLIC_ROLES,
-                    label="Roles to probe",
                 )
-                btn = gr.Button("Run encoder probe")
             with gr.Column():
-                out_present = gr.Textbox(label="Explicit role tokens found")
-                out_match   = gr.Textbox(label="Role → Best-Match Token (cos θ)")
-                out_norm    = gr.Textbox(label="Mean hidden-state norm (explicit)")
-                out_count   = gr.Textbox(label="# explicit role tokens")
-        btn.click(
-            encode_and_trace,
-            inputs=[txt, roles],
-            outputs=[out_present, out_match, out_norm, out_count],
-        )
     return demo

+# app.py – encoder-only demo for bert-beatrix-2048
 # launch:  python app.py
+import json
+import sys
 from pathlib import Path, PurePosixPath
+from itertools import islice
 import gradio as gr
 import spaces
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
 # 0.  Download & patch config.json  --------------------------------
 # ------------------------------------------------------------------
+REPO_ID     = "AbstractPhil/bert-beatrix-2048"
+LOCAL_CKPT  = "bert-beatrix-2048"
 snapshot_download(
     repo_id=REPO_ID,
     revision="main",
+    local_dir=LOCAL_CKPT,
     local_dir_use_symlinks=False,
 )
+cfg_path = Path(LOCAL_CKPT) / "config.json"
 with cfg_path.open() as f:
     cfg = json.load(f)
+auto_map  = cfg.get("auto_map", {})
+patched   = False
 for k, v in auto_map.items():
+    if "--" in v:                                  # "repo--module.Class"
         auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
         patched = True
 if patched:
+    cfg["auto_map"] = auto_map
     with cfg_path.open("w") as f:
         json.dump(cfg, f, indent=2)
+    print("🛠️  Patched config.json → auto_map paths fixed")
 # ------------------------------------------------------------------
 # 1.  Model / tokenizer  -------------------------------------------
 # ------------------------------------------------------------------
+handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CKPT)
 full_model = full_model.eval().cuda()
 encoder     = full_model.bert.encoder
 emb_ln      = full_model.bert.emb_ln
 emb_drop    = full_model.bert.emb_drop
 # ------------------------------------------------------------------
 # 2.  Symbolic token set  ------------------------------------------
 # ------------------------------------------------------------------
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
+unk = tokenizer.unk_token_id
+missing = [t for t in SYMBOLIC_ROLES if tokenizer.convert_tokens_to_ids(t) == unk]
 if missing:
     sys.exit(f"❌ Tokenizer is missing {missing}")
+# ------------------------------------------------------------------
+# 3.  helper: merge lowest + highest until 3 remain ----------------
+# ------------------------------------------------------------------
+def reduce_to_three(table):
+    """
+    table : list of dicts  {role, token, score}
+    repeatedly remove   lowest and highest,
+    replace with their average,
+    until len(table)==3.
+    """
+    working = table[:]
+    working.sort(key=lambda x: x["score"])
+    while len(working) > 3:
+        low  = working.pop(0)
+        high = working.pop(-1)
+        merged = {
+            "role":  f"{high['role']}|{low['role']}",
+            "token": f"{high['token']}/{low['token']}",
+            "score": (high["score"] + low["score"]) / 2.0,
+        }
+        working.append(merged)
+        working.sort(key=lambda x: x["score"])
+    # highest first for display
+    working.sort(key=lambda x: x["score"], reverse=True)
+    return working
 # ------------------------------------------------------------------
+# 4.  Encoder-only inference util  ---------------------------------
 # ------------------------------------------------------------------
 @spaces.GPU
 def encode_and_trace(text: str, selected_roles: list[str]):
     with torch.no_grad():
+        if not text.strip():
+            return "(no input)","",""
         batch = tokenizer(text, return_tensors="pt").to("cuda")
+        ids, attn = batch.input_ids, batch.attention_mask
+        # encoder forward
+        x = emb_drop(emb_ln(embeddings(ids)))
+        ext = full_model.bert.get_extended_attention_mask(attn, x.shape[:-1])
+        hs  = encoder(x, attention_mask=ext)                     # (1,S,H)
+        # token-level embeddings (before LN) for similarity calc
+        token_emb = embeddings(ids).squeeze(0)                   # (S,H)
+        rows = []
         for role in selected_roles:
+            rid   = tokenizer.convert_tokens_to_ids(role)
+            rvec  = embeddings.weight[rid]                       # (H,)
+            # cosine similarity to every *input* token embedding
+            sims  = F.cosine_similarity(rvec.unsqueeze(0), token_emb, dim=-1)
+            best  = torch.argmax(sims).item()
+            rows.append({
+                "role" : role,
+                "token": tokenizer.convert_ids_to_tokens([ids[0, best].item()])[0],
+                "score": sims[best].item()
+            })
+        if not rows:
+            return "(none selected)","",""
+        final3 = reduce_to_three(rows)
+        out_strs = [f"{r['role']} ↔ {r['token']}  ({r['score']:+.2f})" for r in final3]
+        # pad so we always return 3 strings
+        while len(out_strs) < 3:
+            out_strs.append("")
+        return out_strs[0], out_strs[1], out_strs[2]
 # ------------------------------------------------------------------
+# 5.  Gradio UI  ----------------------------------------------------
 # ------------------------------------------------------------------
 def build_interface():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
+            "### 🧠 Symbolic Encoder Inspector\n"
+            "Paste text with `<role>` tokens, pick roles to track, then we\n"
+            "• compute role ↔ token cosine scores\n"
+            "• iteratively merge low+high pairs until **3 composite buckets** remain."
         )
         with gr.Row():
             with gr.Column():
                 txt = gr.Textbox(
+                    label="Input with Symbolic Tokens",
+                    placeholder="Example: A <subject> wearing <upper_body_clothing> …",
                     lines=3,
                 )
                 roles = gr.CheckboxGroup(
                     choices=SYMBOLIC_ROLES,
+                    label="Trace these symbolic roles",
                 )
+                btn = gr.Button("Encode & Merge")
             with gr.Column():
+                cat1 = gr.Textbox(label="Category 1 (highest)")
+                cat2 = gr.Textbox(label="Category 2")
+                cat3 = gr.Textbox(label="Category 3 (lowest)")
+        btn.click(encode_and_trace, [txt, roles], [cat1, cat2, cat3])
     return demo