Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 10 days ago

Commit

da8b548

verified ·

1 Parent(s): 6c28416

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -118

app.py CHANGED Viewed

@@ -1,24 +1,21 @@
-# app.py – encoder-only demo for bert-beatrix-2048
 # launch:  python app.py
-import json
-import sys
 from pathlib import Path, PurePosixPath
-from itertools import islice
-import gradio as gr
-import spaces
-import torch
-import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
-# 0.  Download & patch config.json  --------------------------------
 # ------------------------------------------------------------------
-REPO_ID     = "AbstractPhil/bert-beatrix-2048"
-LOCAL_CKPT  = "bert-beatrix-2048"
 snapshot_download(
     repo_id=REPO_ID,
@@ -28,35 +25,32 @@ snapshot_download(
 )
 cfg_path = Path(LOCAL_CKPT) / "config.json"
-with cfg_path.open() as f:
-    cfg = json.load(f)
-auto_map  = cfg.get("auto_map", {})
-patched   = False
 for k, v in auto_map.items():
-    if "--" in v:                                  # "repo--module.Class"
         auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
-        patched = True
-if patched:
-    cfg["auto_map"] = auto_map
-    with cfg_path.open("w") as f:
-        json.dump(cfg, f, indent=2)
-    print("🛠️  Patched config.json → auto_map paths fixed")
 # ------------------------------------------------------------------
-# 1.  Model / tokenizer  -------------------------------------------
 # ------------------------------------------------------------------
 handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CKPT)
 full_model = full_model.eval().cuda()
 encoder     = full_model.bert.encoder
 embeddings  = full_model.bert.embeddings
 emb_ln      = full_model.bert.emb_ln
 emb_drop    = full_model.bert.emb_drop
 # ------------------------------------------------------------------
-# 2.  Symbolic token set  ------------------------------------------
 # ------------------------------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
@@ -66,114 +60,108 @@ SYMBOLIC_ROLES = [
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
-unk = tokenizer.unk_token_id
-missing = [t for t in SYMBOLIC_ROLES if tokenizer.convert_tokens_to_ids(t) == unk]
 if missing:
-    sys.exit(f"❌ Tokenizer is missing {missing}")
-# ------------------------------------------------------------------
-# 3.  helper: merge lowest + highest until 3 remain ----------------
-# ------------------------------------------------------------------
-def reduce_to_three(table):
-    """
-    table : list of dicts  {role, token, score}
-    repeatedly remove   lowest and highest,
-    replace with their average,
-    until len(table)==3.
-    """
-    working = table[:]
-    working.sort(key=lambda x: x["score"])
-    while len(working) > 3:
-        low  = working.pop(0)
-        high = working.pop(-1)
-        merged = {
-            "role":  f"{high['role']}|{low['role']}",
-            "token": f"{high['token']}/{low['token']}",
-            "score": (high["score"] + low["score"]) / 2.0,
-        }
-        working.append(merged)
-        working.sort(key=lambda x: x["score"])
-    # highest first for display
-    working.sort(key=lambda x: x["score"], reverse=True)
-    return working
 # ------------------------------------------------------------------
-# 4.  Encoder-only inference util  ---------------------------------
-# ------------------------------------------------------------------
-@spaces.GPU
-def encode_and_trace(text: str, selected_roles: list[str]):
     with torch.no_grad():
-        if not text.strip():
-            return "(no input)","",""
-        batch = tokenizer(text, return_tensors="pt").to("cuda")
-        ids, attn = batch.input_ids, batch.attention_mask
-        # encoder forward
-        x = emb_drop(emb_ln(embeddings(ids)))
-        ext = full_model.bert.get_extended_attention_mask(attn, x.shape[:-1])
-        hs  = encoder(x, attention_mask=ext)                     # (1,S,H)
-        # token-level embeddings (before LN) for similarity calc
-        token_emb = embeddings(ids).squeeze(0)                   # (S,H)
-        rows = []
-        for role in selected_roles:
-            rid   = tokenizer.convert_tokens_to_ids(role)
-            rvec  = embeddings.word_embeddings.weight[rid]                    # (H,)
-            # cosine similarity to every *input* token embedding
-            sims  = F.cosine_similarity(rvec.unsqueeze(0), token_emb, dim=-1)
-            best  = torch.argmax(sims).item()
-            rows.append({
-                "role" : role,
-                "token": tokenizer.convert_ids_to_tokens([ids[0, best].item()])[0],
-                "score": sims[best].item()
-            })
-        if not rows:
-            return "(none selected)","",""
-        final3 = reduce_to_three(rows)
-        out_strs = [f"{r['role']} ↔ {r['token']}  ({r['score']:+.2f})" for r in final3]
-        # pad so we always return 3 strings
-        while len(out_strs) < 3:
-            out_strs.append("")
-        return out_strs[0], out_strs[1], out_strs[2]
 # ------------------------------------------------------------------
-# 5.  Gradio UI  ----------------------------------------------------
 # ------------------------------------------------------------------
-def build_interface():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
-            "### 🧠 Symbolic Encoder Inspector\n"
-            "Paste text with `<role>` tokens, pick roles to track, then we\n"
-            "• compute role ↔ token cosine scores\n"
-            "• iteratively merge low+high pairs until **3 composite buckets** remain."
         )
         with gr.Row():
             with gr.Column():
-                txt = gr.Textbox(
-                    label="Input with Symbolic Tokens",
-                    placeholder="Example: A <subject> wearing <upper_body_clothing> …",
-                    lines=3,
-                )
                 roles = gr.CheckboxGroup(
-                    choices=SYMBOLIC_ROLES,
-                    label="Trace these symbolic roles",
                 )
-                btn = gr.Button("Encode & Merge")
             with gr.Column():
-                cat1 = gr.Textbox(label="Category 1 (highest)")
-                cat2 = gr.Textbox(label="Category 2")
-                cat3 = gr.Textbox(label="Category 3 (lowest)")
-        btn.click(encode_and_trace, [txt, roles], [cat1, cat2, cat3])
     return demo
 if __name__ == "__main__":
-    build_interface().launch()

+# app.py – encoder-only demo + pool-and-test prototype
+# ----------------------------------------------------
 # launch:  python app.py
+# UI: http://localhost:7860
+import json, re, sys, math
 from pathlib import Path, PurePosixPath
+import torch, torch.nn.functional as F
+import gradio as gr, spaces
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
+# 0. One-time patch of auto_map in config.json
 # ------------------------------------------------------------------
+REPO_ID   = "AbstractPhil/bert-beatrix-2048"
+LOCAL_CKPT = "bert-beatrix-2048"
 snapshot_download(
     repo_id=REPO_ID,
 )
 cfg_path = Path(LOCAL_CKPT) / "config.json"
+cfg      = json.loads(cfg_path.read_text())
+auto_map = cfg.get("auto_map", {})
+changed  = False
 for k, v in auto_map.items():
+    if "--" in v:                         # strip “repo--”
         auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
+        changed = True
+if changed:
+    cfg_path.write_text(json.dumps(cfg, indent=2))
+    print("🛠️  Patched config.json → auto_map points to local modules")
 # ------------------------------------------------------------------
+# 1. Load model + tokenizer with BERTHandler
 # ------------------------------------------------------------------
 handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CKPT)
 full_model = full_model.eval().cuda()
+# pull encoder & embedding stack
 encoder     = full_model.bert.encoder
 embeddings  = full_model.bert.embeddings
+emb_weight  = embeddings.word_embeddings.weight  # <- correct tensor
 emb_ln      = full_model.bert.emb_ln
 emb_drop    = full_model.bert.emb_drop
 # ------------------------------------------------------------------
+# 2. Symbolic roles
 # ------------------------------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
+missing = [t for t in SYMBOLIC_ROLES
+           if tokenizer.convert_tokens_to_ids(t) == tokenizer.unk_token_id]
 if missing:
+    sys.exit(f"❌ Tokenizer missing {missing}")
+MASK_ID   = tokenizer.mask_token_id
+MASK_TOK  = tokenizer.mask_token
 # ------------------------------------------------------------------
+# helpers -----------------------------------------------------------
+def contextual_vectors(ids, mask):
+    """run through embedding→encoder, return (S,H) hidden states"""
+    x = emb_drop(emb_ln(embeddings(ids)))                    # (1,S,H)
+    ext = full_model.bert.get_extended_attention_mask(mask, x.shape[:-1])
+    return encoder(x, attention_mask=ext).squeeze(0)         # (S,H)
+def pool_accuracy(ids, mask, pool_positions):
+    """mask positions in pool, predict, calc accuracy"""
+    masked = ids.clone()
+    masked[0, pool_positions] = MASK_ID
     with torch.no_grad():
+        logits = full_model(masked, attention_mask=mask).logits[0]
+    preds  = logits.argmax(-1)
+    gold   = ids.squeeze(0)
+    correct = (preds[pool_positions] == gold[pool_positions]).sum().item()
+    return correct / len(pool_positions) if pool_positions else 0.0
+# cosine utility
+def cos(a, b): return F.cosine_similarity(a, b, dim=-1, eps=1e-8).item()
 # ------------------------------------------------------------------
+# 3. Core routine ---------------------------------------------------
+@spaces.GPU
+def encode_and_trace(text: str, picked_roles: list[str]):
+    # -------- tokenise ----------
+    batch = tokenizer(text, return_tensors="pt").to("cuda")
+    ids, attn = batch.input_ids, batch.attention_mask
+    hid = contextual_vectors(ids, attn)                      # (S,H)
+    # -------- decide which roles we analyse ----------
+    present = {tid: pos for pos, tid in enumerate(ids[0].tolist())
+               if tid in {tokenizer.convert_tokens_to_ids(r) for r in SYMBOLIC_ROLES}}
+    if picked_roles:
+        present = {tid: pos for tid, pos in present.items()
+                   if tokenizer.convert_ids_to_tokens([tid])[0] in picked_roles}
+    if not present:
+        return "No symbolic tokens in sentence", "", ""
+    # -------- similarity scores ----------
+    sims = []
+    for tid, pos in present.items():
+        rvec = emb_weight[tid]               # static embedding
+        cvec = hid[pos]                      # contextual
+        sims.append((cos(cvec, rvec), tid, pos))
+    sims.sort()                              # low → high
+    # pools: bottom-2, top-2  (expand later)
+    low_pool, high_pool = sims[:2], sims[-2:]
+    accepted = []
+    for grow in range(1 + math.ceil(len(sims)/2)):   # ≤26 shots
+        for tag, pool in [("low", low_pool), ("high", high_pool)]:
+            pool_pos = [p for _,_,p in pool]
+            acc = pool_accuracy(ids, attn, pool_pos)
+            if acc >= 0.5:        # category accepted
+                roles = [tokenizer.convert_ids_to_tokens([tid])[0] for _,tid,_ in pool]
+                accepted.append(f"{tag}:{roles} (acc {acc:.2f})")
+        if accepted: break        # stop once something passed
+        # grow pools by two (if any left)
+        next_lo = sims[2+grow*2 : 4+grow*2]
+        next_hi = sims[-4-grow*2 : -2-grow*2] if 4+grow*2 <= len(sims) else []
+        low_pool  += next_lo
+        high_pool += next_hi
+    if not accepted:
+        accepted = ["(none hit 50 %)"]
+    return ", ".join(accepted), f"{len(present)} roles analysed", f"{text[:80]}…"
 # ------------------------------------------------------------------
+# 4. UI -------------------------------------------------------------
+def build_ui():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
+            "## 🧠 Symbolic Encoder Inspector  \n"
+            "Select roles, paste text, and watch the pool-and-test prototype work."
         )
         with gr.Row():
             with gr.Column():
+                txt = gr.Textbox(lines=3, label="Input")
                 roles = gr.CheckboxGroup(
+                    SYMBOLIC_ROLES,
+                    value=SYMBOLIC_ROLES,
+                    label="Roles to consider (else all present)"
                 )
+                btn = gr.Button("Run")
             with gr.Column():
+                out_cat = gr.Textbox(label="Accepted categories")
+                out_info= gr.Textbox(label="Debug")
+                out_excerpt = gr.Textbox(label="Excerpt")
+        btn.click(encode_and_trace, [txt, roles], [out_cat, out_info, out_excerpt])
     return demo
 if __name__ == "__main__":
+    build_ui().launch()