Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 19 days ago

Commit

6cdb3e9

verified ·

1 Parent(s): e6ff6d3

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -59

app.py CHANGED Viewed

@@ -83,89 +83,96 @@ def pool_accuracy(ids, logits, pool_mask):
     return (preds==gold).float().mean().item()
 @spaces.GPU
 def encode_and_trace(text, selected_roles):
-    # if user unchecked everything we treat as "all"
     if not selected_roles:
         selected_roles = SYMBOLIC_ROLES
-    sel_ids = {tokenizer.convert_tokens_to_ids(t) for t in selected_roles}
-    # ---- Tokenise & encode once ----
     batch = tokenizer(text, return_tensors="pt").to("cuda")
-    ids, att = batch.input_ids, batch.attention_mask
-    x = emb_drop(emb_ln(embeddings(ids)))
-    ext = full_model.bert.get_extended_attention_mask(att, x.shape[:-1])
-    enc = encoder(x, attention_mask=ext)[0, :, :]      # (S,H)
-    # ---- compute max-cos per token (F-0/F-1) ----
-    role_mat = embeddings.word_embeddings(
-        torch.tensor(sorted(sel_ids), device=enc.device)
-    )                          # (R,H)
-    cos = cosine(enc.unsqueeze(1), role_mat.unsqueeze(0))  # (S,R)
-    maxcos, argrole = cos.max(-1)                          # (S,)
-    # ---- split tokens into High / Low half (F-2) ----
-    S = len(ids[0])
     sort_idx = maxcos.argsort(descending=True)
-    hi_idx   = sort_idx[: S//2]
-    lo_idx   = sort_idx[S//2:]
-    # container for summary text
-    report_lines = []
-    # ───────────────────────────────────────────────────────────────
-    # 3.  Encoder-only inference util  (FIXED)                       │
-    # ───────────────────────────────────────────────────────────────
-    MASK_ID = tokenizer.mask_token_id or tokenizer.convert_tokens_to_ids("[MASK]")  # <- NEW
-    def greedy_pool(idx_order, tag):
-        """
-        idx_order : tensor of token-indices sorted hi→lo  or  lo→hi
-        tag       : "high" | "low"   (for the debug print)
-        returns   : (best_pool_indices  ,  best_accuracy)
-        """
         best_pool, best_acc = [], 0.0
         ptr = 0
         while ptr < len(idx_order):
-            cand = idx_order[ptr : ptr + 2]               # 2-at-a-time
-            pool = best_pool + cand.tolist()              # grow pool
             ptr += 2
-            # --- build *mask* for “everything NOT in pool” ----------
             mask_flags = torch.zeros_like(ids, dtype=torch.bool)
-            mask_flags[0, pool] = True                   # keep these un-masked
-            masked_ids = ids.where(mask_flags, MASK_ID)  # <-  uses the constant
-            # re-encode & score
-            enc_m   = encode(masked_ids, mask)           # helper already defined
-            logits  = mlm_head(enc_m).logits[0]          # (S, V)
-            preds   = logits.argmax(-1)
-            acc     = (preds[~mask_flags] == ids[0][~mask_flags]).float().mean().item()
-            if acc > best_acc:                           # accept pool only on gain
                 best_pool, best_acc = pool, acc
-                if acc >= 0.50:                          # early-stop rule
                     break
-        print(f"{tag:>4s}-pool  {best_pool}   acc={best_acc:.3f}")
         return best_pool, best_acc
-    pool_lo, acc_lo = greedy_pool(lo_idx, "low")
-    pool_hi, acc_hi = greedy_pool(hi_idx, "high")
-    # ---- package textual result ----
     res_json = {
         "Low-pool tokens": tokenizer.decode(ids[0, pool_lo]),
-        "Low accuracy":    f"{acc_lo:.2f}",
-        "High-pool tokens":tokenizer.decode(ids[0, pool_hi]),
-        "High accuracy":   f"{acc_hi:.2f}",
-        "Trace": "\n".join(report_lines)
     }
-    # three outputs expected by UI
     return json.dumps(res_json, indent=2), f"{maxcos.max():.4f}", len(selected_roles)
 # ------------------------------------------------------------------
 # 4.  Gradio UI -----------------------------------------------------
 def build_interface():

     return (preds==gold).float().mean().item()
+@spaces.GPU
 @spaces.GPU
 def encode_and_trace(text, selected_roles):
     if not selected_roles:
         selected_roles = SYMBOLIC_ROLES
+    sel_ids = [tokenizer.convert_tokens_to_ids(t) for t in selected_roles]
+    sel_ids_tensor = torch.tensor(sel_ids, device="cuda")
+    # ========== Tokenize & Embed ==========
     batch = tokenizer(text, return_tensors="pt").to("cuda")
+    ids, attn = batch.input_ids, batch.attention_mask
+    S = ids.shape[1]
+    def encode(input_ids, attn_mask):
+        x = embeddings(input_ids)
+        if emb_ln: x = emb_ln(x)
+        if emb_drop: x = emb_drop(x)
+        ext = full_model.bert.get_extended_attention_mask(attn_mask, x.shape[:-1])
+        return encoder(x, attention_mask=ext)[0]
+    # Full unmasked encoding pass
+    encoded = encode(ids, attn)
+    # ========== Cosine Similarity ==========
+    symbolic_embeds = embeddings(sel_ids_tensor)  # (R, H)
+    sim = cosine(encoded.unsqueeze(1), symbolic_embeds.unsqueeze(0))  # (S, R)
+    maxcos, argrole = sim.max(-1)  # (S,)
+    top_roles = [selected_roles[i] for i in argrole.tolist()]
+    # ========== Sorting into High / Low Alignment Pools ==========
     sort_idx = maxcos.argsort(descending=True)
+    hi_idx = sort_idx[:S // 2]
+    lo_idx = sort_idx[S // 2:]
+    # ========== Greedy Pool Testing ==========
+    MASK_ID = tokenizer.mask_token_id or tokenizer.convert_tokens_to_ids("[MASK]")
+    def evaluate_pool(idx_order, label):
         best_pool, best_acc = [], 0.0
         ptr = 0
         while ptr < len(idx_order):
+            cand = idx_order[ptr:ptr + 2]
+            pool = best_pool + cand.tolist()
             ptr += 2
             mask_flags = torch.zeros_like(ids, dtype=torch.bool)
+            mask_flags[0, pool] = True
+            masked_input = ids.where(mask_flags, MASK_ID)
+            encoded_m = encode(masked_input, attn)
+            logits = mlm_head(encoded_m).logits[0]
+            preds = logits.argmax(-1)
+            masked_positions = (~mask_flags[0]).nonzero(as_tuple=False).squeeze(-1)
+            if masked_positions.numel() == 0:
+                continue
+            correct = (preds[masked_positions] == ids[0][masked_positions]).float()
+            acc = correct.mean().item()
+            if acc > best_acc:
                 best_pool, best_acc = pool, acc
+                if acc >= 0.5:
                     break
         return best_pool, best_acc
+    pool_hi, acc_hi = evaluate_pool(hi_idx, "high")
+    pool_lo, acc_lo = evaluate_pool(lo_idx, "low")
+    # ========== Per-token Symbolic Role Trace ==========
+    decoded_tokens = tokenizer.convert_ids_to_tokens(ids[0])
+    role_trace = [
+        f"{tok:<15} → {role}   cos={score:.4f}"
+        for tok, role, score in zip(decoded_tokens, top_roles, maxcos.tolist())
+    ]
+    # ========== JSON Result ==========
     res_json = {
+        "High-pool tokens": tokenizer.decode(ids[0, pool_hi]),
+        "High accuracy": f"{acc_hi:.3f}",
         "Low-pool tokens": tokenizer.decode(ids[0, pool_lo]),
+        "Low accuracy": f"{acc_lo:.3f}",
+        "Token–Symbolic Role Alignment": role_trace
     }
     return json.dumps(res_json, indent=2), f"{maxcos.max():.4f}", len(selected_roles)
 # ------------------------------------------------------------------
 # 4.  Gradio UI -----------------------------------------------------
 def build_interface():