Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 6 days ago

Commit

20331bc

verified ·

1 Parent(s): 9228a8c

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -16

app.py CHANGED Viewed

@@ -90,11 +90,12 @@ def encode_and_trace(text, selected_roles):
     sel_ids = [tokenizer.convert_tokens_to_ids(t) for t in selected_roles]
     sel_ids_tensor = torch.tensor(sel_ids, device="cuda")
-    # ========== Tokenize & Embed ==========
     batch = tokenizer(text, return_tensors="pt").to("cuda")
     ids, attn = batch.input_ids, batch.attention_mask
     S = ids.shape[1]
     def encode(input_ids, attn_mask):
         x = embeddings(input_ids)
         if emb_ln: x = emb_ln(x)
@@ -102,25 +103,21 @@ def encode_and_trace(text, selected_roles):
         ext = full_model.bert.get_extended_attention_mask(attn_mask, x.shape[:-1])
         return encoder(x, attention_mask=ext)[0]
-    # Full unmasked encoding pass
     encoded = encode(ids, attn)
-    # ========== Cosine Similarity ==========
-    symbolic_embeds = embeddings.word_embeddings(sel_ids_tensor)
     sim = cosine(encoded.unsqueeze(1), symbolic_embeds.unsqueeze(0))  # (S, R)
     maxcos, argrole = sim.max(-1)  # (S,)
     top_roles = [selected_roles[i] for i in argrole.tolist()]
-    # ========== Sorting into High / Low Alignment Pools ==========
     sort_idx = maxcos.argsort(descending=True)
     hi_idx = sort_idx[:S // 2]
     lo_idx = sort_idx[S // 2:]
-    # ========== Greedy Pool Testing ==========
     MASK_ID = tokenizer.mask_token_id or tokenizer.convert_tokens_to_ids("[MASK]")
-    def evaluate_pool(idx_order, label):
         best_pool, best_acc = [], 0.0
         ptr = 0
         while ptr < len(idx_order):
@@ -133,15 +130,16 @@ def encode_and_trace(text, selected_roles):
             masked_input = ids.where(mask_flags, MASK_ID)
             encoded_m = encode(masked_input, attn)
-            logits = mlm_head(encoded_m)[0]  # shape: (S, V)
             preds = logits.argmax(-1)
             masked_positions = (~mask_flags[0]).nonzero(as_tuple=False).squeeze(-1)
             if masked_positions.numel() == 0:
                 continue
-            correct = (preds[masked_positions] == ids[0][masked_positions]).float()
             acc = correct.mean().item()
             if acc > best_acc:
@@ -151,17 +149,18 @@ def encode_and_trace(text, selected_roles):
         return best_pool, best_acc
-    pool_hi, acc_hi = evaluate_pool(hi_idx, "high")
-    pool_lo, acc_lo = evaluate_pool(lo_idx, "low")
-    # ========== Per-token Symbolic Role Trace ==========
     decoded_tokens = tokenizer.convert_ids_to_tokens(ids[0])
     role_trace = [
         f"{tok:<15} → {role}   cos={score:.4f}"
         for tok, role, score in zip(decoded_tokens, top_roles, maxcos.tolist())
     ]
-    # ========== JSON Result ==========
     res_json = {
         "High-pool tokens": tokenizer.decode(ids[0, pool_hi]),
         "High accuracy": f"{acc_hi:.3f}",
@@ -174,6 +173,7 @@ def encode_and_trace(text, selected_roles):
 # ------------------------------------------------------------------
 # 4.  Gradio UI -----------------------------------------------------
 def build_interface():

     sel_ids = [tokenizer.convert_tokens_to_ids(t) for t in selected_roles]
     sel_ids_tensor = torch.tensor(sel_ids, device="cuda")
+    # Tokenize input
     batch = tokenizer(text, return_tensors="pt").to("cuda")
     ids, attn = batch.input_ids, batch.attention_mask
     S = ids.shape[1]
+    # Safe encoder forward
     def encode(input_ids, attn_mask):
         x = embeddings(input_ids)
         if emb_ln: x = emb_ln(x)
         ext = full_model.bert.get_extended_attention_mask(attn_mask, x.shape[:-1])
         return encoder(x, attention_mask=ext)[0]
     encoded = encode(ids, attn)
+    # Get raw symbolic token embeddings directly
+    symbolic_embeds = embeddings.word_embeddings(sel_ids_tensor)  # ✅ FIXED
     sim = cosine(encoded.unsqueeze(1), symbolic_embeds.unsqueeze(0))  # (S, R)
     maxcos, argrole = sim.max(-1)  # (S,)
     top_roles = [selected_roles[i] for i in argrole.tolist()]
     sort_idx = maxcos.argsort(descending=True)
     hi_idx = sort_idx[:S // 2]
     lo_idx = sort_idx[S // 2:]
     MASK_ID = tokenizer.mask_token_id or tokenizer.convert_tokens_to_ids("[MASK]")
+    # 🔧 Pass ids into this function
+    def evaluate_pool(idx_order, label, ids):
         best_pool, best_acc = [], 0.0
         ptr = 0
         while ptr < len(idx_order):
             masked_input = ids.where(mask_flags, MASK_ID)
             encoded_m = encode(masked_input, attn)
+            logits = mlm_head(encoded_m)[0]  # ✅ FIXED — direct tensor
             preds = logits.argmax(-1)
             masked_positions = (~mask_flags[0]).nonzero(as_tuple=False).squeeze(-1)
             if masked_positions.numel() == 0:
                 continue
+            # ✅ FIXED: indexing from explicitly passed ids
+            gold = ids[0][masked_positions]
+            correct = (preds[masked_positions] == gold).float()
             acc = correct.mean().item()
             if acc > best_acc:
         return best_pool, best_acc
+    # Run both pool evaluations
+    pool_hi, acc_hi = evaluate_pool(hi_idx, "high", ids)
+    pool_lo, acc_lo = evaluate_pool(lo_idx, "low", ids)
+    # Per-token symbolic trace
     decoded_tokens = tokenizer.convert_ids_to_tokens(ids[0])
     role_trace = [
         f"{tok:<15} → {role}   cos={score:.4f}"
         for tok, role, score in zip(decoded_tokens, top_roles, maxcos.tolist())
     ]
+    # Output JSON
     res_json = {
         "High-pool tokens": tokenizer.decode(ids[0, pool_hi]),
         "High accuracy": f"{acc_hi:.3f}",
 # ------------------------------------------------------------------
 # 4.  Gradio UI -----------------------------------------------------
 def build_interface():