Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 17 days ago

Commit

b235205

verified ·

1 Parent(s): 45ddfa3

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -74

app.py CHANGED Viewed

@@ -87,93 +87,57 @@ def pool_accuracy(ids, logits, pool_mask):
 def encode_and_trace(text, selected_roles):
     if not selected_roles:
         selected_roles = SYMBOLIC_ROLES
     sel_ids = [tokenizer.convert_tokens_to_ids(t) for t in selected_roles]
-    sel_ids_tensor = torch.tensor(sel_ids, device="cuda")
-    # Tokenize
     batch = tokenizer(text, return_tensors="pt").to("cuda")
-    ids, attn = batch.input_ids, batch.attention_mask
-    S = ids.shape[1]
-    # Encode helper
     def encode(input_ids, attn_mask):
-        x = embeddings(input_ids)
         if emb_ln: x = emb_ln(x)
         if emb_drop: x = emb_drop(x)
-        ext = full_model.bert.get_extended_attention_mask(attn_mask, x.shape[:-1])
-        return encoder(x, attention_mask=ext)[0]  # shape: (1, S, H)
-    encoded = encode(ids, attn)
-    # Project symbolic token embeddings
-    symbolic_embeds = embeddings.word_embeddings(sel_ids_tensor)  # shape: (R, H)
-    sim = cosine(encoded[0].unsqueeze(1), symbolic_embeds.unsqueeze(0))  # (S, R)
-    maxcos, argrole = sim.max(-1)  # (S,)
-    top_roles = [selected_roles[i] for i in argrole.tolist()]
-    sort_idx = maxcos.argsort(descending=True)
-    hi_idx = sort_idx[:S // 2]
-    lo_idx = sort_idx[S // 2:]
-    MASK_ID = tokenizer.mask_token_id or tokenizer.convert_tokens_to_ids("[MASK]")
-    # Final pool evaluator
-    def evaluate_pool(idx_order, label, ids):
-        best_pool, best_acc = [], 0.0
-        ptr = 0
-        while ptr < len(idx_order):
-            cand = idx_order[ptr:ptr + 2]
-            pool = best_pool + cand.tolist()
-            ptr += 2
-            mask_flags = torch.zeros_like(ids, dtype=torch.bool)
-            mask_flags[0, pool] = True
-            masked_input = ids.where(mask_flags, MASK_ID)
-            encoded_m = encode(masked_input, attn)
-            logits = mlm_head(encoded_m)  # (1, S, V)
-            preds = logits.argmax(-1)     # (1, S)
-            masked_positions = (~mask_flags[0]).nonzero(as_tuple=True)[0]  # 1D tensor
-            if masked_positions.numel() == 0:
-                continue
-            # Extract both predicted and gold tokens
-            pred_tokens = preds[0, masked_positions]
-            gold_tokens = ids[0, masked_positions]
-            correct = (pred_tokens == gold_tokens).float()
-            acc = correct.mean().item()
-            if acc > best_acc:
-                best_pool, best_acc = pool, acc
-                if acc >= 0.5:
-                    break
-        return best_pool, best_acc
-    # Run both pools
-    pool_hi, acc_hi = evaluate_pool(hi_idx, "high", ids)
-    pool_lo, acc_lo = evaluate_pool(lo_idx, "low", ids)
-    # Alignment trace
-    decoded_tokens = tokenizer.convert_ids_to_tokens(ids[0])
     role_trace = [
-        f"{tok:<15} → {role}   cos={score:.4f}"
-        for tok, role, score in zip(decoded_tokens, top_roles, maxcos.tolist())
     ]
-    # Return results
     res_json = {
-        "High-pool tokens": tokenizer.decode(ids[0, pool_hi]),
-        "High accuracy": f"{acc_hi:.3f}",
-        "Low-pool tokens": tokenizer.decode(ids[0, pool_lo]),
-        "Low accuracy": f"{acc_lo:.3f}",
-        "Token–Symbolic Role Alignment": role_trace
     }
-    return json.dumps(res_json, indent=2), f"{maxcos.max():.4f}", len(selected_roles)
 # ------------------------------------------------------------------

 def encode_and_trace(text, selected_roles):
     if not selected_roles:
         selected_roles = SYMBOLIC_ROLES
+    # Convert symbolic role tokens to IDs
     sel_ids = [tokenizer.convert_tokens_to_ids(t) for t in selected_roles]
+    sel_ids_tensor = torch.tensor(sel_ids, device="cuda").unsqueeze(0)  # shape: (1, R)
+    # Tokenize user prompt
     batch = tokenizer(text, return_tensors="pt").to("cuda")
+    input_ids, attention_mask = batch.input_ids, batch.attention_mask
+    S = input_ids.shape[1]
+    # === Shared encoder logic with RoPE ===
     def encode(input_ids, attn_mask):
+        x = embeddings(input_ids)  # (B, S, H)
         if emb_ln: x = emb_ln(x)
         if emb_drop: x = emb_drop(x)
+        ext = full_model.bert.get_extended_attention_mask(attn_mask, input_ids.shape)
+        return encoder(x, attention_mask=ext)[0]  # (B, S, H)
+    # Encode prompt
+    encoded_prompt = encode(input_ids, attention_mask)[0]  # (S, H)
+    # Encode symbolic roles through same pipeline
+    symbolic_attn = torch.ones_like(sel_ids_tensor)
+    encoded_roles = encode(sel_ids_tensor, symbolic_attn)[0]  # (R, H)
+    # === Symbolic classification via cosine similarity ===
+    # Compare each token to each symbolic role → shape: (S, R)
+    token_exp = encoded_prompt.unsqueeze(1).expand(-1, encoded_roles.size(0), -1)  # (S, R, H)
+    role_exp  = encoded_roles.unsqueeze(0).expand(encoded_prompt.size(0), -1, -1)  # (S, R, H)
+    sim = F.cosine_similarity(token_exp, role_exp, dim=-1)  # → (S, R)
+    argmax_ids = sim.argmax(dim=-1)            # (S,)
+    max_scores = sim.max(dim=-1).values        # (S,)
+    predicted_roles = [selected_roles[i] for i in argmax_ids.tolist()]
+    decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
+    # === Build readable trace
     role_trace = [
+        f"{tok:<15} → {role:<22} score={score:.4f}"
+        for tok, role, score in zip(decoded_tokens, predicted_roles, max_scores.tolist())
     ]
+    # === Final output
     res_json = {
+        "Prompt": text,
+        "Predicted symbolic roles": predicted_roles,
+        "Max alignment score": f"{max_scores.max().item():.4f}",
+        "Per-token classification": role_trace
     }
+    return json.dumps(res_json, indent=2), f"{max_scores.max().item():.4f}", len(selected_roles)
 # ------------------------------------------------------------------