Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on 8 days ago

Commit

096fe3a

verified ·

1 Parent(s): ec302cb

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -54

app.py CHANGED Viewed

@@ -1,55 +1,36 @@
-# Updating the app to use only the encoder from the model, ensuring symbolic support
-import spaces
-from bert_handler import create_handler_from_checkpoint
 import torch
 import gradio as gr
-import re
-from pathlib import Path
-from huggingface_hub import snapshot_download
-# Load checkpoint using BERTHandler (loads tokenizer and full model)
-checkpoint_path = snapshot_download(
-    repo_id="AbstractPhil/bert-beatrix-2048",
-    revision="main",
-    local_dir="bert-beatrix-2048",
-    local_dir_use_symlinks=False
-)
-handler, model, tokenizer = create_handler_from_checkpoint(checkpoint_path)
-model = model.eval().cuda()
-# Extract encoder only (NomicBertModel -> encoder)
-encoder = model.bert.encoder
-embeddings = model.bert.embeddings
-emb_ln = model.bert.emb_ln
-emb_drop = model.bert.emb_drop
-@spaces.GPU
-def encode_and_predict(text: str, selected_roles: list[str]):
-    with torch.no_grad():
-        inputs = tokenizer(text, return_tensors="pt").to("cuda")
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-        # Run embedding + encoder pipeline
-        x = embeddings(input_ids)
-        x = emb_ln(x)
-        x = emb_drop(x)
-        encoded = encoder(x, attention_mask=attention_mask.bool())
-        symbolic_ids = [tokenizer.convert_tokens_to_ids(tok) for tok in selected_roles]
-        symbolic_mask = torch.isin(input_ids, torch.tensor(symbolic_ids, device=input_ids.device))
-        masked_tokens = [tokenizer.convert_ids_to_tokens([tid])[0] for tid in input_ids[0] if tid in symbolic_ids]
-        role_reprs = encoded[symbolic_mask].mean(dim=0) if symbolic_mask.any() else torch.zeros_like(encoded[0, 0])
-        return {
-            "Symbolic Tokens": masked_tokens,
-            "Embedding Norm": f"{role_reprs.norm().item():.4f}",
-            "Symbolic Token Count": symbolic_mask.sum().item(),
-        }
-symbolic_roles = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<surface>", "<lighting>", "<material>", "<accessory>", "<footwear>",
     "<upper_body_clothing>", "<hair_style>", "<hair_length>", "<headwear>",
@@ -58,26 +39,93 @@ symbolic_roles = [
     "<fabric>", "<jewelry>"
 ]
 def build_interface():
-    with gr.Blocks() as demo:
-        gr.Markdown("## 🧠 Symbolic Encoder Inspector")
         with gr.Row():
             with gr.Column():
-                input_text = gr.Textbox(label="Input with Symbolic Tokens", lines=3)
-                selected_roles = gr.CheckboxGroup(
-                    choices=symbolic_roles,
-                    label="Which symbolic tokens should be traced?"
                 )
                 run_btn = gr.Button("Encode & Trace")
             with gr.Column():
-                symbolic_tokens = gr.Textbox(label="Symbolic Tokens Found")
-                embedding_norm = gr.Textbox(label="Mean Norm of Symbolic Embeddings")
-                token_count = gr.Textbox(label="Count of Symbolic Tokens")
-        run_btn.click(fn=encode_and_predict, inputs=[input_text, selected_roles], outputs=[symbolic_tokens, embedding_norm, token_count])
     return demo
 if __name__ == "__main__":
     demo = build_interface()
     demo.launch()

+# app.py – encoder-only demo for bert-beatrix-2048
+# -----------------------------------------------
+# launch:  python app.py
+# (gradio UI appears at http://localhost:7860)
 import torch
 import gradio as gr
+import spaces
+from bert_handler import create_handler_from_checkpoint
+# ------------------------------------------------------------------
+# 1.  Model / tokenizer  -------------------------------------------------
+# ------------------------------------------------------------------
+#
+# • We load one repo *once*, via its canonical name.
+# • BERTHandler handles the VRAM-safe cleanup & guarantees that the
+#   tokenizer already contains all special tokens saved in the checkpoint.
+REPO_ID = "AbstractPhil/bert-beatrix-2048"
+handler, full_model, tokenizer = create_handler_from_checkpoint(REPO_ID)
+full_model = full_model.eval().cuda()
+# Grab the encoder + embedding stack only
+encoder     = full_model.bert.encoder
+embeddings  = full_model.bert.embeddings
+emb_ln      = full_model.bert.emb_ln
+emb_drop    = full_model.bert.emb_drop
+# ------------------------------------------------------------------
+# 2.  Symbolic token set  -------------------------------------------
+# ------------------------------------------------------------------
+SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<surface>", "<lighting>", "<material>", "<accessory>", "<footwear>",
     "<upper_body_clothing>", "<hair_style>", "<hair_length>", "<headwear>",
     "<fabric>", "<jewelry>"
 ]
+# Quick sanity check – should *never* be unk
+missing = [tok for tok in SYMBOLIC_ROLES
+           if tokenizer.convert_tokens_to_ids(tok) == tokenizer.unk_token_id]
+if missing:
+    raise RuntimeError(f"Tokenizer is missing {len(missing)} special tokens: {missing}")
+# ------------------------------------------------------------------
+# 3.  Encoder-only inference util  ----------------------------------
+# ------------------------------------------------------------------
+@spaces.GPU
+def encode_and_trace(text: str, selected_roles: list[str]):
+    """
+    • encodes `text`
+    • pulls out the hidden states for any of the `selected_roles`
+    • returns some quick stats so we can verify everything’s wired up
+    """
+    with torch.no_grad():
+        batch = tokenizer(text, return_tensors="pt").to("cuda")
+        inp_ids, attn_mask = batch.input_ids, batch.attention_mask
+        # --- embedding + LayerNorm/dropout ---
+        x = embeddings(inp_ids)
+        x = emb_drop(emb_ln(x))
+        # --- proper *additive* attention mask ---
+        ext_mask = full_model.bert.get_extended_attention_mask(
+            attn_mask, x.shape[:-1]
+        )
+        encoded = encoder(x, attention_mask=ext_mask)          # (B, S, H)
+        # --- pick out the positions that match selected_roles ---
+        sel_ids = {tokenizer.convert_tokens_to_ids(t) for t in selected_roles}
+        ids_list = inp_ids.squeeze(0).tolist()                 # python ints
+        keep_mask = torch.tensor([tid in sel_ids for tid in ids_list],
+                                 device=encoded.device)
+        tokens_found = [tokenizer.convert_ids_to_tokens([tid])[0]
+                        for tid in ids_list if tid in sel_ids]
+        if keep_mask.any():
+            repr_vec = encoded.squeeze(0)[keep_mask].mean(0)
+            norm_val = f"{repr_vec.norm().item():.4f}"
+        else:
+            norm_val = "0.0000"
+        return {
+            "Symbolic Tokens": ", ".join(tokens_found) or "(none)",
+            "Embedding Norm":  norm_val,
+            "Symbolic Token Count": int(keep_mask.sum().item()),
+        }
+# ------------------------------------------------------------------
+# 4.  Gradio UI  -----------------------------------------------------
+# ------------------------------------------------------------------
 def build_interface():
+    with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
+        gr.Markdown("## 🧠 Symbolic Encoder Inspector\n"
+                    "Paste some text containing the special `<role>` tokens and "
+                    "inspect their encoder representations.")
         with gr.Row():
             with gr.Column():
+                input_text = gr.Textbox(
+                    label="Input with Symbolic Tokens",
+                    placeholder="Example: A <subject> wearing <upper_body_clothing> …",
+                    lines=3,
+                )
+                role_selector = gr.CheckboxGroup(
+                    choices=SYMBOLIC_ROLES,
+                    label="Trace these symbolic roles"
                 )
                 run_btn = gr.Button("Encode & Trace")
             with gr.Column():
+                out_tokens  = gr.Textbox(label="Symbolic Tokens Found")
+                out_norm    = gr.Textbox(label="Mean Norm")
+                out_count   = gr.Textbox(label="Token Count")
+        run_btn.click(
+            fn=encode_and_trace,
+            inputs=[input_text, role_selector],
+            outputs=[out_tokens, out_norm, out_count],
+        )
     return demo
 if __name__ == "__main__":
     demo = build_interface()
     demo.launch()