Spaces:

pyvene
/

AxBench-ReFT-r1-16K

Running on Zero

App Files Files Community

frankaging commited on Jan 27

Commit

36edf66

1 Parent(s): 7962ddb

autosteer

Browse files

Files changed (2) hide show

app.py +100 -27
style.css +0 -19

app.py CHANGED Viewed

@@ -2,12 +2,13 @@ import os, json, random
 import torch
 import gradio as gr
 import spaces
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from huggingface_hub import login, hf_hub_download
 import pyreft
 import pyvene as pv
 from threading import Thread
 from typing import Iterator
 HF_TOKEN = os.environ.get("HF_TOKEN")
 login(token=HF_TOKEN)
@@ -16,6 +17,18 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 256  # smaller default to save memory
 MAX_INPUT_TOKEN_LENGTH = 4096
 def load_jsonl(jsonl_path):
     jsonl_data = []
     with open(jsonl_path, 'r') as f:
@@ -29,19 +42,44 @@ class Steer(pv.SourcelessIntervention):
     def __init__(self, **kwargs):
         super().__init__(**kwargs, keep_last_dim=True)
         self.proj = torch.nn.Linear(
-            self.embed_dim, kwargs["latent_dim"], bias=False
-        )
     def forward(self, base, source=None, subspaces=None):
-        if subspaces is None:
             return base
-        steering_vec = []
-        avg_mag = sum(subspaces["mag"]) / len(subspaces["mag"])
-        for idx, mag in zip(subspaces["idx"], subspaces["mag"]):
-            steering_vec.append(self.proj.weight[idx].unsqueeze(dim=0))
-        steering_vec = torch.cat(steering_vec, dim=0).mean(dim=0)
-        steering_vec = avg_mag * steering_vec
         return base + steering_vec
 # Check GPU
 if not torch.cuda.is_available():
     print("Warning: Running on CPU, may be slow.")
@@ -73,7 +111,23 @@ if torch.cuda.is_available():
         concept_id_map[item["concept"]] = concept_reindex
         concept_reindex += 1
-    steer = Steer(embed_dim=params.shape[0], latent_dim=params.shape[1])
     steer.proj.weight.data = params.float()
     pv_model = pv.IntervenableModel({
@@ -117,8 +171,10 @@ def generate(
         "intervene_on_prompt": True,
         "subspaces": [
             {
-                "idx": [int(sl["idx"]) for sl in subspaces_list],
-                "mag": [int(sl["internal_mag"]) for sl in subspaces_list]
             }
         ] if subspaces_list else None,
         "streamer": streamer,
@@ -133,9 +189,6 @@ def generate(
         partial_text.append(token_str)
         yield "".join(partial_text)
-def _build_remove_choices(subspaces):
-    return [f"(+{x['display_mag']:.1f}*) {x['text']}" for x in subspaces]
 def filter_concepts(search_text: str):
     if not search_text.strip():
         return concept_list[:500]
@@ -144,15 +197,21 @@ def filter_concepts(search_text: str):
 def add_concept_to_list(selected_concept, user_slider_val, current_list):
     if not selected_concept:
-        return current_list, gr.update(choices=_build_remove_choices(current_list))
-    idx = concept_id_map[selected_concept]
     internal_mag = user_slider_val * 50
     new_entry = {
         "text": selected_concept,
         "idx": idx,
         "display_mag": user_slider_val,
         "internal_mag": internal_mag,
     }
     # Add to the beginning of the list
     current_list = [new_entry]
@@ -160,16 +219,23 @@ def add_concept_to_list(selected_concept, user_slider_val, current_list):
 def update_dropdown_choices(search_text):
     filtered = filter_concepts(search_text)
-    if not filtered:
-        return gr.update(choices=[], value=None, interactive=True)
     # Automatically select the first matching concept
     return gr.update(
         choices=filtered,
         value=filtered[0],  # Select the first match
-        interactive=True
-    )
-with gr.Blocks(fill_height=True) as demo:
     # Remove default subspaces
     selected_subspaces = gr.State([])
@@ -179,7 +245,7 @@ with gr.Blocks(fill_height=True) as demo:
             chat_interface = gr.ChatInterface(
                 fn=generate,
                 title="Chat with a Concept Steering Model",
-                description="Steer responses by selecting concepts on the right →",
                 type="messages",
                 additional_inputs=[selected_subspaces],
                 fill_height=True
@@ -188,7 +254,7 @@ with gr.Blocks(fill_height=True) as demo:
         # Right side: concept management
         with gr.Column(scale=4):
             gr.Markdown("## Steer Model Responses")
-            gr.Markdown("Search and then select a concept to steer. The closest match will be automatically selected.")
             # Concept Search and Selection
             with gr.Group():
                 search_box = gr.Textbox(
@@ -196,6 +262,7 @@ with gr.Blocks(fill_height=True) as demo:
                     placeholder="Find concepts to steer the model (e.g. 'time travel')",
                     lines=2,
                 )
                 concept_dropdown = gr.Dropdown(
                     label="Select a concept to steer the model (Click to see more!)",
                     interactive=True,
@@ -211,10 +278,10 @@ with gr.Blocks(fill_height=True) as demo:
     # Wire up events
     # When search box changes, update dropdown AND trigger concept selection
-    search_box.change(
         update_dropdown_choices,
         [search_box],
-        [concept_dropdown]
     ).then(  # Chain the events to automatically add the concept
         add_concept_to_list,
         [concept_dropdown, concept_magnitude, selected_subspaces],
@@ -227,6 +294,12 @@ with gr.Blocks(fill_height=True) as demo:
         [selected_subspaces]
     )
     concept_magnitude.input(
         add_concept_to_list,
         [concept_dropdown, concept_magnitude, selected_subspaces],

 import torch
 import gradio as gr
 import spaces
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from huggingface_hub import login, hf_hub_download
 import pyreft
 import pyvene as pv
 from threading import Thread
 from typing import Iterator
+import torch.nn.functional as F
 HF_TOKEN = os.environ.get("HF_TOKEN")
 login(token=HF_TOKEN)
 DEFAULT_MAX_NEW_TOKENS = 256  # smaller default to save memory
 MAX_INPUT_TOKEN_LENGTH = 4096
+css = """
+#alert-message textarea {
+    background-color: #e8f4ff;
+    border: 1px solid #cce5ff;
+    color: #084298;
+    font-size: 1.1em;
+    padding: 12px;
+    border-radius: 4px;
+    font-weight: 500;
+}
+"""
 def load_jsonl(jsonl_path):
     jsonl_data = []
     with open(jsonl_path, 'r') as f:
     def __init__(self, **kwargs):
         super().__init__(**kwargs, keep_last_dim=True)
         self.proj = torch.nn.Linear(
+                self.embed_dim, kwargs["latent_dim"], bias=False)
+        self.subspace_generator = kwargs["subspace_generator"]
     def forward(self, base, source=None, subspaces=None):
+        if subspaces == None:
             return base
+        if subspaces["subspace_gen_inputs"] is not None:
+            # we call our subspace generator to generate the subspace on-the-fly.
+            raw_steering_vec = self.subspace_generator(
+                subspaces["subspace_gen_inputs"]["input_ids"],
+                subspaces["subspace_gen_inputs"]["attention_mask"],
+            )[0]
+            steering_vec = torch.tensor(subspaces["mag"]) * \
+                raw_steering_vec.unsqueeze(dim=0)
+            return base + steering_vec
+        else:
+            steering_vec = torch.tensor(subspaces["mag"]) * \
+                self.proj.weight[subspaces["idx"]].unsqueeze(dim=0)
         return base + steering_vec
+class RegressionWrapper(torch.nn.Module):
+    def __init__(self, base_model, hidden_size, output_dim):
+        super().__init__()
+        self.base_model = base_model
+        self.regression_head = torch.nn.Linear(hidden_size, output_dim)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.base_model.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            return_dict=True
+        )
+        last_hiddens = outputs.hidden_states[-1]
+        last_token_representations = last_hiddens[:, -1]
+        preds = self.regression_head(last_token_representations)
+        preds = F.normalize(preds, p=2, dim=-1)
+        return preds
 # Check GPU
 if not torch.cuda.is_available():
     print("Warning: Running on CPU, may be slow.")
         concept_id_map[item["concept"]] = concept_reindex
         concept_reindex += 1
+    # load subspace generator.
+    base_tokenizer = AutoTokenizer.from_pretrained(
+        f"google/gemma-2-2b", model_max_length=512)
+    config = AutoConfig.from_pretrained("google/gemma-2-2b")
+    base_model = AutoModelForCausalLM.from_config(config)
+    subspace_generator_weight_path = hf_hub_download(repo_id="pyvene/gemma-reft-2b-it-res-generator", filename="l20/weight.pt")
+    hidden_size = base_model.config.hidden_size
+    subspace_generator = RegressionWrapper(
+        base_model, hidden_size, hidden_size).bfloat16().to("cuda")
+    subspace_generator.load_state_dict(torch.load(subspace_generator_weight_path))
+    print(f"Loading model from saved file {subspace_generator_weight_path}")
+    _ = subspace_generator.eval()
+    steer = Steer(
+        embed_dim=params.shape[0], latent_dim=params.shape[1],
+        subspace_generator=subspace_generator)
     steer.proj.weight.data = params.float()
     pv_model = pv.IntervenableModel({
         "intervene_on_prompt": True,
         "subspaces": [
             {
+                "idx": int(subspaces_list[0]["idx"]),
+                "mag": int(subspaces_list[0]["internal_mag"]),
+                "subspace_gen_inputs": base_tokenizer(subspaces_list[0]["subspace_gen_text"], return_tensors="pt").to("cuda") \
+                    if subspaces_list[0]["subspace_gen_text"] is not None else None
             }
         ] if subspaces_list else None,
         "streamer": streamer,
         partial_text.append(token_str)
         yield "".join(partial_text)
 def filter_concepts(search_text: str):
     if not search_text.strip():
         return concept_list[:500]
 def add_concept_to_list(selected_concept, user_slider_val, current_list):
     if not selected_concept:
+        return current_list
+    selected_concept_text = None
+    if selected_concept.startswith("[New] "):
+        selected_concept_text = selected_concept[6:]
+        idx = 0
+    else:
+        idx = concept_id_map[selected_concept]
     internal_mag = user_slider_val * 50
     new_entry = {
         "text": selected_concept,
         "idx": idx,
         "display_mag": user_slider_val,
         "internal_mag": internal_mag,
+        "subspace_gen_text": selected_concept_text
     }
     # Add to the beginning of the list
     current_list = [new_entry]
 def update_dropdown_choices(search_text):
     filtered = filter_concepts(search_text)
+    if not filtered or len(filtered) == 0:
+        return gr.update(choices=[f"[New] {search_text}"], value=f"[New] {search_text}", interactive=True), gr.Textbox(
+        label="No matching existing concepts were found!",
+        value="Good news! Based on the concept you provided, we will automatically generate a steering vector. Try it out by starting a chat!",
+        lines=3,
+        interactive=False,
+        visible=True,
+        elem_id="alert-message"
+    )
     # Automatically select the first matching concept
     return gr.update(
         choices=filtered,
         value=filtered[0],  # Select the first match
+        interactive=True, visible=True
+    ), gr.Textbox(visible=False)
+with gr.Blocks(css=css, fill_height=True) as demo:
     # Remove default subspaces
     selected_subspaces = gr.State([])
             chat_interface = gr.ChatInterface(
                 fn=generate,
                 title="Chat with a Concept Steering Model",
+                description="""Steer responses by selecting concepts on the right →\n\nWe are using Gemma-2-2B-it with steering vectors added to the residual stream at layer 20. Our auto-steer steering vector generated is a finetuned Gemma-2-2B model.""",
                 type="messages",
                 additional_inputs=[selected_subspaces],
                 fill_height=True
         # Right side: concept management
         with gr.Column(scale=4):
             gr.Markdown("## Steer Model Responses")
+            gr.Markdown("Search and then select a concept to steer. The closest match will be automatically selected. If there is no match, we will use our steering vector generator to auto-steer for you!")
             # Concept Search and Selection
             with gr.Group():
                 search_box = gr.Textbox(
                     placeholder="Find concepts to steer the model (e.g. 'time travel')",
                     lines=2,
                 )
+                msg = gr.TextArea(visible=False)
                 concept_dropdown = gr.Dropdown(
                     label="Select a concept to steer the model (Click to see more!)",
                     interactive=True,
     # Wire up events
     # When search box changes, update dropdown AND trigger concept selection
+    search_box.input(
         update_dropdown_choices,
         [search_box],
+        [concept_dropdown, msg]
     ).then(  # Chain the events to automatically add the concept
         add_concept_to_list,
         [concept_dropdown, concept_magnitude, selected_subspaces],
         [selected_subspaces]
     )
+    concept_dropdown.change(
+        add_concept_to_list,
+        [concept_dropdown, concept_magnitude, selected_subspaces],
+        [selected_subspaces]
+    )
     concept_magnitude.input(
         add_concept_to_list,
         [concept_dropdown, concept_magnitude, selected_subspaces],

style.css DELETED Viewed

@@ -1,19 +0,0 @@
-#alert-message label {
-    font-weight: 700;
-    background-color: #fff3cd;
-    padding: 8px;
-    border-radius: 4px;
-    color: #664d03;
-    display: inline-block;
-    margin-bottom: 8px;
-}
-#alert-message textarea {
-    background-color: #e8f4ff;
-    border: 1px solid #cce5ff;
-    color: #084298;
-    font-size: 1.1em;
-    padding: 12px;
-    border-radius: 4px;
-    font-weight: 500;
-}