RLHFlow
/

Decision-Tree-Reward-Llama-3.1-8B

@@ -1,7 +1,10 @@
 {
   "_name_or_path": "RLHFlow/Decision-Tree-Reward-Llama-3.1-8B",
   "architectures": [
-    "LlamaForSequenceClassification"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,

 {
   "_name_or_path": "RLHFlow/Decision-Tree-Reward-Llama-3.1-8B",
+  "auto_map": {
+    "AutoModelForSequenceClassification": "modeling_decision_tree_reward_model.LlamaForDecisionTreeRewardModel"
+  },
   "architectures": [
+    "LlamaForDecisionTreeRewardModel"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,

decision_tree.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5433ef4e775535b16490d8b2e9693a4f46b2f637b0749a089f097c94159814c5
 size 2388

 version https://git-lfs.github.com/spec/v1
+oid sha256:83f0139429fff38e775af9a281ba5600a46ff852967f6c310667e61710b5bf40
 size 2388

modeling_decision_tree_reward_model.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import torch.nn as nn
+from transformers.models.llama.modeling_llama import LlamaForSequenceClassification
+from sklearn.tree import DecisionTreeClassifier
+import os
+import pickle
+import json
+from huggingface_hub import hf_hub_download
+from typing import List, Dict, Union
+import numpy as np
+def convert_to_chat_format(prompt, response=None):
+    if "<extra_id_1>" in prompt:
+        """
+        Handling HelpSteer2 prompts which may contain multi-turn conversations with the special token <extra_id_1>
+        """
+        turns = prompt.split("<extra_id_1>")
+        conversation = []
+        conversation.append({
+            "role": "user",
+            "content": turns[0]
+        })
+        for i in range(1, len(turns)):
+            parts = turns[i].split("\n", 1)
+            role = parts[0]
+            content = parts[1]
+            conversation.append({
+                "role": "assistant" if role == "Assistant" else "user",
+                "content": content
+            })
+    else:
+        conversation = [{"role": "user", "content": prompt}]
+    if response is not None:
+        conversation.append({"role": "assistant", "content": response})
+    return conversation
+def process_conversation(conversation):
+    for message in conversation:
+        message["content"] = message["content"].rstrip('\n')
+    return conversation
+class LlamaForDecisionTreeRewardModel(LlamaForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=True)
+        # Initialize the decision tree
+        self.tree = None
+        # Define the default attributes (from HelpSteer2)
+        self.attributes = ['helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity']
+        print("Initialized LlamaForDecisionTreeRewardModel")
+    def load_decision_tree(self, repo_id, filename="decision_tree.pkl"):
+        # Load the tree from the model's directory
+        with open(hf_hub_download(repo_id=repo_id, filename=filename), "rb") as f:
+            self.tree = pickle.load(f)
+            assert isinstance(self.tree, DecisionTreeClassifier), f"The tree is not a DecisionTreeClassifier. It is a {type(self.tree)}"
+        with open(hf_hub_download(repo_id=repo_id, filename="config.json"), "r") as f:
+            config = json.load(f)
+        label2id_map = config["label2id"]
+        # Sort labels and ids by ids
+        labels, ids = zip(*sorted(label2id_map.items(), key=lambda x: x[1]))
+        labels = list(labels)
+        self.attributes = labels
+    @torch.no_grad()
+    def compare(self, prompt: Union[str, List[Dict[str, str]]], response_1: str, response_2: str, tokenizer, device):
+        """
+        Compare two inputs and return the difference in scores
+        """
+        assert self.tree is not None, "The decision tree is not loaded. Please call load_decision_tree(repo_id, filename) first."
+        if isinstance(prompt, str):
+            conversation = convert_to_chat_format(prompt)
+        elif isinstance(prompt, list):
+            conversation = prompt
+        else:
+            raise ValueError(f"The prompt must be a string or a list of dictionaries, but got {type(prompt)}")
+        assert isinstance(conversation, list), "The conversation must be a list of dictionaries"
+        assert len(conversation) >= 1, "The conversation must have at least one message (as prompt)"
+        assert conversation[-1]["role"] == "user", "The last message in the conversation must be from the user"
+        conversation_1 = conversation + [{"role": "assistant", "content": response_1}]
+        conversation_2 = conversation + [{"role": "assistant", "content": response_2}]
+        conversation_1 = process_conversation(conversation_1)
+        conversation_2 = process_conversation(conversation_2)
+        conv_tokenized_1 = tokenizer.apply_chat_template(conversation_1, tokenize=True, return_tensors="pt").to(device)
+        conv_tokenized_2 = tokenizer.apply_chat_template(conversation_2, tokenize=True, return_tensors="pt").to(device)
+        embedding_1 = self.forward(conv_tokenized_1, output_hidden_states=True).hidden_states[-1][:,-1].float().cpu().numpy()
+        embedding_2 = self.forward(conv_tokenized_2, output_hidden_states=True).hidden_states[-1][:,-1].float().cpu().numpy()
+        weight = self.score.weight.float().cpu().numpy()
+        bias = self.score.bias.float().cpu().numpy()
+        rewards_1 = embedding_1 @ weight.T + bias
+        rewards_2 = embedding_2 @ weight.T + bias
+        rewards_diff = rewards_2 - rewards_1
+        return {
+            "preference": self.tree.predict(rewards_diff)[0],       "rewards": np.stack([rewards_1, rewards_2]),
+            "attributes": self.attributes}