Riddhi Bhagwat commited on
Commit
67312ac
·
1 Parent(s): 85e41fb

evaluation pipeline updates

Browse files
ml/dataset_transformer.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # NOTE: names of preset cols may be different based on dataset, this is just a generalized pipeline
5
+
6
+ CHOSEN_COLUMN = 'chosen' # name of col with chosen responses
7
+ REJECTED_COLUMN = 'rejected' # name of col with rejected responses
8
+ COLUMNS_TO_DROP = ['metadata', 'timestamp', 'id'] # cols to remove
9
+
10
+ def transform_rlhf_dataset(df, chosen_col=CHOSEN_COLUMN, rejected_col=REJECTED_COLUMN, drop_cols=COLUMNS_TO_DROP):
11
+ """
12
+ Parameters:
13
+ df (pandas.DataFrame): Input dataframe with chosen and rejected columns
14
+ chosen_col (str): Name of column containing chosen responses
15
+ rejected_col (str): Name of column containing rejected responses
16
+ drop_cols (list): List of column names to drop from the dataset
17
+
18
+ Returns:
19
+ pandas.DataFrame: Transformed dataset with 'text' and 'label' columns
20
+ """
21
+
22
+ df = df.copy()
23
+
24
+ existing_cols_to_drop = [col for col in drop_cols if col in df.columns]
25
+ if existing_cols_to_drop:
26
+ df = df.drop(columns=existing_cols_to_drop)
27
+
28
+ preserved_cols = [col for col in df.columns if col not in [chosen_col, rejected_col]]
29
+
30
+ # two separate dataframes for liked and disliked
31
+ liked_df = df[[chosen_col]].copy()
32
+ liked_df.columns = ['text']
33
+ liked_df['label'] = 'liked'
34
+
35
+ disliked_df = df[[rejected_col]].copy()
36
+ disliked_df.columns = ['text']
37
+ disliked_df['label'] = 'disliked'
38
+
39
+ for col in preserved_cols:
40
+ liked_df[col] = df[col]
41
+ for col in preserved_cols:
42
+ disliked_df[col] = df[col]
43
+
44
+ # combine + shuffle
45
+ transformed_df = pd.concat([liked_df, disliked_df], ignore_index=True)
46
+ transformed_df = transformed_df.dropna(subset=['text'])
47
+ transformed_df = transformed_df.sample(frac=1).reset_index(drop=True)
48
+
49
+ # reordering
50
+ column_order = ['text', 'label'] + preserved_cols
51
+ transformed_df = transformed_df[column_order]
52
+
53
+ return transformed_df
54
+
55
+ def test_example():
56
+ example_data = {
57
+ 'chosen': ['This is a good response', 'Another good one'],
58
+ 'rejected': ['This is a bad response', 'Another bad one'],
59
+ 'metadata': ['meta1', 'meta2'],
60
+ 'timestamp': ['2024-01-01', '2024-01-02'],
61
+ 'id': [1, 2]
62
+ }
63
+
64
+ df = pd.DataFrame(example_data)
65
+ transformed_df = transform_rlhf_dataset(
66
+ df,
67
+ chosen_col='chosen',
68
+ rejected_col='rejected',
69
+ drop_cols=['metadata', 'id']
70
+ )
71
+
72
+ print("Original shape:", df.shape)
73
+ print("\nTransformed shape:", transformed_df.shape)
74
+ print("\nTransformation sample:")
75
+ print(transformed_df.head())
76
+ print("\nLabel distribution:")
77
+ print(transformed_df['label'].value_counts())
78
+
79
+ if __name__ == "__main__":
80
+ test_example()
ml/dpo_pipeline.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Trainer, TrainingArguments
2
+ from datasets import Dataset
3
+ import torch
4
+
5
+ def train_dpo_model(model, dataset, learning_rate=5e-5, num_train_epochs=3, per_device_train_batch_size=16):
6
+ """
7
+ Trains a model using Direct Preference Optimization (DPO).
8
+
9
+ Args:
10
+ model: The language model to be trained.
11
+ dataset: The dataset used for training, should be in Hugging Face Dataset format.
12
+ learning_rate: Learning rate for the optimizer.
13
+ num_train_epochs: Number of epochs to train.
14
+ per_device_train_batch_size: Batch size per device during training.
15
+ """
16
+ model.train()
17
+
18
+ training_args = TrainingArguments(
19
+ output_dir="./dpo_model",
20
+ evaluation_strategy="epoch",
21
+ save_strategy="epoch",
22
+ learning_rate=learning_rate,
23
+ per_device_train_batch_size=per_device_train_batch_size,
24
+ per_device_eval_batch_size=per_device_train_batch_size,
25
+ num_train_epochs=num_train_epochs,
26
+ weight_decay=0.01,
27
+ logging_dir="./logs",
28
+ logging_steps=100,
29
+ save_total_limit=2,
30
+ push_to_hub=False,
31
+ load_best_model_at_end=True,
32
+ )
33
+
34
+ trainer = Trainer(
35
+ model=model,
36
+ args=training_args,
37
+ train_dataset=dataset["train"],
38
+ eval_dataset=dataset.get("validation", None),
39
+ )
40
+
41
+ trainer.train()
42
+
43
+ return model
44
+
ml/eval/.reward_eval.py.swp ADDED
Binary file (20.5 kB). View file
 
ml/eval/alpaca.py CHANGED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+ from alpaca_eval import run_evaluation
4
+
5
+ def judge_responses(response1, response2, prompt):
6
+ """
7
+ Use OpenAI GPT-4 API to judge two model responses.
8
+ Returns: "A" if response1 is better, "B" if response2 is better, or "tie".
9
+ """
10
+ openai.api_key = os.getenv("OPENAI_API_KEY")
11
+
12
+ prompt_text = f"""
13
+ Given the user prompt: "{prompt}"
14
+
15
+ Response A: "{response1}"
16
+ Response B: "{response2}"
17
+
18
+ Which response is better? Reply with 'A', 'B', or 'tie'.
19
+ """
20
+
21
+ try:
22
+ response = openai.ChatCompletion.create(
23
+ model="gpt-4",
24
+ messages=[{"role": "system", "content": "You are an expert evaluator."},
25
+ {"role": "user", "content": prompt_text}],
26
+ max_tokens=5
27
+ )
28
+ result = response["choices"][0]["message"]["content"].strip().lower()
29
+ return result if result in ["a", "b", "tie"] else "tie"
30
+ except Exception as e:
31
+ print(f"Error in OpenAI API call: {e}")
32
+ return "tie"
33
+
34
+
35
+
36
+ def alpaca_evaluator(model_name, model_path, num_samples=200):
37
+ results = run_evaluation(
38
+ model=model_name,
39
+ model_path=model_path,
40
+ num_samples=num_samples, # fewer samples for quick testing
41
+ reference_model="gpt-4", # Compare against GPT-4 (optional)
42
+ )
43
+ return results
44
+
ml/eval/bt.py CHANGED
@@ -11,9 +11,9 @@ class ScriptArguments:
11
  """
12
  Arguments for the Bradley-Terry evaluation script.
13
  """
14
- sft_generations_file: str = '/raid/lingo/jen_ben/HF-RLHF/eval/test/gen_examples_idan_mini.json'
15
- kto_generations_file: str = '/raid/lingo/jen_ben/HF-RLHF/eval/test/gen_examples_idan_mini.json'
16
- output_file: str = 'bt_results_test_mini.json'
17
 
18
 
19
  ####################################
@@ -34,63 +34,63 @@ def load_rewards(file_path):
34
  return json.load(f)
35
 
36
 
37
- def bradley_terry_comparison(sft_rewards, kto_rewards):
38
  """
39
  Perform Bradley-Terry comparison between two sets of model generations.
40
 
41
  Args:
42
- sft_rewards (list): List of dictionaries for the SFT model's generations and rewards.
43
- kto_rewards (list): List of dictionaries for the KTO model's generations and rewards.
44
 
45
  Returns:
46
  list: Comparison results including preferred outputs and probabilities.
47
  dict: Metrics summary including percentage preferred and average probabilities.
48
  """
49
  results = []
50
- kto_preferred_count = 0
51
- sft_preferred_count = 0
52
  probabilities = []
53
 
54
- for ix in range(len(sft_rewards)):
55
- sft = sft_rewards[ix]
56
- kto = kto_rewards[ix]
57
 
58
  # Ensure prompts match
59
- assert sft['prompt'] == kto['prompt'], f"ERROR: Prompts at index {ix} do not match."
60
 
61
  # Compute Bradley-Terry probability
62
- kto_reward = torch.tensor(kto['reward'], dtype=torch.float32)
63
- sft_reward = torch.tensor(sft['reward'], dtype=torch.float32)
64
- prob_kto_preferred = torch.sigmoid(kto_reward - sft_reward).item()
65
 
66
- probabilities.append(prob_kto_preferred)
67
- preferred_model = 'kto' if prob_kto_preferred > 0.5 else 'sft'
68
 
69
  # Count preferences
70
- if preferred_model == 'kto':
71
- kto_preferred_count += 1
72
  else:
73
- sft_preferred_count += 1
74
 
75
  # Log results
76
  bt_result = {
77
- 'prompt': sft['prompt'],
78
- 'sft_output': sft['output'],
79
- 'kto_output': kto['output'],
80
- 'sft_reward': sft['reward'],
81
- 'kto_reward': kto['reward'],
82
  'preferred': preferred_model,
83
- 'prob_kto_preferred': prob_kto_preferred
84
  }
85
  results.append(bt_result)
86
 
87
  # Calculate metrics
88
- total_examples = len(sft_rewards)
89
  metrics = {
90
  'total_examples': total_examples,
91
- 'kto_preferred_percentage': 100 * kto_preferred_count / total_examples,
92
- 'sft_preferred_percentage': 100 * sft_preferred_count / total_examples,
93
- 'avg_probability_kto_preferred': sum(probabilities) / total_examples
94
  }
95
 
96
  return results, metrics
@@ -118,9 +118,9 @@ def print_metrics(metrics):
118
  """
119
  print("\nEVALUATION METRICS:")
120
  print(f"Total examples: {metrics['total_examples']}")
121
- print(f"Percentage preferred - KTO model: {metrics['kto_preferred_percentage']:.2f}%")
122
- print(f"Percentage preferred - SFT model: {metrics['sft_preferred_percentage']:.2f}%")
123
- print(f"Average probability of KTO model being preferred: {metrics['avg_probability_kto_preferred']:.4f}")
124
 
125
 
126
  ####################################
@@ -133,12 +133,12 @@ def main():
133
 
134
  # Load data
135
  print("Loading data...")
136
- sft_rewards = load_rewards(args.sft_generations_file)
137
- kto_rewards = load_rewards(args.kto_generations_file)
138
 
139
  # Perform Bradley-Terry comparison
140
  print("Performing Bradley-Terry comparison...")
141
- results, metrics = bradley_terry_comparison(sft_rewards, kto_rewards)
142
 
143
  # Save results
144
  save_results(results, args.output_file)
@@ -152,55 +152,3 @@ if __name__ == "__main__":
152
 
153
 
154
 
155
- # import json
156
- # import torch
157
-
158
- # output_file_path = 'bt_results.json'
159
- # ref_generations_rewards_file_path = 'ref_models_generations_reward_trl-libqwen1.5-1.8b-sft.json'
160
- # finetuned_generations_rewards_file_path = 'finetuned_models_generations_reward_trl-libqwen1.5-1.8b-sft.json'
161
-
162
- # # Open and read JSON files
163
- # with open(ref_generations_rewards_file_path, 'r') as f:
164
- # ref_rewards = json.load(f)
165
-
166
- # with open(finetuned_generations_rewards_file_path, 'r') as g:
167
- # finetuned_rewards = json.load(g)
168
-
169
- # # assert len(ref_rewards) != len(finetuned_rewards), 'ERROR: files are not with the same length.'
170
-
171
- # results = []
172
- # finetuned_preffered = 0
173
- # for ix in range(len(ref_rewards)):
174
- # ref = ref_rewards[ix]
175
- # finetuned = finetuned_rewards[ix]
176
- # assert ref['prompt'] == finetuned['prompt'], 'ERROR: ref and finetuned prompt are not the same.'
177
-
178
- # # Bradely Terry
179
- # finetuned_reward = torch.tensor(finetuned['reward'], dtype=torch.float32)
180
- # ref_reward = torch.tensor(ref['reward'], dtype=torch.float32)
181
- # prob_finetuned_preferred = torch.sigmoid(finetuned_reward - ref_reward)
182
-
183
-
184
- # if prob_finetuned_preferred > 0.5:
185
- # finetuned_preffered +=1
186
- # print(f'example {ix}: finetuned preffered')
187
- # else:
188
- # print(f'example {ix}: ref preffered')
189
-
190
- # # log results
191
- # bt_result = {}
192
- # bt_result['prompt'] = ref['prompt']
193
- # bt_result['ref_output'] = ref['output']
194
- # bt_result['finetuned_output'] = finetuned['output']
195
- # bt_result['ref_reward'] = ref['output']
196
- # bt_result['finetuned_reward'] = finetuned['output']
197
- # bt_result['preffered'] = 'finetuned' if prob_finetuned_preferred > 0.5 else 'ref'
198
- # results.append(bt_result)
199
-
200
-
201
- # # save results in json files
202
-
203
- # with open(output_file_path, "w") as f:
204
- # json.dump(results, f, indent=4)
205
-
206
- # print('BT EVALUATION COMPLETED.')
 
11
  """
12
  Arguments for the Bradley-Terry evaluation script.
13
  """
14
+ old_generations_file: str
15
+ new_generations_file: str
16
+ output_file: str = 'bt_results.json'
17
 
18
 
19
  ####################################
 
34
  return json.load(f)
35
 
36
 
37
+ def bradley_terry_comparison(old_rewards, new_rewards):
38
  """
39
  Perform Bradley-Terry comparison between two sets of model generations.
40
 
41
  Args:
42
+ old_rewards (list): List of dictionaries for the OLD model's generations and rewards.
43
+ new_rewards (list): List of dictionaries for the NEW model's generations and rewards.
44
 
45
  Returns:
46
  list: Comparison results including preferred outputs and probabilities.
47
  dict: Metrics summary including percentage preferred and average probabilities.
48
  """
49
  results = []
50
+ new_preferred_count = 0
51
+ old_preferred_count = 0
52
  probabilities = []
53
 
54
+ for ix in range(len(old_rewards)):
55
+ old = sft_rewards[ix]
56
+ new = kto_rewards[ix]
57
 
58
  # Ensure prompts match
59
+ assert old['prompt'] == new['prompt'], f"ERROR: Prompts at index {ix} do not match."
60
 
61
  # Compute Bradley-Terry probability
62
+ new_reward = torch.tensor(kto['reward'], dtype=torch.float32)
63
+ old_reward = torch.tensor(sft['reward'], dtype=torch.float32)
64
+ prob_new_preferred = torch.sigmoid(kto_reward - old_reward).item()
65
 
66
+ probabilities.append(prob_new_preferred)
67
+ preferred_model = 'new' if prob_kto_preferred > 0.5 else 'old'
68
 
69
  # Count preferences
70
+ if preferred_model == 'new':
71
+ new_preferred_count += 1
72
  else:
73
+ old_preferred_count += 1
74
 
75
  # Log results
76
  bt_result = {
77
+ 'prompt': old['prompt'],
78
+ 'old_output': sft['output'],
79
+ 'new_output': kto['output'],
80
+ 'old_reward': sft['reward'],
81
+ 'new_reward': kto['reward'],
82
  'preferred': preferred_model,
83
+ 'prob_new_preferred': prob_kto_preferred
84
  }
85
  results.append(bt_result)
86
 
87
  # Calculate metrics
88
+ total_examples = len(old_rewards)
89
  metrics = {
90
  'total_examples': total_examples,
91
+ 'new_preferred_percentage': 100 * kto_preferred_count / total_examples,
92
+ 'old_preferred_percentage': 100 * sft_preferred_count / total_examples,
93
+ 'avg_probability_new_preferred': sum(probabilities) / total_examples
94
  }
95
 
96
  return results, metrics
 
118
  """
119
  print("\nEVALUATION METRICS:")
120
  print(f"Total examples: {metrics['total_examples']}")
121
+ print(f"Percentage preferred - KTO model: {metrics['new_preferred_percentage']:.2f}%")
122
+ print(f"Percentage preferred - SFT model: {metrics['old_preferred_percentage']:.2f}%")
123
+ print(f"Average probability of KTO model being preferred: {metrics['avg_probability_new_preferred']:.4f}")
124
 
125
 
126
  ####################################
 
133
 
134
  # Load data
135
  print("Loading data...")
136
+ old_rewards = load_rewards(args.sft_generations_file)
137
+ new_rewards = load_rewards(args.kto_generations_file)
138
 
139
  # Perform Bradley-Terry comparison
140
  print("Performing Bradley-Terry comparison...")
141
+ results, metrics = bradley_terry_comparison(old_rewards, new_rewards)
142
 
143
  # Save results
144
  save_results(results, args.output_file)
 
152
 
153
 
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ml/eval/evaluate.py DELETED
@@ -1,185 +0,0 @@
1
- import sys
2
- import os
3
- from typing import Any, Dict, List
4
-
5
- import torch
6
- import transformers
7
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
8
- from accelerate import Accelerator
9
- from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config, maybe_unpair_preference_dataset, setup_chat_format
10
- from tqdm import tqdm
11
-
12
- # Add script directory to system path for importing local modules
13
- SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
14
- sys.path.append(os.path.dirname(SCRIPT_DIR))
15
-
16
- from eval.utils import jload, jdump
17
- from eval.evaluate_arguments import EvalArguments
18
-
19
-
20
- # set `device` to "cuda" if a GPU is available. otherwise, defaults to CPU
21
- device = "cuda" if torch.cuda.is_available() else "cpu"
22
-
23
- def create_model():
24
- # loads a specified reward model and sets it to use the GPU ("cuda")
25
- # CHANGE FUNCTION DEPENDING OF THE MODEL YOU LOAD
26
- model = AutoModelForSequenceClassification.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", num_labels=1).to("cuda")
27
- return model
28
-
29
-
30
- def create_tokenizer():
31
- # loads the tokenizer that pairs with the model for encoding the text data
32
- tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", use_auth_token=True)
33
- return tokenizer
34
-
35
-
36
- def MyAccelerator(mixed_precision):
37
- # wrap `Accelerator` to set up model handling with mixed-precision (to save memory)
38
- accelerator = Accelerator(mixed_precision=mixed_precision)
39
- return accelerator
40
-
41
-
42
- #####################################
43
- # Idan's script from here
44
- #####################################
45
-
46
-
47
- def main():
48
-
49
- # Parse evaluation arguments from `EvalArguments`
50
- parser = transformers.HfArgumentParser((EvalArguments, ))
51
- args, = parser.parse_args_into_dataclasses()
52
-
53
- # set `mixed_precision` based on `args.bfloat16` (if true use bf16, otherwise fp16)
54
- mixed_precision = 'bf16' if args.bfloat16 else 'fp16'
55
- args.mixed_precision = mixed_precision
56
-
57
- # initialize `MyAccelerator` with the chosen mixed precision setting
58
- accelerator = MyAccelerator(
59
- mixed_precision=mixed_precision,
60
- )
61
-
62
-
63
- # load model and tokenizer
64
- model = create_model()
65
- if 't5' not in args.model_name_or_path:
66
- # t5 models where trained with fp32
67
- model = accelerator.prepare(model)
68
- model.eval()
69
-
70
- tokenizer = create_tokenizer()
71
-
72
- print("Output file path:", args.output_filepath)
73
-
74
- # load LM generations data from `args.output_filepath` + handles cases where it’s a single file or directory.
75
- filenames = []
76
- eval_data_list_dict = []
77
- if os.path.isfile(args.output_filepath):
78
- print(f'Loading data from {args.output_filepath}...')
79
- eval_data_list_dict.append(jload(args.output_filepath))
80
- filenames.append(args.output_filepath)
81
- elif os.path.isdir(args.output_filepath):
82
- print(f'Loading data from {args.output_filepath}...')
83
- for filename in os.listdir(args.output_filepath):
84
- if filename.endswith('.json'):
85
- print(f'Loaded file {filename}')
86
- eval_data_list_dict.append(jload(os.path.join(args.output_filepath, filename)))
87
- filenames.append(os.path.join(args.output_filepath, filename))
88
- else:
89
- raise Exception('Output file(s) not found!')
90
-
91
-
92
- # process each file and call `evaluate_data()` to calculate reward scores
93
- for filename, eval_data_dict in zip(filenames, eval_data_list_dict):
94
- eval_data = evaluate_data(args, model, tokenizer, eval_data_dict)
95
-
96
- if args.result_filename is None:
97
- path_to_result = os.path.basename(filename).split('.json')[0] + f"_reward_{args.model_name_or_path.replace('/', '')}.json"
98
- else:
99
- path_to_result = args.result_filename
100
-
101
- print(f'Saving results to file {path_to_result}...')
102
- jdump(eval_data, path_to_result)
103
-
104
-
105
- def get_reward_output_fn(reward_output_fmt: str, apply_sigmoid_to_reward: bool):
106
- # defines the reward output function format based on `reward_output_fmt`
107
- if reward_output_fmt is None:
108
- reward_output_fn = lambda x: x.squeeze().cpu().detach().numpy().tolist()
109
- elif reward_output_fmt == '0':
110
- reward_output_fn = lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0].tolist()
111
- elif reward_output_fmt == '1':
112
- reward_output_fn = lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1].tolist()
113
- elif reward_output_fmt == '1-0':
114
- reward_output_fn = lambda x: (x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1] - x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0]).tolist()
115
- else:
116
- raise NotImplementedError(f'Unsupported reward output format: {reward_output_fmt}')
117
-
118
- # Apply sigmoid transformation if `apply_sigmoid_to_reward` is true
119
- if apply_sigmoid_to_reward:
120
- reward_output_fn = lambda x: torch.sigmoid(torch.tensor(x)).numpy().tolist()
121
-
122
- return reward_output_fn
123
-
124
-
125
- @torch.inference_mode()
126
- def evaluate_data(args: EvalArguments, model, tokenizer, eval_data_list_dict) -> List[Dict[str, Any]]:
127
- """Given a generated dataset, evaluate it using the reward model
128
-
129
- args: argparse.Namespace, the arguments to use
130
- reward_model: reward_model_module.RewardModel, the reward model to use
131
- eval_data_list_dict: List[Dict[str, Any]], the generated data to evaluate
132
- """
133
-
134
- pbar = tqdm(total=len(eval_data_list_dict), desc="eval")
135
- rewards_list = []
136
- reward_output_fn = get_reward_output_fn(args.reward_output_fmt, args.apply_sigmoid_to_reward)
137
-
138
- print('Evaluating reward scores...')
139
-
140
- # Split `eval_data_list_dict` into batches for processing
141
- for idx in range(0, len(eval_data_list_dict), args.per_device_batch_size):
142
- if len(eval_data_list_dict) > (idx + args.per_device_batch_size):
143
- batch_list_dict = eval_data_list_dict[idx:idx+args.per_device_batch_size]
144
- else:
145
- batch_list_dict = eval_data_list_dict[idx:]
146
-
147
- # create formatted text from prompts and outputs for tokenization
148
- if 'prompt' in batch_list_dict[0]:
149
- batch_full_outputs = [l['prompt'] + ' ' + l['output'] for l in batch_list_dict]
150
- else:
151
- print('Overriding with custom prompt format')
152
- prompt_fmt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: {output}"
153
- for l in batch_list_dict:
154
- l['output'] = l['output'].split('.')[0] + '.'
155
- batch_full_outputs = [prompt_fmt.format_map(l) for l in batch_list_dict]
156
-
157
- # tokenize and send the batched text to the model’s device
158
- encoded_full_responses = tokenizer(batch_full_outputs, return_tensors="pt", padding=True, truncation=True)
159
- encoded_full_responses = encoded_full_responses.to(model.device) # i added this
160
-
161
- # generate reward scores and stores them in `rewards_list`
162
- reward_outputs = model(**encoded_full_responses)
163
- rewards = reward_output_fn(reward_outputs.logits)
164
- rewards_list.extend(rewards if isinstance(rewards, list) else [rewards])
165
-
166
- # update progress bar after each batch is processed
167
- pbar.update(len(batch_list_dict))
168
-
169
- print('Combining reward outputs into outputs...')
170
-
171
- # add calculated rewards to each item in `eval_data_list_dict`
172
- for j in range(len(eval_data_list_dict)):
173
- eval_data_list_dict[j]['reward'] = rewards_list[j]
174
- eval_data_list_dict[j]['reward_model'] = args.model_name_or_path + args.model_pretrained_lora_weights if args.model_pretrained_lora_weights is not None else args.model_name_or_path
175
-
176
- print('Finished evaluating reward scores!')
177
-
178
- print('Mean reward score: ', sum(rewards_list) / len(rewards_list))
179
- print('Std reward score: ', torch.tensor(rewards_list).std().item())
180
-
181
- return eval_data_list_dict
182
-
183
-
184
- if __name__ == '__main__':
185
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ml/eval/evaluation_pipeline.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###########
2
+ # IMPORTS #
3
+ ###########
4
+ from reward_eval import process_evaluation
5
+ from generate import generate_files
6
+ from alpaca import alpaca_evaluator
7
+ from bt import bradley_terry_comparison, save_results, print_metrics
8
+
9
+ ##################
10
+ # M-REWARD BENCH #
11
+ ##################
12
+
13
+
14
+
15
+ #############
16
+ # EVALUATOR #
17
+ #############
18
+ '''
19
+ Evaluation Pipeline
20
+
21
+ Parameters:
22
+ eval_dataset: list of dictionaries that contain the prompt and response in the same form as below:
23
+ [{"prompt": "How are you?", "output": "I'm doing great!"}, {"prompt": "What's your name?", "output": "Assistant"}]
24
+ reward_output_filepath: string (must end in .json) that represents the path of the output of the reward score evaluation
25
+ model: base model that is being evaluated (defaults to starter base model - Aya-23-8B )
26
+
27
+ '''
28
+ def evaluator_master_fn(eval_dataset: list[dict],
29
+ reward_output_filepath: str,
30
+ model="CohereForAI/aya-23-8B"):
31
+
32
+ # 1. Reward score evaluation:
33
+ args = {
34
+ 'bfloat16': False,
35
+ 'reward_output_fmt': '1-0',
36
+ 'apply_sigmoid_to_reward': False,
37
+ 'per_device_batch_size': 8,
38
+ 'output_filepath': reward_output_filepath + '.json',
39
+ 'result_filename': None,
40
+ }
41
+ process_evaluation(args, model_name=model, eval_data_list_dict=eval_dataset)
42
+
43
+ # 2.
44
+
ml/eval/generate.py CHANGED
@@ -141,7 +141,7 @@ def save_results(results, output_file):
141
  # MAIN SCRIPT
142
  ####################################
143
 
144
- def main():
145
  # Load model and tokenizer
146
  print("Loading kto fine-tuned model...")
147
  kto_model, kto_tokenizer = load_model_and_tokenizer(script_args.kto_model_path, use_auth_token=True)
@@ -166,4 +166,4 @@ def main():
166
 
167
 
168
  if __name__ == "__main__":
169
- main()
 
141
  # MAIN SCRIPT
142
  ####################################
143
 
144
+ def generate_files():
145
  # Load model and tokenizer
146
  print("Loading kto fine-tuned model...")
147
  kto_model, kto_tokenizer = load_model_and_tokenizer(script_args.kto_model_path, use_auth_token=True)
 
166
 
167
 
168
  if __name__ == "__main__":
169
+ generate_files()
ml/eval/reward_eval.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from typing import Any, Dict, List
4
+ import json
5
+ import torch
6
+ import transformers
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
8
+ from accelerate import Accelerator
9
+ from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config, maybe_unpair_preference_dataset, setup_chat_format
10
+ from tqdm import tqdm
11
+
12
+ # Add script directory to system path for importing local modules
13
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
14
+ sys.path.append(os.path.dirname(SCRIPT_DIR))
15
+
16
+ from eval.utils import jload, jdump
17
+ from eval.evaluate_arguments import EvalArguments
18
+
19
+
20
+ # set `device` to "cuda" if a GPU is available. otherwise, defaults to CPU
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
+ def create_model(model_name: str):
24
+ """
25
+ loads pre-trained reward model and moves it onto device
26
+ """
27
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", num_labels=1).to("cuda")
28
+ return model
29
+
30
+
31
+ def create_tokenizer(model_name):
32
+ # loads the tokenizer that pairs with the model for encoding the text data
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
34
+ return tokenizer
35
+
36
+
37
+ def MyAccelerator(mixed_precision: str):
38
+ """
39
+ accelerator initialization (wrapper) for handling mixed precision
40
+ """
41
+ return Accelerator(mixed_precision=mixed_precision)
42
+
43
+ def get_reward_output_fn(reward_output_format: str, sigmoid: bool):
44
+ def default(x):
45
+ return x.squeeze().cpu().detach().numpy().tolist()
46
+ reward_fn_map = {
47
+ '0': lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0].tolist(),
48
+ '1': lambda x: x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1].tolist(),
49
+ '1-0': lambda x: (x.squeeze().cpu().detach().softmax(dim=-1).numpy()[1] - x.squeeze().cpu().detach().softmax(dim=-1).numpy()[0]).tolist()
50
+ }
51
+ reward_output_fn = reward_fn_map.get(reward_output_format, default)
52
+ if sigmoid:
53
+ return lambda x: torch.sigmoid(torch.tensor(x)).numpy().tolist()
54
+ return reward_output_fn
55
+
56
+ def evaluate_data(args, model, tokenizer, eval_data_list_dict) -> List[Dict[str, Any]]:
57
+ """
58
+ Evaluate the dataset using the reward model.
59
+ """
60
+ reward_output_fn = get_reward_output_fn(args.reward_output_fmt, args.apply_sigmoid_to_reward)
61
+ pbar = tqdm(total=len(eval_data_list_dict), desc="Evaluating Rewards")
62
+ rewards_list = []
63
+
64
+ for idx in range(0, len(eval_data_list_dict), args.per_device_batch_size):
65
+ batch_list_dict = eval_data_list_dict[idx:idx+args.per_device_batch_size]
66
+
67
+ # Create prompt-response pairs
68
+ batch_full_outputs = [
69
+ f"{l['prompt']} {l['output']}" for l in batch_list_dict
70
+ ] if 'prompt' in batch_list_dict[0] else [f"Below is an instruction: {l['instruction']} Response: {l['output']}" for l in batch_list_dict]
71
+
72
+ # Tokenize reponse and send to device
73
+ encoded_full_responses = tokenizer(batch_full_outputs, return_tensors="pt", padding=True, truncation=True)
74
+ encoded_full_responses = encoded_full_responses.to(model.device)
75
+
76
+ # Generate rewards
77
+ with torch.inference_mode():
78
+ reward_outputs = model(**encoded_full_responses)
79
+ rewards = reward_output_fn(reward_outputs.logits)
80
+ rewards_list.extend(rewards)
81
+
82
+ pbar.update(len(batch_list_dict))
83
+
84
+ # Adding reward scores to original data
85
+ for i, data in enumerate(eval_data_list_dict):
86
+ data['reward'] = rewards_list[i]
87
+
88
+ return eval_data_list_dict
89
+
90
+ def process_evaluation(args, model_name: str, eval_data_list_dict) -> List[Dict[str, Any]]:
91
+ """
92
+ Main function for processing evaluation, takes model name as input.
93
+ """
94
+ mixed_precision = 'bf16' if args.bfloat16 else 'fp16'
95
+
96
+ # Initialize accelerator and model
97
+ accelerator = MyAccelerator(mixed_precision)
98
+ model = create_model(model_name)
99
+ tokenizer = create_tokenizer(model_name)
100
+
101
+ model.eval()
102
+
103
+ eval_data = evaluate_data(args, model, tokenizer, eval_data_list_dict)
104
+
105
+ result_filename = args.result_filename or f"{os.path.basename(args.output_filepath).split('.')[0]}_reward_results.json"
106
+ with open(result_filename, "w") as f:
107
+ json.dump(eval_data, f)
108
+
109
+ return eval_data
110
+
111
+
112
+ # ONLY FOR TESTING:
113
+ if __name__ == '__main__':
114
+ args = {
115
+ 'bfloat16': False,
116
+ 'reward_output_fmt': '1-0',
117
+ 'apply_sigmoid_to_reward': False,
118
+ 'per_device_batch_size': 8,
119
+ 'output_filepath': '/path/to/your/data.json',
120
+ 'result_filename': None,
121
+ }
122
+
123
+ eval_data_list_dict = [{"prompt": "How are you?", "output": "I'm doing great!"}, {"prompt": "What's your name?", "output": "Assistant"}]
124
+
125
+ process_evaluation(args, model_name="CohereForAI/aya-23-8B", eval_data_list_dict=eval_data_list_dict)