Riddhi Bhagwat commited on
Commit
b79d85c
·
1 Parent(s): 4b82d89

refined evaluation master function and ensured functionality

Browse files
Files changed (3) hide show
  1. ml/eval/alpaca.py +1 -2
  2. ml/eval/bt.py +13 -18
  3. ml/eval/evaluation_pipeline.py +27 -14
ml/eval/alpaca.py CHANGED
@@ -33,10 +33,9 @@ def judge_responses(response1, response2, prompt):
33
 
34
 
35
 
36
- def alpaca_evaluator(model_name, model_path, num_samples=200):
37
  results = run_evaluation(
38
  model=model_name,
39
- model_path=model_path,
40
  num_samples=num_samples, # fewer samples for quick testing
41
  reference_model="gpt-4", # Compare against GPT-4 (optional)
42
  )
 
33
 
34
 
35
 
36
+ def alpaca_evaluator(model_name, num_samples=200):
37
  results = run_evaluation(
38
  model=model_name,
 
39
  num_samples=num_samples, # fewer samples for quick testing
40
  reference_model="gpt-4", # Compare against GPT-4 (optional)
41
  )
ml/eval/bt.py CHANGED
@@ -52,19 +52,19 @@ def bradley_terry_comparison(old_rewards, new_rewards):
52
  probabilities = []
53
 
54
  for ix in range(len(old_rewards)):
55
- old = sft_rewards[ix]
56
- new = kto_rewards[ix]
57
 
58
  # Ensure prompts match
59
  assert old['prompt'] == new['prompt'], f"ERROR: Prompts at index {ix} do not match."
60
 
61
  # Compute Bradley-Terry probability
62
- new_reward = torch.tensor(kto['reward'], dtype=torch.float32)
63
- old_reward = torch.tensor(sft['reward'], dtype=torch.float32)
64
- prob_new_preferred = torch.sigmoid(kto_reward - old_reward).item()
65
 
66
  probabilities.append(prob_new_preferred)
67
- preferred_model = 'new' if prob_kto_preferred > 0.5 else 'old'
68
 
69
  # Count preferences
70
  if preferred_model == 'new':
@@ -75,12 +75,12 @@ def bradley_terry_comparison(old_rewards, new_rewards):
75
  # Log results
76
  bt_result = {
77
  'prompt': old['prompt'],
78
- 'old_output': sft['output'],
79
- 'new_output': kto['output'],
80
- 'old_reward': sft['reward'],
81
- 'new_reward': kto['reward'],
82
  'preferred': preferred_model,
83
- 'prob_new_preferred': prob_kto_preferred
84
  }
85
  results.append(bt_result)
86
 
@@ -88,8 +88,8 @@ def bradley_terry_comparison(old_rewards, new_rewards):
88
  total_examples = len(old_rewards)
89
  metrics = {
90
  'total_examples': total_examples,
91
- 'new_preferred_percentage': 100 * kto_preferred_count / total_examples,
92
- 'old_preferred_percentage': 100 * sft_preferred_count / total_examples,
93
  'avg_probability_new_preferred': sum(probabilities) / total_examples
94
  }
95
 
@@ -128,10 +128,8 @@ def print_metrics(metrics):
128
  ####################################
129
 
130
  def main():
131
- # Initialize script arguments
132
  args = ScriptArguments()
133
 
134
- # Load data
135
  print("Loading data...")
136
  old_rewards = load_rewards(args.sft_generations_file)
137
  new_rewards = load_rewards(args.kto_generations_file)
@@ -140,10 +138,7 @@ def main():
140
  print("Performing Bradley-Terry comparison...")
141
  results, metrics = bradley_terry_comparison(old_rewards, new_rewards)
142
 
143
- # Save results
144
  save_results(results, args.output_file)
145
-
146
- # Print metrics
147
  print_metrics(metrics)
148
 
149
 
 
52
  probabilities = []
53
 
54
  for ix in range(len(old_rewards)):
55
+ old = old_rewards[ix]
56
+ new = new_rewards[ix]
57
 
58
  # Ensure prompts match
59
  assert old['prompt'] == new['prompt'], f"ERROR: Prompts at index {ix} do not match."
60
 
61
  # Compute Bradley-Terry probability
62
+ new_reward = torch.tensor(old['reward'], dtype=torch.float32)
63
+ old_reward = torch.tensor(new['reward'], dtype=torch.float32)
64
+ prob_new_preferred = torch.sigmoid(new_reward - old_reward).item()
65
 
66
  probabilities.append(prob_new_preferred)
67
+ preferred_model = 'new' if prob_new_preferred > 0.5 else 'old'
68
 
69
  # Count preferences
70
  if preferred_model == 'new':
 
75
  # Log results
76
  bt_result = {
77
  'prompt': old['prompt'],
78
+ 'old_output': old['output'],
79
+ 'new_output': new['output'],
80
+ 'old_reward': old['reward'],
81
+ 'new_reward': new['reward'],
82
  'preferred': preferred_model,
83
+ 'prob_new_preferred': prob_new_preferred
84
  }
85
  results.append(bt_result)
86
 
 
88
  total_examples = len(old_rewards)
89
  metrics = {
90
  'total_examples': total_examples,
91
+ 'new_preferred_percentage': 100 * new_preferred_count / total_examples,
92
+ 'old_preferred_percentage': 100 * old_preferred_count / total_examples,
93
  'avg_probability_new_preferred': sum(probabilities) / total_examples
94
  }
95
 
 
128
  ####################################
129
 
130
  def main():
 
131
  args = ScriptArguments()
132
 
 
133
  print("Loading data...")
134
  old_rewards = load_rewards(args.sft_generations_file)
135
  new_rewards = load_rewards(args.kto_generations_file)
 
138
  print("Performing Bradley-Terry comparison...")
139
  results, metrics = bradley_terry_comparison(old_rewards, new_rewards)
140
 
 
141
  save_results(results, args.output_file)
 
 
142
  print_metrics(metrics)
143
 
144
 
ml/eval/evaluation_pipeline.py CHANGED
@@ -3,16 +3,11 @@
3
  ###########
4
  from reward_eval import process_evaluation
5
  from generate import generate_files
6
- from alpaca import alpaca_evaluator
7
- from bt import bradley_terry_comparison, save_results, print_metrics
8
  from evaluate_arguments import EvalArguments
9
-
10
-
11
- ##################
12
- # M-REWARD BENCH #
13
- ##################
14
-
15
-
16
 
17
  #############
18
  # EVALUATOR #
@@ -25,21 +20,39 @@ eval_dataset: list of dictionaries that contain the prompt and response in the s
25
  [{"prompt": "How are you?", "output": "I'm doing great!"}, {"prompt": "What's your name?", "output": "Assistant"}]
26
  reward_output_filepath: string (must end in .json) that represents the path of the output of the reward score evaluation
27
  model: base model that is being evaluated (defaults to starter base model - Aya-23-8B )
 
 
28
 
 
29
  '''
30
  def evaluator_master_fn(eval_dataset: list[dict],
31
  reward_output_filepath: str,
32
- model="CohereForAI/aya-23-8B"):
 
 
 
 
 
33
 
34
  # 1. Reward score evaluation:
35
  args = EvalArguments(bfloat16=True,
36
  reward_output_fmt='1-0',
37
  apply_sigmoid_to_reward=False,
38
  per_device_batch_size=8,
39
- output_filepath= '/path/to/your/data.json',
40
  result_filename=None,
41
- model_name_or_path="CohereForAI/aya-expanse-8b")
42
- process_evaluation(args, model_name=model, eval_data_list_dict=eval_dataset)
 
 
 
 
 
 
 
 
 
 
43
 
44
- # 2.
45
 
 
3
  ###########
4
  from reward_eval import process_evaluation
5
  from generate import generate_files
6
+ from alpaca import alpaca_evaluator, judge_responses
7
+ from bt import bradley_terry_comparison, load_rewards
8
  from evaluate_arguments import EvalArguments
9
+ import pandas as pd
10
+ import numpy as np
 
 
 
 
 
11
 
12
  #############
13
  # EVALUATOR #
 
20
  [{"prompt": "How are you?", "output": "I'm doing great!"}, {"prompt": "What's your name?", "output": "Assistant"}]
21
  reward_output_filepath: string (must end in .json) that represents the path of the output of the reward score evaluation
22
  model: base model that is being evaluated (defaults to starter base model - Aya-23-8B )
23
+ all_responses: should be a path to a csv file that has all the model's responses and their corresponding prompts with the following
24
+ format: response1 --> col 1, response2 --> col 2, prompt --> col 3
25
 
26
+ language: which language is being used for this model (needs to be a valid FeeLLanguage object once FeeLLanguage class is updated)
27
  '''
28
  def evaluator_master_fn(eval_dataset: list[dict],
29
  reward_output_filepath: str,
30
+ all_responses: str,
31
+ language: str,
32
+ new_model,
33
+ old_model="CohereForAI/aya-expanse-8b"):
34
+ # language is string for now, will be an object later with FeeLLanguage class definition with specific lanugage
35
+ # functionalities (will also store latest model)
36
 
37
  # 1. Reward score evaluation:
38
  args = EvalArguments(bfloat16=True,
39
  reward_output_fmt='1-0',
40
  apply_sigmoid_to_reward=False,
41
  per_device_batch_size=8,
42
+ output_filepath="new_evaluation",
43
  result_filename=None,
44
+ model_name_or_path=new_model)
45
+ reward_score_result = process_evaluation(args, model_name=new_model, eval_data_list_dict=eval_dataset)
46
+
47
+ # 2. Alpaca Eval - Judging Responses
48
+ judge_df = pd.read_csv(all_responses)
49
+ judge_df["winner"] = judge_df.apply(lambda r: judge_responses(r["response1"], r["response2"], r["prompt"]), axis = 1) # axis = 1 -- loops rows
50
+
51
+ # 3. Alpaca Eval - model comparison
52
+ alpaca_results = alpaca_evaluator(new_model, num_samples=200) # can adjust num_samples as needed, potentially based on language
53
+
54
+ # 4. Bradley Terry Evaluation
55
+ bt_results = bradley_terry_comparison(load_rewards(old_model), load_rewards(new_model))
56
 
57
+ return reward_score_result, judge_df, alpaca_results, bt_results
58