rohansampath commited on
Commit
a3cb7ba
·
verified ·
1 Parent(s): 0e843f9

Update mmlu_pro_eval_adapted.py

Browse files
Files changed (1) hide show
  1. mmlu_pro_eval_adapted.py +111 -38
mmlu_pro_eval_adapted.py CHANGED
@@ -15,7 +15,7 @@ import logging
15
  import sys
16
  from datasets import load_dataset
17
  import pandas as pd
18
- import numpy as mnp
19
 
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
@@ -46,6 +46,11 @@ def load_mmlu_pro():
46
  test_df, val_df = dataset["test"], dataset["validation"]
47
  test_df = preprocess(test_df)
48
  val_df = preprocess(val_df)
 
 
 
 
 
49
  return test_df, val_df
50
 
51
 
@@ -62,6 +67,10 @@ def load_model(model_name, gpu_utilization=0.8):
62
 
63
 
64
  def format_cot_example(example, including_answer=True):
 
 
 
 
65
  prompt = "Question:\n"
66
  question = example["question"]
67
  options = example["options"]
@@ -79,15 +88,34 @@ def format_cot_example(example, including_answer=True):
79
 
80
 
81
  def generate_cot_prompt(val_df, curr, k):
 
 
 
 
 
 
 
 
82
  prompt = initial_prompt
83
- subject = curr["category"]
84
- # Assert that all rows in val_df have 'category' equal to subject
85
- assert (val_df["category"] == subject).all(), "Not all rows in val_df have the correct category"
86
- val_df = val_df[: k]
 
 
 
 
 
 
87
  prompt = prompt.replace("{$}", subject) + "\n"
88
- for example in val_df:
 
 
89
  prompt += format_cot_example(example, including_answer=True)
 
 
90
  prompt += format_cot_example(curr, including_answer=False)
 
91
  return prompt
92
 
93
 
@@ -121,7 +149,7 @@ def extract_final(text):
121
  def batch_inference(llm, sampling_params, inference_batch):
122
  start = time.time()
123
  outputs = llm.generate(inference_batch, sampling_params)
124
- logging.info(str(len(inference_batch)) + "size batch costing time: " + str(time.time() - start))
125
  response_batch = []
126
  pred_batch = []
127
  for output in outputs:
@@ -139,15 +167,17 @@ def calculate_accuracy(res):
139
  along with the overall accuracy.
140
  """
141
  correctness = []
142
- for each in res:
143
- if not each["pred"]:
 
 
144
  # If prediction is None, use random choice with fixed seed
145
- # This ensures reproducibility when handling missing predictions
146
  random.seed(12345)
147
- x = random.randint(0, len(each["options"]) - 1)
148
- is_correct = 1 if x == each["answer_index"] else 0
 
149
  else:
150
- is_correct = 1 if each["pred"] == each["answer"] else 0
151
  correctness.append(is_correct)
152
 
153
  # Calculate accuracy from correctness array
@@ -157,77 +187,119 @@ def calculate_accuracy(res):
157
  accuracy = sum(correctness) / len(correctness)
158
  return correctness, accuracy
159
 
 
160
  @torch.no_grad()
161
  def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
 
 
 
 
 
 
 
 
 
 
 
162
  llm, sampling_params = model
163
  global choices
164
  logging.info("evaluating " + subject)
165
  inference_batches = []
166
- k = num_shots
167
-
168
- for i in tqdm(range(len(test_df))):
169
- curr = test_df[i]
 
 
 
170
  prompt_length_ok = False
171
  prompt = None
172
- while not prompt_length_ok:
173
  prompt = generate_cot_prompt(val_df, curr, k)
174
  inputs = tokenizer(prompt, return_tensors="pt")
175
  inputs = {key: value.cuda() for key, value in inputs.items()}
176
  length = len(inputs["input_ids"][0])
177
  if length < max_model_length - max_new_tokens:
178
  prompt_length_ok = True
179
- k -= 1
 
 
 
 
 
 
180
  inference_batches.append(prompt)
181
 
 
182
  pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
183
- results = []
184
- for j, curr in enumerate(test_df):
185
- curr["pred"] = pred_batch[j]
186
- curr["model_outputs"] = response_batch[j]
187
- results.append(curr)
188
-
189
- # Get array of correctness and overall accuracy
190
- correctness, accuracy = calculate_accuracy(results)
191
  logging.info("This batch accuracy is: {}, correct samples: {}/{}\n".format(
192
  str(accuracy), str(sum(correctness)), str(len(correctness))))
193
 
194
  return correctness, accuracy
195
 
 
196
  @spaces.GPU(duration=240) # Extended to 3 minutes for larger evaluations
197
  def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
198
- print ("IS CUDA AVAILABLE: ", torch.cuda.is_available())
 
199
 
200
- model, tokenizer = load_model(model_name, gpu_utilization=0.8)
 
 
 
 
 
 
201
 
 
 
202
  test_df, val_df = load_mmlu_pro()
203
-
204
- test_df = pd.DataFrame(test_df)
205
- val_df = pd.DataFrame(val_df) # Fixed: was 'val_def'
206
  test_df = test_df.sort_values(['category', 'question_id'])
207
- val_df = val_df.sort_values(['category', 'question_id']) # Fixed: was 'dev_df'
208
 
209
- # Get all unique subjects
210
  all_subjects = sorted(test_df['category'].unique())
211
- selected_subjects = []
212
 
213
  # Select subjects based on num_subjects parameter
214
  if num_subjects == -1 or num_subjects >= len(all_subjects):
215
  selected_subjects = all_subjects
216
  else:
217
- # Take the first num_subjects subjects
218
  selected_subjects = all_subjects[:num_subjects]
219
 
220
  logging.info("selected subjects:\n" + "\n".join(selected_subjects))
221
 
 
222
  results = {}
223
  all_correctness = []
224
  results_table = []
225
 
 
226
  for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
 
227
  test_samples = test_df[test_df['category'] == subject].head(num_questions)
228
  val_samples = val_df[val_df['category'] == subject].head(num_shots)
229
 
230
- correctness, acc = eval_cot(subject, model, tokenizer, val_df=val_samples, test_df=test_samples, num_shots=num_shots)
 
 
 
 
 
 
 
 
 
 
231
  results[subject] = acc
232
  all_correctness.extend(correctness)
233
  results_table.append({
@@ -237,11 +309,12 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
237
  'Accuracy': acc
238
  })
239
 
 
240
  weighted_acc = np.mean(all_correctness)
241
-
242
  min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
243
  max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
244
 
 
245
  return {
246
  "overall_accuracy": weighted_acc,
247
  "min_accuracy_subject": (min_acc_subject, results[min_acc_subject]),
 
15
  import sys
16
  from datasets import load_dataset
17
  import pandas as pd
18
+ import numpy as np
19
 
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
 
46
  test_df, val_df = dataset["test"], dataset["validation"]
47
  test_df = preprocess(test_df)
48
  val_df = preprocess(val_df)
49
+
50
+ # Convert to DataFrames right after loading and preprocessing
51
+ test_df = pd.DataFrame(test_df)
52
+ val_df = pd.DataFrame(val_df)
53
+
54
  return test_df, val_df
55
 
56
 
 
67
 
68
 
69
  def format_cot_example(example, including_answer=True):
70
+ # Handle both Series and dict inputs
71
+ if isinstance(example, pd.Series):
72
+ example = example.to_dict()
73
+
74
  prompt = "Question:\n"
75
  question = example["question"]
76
  options = example["options"]
 
88
 
89
 
90
  def generate_cot_prompt(val_df, curr, k):
91
+ """
92
+ Generate prompt with examples from val_df matching curr's category.
93
+
94
+ Args:
95
+ val_df: DataFrame containing validation examples
96
+ curr: Series or dict representing current example
97
+ k: Number of examples to include
98
+ """
99
  prompt = initial_prompt
100
+
101
+ # Handle both Series and dict inputs for curr
102
+ if isinstance(curr, pd.Series):
103
+ subject = curr["category"]
104
+ else:
105
+ subject = curr["category"]
106
+
107
+ # Filter validation examples by category
108
+ filtered_val_df = val_df[val_df["category"] == subject].head(k)
109
+
110
  prompt = prompt.replace("{$}", subject) + "\n"
111
+
112
+ # Add each example to the prompt
113
+ for _, example in filtered_val_df.iterrows():
114
  prompt += format_cot_example(example, including_answer=True)
115
+
116
+ # Add the current example
117
  prompt += format_cot_example(curr, including_answer=False)
118
+
119
  return prompt
120
 
121
 
 
149
  def batch_inference(llm, sampling_params, inference_batch):
150
  start = time.time()
151
  outputs = llm.generate(inference_batch, sampling_params)
152
+ logging.info(str(len(inference_batch)) + " size batch costing time: " + str(time.time() - start))
153
  response_batch = []
154
  pred_batch = []
155
  for output in outputs:
 
167
  along with the overall accuracy.
168
  """
169
  correctness = []
170
+
171
+ # Process predictions and compute correctness
172
+ for i, row in res.iterrows():
173
+ if not row["pred"]:
174
  # If prediction is None, use random choice with fixed seed
 
175
  random.seed(12345)
176
+ options_len = len(row["options"]) if isinstance(row["options"], list) else 4
177
+ x = random.randint(0, options_len - 1)
178
+ is_correct = 1 if x == row["answer_index"] else 0
179
  else:
180
+ is_correct = 1 if row["pred"] == row["answer"] else 0
181
  correctness.append(is_correct)
182
 
183
  # Calculate accuracy from correctness array
 
187
  accuracy = sum(correctness) / len(correctness)
188
  return correctness, accuracy
189
 
190
+
191
  @torch.no_grad()
192
  def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
193
+ """
194
+ Evaluate model using chain-of-thought prompting.
195
+
196
+ Args:
197
+ subject: Subject category being evaluated
198
+ model: Tuple of (llm, sampling_params)
199
+ tokenizer: Model tokenizer
200
+ val_df: DataFrame with validation examples
201
+ test_df: DataFrame with test examples
202
+ num_shots: Number of examples to include in prompt
203
+ """
204
  llm, sampling_params = model
205
  global choices
206
  logging.info("evaluating " + subject)
207
  inference_batches = []
208
+
209
+ # Process each test example
210
+ for i in range(len(test_df)):
211
+ curr = test_df.iloc[i]
212
+ k = num_shots # Reset k for each example
213
+
214
+ # Find prompt that fits within token limit
215
  prompt_length_ok = False
216
  prompt = None
217
+ while not prompt_length_ok and k > 0:
218
  prompt = generate_cot_prompt(val_df, curr, k)
219
  inputs = tokenizer(prompt, return_tensors="pt")
220
  inputs = {key: value.cuda() for key, value in inputs.items()}
221
  length = len(inputs["input_ids"][0])
222
  if length < max_model_length - max_new_tokens:
223
  prompt_length_ok = True
224
+ else:
225
+ k -= 1
226
+
227
+ if not prompt_length_ok:
228
+ # If we couldn't fit any examples, use just the test question
229
+ prompt = generate_cot_prompt(val_df.head(0), curr, 0)
230
+
231
  inference_batches.append(prompt)
232
 
233
+ # Get model predictions
234
  pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
235
+
236
+ # Add predictions to test DataFrame
237
+ results_df = test_df.copy()
238
+ results_df["pred"] = pred_batch
239
+ results_df["model_outputs"] = response_batch
240
+
241
+ # Calculate accuracy
242
+ correctness, accuracy = calculate_accuracy(results_df)
243
  logging.info("This batch accuracy is: {}, correct samples: {}/{}\n".format(
244
  str(accuracy), str(sum(correctness)), str(len(correctness))))
245
 
246
  return correctness, accuracy
247
 
248
+
249
  @spaces.GPU(duration=240) # Extended to 3 minutes for larger evaluations
250
  def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
251
+ """
252
+ Main evaluation function for MMLU-Pro benchmark.
253
 
254
+ Args:
255
+ model_name: Name/path of model to evaluate
256
+ num_subjects: Number of subjects to test (-1 for all)
257
+ num_questions: Number of questions per subject
258
+ num_shots: Number of examples to include in prompts
259
+ """
260
+ print("IS CUDA AVAILABLE: ", torch.cuda.is_available())
261
 
262
+ # Load model and data
263
+ model, tokenizer = load_model(model_name, gpu_utilization=0.8)
264
  test_df, val_df = load_mmlu_pro()
265
+
266
+ # Sort DataFrames
 
267
  test_df = test_df.sort_values(['category', 'question_id'])
268
+ val_df = val_df.sort_values(['category', 'question_id'])
269
 
270
+ # Get unique subjects
271
  all_subjects = sorted(test_df['category'].unique())
 
272
 
273
  # Select subjects based on num_subjects parameter
274
  if num_subjects == -1 or num_subjects >= len(all_subjects):
275
  selected_subjects = all_subjects
276
  else:
 
277
  selected_subjects = all_subjects[:num_subjects]
278
 
279
  logging.info("selected subjects:\n" + "\n".join(selected_subjects))
280
 
281
+ # Prepare results tracking
282
  results = {}
283
  all_correctness = []
284
  results_table = []
285
 
286
+ # Process each subject
287
  for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
288
+ # Filter data for current subject
289
  test_samples = test_df[test_df['category'] == subject].head(num_questions)
290
  val_samples = val_df[val_df['category'] == subject].head(num_shots)
291
 
292
+ # Run evaluation
293
+ correctness, acc = eval_cot(
294
+ subject,
295
+ model,
296
+ tokenizer,
297
+ val_df=val_samples,
298
+ test_df=test_samples,
299
+ num_shots=num_shots
300
+ )
301
+
302
+ # Store results
303
  results[subject] = acc
304
  all_correctness.extend(correctness)
305
  results_table.append({
 
309
  'Accuracy': acc
310
  })
311
 
312
+ # Calculate overall metrics
313
  weighted_acc = np.mean(all_correctness)
 
314
  min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
315
  max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
316
 
317
+ # Return results summary
318
  return {
319
  "overall_accuracy": weighted_acc,
320
  "min_accuracy_subject": (min_acc_subject, results[min_acc_subject]),