rohansampath commited on
Commit
ea8fa3f
·
verified ·
1 Parent(s): 397d798

Update mmlu_pro_eval_adapted.py

Browse files
Files changed (1) hide show
  1. mmlu_pro_eval_adapted.py +70 -6
mmlu_pro_eval_adapted.py CHANGED
@@ -149,7 +149,7 @@ def extract_final(text):
149
  def batch_inference(llm, sampling_params, inference_batch):
150
  start = time.time()
151
  outputs = llm.generate(inference_batch, sampling_params)
152
- logging.info(str(len(inference_batch)) + " size batch costing time: " + str(time.time() - start))
153
  response_batch = []
154
  pred_batch = []
155
  for output in outputs:
@@ -157,9 +157,72 @@ def batch_inference(llm, sampling_params, inference_batch):
157
  response_batch.append(generated_text)
158
  pred = extract_answer(generated_text)
159
  pred_batch.append(pred)
160
- logging.info("PRED BATCH: %s, RESPONSE BATCH: %s", pred_batch, response_batch)
161
  return pred_batch, response_batch
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def calculate_accuracy(res):
165
  """
@@ -190,7 +253,7 @@ def calculate_accuracy(res):
190
 
191
 
192
  @torch.no_grad()
193
- def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
194
  """
195
  Evaluate model using chain-of-thought prompting.
196
 
@@ -231,8 +294,9 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
231
 
232
  inference_batches.append(prompt)
233
 
234
- # Get model predictions
235
- pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
 
236
 
237
  # Add predictions to test DataFrame
238
  results_df = test_df.copy()
@@ -247,7 +311,7 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
247
  return correctness, accuracy
248
 
249
 
250
- @spaces.GPU(duration=240) # Extended to 3 minutes for larger evaluations
251
  def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
252
  """
253
  Main evaluation function for MMLU-Pro benchmark.
 
149
  def batch_inference(llm, sampling_params, inference_batch):
150
  start = time.time()
151
  outputs = llm.generate(inference_batch, sampling_params)
152
+ logging.info("Batch of size: ", str(len(inference_batch)) + ". Time taken: " + str(time.time() - start))
153
  response_batch = []
154
  pred_batch = []
155
  for output in outputs:
 
157
  response_batch.append(generated_text)
158
  pred = extract_answer(generated_text)
159
  pred_batch.append(pred)
 
160
  return pred_batch, response_batch
161
 
162
+ def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer):
163
+ start = time.time()
164
+ outputs = llm.generate(inference_batch, sampling_params)
165
+ logging.info(str(len(inference_batch)) + " size batch costing time: " + str(time.time() - start))
166
+ response_batch = []
167
+ pred_batch = []
168
+ input_token_counts = []
169
+ output_token_counts = []
170
+
171
+ for i, output in enumerate(outputs):
172
+ generated_text = output.outputs[0].text
173
+ response_batch.append(generated_text)
174
+ pred = extract_answer(generated_text)
175
+ pred_batch.append(pred)
176
+
177
+ # Proper token count using tokenizer
178
+ input_tokens = len(tokenizer.encode(inference_batch[i]))
179
+ output_tokens = len(tokenizer.encode(generated_text))
180
+
181
+ input_token_counts.append(input_tokens)
182
+ output_token_counts.append(output_tokens)
183
+
184
+ logging.info("PRED BATCH: %s", pred_batch)
185
+ logging.info("RESPONSE BATCH: %s", response_batch)
186
+
187
+ # Convert to DataFrame for logging (handle cases with fewer than 40 requests)
188
+ num_samples = min(40, len(inference_batch))
189
+ summary_df = pd.DataFrame({
190
+ 'Input': inference_batch[:num_samples],
191
+ 'Response': response_batch[:num_samples]
192
+ })
193
+ logging.info("\nSummary of first %d requests and responses:\n%s", num_samples, summary_df.to_string())
194
+
195
+ # Total and average input/output token statistics
196
+ total_input_tokens = sum(input_token_counts)
197
+ total_output_tokens = sum(output_token_counts)
198
+ avg_input_tokens = total_input_tokens / len(input_token_counts)
199
+ avg_output_tokens = total_output_tokens / len(output_token_counts)
200
+
201
+ max_input_idx = np.argmax(input_token_counts)
202
+ max_output_idx = np.argmax(output_token_counts)
203
+ min_input_idx = np.argmin(input_token_counts)
204
+ min_output_idx = np.argmin(output_token_counts)
205
+
206
+ logging.info("\nTotal input tokens: %d", total_input_tokens)
207
+ logging.info("Total output tokens: %d", total_output_tokens)
208
+ logging.info("Average input tokens: %.2f", avg_input_tokens)
209
+ logging.info("Average output tokens: %.2f", avg_output_tokens)
210
+
211
+ logging.info("\nRequest with max input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
212
+ max_input_idx, input_token_counts[max_input_idx], inference_batch[max_input_idx], response_batch[max_input_idx])
213
+
214
+ logging.info("\nRequest with max output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
215
+ max_output_idx, output_token_counts[max_output_idx], inference_batch[max_output_idx], response_batch[max_output_idx])
216
+
217
+ logging.info("\nRequest with min input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
218
+ min_input_idx, input_token_counts[min_input_idx], inference_batch[min_input_idx], response_batch[min_input_idx])
219
+
220
+ logging.info("\nRequest with min output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
221
+ min_output_idx, output_token_counts[min_output_idx], inference_batch[min_output_idx], response_batch[min_output_idx])
222
+
223
+ return pred_batch, response_batch
224
+
225
+
226
 
227
  def calculate_accuracy(res):
228
  """
 
253
 
254
 
255
  @torch.no_grad()
256
+ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode=True):
257
  """
258
  Evaluate model using chain-of-thought prompting.
259
 
 
294
 
295
  inference_batches.append(prompt)
296
 
297
+
298
+ batch_fn = batch_inference_debug_mode if debug_mode else batch_inference
299
+ pred_batch, response_batch = batch_fn(llm, sampling_params, inference_batches)
300
 
301
  # Add predictions to test DataFrame
302
  results_df = test_df.copy()
 
311
  return correctness, accuracy
312
 
313
 
314
+ @spaces.GPU(duration=240) # Extended to 4 minutes for larger evaluations
315
  def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
316
  """
317
  Main evaluation function for MMLU-Pro benchmark.