rohansampath commited on
Commit
49b1770
·
verified ·
1 Parent(s): d99c3bf

Update mmlu_pro_eval_adapted.py

Browse files
Files changed (1) hide show
  1. mmlu_pro_eval_adapted.py +10 -10
mmlu_pro_eval_adapted.py CHANGED
@@ -181,8 +181,8 @@ def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer)
181
  input_token_counts.append(input_tokens)
182
  output_token_counts.append(output_tokens)
183
 
184
- logging.info("PRED BATCH: %s", pred_batch)
185
- logging.info("RESPONSE BATCH: %s", response_batch)
186
 
187
  # Convert to DataFrame for logging (handle cases with fewer than 40 requests)
188
  num_samples = min(40, len(inference_batch))
@@ -190,7 +190,7 @@ def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer)
190
  'Input': inference_batch[:num_samples],
191
  'Response': response_batch[:num_samples]
192
  })
193
- logging.info("\nSummary of first %d requests and responses:\n%s", num_samples, summary_df.to_string())
194
 
195
  # Total and average input/output token statistics
196
  total_input_tokens = sum(input_token_counts)
@@ -203,27 +203,27 @@ def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer)
203
  min_input_idx = np.argmin(input_token_counts)
204
  min_output_idx = np.argmin(output_token_counts)
205
 
206
- logging.info("\nTotal input tokens: %d", total_input_tokens)
 
207
  logging.info("Total output tokens: %d", total_output_tokens)
208
  logging.info("Average input tokens: %.2f", avg_input_tokens)
209
  logging.info("Average output tokens: %.2f", avg_output_tokens)
210
 
211
- logging.info("\nRequest with max input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
212
  max_input_idx, input_token_counts[max_input_idx], inference_batch[max_input_idx], response_batch[max_input_idx])
213
 
214
- logging.info("\nRequest with max output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
215
  max_output_idx, output_token_counts[max_output_idx], inference_batch[max_output_idx], response_batch[max_output_idx])
216
 
217
- logging.info("\nRequest with min input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
218
  min_input_idx, input_token_counts[min_input_idx], inference_batch[min_input_idx], response_batch[min_input_idx])
219
 
220
- logging.info("\nRequest with min output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
221
  min_output_idx, output_token_counts[min_output_idx], inference_batch[min_output_idx], response_batch[min_output_idx])
222
-
223
  return pred_batch, response_batch
224
 
225
 
226
-
227
  def calculate_accuracy(res):
228
  """
229
  Calculate accuracy and return an array of correctness (1 if correct, 0 if wrong)
 
181
  input_token_counts.append(input_tokens)
182
  output_token_counts.append(output_tokens)
183
 
184
+ logging.info("\n----------- PRED BATCH -----------\n%s", pred_batch)
185
+ logging.info("\n----------- RESPONSE BATCH -----------\n%s", response_batch)
186
 
187
  # Convert to DataFrame for logging (handle cases with fewer than 40 requests)
188
  num_samples = min(40, len(inference_batch))
 
190
  'Input': inference_batch[:num_samples],
191
  'Response': response_batch[:num_samples]
192
  })
193
+ logging.info("\n----------- Summary of first %d requests and responses -----------\n%s", num_samples, summary_df.to_string())
194
 
195
  # Total and average input/output token statistics
196
  total_input_tokens = sum(input_token_counts)
 
203
  min_input_idx = np.argmin(input_token_counts)
204
  min_output_idx = np.argmin(output_token_counts)
205
 
206
+ logging.info("\n----------- Token Statistics -----------")
207
+ logging.info("Total input tokens: %d", total_input_tokens)
208
  logging.info("Total output tokens: %d", total_output_tokens)
209
  logging.info("Average input tokens: %.2f", avg_input_tokens)
210
  logging.info("Average output tokens: %.2f", avg_output_tokens)
211
 
212
+ logging.info("\n----------- Request with max input tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
213
  max_input_idx, input_token_counts[max_input_idx], inference_batch[max_input_idx], response_batch[max_input_idx])
214
 
215
+ logging.info("\n----------- Request with max output tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
216
  max_output_idx, output_token_counts[max_output_idx], inference_batch[max_output_idx], response_batch[max_output_idx])
217
 
218
+ logging.info("\n----------- Request with min input tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
219
  min_input_idx, input_token_counts[min_input_idx], inference_batch[min_input_idx], response_batch[min_input_idx])
220
 
221
+ logging.info("\n----------- Request with min output tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
222
  min_output_idx, output_token_counts[min_output_idx], inference_batch[min_output_idx], response_batch[min_output_idx])
223
+
224
  return pred_batch, response_batch
225
 
226
 
 
227
  def calculate_accuracy(res):
228
  """
229
  Calculate accuracy and return an array of correctness (1 if correct, 0 if wrong)