Spaces:
Sleeping
Sleeping
Update mmlu_pro_eval_adapted.py
Browse files- mmlu_pro_eval_adapted.py +10 -10
mmlu_pro_eval_adapted.py
CHANGED
@@ -181,8 +181,8 @@ def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer)
|
|
181 |
input_token_counts.append(input_tokens)
|
182 |
output_token_counts.append(output_tokens)
|
183 |
|
184 |
-
logging.info("PRED BATCH
|
185 |
-
logging.info("RESPONSE BATCH
|
186 |
|
187 |
# Convert to DataFrame for logging (handle cases with fewer than 40 requests)
|
188 |
num_samples = min(40, len(inference_batch))
|
@@ -190,7 +190,7 @@ def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer)
|
|
190 |
'Input': inference_batch[:num_samples],
|
191 |
'Response': response_batch[:num_samples]
|
192 |
})
|
193 |
-
logging.info("\
|
194 |
|
195 |
# Total and average input/output token statistics
|
196 |
total_input_tokens = sum(input_token_counts)
|
@@ -203,27 +203,27 @@ def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer)
|
|
203 |
min_input_idx = np.argmin(input_token_counts)
|
204 |
min_output_idx = np.argmin(output_token_counts)
|
205 |
|
206 |
-
logging.info("\
|
|
|
207 |
logging.info("Total output tokens: %d", total_output_tokens)
|
208 |
logging.info("Average input tokens: %.2f", avg_input_tokens)
|
209 |
logging.info("Average output tokens: %.2f", avg_output_tokens)
|
210 |
|
211 |
-
logging.info("\
|
212 |
max_input_idx, input_token_counts[max_input_idx], inference_batch[max_input_idx], response_batch[max_input_idx])
|
213 |
|
214 |
-
logging.info("\
|
215 |
max_output_idx, output_token_counts[max_output_idx], inference_batch[max_output_idx], response_batch[max_output_idx])
|
216 |
|
217 |
-
logging.info("\
|
218 |
min_input_idx, input_token_counts[min_input_idx], inference_batch[min_input_idx], response_batch[min_input_idx])
|
219 |
|
220 |
-
logging.info("\
|
221 |
min_output_idx, output_token_counts[min_output_idx], inference_batch[min_output_idx], response_batch[min_output_idx])
|
222 |
-
|
223 |
return pred_batch, response_batch
|
224 |
|
225 |
|
226 |
-
|
227 |
def calculate_accuracy(res):
|
228 |
"""
|
229 |
Calculate accuracy and return an array of correctness (1 if correct, 0 if wrong)
|
|
|
181 |
input_token_counts.append(input_tokens)
|
182 |
output_token_counts.append(output_tokens)
|
183 |
|
184 |
+
logging.info("\n----------- PRED BATCH -----------\n%s", pred_batch)
|
185 |
+
logging.info("\n----------- RESPONSE BATCH -----------\n%s", response_batch)
|
186 |
|
187 |
# Convert to DataFrame for logging (handle cases with fewer than 40 requests)
|
188 |
num_samples = min(40, len(inference_batch))
|
|
|
190 |
'Input': inference_batch[:num_samples],
|
191 |
'Response': response_batch[:num_samples]
|
192 |
})
|
193 |
+
logging.info("\n----------- Summary of first %d requests and responses -----------\n%s", num_samples, summary_df.to_string())
|
194 |
|
195 |
# Total and average input/output token statistics
|
196 |
total_input_tokens = sum(input_token_counts)
|
|
|
203 |
min_input_idx = np.argmin(input_token_counts)
|
204 |
min_output_idx = np.argmin(output_token_counts)
|
205 |
|
206 |
+
logging.info("\n----------- Token Statistics -----------")
|
207 |
+
logging.info("Total input tokens: %d", total_input_tokens)
|
208 |
logging.info("Total output tokens: %d", total_output_tokens)
|
209 |
logging.info("Average input tokens: %.2f", avg_input_tokens)
|
210 |
logging.info("Average output tokens: %.2f", avg_output_tokens)
|
211 |
|
212 |
+
logging.info("\n----------- Request with max input tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
213 |
max_input_idx, input_token_counts[max_input_idx], inference_batch[max_input_idx], response_batch[max_input_idx])
|
214 |
|
215 |
+
logging.info("\n----------- Request with max output tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
216 |
max_output_idx, output_token_counts[max_output_idx], inference_batch[max_output_idx], response_batch[max_output_idx])
|
217 |
|
218 |
+
logging.info("\n----------- Request with min input tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
219 |
min_input_idx, input_token_counts[min_input_idx], inference_batch[min_input_idx], response_batch[min_input_idx])
|
220 |
|
221 |
+
logging.info("\n----------- Request with min output tokens -----------\nIndex: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
222 |
min_output_idx, output_token_counts[min_output_idx], inference_batch[min_output_idx], response_batch[min_output_idx])
|
223 |
+
|
224 |
return pred_batch, response_batch
|
225 |
|
226 |
|
|
|
227 |
def calculate_accuracy(res):
|
228 |
"""
|
229 |
Calculate accuracy and return an array of correctness (1 if correct, 0 if wrong)
|