Spaces:
Sleeping
Sleeping
Update mmlu_pro_eval_adapted.py
Browse files- mmlu_pro_eval_adapted.py +70 -6
mmlu_pro_eval_adapted.py
CHANGED
@@ -149,7 +149,7 @@ def extract_final(text):
|
|
149 |
def batch_inference(llm, sampling_params, inference_batch):
|
150 |
start = time.time()
|
151 |
outputs = llm.generate(inference_batch, sampling_params)
|
152 |
-
logging.info(str(len(inference_batch)) + "
|
153 |
response_batch = []
|
154 |
pred_batch = []
|
155 |
for output in outputs:
|
@@ -157,9 +157,72 @@ def batch_inference(llm, sampling_params, inference_batch):
|
|
157 |
response_batch.append(generated_text)
|
158 |
pred = extract_answer(generated_text)
|
159 |
pred_batch.append(pred)
|
160 |
-
logging.info("PRED BATCH: %s, RESPONSE BATCH: %s", pred_batch, response_batch)
|
161 |
return pred_batch, response_batch
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
def calculate_accuracy(res):
|
165 |
"""
|
@@ -190,7 +253,7 @@ def calculate_accuracy(res):
|
|
190 |
|
191 |
|
192 |
@torch.no_grad()
|
193 |
-
def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
|
194 |
"""
|
195 |
Evaluate model using chain-of-thought prompting.
|
196 |
|
@@ -231,8 +294,9 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
|
|
231 |
|
232 |
inference_batches.append(prompt)
|
233 |
|
234 |
-
|
235 |
-
|
|
|
236 |
|
237 |
# Add predictions to test DataFrame
|
238 |
results_df = test_df.copy()
|
@@ -247,7 +311,7 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
|
|
247 |
return correctness, accuracy
|
248 |
|
249 |
|
250 |
-
@spaces.GPU(duration=240) # Extended to
|
251 |
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
|
252 |
"""
|
253 |
Main evaluation function for MMLU-Pro benchmark.
|
|
|
149 |
def batch_inference(llm, sampling_params, inference_batch):
|
150 |
start = time.time()
|
151 |
outputs = llm.generate(inference_batch, sampling_params)
|
152 |
+
logging.info("Batch of size: ", str(len(inference_batch)) + ". Time taken: " + str(time.time() - start))
|
153 |
response_batch = []
|
154 |
pred_batch = []
|
155 |
for output in outputs:
|
|
|
157 |
response_batch.append(generated_text)
|
158 |
pred = extract_answer(generated_text)
|
159 |
pred_batch.append(pred)
|
|
|
160 |
return pred_batch, response_batch
|
161 |
|
162 |
+
def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer):
|
163 |
+
start = time.time()
|
164 |
+
outputs = llm.generate(inference_batch, sampling_params)
|
165 |
+
logging.info(str(len(inference_batch)) + " size batch costing time: " + str(time.time() - start))
|
166 |
+
response_batch = []
|
167 |
+
pred_batch = []
|
168 |
+
input_token_counts = []
|
169 |
+
output_token_counts = []
|
170 |
+
|
171 |
+
for i, output in enumerate(outputs):
|
172 |
+
generated_text = output.outputs[0].text
|
173 |
+
response_batch.append(generated_text)
|
174 |
+
pred = extract_answer(generated_text)
|
175 |
+
pred_batch.append(pred)
|
176 |
+
|
177 |
+
# Proper token count using tokenizer
|
178 |
+
input_tokens = len(tokenizer.encode(inference_batch[i]))
|
179 |
+
output_tokens = len(tokenizer.encode(generated_text))
|
180 |
+
|
181 |
+
input_token_counts.append(input_tokens)
|
182 |
+
output_token_counts.append(output_tokens)
|
183 |
+
|
184 |
+
logging.info("PRED BATCH: %s", pred_batch)
|
185 |
+
logging.info("RESPONSE BATCH: %s", response_batch)
|
186 |
+
|
187 |
+
# Convert to DataFrame for logging (handle cases with fewer than 40 requests)
|
188 |
+
num_samples = min(40, len(inference_batch))
|
189 |
+
summary_df = pd.DataFrame({
|
190 |
+
'Input': inference_batch[:num_samples],
|
191 |
+
'Response': response_batch[:num_samples]
|
192 |
+
})
|
193 |
+
logging.info("\nSummary of first %d requests and responses:\n%s", num_samples, summary_df.to_string())
|
194 |
+
|
195 |
+
# Total and average input/output token statistics
|
196 |
+
total_input_tokens = sum(input_token_counts)
|
197 |
+
total_output_tokens = sum(output_token_counts)
|
198 |
+
avg_input_tokens = total_input_tokens / len(input_token_counts)
|
199 |
+
avg_output_tokens = total_output_tokens / len(output_token_counts)
|
200 |
+
|
201 |
+
max_input_idx = np.argmax(input_token_counts)
|
202 |
+
max_output_idx = np.argmax(output_token_counts)
|
203 |
+
min_input_idx = np.argmin(input_token_counts)
|
204 |
+
min_output_idx = np.argmin(output_token_counts)
|
205 |
+
|
206 |
+
logging.info("\nTotal input tokens: %d", total_input_tokens)
|
207 |
+
logging.info("Total output tokens: %d", total_output_tokens)
|
208 |
+
logging.info("Average input tokens: %.2f", avg_input_tokens)
|
209 |
+
logging.info("Average output tokens: %.2f", avg_output_tokens)
|
210 |
+
|
211 |
+
logging.info("\nRequest with max input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
212 |
+
max_input_idx, input_token_counts[max_input_idx], inference_batch[max_input_idx], response_batch[max_input_idx])
|
213 |
+
|
214 |
+
logging.info("\nRequest with max output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
215 |
+
max_output_idx, output_token_counts[max_output_idx], inference_batch[max_output_idx], response_batch[max_output_idx])
|
216 |
+
|
217 |
+
logging.info("\nRequest with min input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
218 |
+
min_input_idx, input_token_counts[min_input_idx], inference_batch[min_input_idx], response_batch[min_input_idx])
|
219 |
+
|
220 |
+
logging.info("\nRequest with min output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
221 |
+
min_output_idx, output_token_counts[min_output_idx], inference_batch[min_output_idx], response_batch[min_output_idx])
|
222 |
+
|
223 |
+
return pred_batch, response_batch
|
224 |
+
|
225 |
+
|
226 |
|
227 |
def calculate_accuracy(res):
|
228 |
"""
|
|
|
253 |
|
254 |
|
255 |
@torch.no_grad()
|
256 |
+
def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode=True):
|
257 |
"""
|
258 |
Evaluate model using chain-of-thought prompting.
|
259 |
|
|
|
294 |
|
295 |
inference_batches.append(prompt)
|
296 |
|
297 |
+
|
298 |
+
batch_fn = batch_inference_debug_mode if debug_mode else batch_inference
|
299 |
+
pred_batch, response_batch = batch_fn(llm, sampling_params, inference_batches)
|
300 |
|
301 |
# Add predictions to test DataFrame
|
302 |
results_df = test_df.copy()
|
|
|
311 |
return correctness, accuracy
|
312 |
|
313 |
|
314 |
+
@spaces.GPU(duration=240) # Extended to 4 minutes for larger evaluations
|
315 |
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
|
316 |
"""
|
317 |
Main evaluation function for MMLU-Pro benchmark.
|