H2H-eval-comparator

Sleeping

rohansampath commited on Feb 18

Commit

3bcb863

verified ·

1 Parent(s): 8c4aa75

Update mmlu_pro_eval_adapted.py

Files changed (1) hide show

mmlu_pro_eval_adapted.py CHANGED Viewed

@@ -14,12 +14,13 @@ import logging
 import sys
 from datasets import load_dataset
 import pandas as pd
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Can be found at https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/cot_prompt_lib/initial_prompt.txt
-initial_prompt = "The following are multiple choice questions (with answers) about {$}. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice."
 choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
 max_model_length = 4096
@@ -127,7 +128,7 @@ def batch_inference(llm, sampling_params, inference_batch):
         response_batch.append(generated_text)
         pred = extract_answer(generated_text)
         pred_batch.append(pred)
-    logging.info("PRED BATCH:", pred_batch, "RESPONSE BATCH:", response_batch)
     return pred_batch, response_batch
@@ -235,7 +236,6 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
             'Accuracy': acc
         })
-    import numpy as np  # Added: missing import
     weighted_acc = np.mean(all_correctness)
     min_acc_subject = min(results.items(), key=lambda x: x[1])[0]

 import sys
 from datasets import load_dataset
 import pandas as pd
+import numpy as mnp
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Can be found at https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/cot_prompt_lib/initial_prompt.txt
+initial_prompt = "The following are multiple choice questions (with answers) about {$}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
 choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
 max_model_length = 4096
         response_batch.append(generated_text)
         pred = extract_answer(generated_text)
         pred_batch.append(pred)
+    logging.info("PRED BATCH: %s, RESPONSE BATCH: %s", pred_batch, response_batch)
     return pred_batch, response_batch
             'Accuracy': acc
         })
     weighted_acc = np.mean(all_correctness)
     min_acc_subject = min(results.items(), key=lambda x: x[1])[0]