H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 18

Commit

84c9e35

verified ·

1 Parent(s): 77d0f13

Update mmlu_eval_original.py

Browse files

Files changed (1) hide show

mmlu_eval_original.py +28 -9

mmlu_eval_original.py CHANGED Viewed

@@ -12,7 +12,7 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 accuracy_metric = evaluate.load("accuracy")
-choices = ["A", "B", "C", "D"]
 MAX_CONTEXT_WINDOW = 4096 #Hard-coded for the moment, will be replaced later to be an input from the Model.
 def load_dataset_from_hf(verbose=False):
@@ -53,13 +53,31 @@ def format_subject(subject):
 def format_example(df, idx, include_answer=True):
     prompt = df.iloc[idx, 0]
-    k = df.shape[1] - 2
-    for j in range(k):
-        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
     prompt += "\nAnswer:"
     if include_answer:
-        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
     return prompt
@@ -70,7 +88,7 @@ def gen_prompt(df, subject, k=-1):
     if k == -1:
         k = df.shape[0]
     for i in range(k):
-        prompt += format_example(df, i)
     return prompt
@@ -107,7 +125,8 @@ def eval (subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=
         logger.info (f"Sample: {i}")
-        label = test_df.iloc[i, test_df.shape[1] - 1]
         logits = model(input_ids=input_ids).logits[0, -1]
@@ -130,10 +149,10 @@ def eval (subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=
         )
         pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
-        cor = pred == label
         if (i == 0):
             logger.info (f"Prompt: {prompt}")
-            logger.info(f"Label: {label}")
             logger.info(f"Logits: {logits}")
             logger.info(f"Probabilities: {probs}")
             logger.info(f"Prediction: {pred}")

 logger = logging.getLogger(__name__)
 accuracy_metric = evaluate.load("accuracy")
+option_letters = ["A", "B", "C", "D"]
 MAX_CONTEXT_WINDOW = 4096 #Hard-coded for the moment, will be replaced later to be an input from the Model.
 def load_dataset_from_hf(verbose=False):
 def format_example(df, idx, include_answer=True):
+    """
+    Format a single example for the prompt based on the actual dataset structure:
+    - Column 0: question text
+    - Column 1: subject
+    - Column 2: choices as a list of strings
+    - Column 3: answer as a numeric index (0-3)
+    """
+    # Get the question text
     prompt = df.iloc[idx, 0]
+    # Get the choices from the dataframe
+    options_list = df.iloc[idx, 2]
+    assert(isinstance(options_list, list))
+    for j, option in enumerate(options_list):
+        prompt += f"\n{option_letters[j]}. {option}"
     prompt += "\nAnswer:"
     if include_answer:
+        # Convert numeric answer to letter
+        answer_num = df.iloc[idx, 3]
+        answer_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[answer_num]
+        prompt += f" {answer_letter}\n\n"
     return prompt
     if k == -1:
         k = df.shape[0]
     for i in range(k):
+        prompt += format_example(df, i, include_answer=True)
     return prompt
         logger.info (f"Sample: {i}")
+        label = test_df.iloc[i, 3]
+        label_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[label]
         logits = model(input_ids=input_ids).logits[0, -1]
         )
         pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
+        cor = pred == label_letter
         if (i == 0):
             logger.info (f"Prompt: {prompt}")
+            logger.info(f"Label_Letter: {label_letter}")
             logger.info(f"Logits: {logits}")
             logger.info(f"Probabilities: {probs}")
             logger.info(f"Prediction: {pred}")