rohansampath commited on
Commit
84c9e35
·
verified ·
1 Parent(s): 77d0f13

Update mmlu_eval_original.py

Browse files
Files changed (1) hide show
  1. mmlu_eval_original.py +28 -9
mmlu_eval_original.py CHANGED
@@ -12,7 +12,7 @@ logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
  accuracy_metric = evaluate.load("accuracy")
15
- choices = ["A", "B", "C", "D"]
16
  MAX_CONTEXT_WINDOW = 4096 #Hard-coded for the moment, will be replaced later to be an input from the Model.
17
 
18
  def load_dataset_from_hf(verbose=False):
@@ -53,13 +53,31 @@ def format_subject(subject):
53
 
54
 
55
  def format_example(df, idx, include_answer=True):
 
 
 
 
 
 
 
 
56
  prompt = df.iloc[idx, 0]
57
- k = df.shape[1] - 2
58
- for j in range(k):
59
- prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
 
 
 
 
 
 
60
  prompt += "\nAnswer:"
61
  if include_answer:
62
- prompt += " {}\n\n".format(df.iloc[idx, k + 1])
 
 
 
 
63
  return prompt
64
 
65
 
@@ -70,7 +88,7 @@ def gen_prompt(df, subject, k=-1):
70
  if k == -1:
71
  k = df.shape[0]
72
  for i in range(k):
73
- prompt += format_example(df, i)
74
  return prompt
75
 
76
 
@@ -107,7 +125,8 @@ def eval (subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=
107
  logger.info (f"Sample: {i}")
108
 
109
 
110
- label = test_df.iloc[i, test_df.shape[1] - 1]
 
111
 
112
  logits = model(input_ids=input_ids).logits[0, -1]
113
 
@@ -130,10 +149,10 @@ def eval (subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=
130
  )
131
  pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
132
 
133
- cor = pred == label
134
  if (i == 0):
135
  logger.info (f"Prompt: {prompt}")
136
- logger.info(f"Label: {label}")
137
  logger.info(f"Logits: {logits}")
138
  logger.info(f"Probabilities: {probs}")
139
  logger.info(f"Prediction: {pred}")
 
12
  logger = logging.getLogger(__name__)
13
 
14
  accuracy_metric = evaluate.load("accuracy")
15
+ option_letters = ["A", "B", "C", "D"]
16
  MAX_CONTEXT_WINDOW = 4096 #Hard-coded for the moment, will be replaced later to be an input from the Model.
17
 
18
  def load_dataset_from_hf(verbose=False):
 
53
 
54
 
55
  def format_example(df, idx, include_answer=True):
56
+ """
57
+ Format a single example for the prompt based on the actual dataset structure:
58
+ - Column 0: question text
59
+ - Column 1: subject
60
+ - Column 2: choices as a list of strings
61
+ - Column 3: answer as a numeric index (0-3)
62
+ """
63
+ # Get the question text
64
  prompt = df.iloc[idx, 0]
65
+
66
+ # Get the choices from the dataframe
67
+ options_list = df.iloc[idx, 2]
68
+ assert(isinstance(options_list, list))
69
+
70
+
71
+ for j, option in enumerate(options_list):
72
+ prompt += f"\n{option_letters[j]}. {option}"
73
+
74
  prompt += "\nAnswer:"
75
  if include_answer:
76
+ # Convert numeric answer to letter
77
+ answer_num = df.iloc[idx, 3]
78
+ answer_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[answer_num]
79
+ prompt += f" {answer_letter}\n\n"
80
+
81
  return prompt
82
 
83
 
 
88
  if k == -1:
89
  k = df.shape[0]
90
  for i in range(k):
91
+ prompt += format_example(df, i, include_answer=True)
92
  return prompt
93
 
94
 
 
125
  logger.info (f"Sample: {i}")
126
 
127
 
128
+ label = test_df.iloc[i, 3]
129
+ label_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[label]
130
 
131
  logits = model(input_ids=input_ids).logits[0, -1]
132
 
 
149
  )
150
  pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
151
 
152
+ cor = pred == label_letter
153
  if (i == 0):
154
  logger.info (f"Prompt: {prompt}")
155
+ logger.info(f"Label_Letter: {label_letter}")
156
  logger.info(f"Logits: {logits}")
157
  logger.info(f"Probabilities: {probs}")
158
  logger.info(f"Prediction: {pred}")