Spaces:
Sleeping
Sleeping
Update mmlu_eval_original.py
Browse files- mmlu_eval_original.py +28 -9
mmlu_eval_original.py
CHANGED
@@ -12,7 +12,7 @@ logging.basicConfig(level=logging.INFO)
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
accuracy_metric = evaluate.load("accuracy")
|
15 |
-
|
16 |
MAX_CONTEXT_WINDOW = 4096 #Hard-coded for the moment, will be replaced later to be an input from the Model.
|
17 |
|
18 |
def load_dataset_from_hf(verbose=False):
|
@@ -53,13 +53,31 @@ def format_subject(subject):
|
|
53 |
|
54 |
|
55 |
def format_example(df, idx, include_answer=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
prompt = df.iloc[idx, 0]
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
prompt += "\nAnswer:"
|
61 |
if include_answer:
|
62 |
-
|
|
|
|
|
|
|
|
|
63 |
return prompt
|
64 |
|
65 |
|
@@ -70,7 +88,7 @@ def gen_prompt(df, subject, k=-1):
|
|
70 |
if k == -1:
|
71 |
k = df.shape[0]
|
72 |
for i in range(k):
|
73 |
-
prompt += format_example(df, i)
|
74 |
return prompt
|
75 |
|
76 |
|
@@ -107,7 +125,8 @@ def eval (subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=
|
|
107 |
logger.info (f"Sample: {i}")
|
108 |
|
109 |
|
110 |
-
label = test_df.iloc[i,
|
|
|
111 |
|
112 |
logits = model(input_ids=input_ids).logits[0, -1]
|
113 |
|
@@ -130,10 +149,10 @@ def eval (subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=
|
|
130 |
)
|
131 |
pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
|
132 |
|
133 |
-
cor = pred ==
|
134 |
if (i == 0):
|
135 |
logger.info (f"Prompt: {prompt}")
|
136 |
-
logger.info(f"
|
137 |
logger.info(f"Logits: {logits}")
|
138 |
logger.info(f"Probabilities: {probs}")
|
139 |
logger.info(f"Prediction: {pred}")
|
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
accuracy_metric = evaluate.load("accuracy")
|
15 |
+
option_letters = ["A", "B", "C", "D"]
|
16 |
MAX_CONTEXT_WINDOW = 4096 #Hard-coded for the moment, will be replaced later to be an input from the Model.
|
17 |
|
18 |
def load_dataset_from_hf(verbose=False):
|
|
|
53 |
|
54 |
|
55 |
def format_example(df, idx, include_answer=True):
|
56 |
+
"""
|
57 |
+
Format a single example for the prompt based on the actual dataset structure:
|
58 |
+
- Column 0: question text
|
59 |
+
- Column 1: subject
|
60 |
+
- Column 2: choices as a list of strings
|
61 |
+
- Column 3: answer as a numeric index (0-3)
|
62 |
+
"""
|
63 |
+
# Get the question text
|
64 |
prompt = df.iloc[idx, 0]
|
65 |
+
|
66 |
+
# Get the choices from the dataframe
|
67 |
+
options_list = df.iloc[idx, 2]
|
68 |
+
assert(isinstance(options_list, list))
|
69 |
+
|
70 |
+
|
71 |
+
for j, option in enumerate(options_list):
|
72 |
+
prompt += f"\n{option_letters[j]}. {option}"
|
73 |
+
|
74 |
prompt += "\nAnswer:"
|
75 |
if include_answer:
|
76 |
+
# Convert numeric answer to letter
|
77 |
+
answer_num = df.iloc[idx, 3]
|
78 |
+
answer_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[answer_num]
|
79 |
+
prompt += f" {answer_letter}\n\n"
|
80 |
+
|
81 |
return prompt
|
82 |
|
83 |
|
|
|
88 |
if k == -1:
|
89 |
k = df.shape[0]
|
90 |
for i in range(k):
|
91 |
+
prompt += format_example(df, i, include_answer=True)
|
92 |
return prompt
|
93 |
|
94 |
|
|
|
125 |
logger.info (f"Sample: {i}")
|
126 |
|
127 |
|
128 |
+
label = test_df.iloc[i, 3]
|
129 |
+
label_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[label]
|
130 |
|
131 |
logits = model(input_ids=input_ids).logits[0, -1]
|
132 |
|
|
|
149 |
)
|
150 |
pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
|
151 |
|
152 |
+
cor = pred == label_letter
|
153 |
if (i == 0):
|
154 |
logger.info (f"Prompt: {prompt}")
|
155 |
+
logger.info(f"Label_Letter: {label_letter}")
|
156 |
logger.info(f"Logits: {logits}")
|
157 |
logger.info(f"Probabilities: {probs}")
|
158 |
logger.info(f"Prediction: {pred}")
|