Spaces:
Sleeping
Sleeping
Update mmlu_eval.py
Browse files- mmlu_eval.py +18 -7
mmlu_eval.py
CHANGED
@@ -35,30 +35,39 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
35 |
- Overall accuracy
|
36 |
- Min accuracy task
|
37 |
- Max accuracy task
|
|
|
|
|
38 |
"""
|
39 |
results = {}
|
40 |
-
|
|
|
|
|
41 |
for task_name in mmlu_dataset.keys():
|
42 |
dataset = mmlu_dataset[task_name]
|
43 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
44 |
-
|
45 |
predictions = []
|
46 |
references = []
|
47 |
-
|
48 |
for sample in sampled_questions:
|
49 |
question = sample["question"]
|
50 |
-
correct_answer = sample["answer"]
|
51 |
-
|
52 |
-
model_output = generate_answer(model, tokenizer, question)
|
53 |
|
54 |
predictions.append(model_output)
|
55 |
references.append(correct_answer)
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
# Compute accuracy for the task
|
58 |
norm_preds = [str(p).lower().strip() for p in predictions]
|
59 |
norm_refs = [str(r).lower().strip() for r in references]
|
60 |
task_accuracy = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)["accuracy"]
|
61 |
-
|
62 |
results[task_name] = task_accuracy
|
63 |
|
64 |
# Compute overall statistics
|
@@ -70,4 +79,6 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
70 |
"overall_accuracy": overall_accuracy,
|
71 |
"min_accuracy_task": (min_task, results[min_task]),
|
72 |
"max_accuracy_task": (max_task, results[max_task]),
|
|
|
|
|
73 |
}
|
|
|
35 |
- Overall accuracy
|
36 |
- Min accuracy task
|
37 |
- Max accuracy task
|
38 |
+
- Two correct examples
|
39 |
+
- Two incorrect examples
|
40 |
"""
|
41 |
results = {}
|
42 |
+
correct_examples = []
|
43 |
+
incorrect_examples = []
|
44 |
+
|
45 |
for task_name in mmlu_dataset.keys():
|
46 |
dataset = mmlu_dataset[task_name]
|
47 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
48 |
+
|
49 |
predictions = []
|
50 |
references = []
|
51 |
+
|
52 |
for sample in sampled_questions:
|
53 |
question = sample["question"]
|
54 |
+
correct_answer = str(sample["answer"]).strip().lower()
|
55 |
+
model_output = generate_answer(model, tokenizer, question).strip().lower()
|
|
|
56 |
|
57 |
predictions.append(model_output)
|
58 |
references.append(correct_answer)
|
59 |
|
60 |
+
# Store examples
|
61 |
+
if model_output == correct_answer and len(correct_examples) < 2:
|
62 |
+
correct_examples.append((task_name, question, model_output, correct_answer))
|
63 |
+
elif model_output != correct_answer and len(incorrect_examples) < 2:
|
64 |
+
incorrect_examples.append((task_name, question, model_output, correct_answer))
|
65 |
+
|
66 |
# Compute accuracy for the task
|
67 |
norm_preds = [str(p).lower().strip() for p in predictions]
|
68 |
norm_refs = [str(r).lower().strip() for r in references]
|
69 |
task_accuracy = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)["accuracy"]
|
70 |
+
|
71 |
results[task_name] = task_accuracy
|
72 |
|
73 |
# Compute overall statistics
|
|
|
79 |
"overall_accuracy": overall_accuracy,
|
80 |
"min_accuracy_task": (min_task, results[min_task]),
|
81 |
"max_accuracy_task": (max_task, results[max_task]),
|
82 |
+
"correct_examples": correct_examples,
|
83 |
+
"incorrect_examples": incorrect_examples,
|
84 |
}
|