Spaces:
Sleeping
Sleeping
Update mmlu_eval.py
Browse files- mmlu_eval.py +2 -0
mmlu_eval.py
CHANGED
@@ -43,6 +43,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
43 |
incorrect_examples = []
|
44 |
|
45 |
for task_name in mmlu_dataset.keys():
|
|
|
46 |
dataset = mmlu_dataset[task_name]
|
47 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
48 |
|
@@ -50,6 +51,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
50 |
references = []
|
51 |
|
52 |
for sample in sampled_questions:
|
|
|
53 |
question = sample["question"]
|
54 |
correct_answer = str(sample["answer"]).strip().lower()
|
55 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|
|
|
43 |
incorrect_examples = []
|
44 |
|
45 |
for task_name in mmlu_dataset.keys():
|
46 |
+
console.log ("TASK NAME", task_name)
|
47 |
dataset = mmlu_dataset[task_name]
|
48 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
49 |
|
|
|
51 |
references = []
|
52 |
|
53 |
for sample in sampled_questions:
|
54 |
+
console.log ("SAMPLE", sample)
|
55 |
question = sample["question"]
|
56 |
correct_answer = str(sample["answer"]).strip().lower()
|
57 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|