Spaces:
Sleeping
Sleeping
Update mmlu_eval.py
Browse files- mmlu_eval.py +2 -0
mmlu_eval.py
CHANGED
|
@@ -43,6 +43,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
| 43 |
incorrect_examples = []
|
| 44 |
|
| 45 |
for task_name in mmlu_dataset.keys():
|
|
|
|
| 46 |
dataset = mmlu_dataset[task_name]
|
| 47 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
| 48 |
|
|
@@ -50,6 +51,7 @@ def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
|
|
| 50 |
references = []
|
| 51 |
|
| 52 |
for sample in sampled_questions:
|
|
|
|
| 53 |
question = sample["question"]
|
| 54 |
correct_answer = str(sample["answer"]).strip().lower()
|
| 55 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|
|
|
|
| 43 |
incorrect_examples = []
|
| 44 |
|
| 45 |
for task_name in mmlu_dataset.keys():
|
| 46 |
+
console.log ("TASK NAME", task_name)
|
| 47 |
dataset = mmlu_dataset[task_name]
|
| 48 |
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
|
| 49 |
|
|
|
|
| 51 |
references = []
|
| 52 |
|
| 53 |
for sample in sampled_questions:
|
| 54 |
+
console.log ("SAMPLE", sample)
|
| 55 |
question = sample["question"]
|
| 56 |
correct_answer = str(sample["answer"]).strip().lower()
|
| 57 |
model_output = generate_answer(model, tokenizer, question).strip().lower()
|