Spaces:
Sleeping
Sleeping
File size: 3,069 Bytes
77d4add c69128a 77d4add 8789e45 77d4add c69128a 77d4add 1b7636f 77d4add 1b7636f 77d4add bd9ca6e 77d4add 1b7636f 77d4add 1b7636f 77d4add bd9ca6e 77d4add 1b7636f 77d4add 1b7636f 77d4add 1b7636f 77d4add 1b7636f 77d4add |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import torch
import random
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
# Load Accuracy Metric
accuracy_metric = evaluate.load("accuracy")
# Load MMLU dataset
mmlu_dataset = load_dataset("cais/mmlu", "all")
@spaces.GPU
def generate_answer(model, tokenizer, question):
"""
Generates an answer using Mistral's instruction format.
"""
prompt = f"<s>[INST] {question}. Provide only the correct answer. [/INST]"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=50,
temperature=0.0,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
def evaluate_mmlu(model, tokenizer, num_questions_per_task=5):
"""
Evaluates the model on MMLU across all 57 tasks.
Returns:
- Overall accuracy
- Min accuracy task
- Max accuracy task
- Two correct examples
- Two incorrect examples
"""
results = {}
correct_examples = []
incorrect_examples = []
for task_name in mmlu_dataset.keys():
print ("TASK NAME: ", task_name)
dataset = mmlu_dataset[task_name]
sampled_questions = random.sample(list(dataset), min(num_questions_per_task, len(dataset)))
predictions = []
references = []
for sample in sampled_questions:
print ("SAMPLE", sample)
question = sample["question"]
correct_answer = str(sample["answer"]).strip().lower()
model_output = generate_answer(model, tokenizer, question).strip().lower()
predictions.append(model_output)
references.append(correct_answer)
# Store examples
if model_output == correct_answer and len(correct_examples) < 2:
correct_examples.append((task_name, question, model_output, correct_answer))
elif model_output != correct_answer and len(incorrect_examples) < 2:
incorrect_examples.append((task_name, question, model_output, correct_answer))
# Compute accuracy for the task
norm_preds = [str(p).lower().strip() for p in predictions]
norm_refs = [str(r).lower().strip() for r in references]
task_accuracy = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)["accuracy"]
results[task_name] = task_accuracy
# Compute overall statistics
overall_accuracy = sum(results.values()) / len(results)
min_task = min(results, key=results.get)
max_task = max(results, key=results.get)
return {
"overall_accuracy": overall_accuracy,
"min_accuracy_task": (min_task, results[min_task]),
"max_accuracy_task": (max_task, results[max_task]),
"correct_examples": correct_examples,
"incorrect_examples": incorrect_examples,
} |