Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -195,17 +195,34 @@ def run_evaluation():
|
|
| 195 |
def run_mmlu_evaluation(num_questions):
|
| 196 |
"""
|
| 197 |
Runs the MMLU evaluation with the specified number of questions per task.
|
|
|
|
| 198 |
"""
|
| 199 |
results = evaluate_mmlu(model, tokenizer, num_questions)
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
report = (
|
| 202 |
-
f"Overall Accuracy: {
|
| 203 |
-
f"Min Accuracy
|
| 204 |
-
f"Max Accuracy
|
|
|
|
|
|
|
|
|
|
| 205 |
)
|
| 206 |
-
|
| 207 |
-
return report
|
| 208 |
|
|
|
|
| 209 |
|
| 210 |
# ---------------------------------------------------------------------------
|
| 211 |
# 6. Gradio Interface
|
|
|
|
| 195 |
def run_mmlu_evaluation(num_questions):
|
| 196 |
"""
|
| 197 |
Runs the MMLU evaluation with the specified number of questions per task.
|
| 198 |
+
Also displays two correct and two incorrect examples.
|
| 199 |
"""
|
| 200 |
results = evaluate_mmlu(model, tokenizer, num_questions)
|
| 201 |
+
|
| 202 |
+
overall_accuracy = results["overall_accuracy"]
|
| 203 |
+
min_task, min_acc = results["min_accuracy_task"]
|
| 204 |
+
max_task, max_acc = results["max_accuracy_task"]
|
| 205 |
+
correct_examples = results["correct_examples"]
|
| 206 |
+
incorrect_examples = results["incorrect_examples"]
|
| 207 |
+
|
| 208 |
+
# Format examples for readability
|
| 209 |
+
def format_example(example):
|
| 210 |
+
task, question, model_output, correct_answer = example
|
| 211 |
+
return f"**Task:** {task}\n**Question:** {question}\n**Model Output:** {model_output}\n**Correct Answer:** {correct_answer}\n"
|
| 212 |
+
|
| 213 |
+
correct_text = "\n\n".join(format_example(ex) for ex in correct_examples)
|
| 214 |
+
incorrect_text = "\n\n".join(format_example(ex) for ex in incorrect_examples)
|
| 215 |
+
|
| 216 |
report = (
|
| 217 |
+
f"### Overall Accuracy: {overall_accuracy:.2f}\n"
|
| 218 |
+
f"**Min Accuracy:** {min_acc:.2f} on `{min_task}`\n"
|
| 219 |
+
f"**Max Accuracy:** {max_acc:.2f} on `{max_task}`\n\n"
|
| 220 |
+
f"---\n\n"
|
| 221 |
+
f"### ✅ Correct Examples\n{correct_text if correct_examples else 'No correct examples available.'}\n\n"
|
| 222 |
+
f"### ❌ Incorrect Examples\n{incorrect_text if incorrect_examples else 'No incorrect examples available.'}"
|
| 223 |
)
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
return report
|
| 226 |
|
| 227 |
# ---------------------------------------------------------------------------
|
| 228 |
# 6. Gradio Interface
|