Spaces:
Sleeping
Sleeping
Update run_evaluation.py
Browse files- run_evaluation.py +17 -0
run_evaluation.py
CHANGED
@@ -115,6 +115,23 @@ def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
|
|
115 |
'Difference': abs(overall_diff),
|
116 |
'Winner': overall_winner
|
117 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
comparison_df = pd.DataFrame(comparison_data)
|
120 |
|
|
|
115 |
'Difference': abs(overall_diff),
|
116 |
'Winner': overall_winner
|
117 |
})
|
118 |
+
|
119 |
+
report = (
|
120 |
+
f"### Head-to-Head Comparison Results\n\n"
|
121 |
+
f"#### Model 1: {model1_config['name']}\n"
|
122 |
+
f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
|
123 |
+
f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
|
124 |
+
f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
|
125 |
+
f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
|
126 |
+
f"#### Model 2: {model2_config['name']}\n"
|
127 |
+
f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
|
128 |
+
f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
|
129 |
+
f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
|
130 |
+
f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
|
131 |
+
f"#### Overall Winner: {overall_winner}\n"
|
132 |
+
f"* Margin: {abs(overall_diff):.3f}\n"
|
133 |
+
)
|
134 |
+
|
135 |
|
136 |
comparison_df = pd.DataFrame(comparison_data)
|
137 |
|