rohansampath commited on
Commit
1d13019
·
verified ·
1 Parent(s): cec8405

Update run_evaluation.py

Browse files
Files changed (1) hide show
  1. run_evaluation.py +17 -0
run_evaluation.py CHANGED
@@ -115,6 +115,23 @@ def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
115
  'Difference': abs(overall_diff),
116
  'Winner': overall_winner
117
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  comparison_df = pd.DataFrame(comparison_data)
120
 
 
115
  'Difference': abs(overall_diff),
116
  'Winner': overall_winner
117
  })
118
+
119
+ report = (
120
+ f"### Head-to-Head Comparison Results\n\n"
121
+ f"#### Model 1: {model1_config['name']}\n"
122
+ f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
123
+ f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
124
+ f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
125
+ f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
126
+ f"#### Model 2: {model2_config['name']}\n"
127
+ f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
128
+ f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
129
+ f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
130
+ f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
131
+ f"#### Overall Winner: {overall_winner}\n"
132
+ f"* Margin: {abs(overall_diff):.3f}\n"
133
+ )
134
+
135
 
136
  comparison_df = pd.DataFrame(comparison_data)
137