rohansampath commited on
Commit
f049604
·
verified ·
1 Parent(s): e53ddd8

Create run_evaluation.py

Browse files
Files changed (1) hide show
  1. run_evaluation.py +156 -0
run_evaluation.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import traceback
3
+ import pandas as pd
4
+ import gradio as gr
5
+ import spaces
6
+ from mmlu_pro_eval_adapted import evaluate_mmlu_pro
7
+ from configs.dataset_config import get_subject_mode_param, get_subject_names
8
+
9
+ @spaces.GPU(duration=240)
10
+ def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
11
+ all_questions, num_questions, model_configs, progress=gr.Progress()):
12
+ """
13
+ Runs the MMLU evaluation with the specified parameters.
14
+
15
+ Args:
16
+ subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
17
+ num_subjects (int): Number of subjects to evaluate (1-14)
18
+ selected_subjects (list): List of specific subjects to evaluate
19
+ all_questions (bool): Whether to evaluate all questions per subject
20
+ num_questions (int): Number of examples per subject (1-100 or all)
21
+ model_configs (dict): Configuration for both models
22
+ progress (gr.Progress): Progress indicator
23
+ """
24
+ try:
25
+ # Convert parameters if needed
26
+ if subject_selection_mode == "all":
27
+ num_subjects = -1
28
+ selected_subjects = []
29
+ elif subject_selection_mode == "specific":
30
+ num_subjects = len(selected_subjects) if selected_subjects else -1
31
+
32
+ if all_questions:
33
+ num_questions = -1
34
+
35
+ # Extract model configurations
36
+ model1_config = model_configs["model1"]
37
+ model2_config = model_configs["model2"]
38
+
39
+ # Run evaluation for Model 1
40
+ start_time_model1 = time.time()
41
+ model1_results = evaluate_mmlu_pro(
42
+ model1_config["name"],
43
+ num_subjects=num_subjects,
44
+ num_questions=num_questions,
45
+ num_shots=model1_config["shots"],
46
+ specific_subjects=selected_subjects if subject_selection_mode == "specific" else None,
47
+ flash_attention=model1_config["flash_attention"],
48
+ regex_pattern=model1_config["regex"] if model1_config["regex"] else None
49
+ )
50
+ model1_elapsed_time = time.time() - start_time_model1
51
+
52
+ # Run evaluation for Model 2
53
+ start_time_model2 = time.time()
54
+ model2_results = evaluate_mmlu_pro(
55
+ model2_config["name"],
56
+ num_subjects=num_subjects,
57
+ num_questions=num_questions,
58
+ num_shots=model2_config["shots"],
59
+ specific_subjects=selected_subjects if subject_selection_mode == "specific" else None,
60
+ flash_attention=model2_config["flash_attention"],
61
+ regex_pattern=model2_config["regex"] if model2_config["regex"] else None
62
+ )
63
+ model2_elapsed_time = time.time() - start_time_model2
64
+
65
+ # Format summary results
66
+ model1_overall_acc = model1_results["overall_accuracy"]
67
+ model1_min_subject, model1_min_acc = model1_results["min_accuracy_subject"]
68
+ model1_max_subject, model1_max_acc = model1_results["max_accuracy_subject"]
69
+
70
+ model2_overall_acc = model2_results["overall_accuracy"]
71
+ model2_min_subject, model2_min_acc = model2_results["min_accuracy_subject"]
72
+ model2_max_subject, model2_max_acc = model2_results["max_accuracy_subject"]
73
+
74
+ # Create merged results DataFrame
75
+ results_df1 = pd.DataFrame(model1_results["full_accuracy_table"])
76
+ results_df2 = pd.DataFrame(model2_results["full_accuracy_table"])
77
+
78
+ # Ensure both dataframes have the same subjects
79
+ subjects = sorted(set(results_df1['Subject'].tolist() + results_df2['Subject'].tolist()))
80
+
81
+ # Create comparison DataFrame
82
+ comparison_data = []
83
+
84
+ for subject in subjects:
85
+ model1_row = results_df1[results_df1['Subject'] == subject]
86
+ model2_row = results_df2[results_df2['Subject'] == subject]
87
+
88
+ model1_acc = model1_row['Accuracy'].iloc[0] if not model1_row.empty else 0
89
+ model2_acc = model2_row['Accuracy'].iloc[0] if not model2_row.empty else 0
90
+
91
+ # Calculate the difference and determine the winner
92
+ diff = model1_acc - model2_acc
93
+ winner = "Model 1" if diff > 0 else ("Model 2" if diff < 0 else "Tie")
94
+
95
+ comparison_data.append({
96
+ 'Subject': subject,
97
+ 'Model 1 Accuracy': model1_acc,
98
+ 'Model 2 Accuracy': model2_acc,
99
+ 'Difference': abs(diff),
100
+ 'Winner': winner
101
+ })
102
+
103
+ # Add overall row
104
+ model1_total_samples = results_df1['Num_samples'].sum()
105
+ model1_total_correct = results_df1['Num_correct'].sum()
106
+ model2_total_samples = results_df2['Num_samples'].sum()
107
+ model2_total_correct = results_df2['Num_correct'].sum()
108
+
109
+ overall_diff = model1_overall_acc - model2_overall_acc
110
+ overall_winner = "Model 1" if overall_diff > 0 else ("Model 2" if overall_diff < 0 else "Tie")
111
+
112
+ comparison_data.insert(0, {
113
+ 'Subject': '**Overall**',
114
+ 'Model 1 Accuracy': model1_overall_acc,
115
+ 'Model 2 Accuracy': model2_overall_acc,
116
+ 'Difference': abs(overall_diff),
117
+ 'Winner': overall_winner
118
+ })
119
+
120
+ comparison_df = pd.DataFrame(comparison_data)
121
+
122
+ # Format the report
123
+ report = (
124
+ f"### Head-to-Head Comparison Results\n\n"
125
+ f"#### Model 1: {model1_config['name']}\n"
126
+ f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
127
+ f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
128
+ f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
129
+ f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
130
+ f"#### Model 2: {model2_config['name']}\n"
131
+ f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
132
+ f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
133
+ f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
134
+ f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
135
+ f"#### Overall Winner: {overall_winner}\n"
136
+ f"* Margin: {abs(overall_diff):.3f}\n"
137
+ )
138
+
139
+ # Return values that re-enable UI components after completion
140
+ return {
141
+ 'report': report,
142
+ 'comparison_df': comparison_df,
143
+ 'success': True
144
+ }
145
+
146
+ except Exception as e:
147
+ # Handle errors gracefully
148
+ error_trace = traceback.format_exc()
149
+ error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
150
+
151
+ # Return error information
152
+ return {
153
+ 'report': error_message,
154
+ 'comparison_df': None,
155
+ 'success': False
156
+ }