Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -61,7 +61,7 @@ def run_toy_evaluation():
|
|
61 |
# 3. MMLU Evaluation call
|
62 |
# ---------------------------------------------------------------------------
|
63 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
64 |
-
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots,
|
65 |
"""
|
66 |
Runs the MMLU evaluation with the specified parameters.
|
67 |
|
@@ -69,7 +69,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
|
|
69 |
all_subjects (bool): Whether to evaluate all subjects
|
70 |
num_subjects (int): Number of subjects to evaluate (1-57)
|
71 |
num_shots (int): Number of few-shot examples (0-5)
|
72 |
-
|
|
|
73 |
"""
|
74 |
|
75 |
if not model_loaded:
|
@@ -81,13 +82,17 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
|
|
81 |
# Convert num_subjects to -1 if all_subjects is True
|
82 |
if all_subjects:
|
83 |
num_subjects = -1
|
|
|
|
|
|
|
|
|
84 |
|
85 |
# Run evaluation
|
86 |
results = evaluate_mmlu(
|
87 |
model,
|
88 |
tokenizer,
|
89 |
num_subjects=num_subjects,
|
90 |
-
num_questions=
|
91 |
num_shots=num_shots
|
92 |
)
|
93 |
|
@@ -138,13 +143,13 @@ with gr.Blocks() as demo:
|
|
138 |
with gr.Row():
|
139 |
all_subjects_checkbox = gr.Checkbox(
|
140 |
label="Evaluate All Subjects",
|
141 |
-
value=
|
142 |
info="When checked, evaluates all 57 MMLU subjects"
|
143 |
)
|
144 |
num_subjects_slider = gr.Slider(
|
145 |
minimum=1,
|
146 |
maximum=57,
|
147 |
-
value=
|
148 |
step=1,
|
149 |
label="Number of Subjects",
|
150 |
info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
|
@@ -155,18 +160,26 @@ with gr.Blocks() as demo:
|
|
155 |
num_shots_slider = gr.Slider(
|
156 |
minimum=0,
|
157 |
maximum=5,
|
158 |
-
value=5,
|
159 |
step=1,
|
160 |
label="Number of Few-shot Examples",
|
161 |
info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."
|
162 |
)
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
minimum=1,
|
165 |
-
maximum=
|
166 |
-
value=
|
167 |
step=1,
|
168 |
-
label="
|
169 |
-
info="
|
|
|
170 |
)
|
171 |
|
172 |
with gr.Row():
|
@@ -184,12 +197,31 @@ with gr.Blocks() as demo:
|
|
184 |
)
|
185 |
|
186 |
# Update num_subjects_slider interactivity based on all_subjects checkbox
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
all_subjects_checkbox.change(
|
188 |
-
fn=
|
189 |
inputs=[all_subjects_checkbox],
|
190 |
outputs=[num_subjects_slider]
|
191 |
)
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
# Connect MMLU evaluation button
|
194 |
eval_mmlu_button.click(
|
195 |
fn=run_mmlu_evaluation,
|
@@ -197,9 +229,10 @@ with gr.Blocks() as demo:
|
|
197 |
all_subjects_checkbox,
|
198 |
num_subjects_slider,
|
199 |
num_shots_slider,
|
200 |
-
|
|
|
201 |
],
|
202 |
outputs=results_output
|
203 |
)
|
204 |
|
205 |
-
demo.launch()
|
|
|
61 |
# 3. MMLU Evaluation call
|
62 |
# ---------------------------------------------------------------------------
|
63 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
64 |
+
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions):
|
65 |
"""
|
66 |
Runs the MMLU evaluation with the specified parameters.
|
67 |
|
|
|
69 |
all_subjects (bool): Whether to evaluate all subjects
|
70 |
num_subjects (int): Number of subjects to evaluate (1-57)
|
71 |
num_shots (int): Number of few-shot examples (0-5)
|
72 |
+
all_questions (bool): Whether to evaluate all questions per subject
|
73 |
+
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
74 |
"""
|
75 |
|
76 |
if not model_loaded:
|
|
|
82 |
# Convert num_subjects to -1 if all_subjects is True
|
83 |
if all_subjects:
|
84 |
num_subjects = -1
|
85 |
+
|
86 |
+
# Convert num_questions to -1 if all_questions is True
|
87 |
+
if all_questions:
|
88 |
+
num_questions = -1
|
89 |
|
90 |
# Run evaluation
|
91 |
results = evaluate_mmlu(
|
92 |
model,
|
93 |
tokenizer,
|
94 |
num_subjects=num_subjects,
|
95 |
+
num_questions=num_questions,
|
96 |
num_shots=num_shots
|
97 |
)
|
98 |
|
|
|
143 |
with gr.Row():
|
144 |
all_subjects_checkbox = gr.Checkbox(
|
145 |
label="Evaluate All Subjects",
|
146 |
+
value=False, # Default is unchecked
|
147 |
info="When checked, evaluates all 57 MMLU subjects"
|
148 |
)
|
149 |
num_subjects_slider = gr.Slider(
|
150 |
minimum=1,
|
151 |
maximum=57,
|
152 |
+
value=10, # Default is 10 subjects
|
153 |
step=1,
|
154 |
label="Number of Subjects",
|
155 |
info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
|
|
|
160 |
num_shots_slider = gr.Slider(
|
161 |
minimum=0,
|
162 |
maximum=5,
|
163 |
+
value=5, # Default is 5 few-shot examples
|
164 |
step=1,
|
165 |
label="Number of Few-shot Examples",
|
166 |
info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."
|
167 |
)
|
168 |
+
|
169 |
+
with gr.Row():
|
170 |
+
all_questions_checkbox = gr.Checkbox(
|
171 |
+
label="Evaluate All Questions",
|
172 |
+
value=False, # Default is unchecked
|
173 |
+
info="When checked, evaluates all available questions for each subject"
|
174 |
+
)
|
175 |
+
num_questions_slider = gr.Slider(
|
176 |
minimum=1,
|
177 |
+
maximum=20,
|
178 |
+
value=10, # Default is 10 questions
|
179 |
step=1,
|
180 |
+
label="Questions per Subject",
|
181 |
+
info="Choose a subset of questions (1-20), or click the checkbox for All Questions",
|
182 |
+
interactive=True
|
183 |
)
|
184 |
|
185 |
with gr.Row():
|
|
|
197 |
)
|
198 |
|
199 |
# Update num_subjects_slider interactivity based on all_subjects checkbox
|
200 |
+
def update_subjects_slider(checked):
|
201 |
+
if checked:
|
202 |
+
return gr.update(value=57, interactive=False)
|
203 |
+
else:
|
204 |
+
return gr.update(interactive=True)
|
205 |
+
|
206 |
all_subjects_checkbox.change(
|
207 |
+
fn=update_subjects_slider,
|
208 |
inputs=[all_subjects_checkbox],
|
209 |
outputs=[num_subjects_slider]
|
210 |
)
|
211 |
|
212 |
+
# Update num_questions_slider interactivity based on all_questions checkbox
|
213 |
+
def update_questions_slider(checked):
|
214 |
+
if checked:
|
215 |
+
return gr.update(interactive=False)
|
216 |
+
else:
|
217 |
+
return gr.update(interactive=True)
|
218 |
+
|
219 |
+
all_questions_checkbox.change(
|
220 |
+
fn=update_questions_slider,
|
221 |
+
inputs=[all_questions_checkbox],
|
222 |
+
outputs=[num_questions_slider]
|
223 |
+
)
|
224 |
+
|
225 |
# Connect MMLU evaluation button
|
226 |
eval_mmlu_button.click(
|
227 |
fn=run_mmlu_evaluation,
|
|
|
229 |
all_subjects_checkbox,
|
230 |
num_subjects_slider,
|
231 |
num_shots_slider,
|
232 |
+
all_questions_checkbox,
|
233 |
+
num_questions_slider
|
234 |
],
|
235 |
outputs=results_output
|
236 |
)
|
237 |
|
238 |
+
demo.launch()
|