Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
4 |
import os
|
5 |
from huggingface_hub import login
|
6 |
from toy_dataset_eval import evaluate_toy_dataset
|
7 |
-
from
|
8 |
import spaces
|
9 |
import pandas as pd
|
10 |
import time # Added for timing functionality
|
@@ -23,49 +23,9 @@ model_name = "mistralai/Mistral-7B-v0.1"
|
|
23 |
tokenizer = None
|
24 |
model = None
|
25 |
model_loaded = False
|
26 |
-
|
27 |
-
@spaces.GPU
|
28 |
-
def load_model():
|
29 |
-
"""Loads the Mistral model and tokenizer and updates the load status."""
|
30 |
-
global tokenizer, model, model_loaded
|
31 |
-
start_time = time.time() # Start timing
|
32 |
-
try:
|
33 |
-
if tokenizer is None:
|
34 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
|
35 |
-
if model is None:
|
36 |
-
model = AutoModelForCausalLM.from_pretrained(
|
37 |
-
model_name,
|
38 |
-
token=hf_token,
|
39 |
-
torch_dtype=torch.float16
|
40 |
-
)
|
41 |
-
model.to('cuda')
|
42 |
-
model_loaded = True
|
43 |
-
elapsed_time = time.time() - start_time # Calculate elapsed time
|
44 |
-
return f"✅ Model Loaded in {elapsed_time:.2f} seconds!"
|
45 |
-
except Exception as e:
|
46 |
-
model_loaded = False
|
47 |
-
return f"❌ Model Load Failed: {str(e)}"
|
48 |
-
# ---------------------------------------------------------------------------
|
49 |
-
# 2. Toy Evaluation
|
50 |
-
# ---------------------------------------------------------------------------
|
51 |
-
@spaces.GPU(duration=120)
|
52 |
-
def run_toy_evaluation():
|
53 |
-
"""Runs the toy dataset evaluation."""
|
54 |
-
if not model_loaded:
|
55 |
-
load_model()
|
56 |
-
|
57 |
-
if not model_loaded:
|
58 |
-
return "⚠️ Model not loaded. Please load the model first."
|
59 |
-
|
60 |
-
start_time = time.time() # Start timing
|
61 |
-
results = evaluate_toy_dataset(model, tokenizer)
|
62 |
-
elapsed_time = time.time() - start_time # Calculate elapsed time
|
63 |
-
|
64 |
-
return f"{results}\n\nEvaluation completed in {elapsed_time:.2f} seconds.", \
|
65 |
-
f"<div>Time taken: {elapsed_time:.2f} seconds</div>" # Return timing info
|
66 |
|
67 |
# ---------------------------------------------------------------------------
|
68 |
-
#
|
69 |
# ---------------------------------------------------------------------------
|
70 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
71 |
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
|
@@ -80,17 +40,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
80 |
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
81 |
progress (gr.Progress): Progress indicator
|
82 |
"""
|
83 |
-
|
84 |
-
if not model_loaded:
|
85 |
-
load_model()
|
86 |
-
|
87 |
-
if not model_loaded:
|
88 |
-
return ("⚠️ Model not loaded. Please load the model first.", None,
|
89 |
-
gr.update(interactive=True), gr.update(visible=False),
|
90 |
-
gr.update(interactive=True), gr.update(interactive=True),
|
91 |
-
gr.update(interactive=True), gr.update(interactive=True),
|
92 |
-
gr.update(interactive=True))
|
93 |
-
|
94 |
# Convert num_subjects to -1 if all_subjects is True
|
95 |
if all_subjects:
|
96 |
num_subjects = -1
|
@@ -101,14 +51,11 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
101 |
|
102 |
# Run evaluation with timing
|
103 |
start_time = time.time() # Start timing
|
104 |
-
results =
|
105 |
-
|
106 |
-
tokenizer,
|
107 |
num_subjects=num_subjects,
|
108 |
num_questions=num_questions,
|
109 |
num_shots=num_shots,
|
110 |
-
batch_size=32,
|
111 |
-
auto_batch_size=True
|
112 |
)
|
113 |
elapsed_time = time.time() - start_time # Calculate elapsed time
|
114 |
|
@@ -159,23 +106,11 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
159 |
# 4. Gradio Interface
|
160 |
# ---------------------------------------------------------------------------
|
161 |
with gr.Blocks() as demo:
|
162 |
-
gr.Markdown("# Mistral-7B on MMLU
|
163 |
gr.Markdown("""
|
164 |
-
This demo evaluates Mistral-7B on the MMLU Dataset.
|
165 |
""")
|
166 |
|
167 |
-
# Load Model Section
|
168 |
-
with gr.Row():
|
169 |
-
load_button = gr.Button("Load Model", variant="primary")
|
170 |
-
load_status = gr.Textbox(label="Model Status", interactive=False)
|
171 |
-
|
172 |
-
# Toy Dataset Evaluation Section
|
173 |
-
gr.Markdown("### Toy Dataset Evaluation")
|
174 |
-
with gr.Row():
|
175 |
-
eval_toy_button = gr.Button("Run Toy Evaluation", variant="primary")
|
176 |
-
toy_output = gr.Textbox(label="Results")
|
177 |
-
toy_plot = gr.HTML(label="Visualization and Details")
|
178 |
-
|
179 |
# MMLU Evaluation Section
|
180 |
gr.Markdown("### MMLU Evaluation")
|
181 |
|
@@ -183,15 +118,15 @@ with gr.Blocks() as demo:
|
|
183 |
all_subjects_checkbox = gr.Checkbox(
|
184 |
label="Evaluate All Subjects",
|
185 |
value=False, # Default is unchecked
|
186 |
-
info="When checked, evaluates all
|
187 |
)
|
188 |
num_subjects_slider = gr.Slider(
|
189 |
minimum=1,
|
190 |
-
maximum=
|
191 |
-
value=
|
192 |
step=1,
|
193 |
label="Number of Subjects",
|
194 |
-
info="Number of subjects to evaluate (1-
|
195 |
interactive=True
|
196 |
)
|
197 |
|
@@ -211,7 +146,7 @@ with gr.Blocks() as demo:
|
|
211 |
value=False, # Default is unchecked
|
212 |
info="When checked, evaluates all available questions for each subject"
|
213 |
)
|
214 |
-
questions_info_text = gr.Markdown(visible=False, value="**All
|
215 |
|
216 |
with gr.Row(elem_id="questions_selection_row"):
|
217 |
questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
|
@@ -220,37 +155,27 @@ with gr.Blocks() as demo:
|
|
220 |
with questions_container:
|
221 |
num_questions_slider = gr.Slider(
|
222 |
minimum=1,
|
223 |
-
maximum=
|
224 |
-
value=
|
225 |
step=1,
|
226 |
label="Questions per Subject",
|
227 |
-
info="Choose a subset of questions (1-
|
228 |
interactive=True
|
229 |
)
|
230 |
|
231 |
with gr.Row():
|
232 |
with gr.Column(scale=1):
|
233 |
-
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary", interactive=True)
|
234 |
-
cancel_mmlu_button = gr.Button("Cancel MMLU Evaluation", variant="stop", visible=False)
|
235 |
results_output = gr.Markdown(label="Evaluation Results")
|
236 |
|
237 |
with gr.Row():
|
238 |
results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
|
239 |
-
|
240 |
-
# Connect components
|
241 |
-
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
242 |
-
|
243 |
-
# Connect toy evaluation
|
244 |
-
eval_toy_button.click(
|
245 |
-
fn=run_toy_evaluation,
|
246 |
-
inputs=None,
|
247 |
-
outputs=[toy_output, toy_plot]
|
248 |
-
)
|
249 |
-
|
250 |
# Update num_subjects_slider interactivity based on all_subjects checkbox
|
251 |
def update_subjects_slider(checked):
|
252 |
if checked:
|
253 |
-
return gr.update(value=
|
254 |
else:
|
255 |
return gr.update(interactive=True)
|
256 |
|
@@ -290,11 +215,11 @@ with gr.Blocks() as demo:
|
|
290 |
# This doesn't actually cancel the GPU job (which would require more backend support)
|
291 |
# But it does reset the UI state to be interactive again
|
292 |
return [
|
293 |
-
gr.update(interactive=True, info="When checked, evaluates all
|
294 |
-
gr.update(interactive=True, info="Number of subjects to evaluate (1-
|
295 |
gr.update(interactive=True, info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."), # num_shots_slider
|
296 |
gr.update(interactive=True, info="When checked, evaluates all available questions for each subject"), # all_questions_checkbox
|
297 |
-
gr.update(interactive=True, info="Choose a subset of questions (1-
|
298 |
gr.update(interactive=True), # eval_mmlu_button
|
299 |
gr.update(visible=False), # cancel_mmlu_button
|
300 |
"⚠️ Evaluation canceled by user", # results_output
|
|
|
4 |
import os
|
5 |
from huggingface_hub import login
|
6 |
from toy_dataset_eval import evaluate_toy_dataset
|
7 |
+
from mmlu_pro_eval_adapted import evaluate_mmlu_pro
|
8 |
import spaces
|
9 |
import pandas as pd
|
10 |
import time # Added for timing functionality
|
|
|
23 |
tokenizer = None
|
24 |
model = None
|
25 |
model_loaded = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# ---------------------------------------------------------------------------
|
28 |
+
# 1. MMLU-Pro Evaluation call
|
29 |
# ---------------------------------------------------------------------------
|
30 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
31 |
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
|
|
|
40 |
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
41 |
progress (gr.Progress): Progress indicator
|
42 |
"""
|
43 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# Convert num_subjects to -1 if all_subjects is True
|
45 |
if all_subjects:
|
46 |
num_subjects = -1
|
|
|
51 |
|
52 |
# Run evaluation with timing
|
53 |
start_time = time.time() # Start timing
|
54 |
+
results = evaluate_mmlu_pro(
|
55 |
+
model_name,
|
|
|
56 |
num_subjects=num_subjects,
|
57 |
num_questions=num_questions,
|
58 |
num_shots=num_shots,
|
|
|
|
|
59 |
)
|
60 |
elapsed_time = time.time() - start_time # Calculate elapsed time
|
61 |
|
|
|
106 |
# 4. Gradio Interface
|
107 |
# ---------------------------------------------------------------------------
|
108 |
with gr.Blocks() as demo:
|
109 |
+
gr.Markdown("# Mistral-7B on MMLU-Pro Evaluation Demo")
|
110 |
gr.Markdown("""
|
111 |
+
This demo evaluates Mistral-7B-v0.1 on the MMLU-Pro Dataset (available here: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro).
|
112 |
""")
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
# MMLU Evaluation Section
|
115 |
gr.Markdown("### MMLU Evaluation")
|
116 |
|
|
|
118 |
all_subjects_checkbox = gr.Checkbox(
|
119 |
label="Evaluate All Subjects",
|
120 |
value=False, # Default is unchecked
|
121 |
+
info="When checked, evaluates all 14 MMLU-Pro subjects"
|
122 |
)
|
123 |
num_subjects_slider = gr.Slider(
|
124 |
minimum=1,
|
125 |
+
maximum=14,
|
126 |
+
value=14, # Default is all subjects
|
127 |
step=1,
|
128 |
label="Number of Subjects",
|
129 |
+
info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order.",
|
130 |
interactive=True
|
131 |
)
|
132 |
|
|
|
146 |
value=False, # Default is unchecked
|
147 |
info="When checked, evaluates all available questions for each subject"
|
148 |
)
|
149 |
+
questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**")
|
150 |
|
151 |
with gr.Row(elem_id="questions_selection_row"):
|
152 |
questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
|
|
|
155 |
with questions_container:
|
156 |
num_questions_slider = gr.Slider(
|
157 |
minimum=1,
|
158 |
+
maximum=40,
|
159 |
+
value=20, # Default is 10 questions
|
160 |
step=1,
|
161 |
label="Questions per Subject",
|
162 |
+
info="Choose a subset of questions (1-40) per subject. They will be loaded in order of question_id for reproducibility. ",
|
163 |
interactive=True
|
164 |
)
|
165 |
|
166 |
with gr.Row():
|
167 |
with gr.Column(scale=1):
|
168 |
+
eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
|
169 |
+
cancel_mmlu_button = gr.Button("Cancel MMLU-Pro Evaluation", variant="stop", visible=False)
|
170 |
results_output = gr.Markdown(label="Evaluation Results")
|
171 |
|
172 |
with gr.Row():
|
173 |
results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
|
174 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
# Update num_subjects_slider interactivity based on all_subjects checkbox
|
176 |
def update_subjects_slider(checked):
|
177 |
if checked:
|
178 |
+
return gr.update(value=14, interactive=False)
|
179 |
else:
|
180 |
return gr.update(interactive=True)
|
181 |
|
|
|
215 |
# This doesn't actually cancel the GPU job (which would require more backend support)
|
216 |
# But it does reset the UI state to be interactive again
|
217 |
return [
|
218 |
+
gr.update(interactive=True, info="When checked, evaluates all 14 MMLU-Pro subjects"), # all_subjects_checkbox
|
219 |
+
gr.update(interactive=True, info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order."), # num_subjects_slider
|
220 |
gr.update(interactive=True, info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."), # num_shots_slider
|
221 |
gr.update(interactive=True, info="When checked, evaluates all available questions for each subject"), # all_questions_checkbox
|
222 |
+
gr.update(interactive=True, info="Choose a subset of questions (1-40) per subject. They will be loaded in order of question_id for reproducibility."), # num_questions_slider
|
223 |
gr.update(interactive=True), # eval_mmlu_button
|
224 |
gr.update(visible=False), # cancel_mmlu_button
|
225 |
"⚠️ Evaluation canceled by user", # results_output
|