rohansampath commited on
Commit
8c4aa75
·
verified ·
1 Parent(s): cbd1959

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -97
app.py CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import os
5
  from huggingface_hub import login
6
  from toy_dataset_eval import evaluate_toy_dataset
7
- from mmlu_eval_original import evaluate_mmlu_batched
8
  import spaces
9
  import pandas as pd
10
  import time # Added for timing functionality
@@ -23,49 +23,9 @@ model_name = "mistralai/Mistral-7B-v0.1"
23
  tokenizer = None
24
  model = None
25
  model_loaded = False
26
-
27
- @spaces.GPU
28
- def load_model():
29
- """Loads the Mistral model and tokenizer and updates the load status."""
30
- global tokenizer, model, model_loaded
31
- start_time = time.time() # Start timing
32
- try:
33
- if tokenizer is None:
34
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
35
- if model is None:
36
- model = AutoModelForCausalLM.from_pretrained(
37
- model_name,
38
- token=hf_token,
39
- torch_dtype=torch.float16
40
- )
41
- model.to('cuda')
42
- model_loaded = True
43
- elapsed_time = time.time() - start_time # Calculate elapsed time
44
- return f"✅ Model Loaded in {elapsed_time:.2f} seconds!"
45
- except Exception as e:
46
- model_loaded = False
47
- return f"❌ Model Load Failed: {str(e)}"
48
- # ---------------------------------------------------------------------------
49
- # 2. Toy Evaluation
50
- # ---------------------------------------------------------------------------
51
- @spaces.GPU(duration=120)
52
- def run_toy_evaluation():
53
- """Runs the toy dataset evaluation."""
54
- if not model_loaded:
55
- load_model()
56
-
57
- if not model_loaded:
58
- return "⚠️ Model not loaded. Please load the model first."
59
-
60
- start_time = time.time() # Start timing
61
- results = evaluate_toy_dataset(model, tokenizer)
62
- elapsed_time = time.time() - start_time # Calculate elapsed time
63
-
64
- return f"{results}\n\nEvaluation completed in {elapsed_time:.2f} seconds.", \
65
- f"<div>Time taken: {elapsed_time:.2f} seconds</div>" # Return timing info
66
 
67
  # ---------------------------------------------------------------------------
68
- # 3. MMLU Evaluation call
69
  # ---------------------------------------------------------------------------
70
  @spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
71
  def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
@@ -80,17 +40,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
80
  num_questions (int): Number of examples per subject (1-20 or -1 for all)
81
  progress (gr.Progress): Progress indicator
82
  """
83
-
84
- if not model_loaded:
85
- load_model()
86
-
87
- if not model_loaded:
88
- return ("⚠️ Model not loaded. Please load the model first.", None,
89
- gr.update(interactive=True), gr.update(visible=False),
90
- gr.update(interactive=True), gr.update(interactive=True),
91
- gr.update(interactive=True), gr.update(interactive=True),
92
- gr.update(interactive=True))
93
-
94
  # Convert num_subjects to -1 if all_subjects is True
95
  if all_subjects:
96
  num_subjects = -1
@@ -101,14 +51,11 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
101
 
102
  # Run evaluation with timing
103
  start_time = time.time() # Start timing
104
- results = evaluate_mmlu_batched(
105
- model,
106
- tokenizer,
107
  num_subjects=num_subjects,
108
  num_questions=num_questions,
109
  num_shots=num_shots,
110
- batch_size=32,
111
- auto_batch_size=True
112
  )
113
  elapsed_time = time.time() - start_time # Calculate elapsed time
114
 
@@ -159,23 +106,11 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
159
  # 4. Gradio Interface
160
  # ---------------------------------------------------------------------------
161
  with gr.Blocks() as demo:
162
- gr.Markdown("# Mistral-7B on MMLU - Evaluation Demo")
163
  gr.Markdown("""
164
- This demo evaluates Mistral-7B on the MMLU Dataset.
165
  """)
166
 
167
- # Load Model Section
168
- with gr.Row():
169
- load_button = gr.Button("Load Model", variant="primary")
170
- load_status = gr.Textbox(label="Model Status", interactive=False)
171
-
172
- # Toy Dataset Evaluation Section
173
- gr.Markdown("### Toy Dataset Evaluation")
174
- with gr.Row():
175
- eval_toy_button = gr.Button("Run Toy Evaluation", variant="primary")
176
- toy_output = gr.Textbox(label="Results")
177
- toy_plot = gr.HTML(label="Visualization and Details")
178
-
179
  # MMLU Evaluation Section
180
  gr.Markdown("### MMLU Evaluation")
181
 
@@ -183,15 +118,15 @@ with gr.Blocks() as demo:
183
  all_subjects_checkbox = gr.Checkbox(
184
  label="Evaluate All Subjects",
185
  value=False, # Default is unchecked
186
- info="When checked, evaluates all 57 MMLU subjects"
187
  )
188
  num_subjects_slider = gr.Slider(
189
  minimum=1,
190
- maximum=57,
191
- value=10, # Default is 10 subjects
192
  step=1,
193
  label="Number of Subjects",
194
- info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
195
  interactive=True
196
  )
197
 
@@ -211,7 +146,7 @@ with gr.Blocks() as demo:
211
  value=False, # Default is unchecked
212
  info="When checked, evaluates all available questions for each subject"
213
  )
214
- questions_info_text = gr.Markdown(visible=False, value="**All 14,042 questions across all subjects will be evaluated**")
215
 
216
  with gr.Row(elem_id="questions_selection_row"):
217
  questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
@@ -220,37 +155,27 @@ with gr.Blocks() as demo:
220
  with questions_container:
221
  num_questions_slider = gr.Slider(
222
  minimum=1,
223
- maximum=20,
224
- value=10, # Default is 10 questions
225
  step=1,
226
  label="Questions per Subject",
227
- info="Choose a subset of questions (1-20)",
228
  interactive=True
229
  )
230
 
231
  with gr.Row():
232
  with gr.Column(scale=1):
233
- eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary", interactive=True)
234
- cancel_mmlu_button = gr.Button("Cancel MMLU Evaluation", variant="stop", visible=False)
235
  results_output = gr.Markdown(label="Evaluation Results")
236
 
237
  with gr.Row():
238
  results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
239
-
240
- # Connect components
241
- load_button.click(fn=load_model, inputs=None, outputs=load_status)
242
-
243
- # Connect toy evaluation
244
- eval_toy_button.click(
245
- fn=run_toy_evaluation,
246
- inputs=None,
247
- outputs=[toy_output, toy_plot]
248
- )
249
-
250
  # Update num_subjects_slider interactivity based on all_subjects checkbox
251
  def update_subjects_slider(checked):
252
  if checked:
253
- return gr.update(value=57, interactive=False)
254
  else:
255
  return gr.update(interactive=True)
256
 
@@ -290,11 +215,11 @@ with gr.Blocks() as demo:
290
  # This doesn't actually cancel the GPU job (which would require more backend support)
291
  # But it does reset the UI state to be interactive again
292
  return [
293
- gr.update(interactive=True, info="When checked, evaluates all 57 MMLU subjects"), # all_subjects_checkbox
294
- gr.update(interactive=True, info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order."), # num_subjects_slider
295
  gr.update(interactive=True, info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."), # num_shots_slider
296
  gr.update(interactive=True, info="When checked, evaluates all available questions for each subject"), # all_questions_checkbox
297
- gr.update(interactive=True, info="Choose a subset of questions (1-20)"), # num_questions_slider
298
  gr.update(interactive=True), # eval_mmlu_button
299
  gr.update(visible=False), # cancel_mmlu_button
300
  "⚠️ Evaluation canceled by user", # results_output
 
4
  import os
5
  from huggingface_hub import login
6
  from toy_dataset_eval import evaluate_toy_dataset
7
+ from mmlu_pro_eval_adapted import evaluate_mmlu_pro
8
  import spaces
9
  import pandas as pd
10
  import time # Added for timing functionality
 
23
  tokenizer = None
24
  model = None
25
  model_loaded = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # ---------------------------------------------------------------------------
28
+ # 1. MMLU-Pro Evaluation call
29
  # ---------------------------------------------------------------------------
30
  @spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
31
  def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
 
40
  num_questions (int): Number of examples per subject (1-20 or -1 for all)
41
  progress (gr.Progress): Progress indicator
42
  """
43
+
 
 
 
 
 
 
 
 
 
 
44
  # Convert num_subjects to -1 if all_subjects is True
45
  if all_subjects:
46
  num_subjects = -1
 
51
 
52
  # Run evaluation with timing
53
  start_time = time.time() # Start timing
54
+ results = evaluate_mmlu_pro(
55
+ model_name,
 
56
  num_subjects=num_subjects,
57
  num_questions=num_questions,
58
  num_shots=num_shots,
 
 
59
  )
60
  elapsed_time = time.time() - start_time # Calculate elapsed time
61
 
 
106
  # 4. Gradio Interface
107
  # ---------------------------------------------------------------------------
108
  with gr.Blocks() as demo:
109
+ gr.Markdown("# Mistral-7B on MMLU-Pro Evaluation Demo")
110
  gr.Markdown("""
111
+ This demo evaluates Mistral-7B-v0.1 on the MMLU-Pro Dataset (available here: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro).
112
  """)
113
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  # MMLU Evaluation Section
115
  gr.Markdown("### MMLU Evaluation")
116
 
 
118
  all_subjects_checkbox = gr.Checkbox(
119
  label="Evaluate All Subjects",
120
  value=False, # Default is unchecked
121
+ info="When checked, evaluates all 14 MMLU-Pro subjects"
122
  )
123
  num_subjects_slider = gr.Slider(
124
  minimum=1,
125
+ maximum=14,
126
+ value=14, # Default is all subjects
127
  step=1,
128
  label="Number of Subjects",
129
+ info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order.",
130
  interactive=True
131
  )
132
 
 
146
  value=False, # Default is unchecked
147
  info="When checked, evaluates all available questions for each subject"
148
  )
149
+ questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**")
150
 
151
  with gr.Row(elem_id="questions_selection_row"):
152
  questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
 
155
  with questions_container:
156
  num_questions_slider = gr.Slider(
157
  minimum=1,
158
+ maximum=40,
159
+ value=20, # Default is 10 questions
160
  step=1,
161
  label="Questions per Subject",
162
+ info="Choose a subset of questions (1-40) per subject. They will be loaded in order of question_id for reproducibility. ",
163
  interactive=True
164
  )
165
 
166
  with gr.Row():
167
  with gr.Column(scale=1):
168
+ eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
169
+ cancel_mmlu_button = gr.Button("Cancel MMLU-Pro Evaluation", variant="stop", visible=False)
170
  results_output = gr.Markdown(label="Evaluation Results")
171
 
172
  with gr.Row():
173
  results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
174
+
 
 
 
 
 
 
 
 
 
 
175
  # Update num_subjects_slider interactivity based on all_subjects checkbox
176
  def update_subjects_slider(checked):
177
  if checked:
178
+ return gr.update(value=14, interactive=False)
179
  else:
180
  return gr.update(interactive=True)
181
 
 
215
  # This doesn't actually cancel the GPU job (which would require more backend support)
216
  # But it does reset the UI state to be interactive again
217
  return [
218
+ gr.update(interactive=True, info="When checked, evaluates all 14 MMLU-Pro subjects"), # all_subjects_checkbox
219
+ gr.update(interactive=True, info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order."), # num_subjects_slider
220
  gr.update(interactive=True, info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."), # num_shots_slider
221
  gr.update(interactive=True, info="When checked, evaluates all available questions for each subject"), # all_questions_checkbox
222
+ gr.update(interactive=True, info="Choose a subset of questions (1-40) per subject. They will be loaded in order of question_id for reproducibility."), # num_questions_slider
223
  gr.update(interactive=True), # eval_mmlu_button
224
  gr.update(visible=False), # cancel_mmlu_button
225
  "⚠️ Evaluation canceled by user", # results_output