rohansampath commited on
Commit
ee60006
·
verified ·
1 Parent(s): b0fd62c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -1
app.py CHANGED
@@ -11,6 +11,7 @@ import base64
11
  import os
12
  from huggingface_hub import login
13
  import spaces
 
14
 
15
  # Read token and login
16
  hf_token = os.getenv("HF_TOKEN_READ_WRITE")
@@ -188,13 +189,31 @@ def run_evaluation():
188
 
189
  return f"Accuracy: {accuracy:.2f}", full_html
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  # ---------------------------------------------------------------------------
192
  # 6. Gradio Interface
193
  # ---------------------------------------------------------------------------
194
  with gr.Blocks() as demo:
195
  gr.Markdown("# Mistral-7B Math Evaluation Demo")
196
  gr.Markdown("""
197
- This demo evaluates Mistral-7B on basic math problems.
198
  Press the button below to run the evaluation.
199
  """)
200
 
@@ -208,4 +227,13 @@ with gr.Blocks() as demo:
208
  outputs=[output_text, output_plot]
209
  )
210
 
 
 
 
 
 
 
 
 
 
211
  demo.launch()
 
11
  import os
12
  from huggingface_hub import login
13
  import spaces
14
+ from mmlu_eval import evaluate_mmlu
15
 
16
  # Read token and login
17
  hf_token = os.getenv("HF_TOKEN_READ_WRITE")
 
189
 
190
  return f"Accuracy: {accuracy:.2f}", full_html
191
 
192
+ # ---------------------------------------------------------------------------
193
+ # 5. MMLU Evaluation call
194
+ # ---------------------------------------------------------------------------
195
+ def run_mmlu_evaluation(num_questions):
196
+ """
197
+ Runs the MMLU evaluation with the specified number of questions per task.
198
+ """
199
+ results = evaluate_mmlu(model, tokenizer, num_questions)
200
+
201
+ report = (
202
+ f"Overall Accuracy: {results['overall_accuracy']:.2f}\n"
203
+ f"Min Accuracy: {results['min_accuracy_task'][1]:.2f} on {results['min_accuracy_task'][0]}\n"
204
+ f"Max Accuracy: {results['max_accuracy_task'][1]:.2f} on {results['max_accuracy_task'][0]}"
205
+ )
206
+
207
+ return report
208
+
209
+
210
  # ---------------------------------------------------------------------------
211
  # 6. Gradio Interface
212
  # ---------------------------------------------------------------------------
213
  with gr.Blocks() as demo:
214
  gr.Markdown("# Mistral-7B Math Evaluation Demo")
215
  gr.Markdown("""
216
+ This demo evaluates Mistral-7B on three very simple math problems to get started.
217
  Press the button below to run the evaluation.
218
  """)
219
 
 
227
  outputs=[output_text, output_plot]
228
  )
229
 
230
+ gr.Markdown("### MMLU Evaluation")
231
+ num_questions_input = gr.Number(label="Questions per Task (there are 57 total Tasks)", value=5, precision=0)
232
+ eval_mmlu_button = gr.Button("Run MMLU Evaluation")
233
+ mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
234
+
235
+ eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
236
+
237
+
238
+
239
  demo.launch()