H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 18

Commit

ed9a008

verified ·

1 Parent(s): 84c9e35

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -14

app.py CHANGED Viewed

@@ -61,7 +61,7 @@ def run_toy_evaluation():
 # 3. MMLU Evaluation call
 # ---------------------------------------------------------------------------
 @spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
-def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
     """
     Runs the MMLU evaluation with the specified parameters.
@@ -69,7 +69,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
         all_subjects (bool): Whether to evaluate all subjects
         num_subjects (int): Number of subjects to evaluate (1-57)
         num_shots (int): Number of few-shot examples (0-5)
-        num_examples (int): Number of examples per subject (1-10 or -1 for all)
     """
     if not model_loaded:
@@ -81,13 +82,17 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
     # Convert num_subjects to -1 if all_subjects is True
     if all_subjects:
         num_subjects = -1
     # Run evaluation
     results = evaluate_mmlu(
         model,
         tokenizer,
         num_subjects=num_subjects,
-        num_questions=num_examples,
         num_shots=num_shots
     )
@@ -138,13 +143,13 @@ with gr.Blocks() as demo:
     with gr.Row():
         all_subjects_checkbox = gr.Checkbox(
             label="Evaluate All Subjects",
-            value=True,
             info="When checked, evaluates all 57 MMLU subjects"
         )
         num_subjects_slider = gr.Slider(
             minimum=1,
             maximum=57,
-            value=57,
             step=1,
             label="Number of Subjects",
             info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
@@ -155,18 +160,26 @@ with gr.Blocks() as demo:
         num_shots_slider = gr.Slider(
             minimum=0,
             maximum=5,
-            value=5,
             step=1,
             label="Number of Few-shot Examples",
             info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."
         )
-        num_examples_slider = gr.Slider(
             minimum=1,
-            maximum=10,
-            value=5,
             step=1,
-            label="Examples per Subject",
-            info="Number of test examples per subject (1-10). They will be loaded in alphabetical order."
         )
     with gr.Row():
@@ -184,12 +197,31 @@ with gr.Blocks() as demo:
     )
     # Update num_subjects_slider interactivity based on all_subjects checkbox
     all_subjects_checkbox.change(
-        fn=lambda x: gr.update(interactive=not x),
         inputs=[all_subjects_checkbox],
         outputs=[num_subjects_slider]
     )
     # Connect MMLU evaluation button
     eval_mmlu_button.click(
         fn=run_mmlu_evaluation,
@@ -197,9 +229,10 @@ with gr.Blocks() as demo:
             all_subjects_checkbox,
             num_subjects_slider,
             num_shots_slider,
-            num_examples_slider
         ],
         outputs=results_output
     )
-demo.launch()

 # 3. MMLU Evaluation call
 # ---------------------------------------------------------------------------
 @spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
+def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions):
     """
     Runs the MMLU evaluation with the specified parameters.
         all_subjects (bool): Whether to evaluate all subjects
         num_subjects (int): Number of subjects to evaluate (1-57)
         num_shots (int): Number of few-shot examples (0-5)
+        all_questions (bool): Whether to evaluate all questions per subject
+        num_questions (int): Number of examples per subject (1-20 or -1 for all)
     """
     if not model_loaded:
     # Convert num_subjects to -1 if all_subjects is True
     if all_subjects:
         num_subjects = -1
+    # Convert num_questions to -1 if all_questions is True
+    if all_questions:
+        num_questions = -1
     # Run evaluation
     results = evaluate_mmlu(
         model,
         tokenizer,
         num_subjects=num_subjects,
+        num_questions=num_questions,
         num_shots=num_shots
     )
     with gr.Row():
         all_subjects_checkbox = gr.Checkbox(
             label="Evaluate All Subjects",
+            value=False,  # Default is unchecked
             info="When checked, evaluates all 57 MMLU subjects"
         )
         num_subjects_slider = gr.Slider(
             minimum=1,
             maximum=57,
+            value=10,  # Default is 10 subjects
             step=1,
             label="Number of Subjects",
             info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
         num_shots_slider = gr.Slider(
             minimum=0,
             maximum=5,
+            value=5,  # Default is 5 few-shot examples
             step=1,
             label="Number of Few-shot Examples",
             info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."
         )
+    with gr.Row():
+        all_questions_checkbox = gr.Checkbox(
+            label="Evaluate All Questions",
+            value=False,  # Default is unchecked
+            info="When checked, evaluates all available questions for each subject"
+        )
+        num_questions_slider = gr.Slider(
             minimum=1,
+            maximum=20,
+            value=10,  # Default is 10 questions
             step=1,
+            label="Questions per Subject",
+            info="Choose a subset of questions (1-20), or click the checkbox for All Questions",
+            interactive=True
         )
     with gr.Row():
     )
     # Update num_subjects_slider interactivity based on all_subjects checkbox
+    def update_subjects_slider(checked):
+        if checked:
+            return gr.update(value=57, interactive=False)
+        else:
+            return gr.update(interactive=True)
     all_subjects_checkbox.change(
+        fn=update_subjects_slider,
         inputs=[all_subjects_checkbox],
         outputs=[num_subjects_slider]
     )
+    # Update num_questions_slider interactivity based on all_questions checkbox
+    def update_questions_slider(checked):
+        if checked:
+            return gr.update(interactive=False)
+        else:
+            return gr.update(interactive=True)
+    all_questions_checkbox.change(
+        fn=update_questions_slider,
+        inputs=[all_questions_checkbox],
+        outputs=[num_questions_slider]
+    )
     # Connect MMLU evaluation button
     eval_mmlu_button.click(
         fn=run_mmlu_evaluation,
             all_subjects_checkbox,
             num_subjects_slider,
             num_shots_slider,
+            all_questions_checkbox,
+            num_questions_slider
         ],
         outputs=results_output
     )
+demo.launch()