rohansampath commited on
Commit
df31ae3
·
verified ·
1 Parent(s): da41998

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -49
app.py CHANGED
@@ -60,71 +60,141 @@ def run_toy_evaluation():
60
  # 3. MMLU Evaluation call
61
  # ---------------------------------------------------------------------------
62
  @spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
63
- def run_mmlu_evaluation(num_questions):
64
- if not model_loaded:
65
- load_model()
66
-
67
- if not model_loaded:
68
- return "⚠️ Model not loaded. Please load the model first."
69
  """
70
- Runs the MMLU evaluation with the specified number of questions per task.
71
- Also displays two correct and two incorrect examples.
 
 
 
 
 
72
  """
73
- results = evaluate_mmlu(model, tokenizer, num_questions)
74
-
75
- overall_accuracy = results["overall_accuracy"]
76
- min_task, min_acc = results["min_accuracy_task"]
77
- max_task, max_acc = results["max_accuracy_task"]
78
- correct_examples = results["correct_examples"]
79
- incorrect_examples = results["incorrect_examples"]
80
-
81
- # Format examples for readability
82
- def format_example(example):
83
- task, question, model_output, correct_answer = example
84
- return f"**Task:** {task}\n**Question:** {question}\n**Model Output:** {model_output}\n**Correct Answer:** {correct_answer}\n"
85
 
86
- correct_text = "\n\n".join(format_example(ex) for ex in correct_examples)
87
- incorrect_text = "\n\n".join(format_example(ex) for ex in incorrect_examples)
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
 
 
 
 
 
89
  report = (
90
- f"### Overall Accuracy: {overall_accuracy:.2f}\n"
91
- f"**Min Accuracy:** {min_acc:.2f} on `{min_task}`\n"
92
- f"**Max Accuracy:** {max_acc:.2f} on `{max_task}`\n\n"
93
- f"---\n\n"
94
- f"### Correct Examples\n{correct_text if correct_examples else 'No correct examples available.'}\n\n"
95
- f"### ❌ Incorrect Examples\n{incorrect_text if incorrect_examples else 'No incorrect examples available.'}"
96
  )
97
 
98
  return report
99
-
100
  # ---------------------------------------------------------------------------
101
  # 4. Gradio Interface
102
  # ---------------------------------------------------------------------------
103
  with gr.Blocks() as demo:
104
- gr.Markdown("# Mistral-7B Math Evaluation Demo")
105
  gr.Markdown("""
106
- This demo evaluates Mistral-7B on Various Datasets.
107
  """)
108
 
109
- # Load Model Button
110
- load_button = gr.Button("Load Model", variant="primary")
111
- load_status = gr.Textbox(label="Model Status", interactive=False)
112
- load_button.click(fn=load_model, inputs=None, outputs=load_status)
113
 
114
- # Toy Dataset Evaluation
115
  gr.Markdown("### Toy Dataset Evaluation")
116
- eval_button = gr.Button("Run Evaluation", variant="primary")
117
- output_text = gr.Textbox(label="Results")
118
- output_plot = gr.HTML(label="Visualization and Details")
119
-
120
- eval_button.click(fn=run_toy_evaluation, inputs=None, outputs=[output_text, output_plot])
121
-
122
- # MMLU Evaluation
123
  gr.Markdown("### MMLU Evaluation")
124
- num_questions_input = gr.Number(label="Questions per Task (Total of 57 tasks)", value=5, precision=0)
125
- eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
126
- mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
127
-
128
- eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- demo.launch()
 
60
  # 3. MMLU Evaluation call
61
  # ---------------------------------------------------------------------------
62
  @spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
63
+ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
 
 
 
 
 
64
  """
65
+ Runs the MMLU evaluation with the specified parameters.
66
+
67
+ Args:
68
+ all_subjects (bool): Whether to evaluate all subjects
69
+ num_subjects (int): Number of subjects to evaluate (1-57)
70
+ num_shots (int): Number of few-shot examples (0-5)
71
+ num_examples (int): Number of examples per subject (1-10 or -1 for all)
72
  """
73
+ if not model_loaded:
74
+ return "⚠️ Model not loaded. Please load the model first."
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Convert num_subjects to -1 if all_subjects is True
77
+ if all_subjects:
78
+ num_subjects = -1
79
+
80
+ # Run evaluation
81
+ results = evaluate_mmlu(
82
+ model,
83
+ tokenizer,
84
+ num_subjects=num_subjects,
85
+ num_questions=num_examples,
86
+ num_shots=num_shots
87
+ )
88
 
89
+ # Format results
90
+ overall_acc = results["overall_accuracy"]
91
+ min_subject, min_acc = results["min_accuracy_subject"]
92
+ max_subject, max_acc = results["max_accuracy_subject"]
93
+
94
+ # Create DataFrame from results table
95
+ results_df = pd.DataFrame(results["full_accuracy_table"])
96
+
97
+ # Format the report
98
  report = (
99
+ f"### Overall Results\n"
100
+ f"* Overall Accuracy: {overall_acc:.3f}\n"
101
+ f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
102
+ f"* Worst Performance: {min_subject} ({min_acc:.3f})\n\n"
103
+ f"### Detailed Results Table\n"
104
+ f"{results_df.to_markdown()}\n"
105
  )
106
 
107
  return report
108
+
109
  # ---------------------------------------------------------------------------
110
  # 4. Gradio Interface
111
  # ---------------------------------------------------------------------------
112
  with gr.Blocks() as demo:
113
+ gr.Markdown("# Mistral-7B on MMLU - Evaluation Demo")
114
  gr.Markdown("""
115
+ This demo evaluates Mistral-7B on the MMLU Dataset.
116
  """)
117
 
118
+ # Load Model Section
119
+ with gr.Row():
120
+ load_button = gr.Button("Load Model", variant="primary")
121
+ load_status = gr.Textbox(label="Model Status", interactive=False)
122
 
123
+ # Toy Dataset Evaluation Section
124
  gr.Markdown("### Toy Dataset Evaluation")
125
+ with gr.Row():
126
+ eval_toy_button = gr.Button("Run Toy Evaluation", variant="primary")
127
+ toy_output = gr.Textbox(label="Results")
128
+ toy_plot = gr.HTML(label="Visualization and Details")
129
+
130
+ # MMLU Evaluation Section
 
131
  gr.Markdown("### MMLU Evaluation")
132
+
133
+ with gr.Row():
134
+ all_subjects_checkbox = gr.Checkbox(
135
+ label="Evaluate All Subjects",
136
+ value=True,
137
+ info="When checked, evaluates all 57 MMLU subjects"
138
+ )
139
+ num_subjects_slider = gr.Slider(
140
+ minimum=1,
141
+ maximum=57,
142
+ value=57,
143
+ step=1,
144
+ label="Number of Subjects",
145
+ info="Number of subjects to evaluate (1-57)",
146
+ interactive=True
147
+ )
148
+
149
+ with gr.Row():
150
+ num_shots_slider = gr.Slider(
151
+ minimum=0,
152
+ maximum=5,
153
+ value=5,
154
+ step=1,
155
+ label="Number of Few-shot Examples",
156
+ info="Number of examples to use for few-shot learning (0-5)"
157
+ )
158
+ num_examples_slider = gr.Slider(
159
+ minimum=1,
160
+ maximum=10,
161
+ value=5,
162
+ step=1,
163
+ label="Examples per Subject",
164
+ info="Number of test examples per subject (1-10)"
165
+ )
166
+
167
+ with gr.Row():
168
+ eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
169
+ results_output = gr.Markdown(label="Evaluation Results")
170
+
171
+ # Connect components
172
+ load_button.click(fn=load_model, inputs=None, outputs=load_status)
173
+
174
+ # Connect toy evaluation
175
+ eval_toy_button.click(
176
+ fn=run_toy_evaluation,
177
+ inputs=None,
178
+ outputs=[toy_output, toy_plot]
179
+ )
180
+
181
+ # Update num_subjects_slider interactivity based on all_subjects checkbox
182
+ all_subjects_checkbox.change(
183
+ fn=lambda x: gr.update(interactive=not x),
184
+ inputs=[all_subjects_checkbox],
185
+ outputs=[num_subjects_slider]
186
+ )
187
+
188
+ # Connect MMLU evaluation button
189
+ eval_mmlu_button.click(
190
+ fn=run_mmlu_evaluation,
191
+ inputs=[
192
+ all_subjects_checkbox,
193
+ num_subjects_slider,
194
+ num_shots_slider,
195
+ num_examples_slider
196
+ ],
197
+ outputs=results_output
198
+ )
199
 
200
+ demo.launch()