Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -60,71 +60,141 @@ def run_toy_evaluation():
|
|
60 |
# 3. MMLU Evaluation call
|
61 |
# ---------------------------------------------------------------------------
|
62 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
63 |
-
def run_mmlu_evaluation(
|
64 |
-
if not model_loaded:
|
65 |
-
load_model()
|
66 |
-
|
67 |
-
if not model_loaded:
|
68 |
-
return "⚠️ Model not loaded. Please load the model first."
|
69 |
"""
|
70 |
-
Runs the MMLU evaluation with the specified
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
73 |
-
|
74 |
-
|
75 |
-
overall_accuracy = results["overall_accuracy"]
|
76 |
-
min_task, min_acc = results["min_accuracy_task"]
|
77 |
-
max_task, max_acc = results["max_accuracy_task"]
|
78 |
-
correct_examples = results["correct_examples"]
|
79 |
-
incorrect_examples = results["incorrect_examples"]
|
80 |
-
|
81 |
-
# Format examples for readability
|
82 |
-
def format_example(example):
|
83 |
-
task, question, model_output, correct_answer = example
|
84 |
-
return f"**Task:** {task}\n**Question:** {question}\n**Model Output:** {model_output}\n**Correct Answer:** {correct_answer}\n"
|
85 |
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
report = (
|
90 |
-
f"### Overall
|
91 |
-
f"
|
92 |
-
f"
|
93 |
-
f"
|
94 |
-
f"###
|
95 |
-
f"
|
96 |
)
|
97 |
|
98 |
return report
|
99 |
-
|
100 |
# ---------------------------------------------------------------------------
|
101 |
# 4. Gradio Interface
|
102 |
# ---------------------------------------------------------------------------
|
103 |
with gr.Blocks() as demo:
|
104 |
-
gr.Markdown("# Mistral-7B
|
105 |
gr.Markdown("""
|
106 |
-
This demo evaluates Mistral-7B on
|
107 |
""")
|
108 |
|
109 |
-
# Load Model
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
|
114 |
-
# Toy Dataset Evaluation
|
115 |
gr.Markdown("### Toy Dataset Evaluation")
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
# MMLU Evaluation
|
123 |
gr.Markdown("### MMLU Evaluation")
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
-
demo.launch()
|
|
|
60 |
# 3. MMLU Evaluation call
|
61 |
# ---------------------------------------------------------------------------
|
62 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
63 |
+
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
|
|
|
|
|
|
|
|
|
|
|
64 |
"""
|
65 |
+
Runs the MMLU evaluation with the specified parameters.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
all_subjects (bool): Whether to evaluate all subjects
|
69 |
+
num_subjects (int): Number of subjects to evaluate (1-57)
|
70 |
+
num_shots (int): Number of few-shot examples (0-5)
|
71 |
+
num_examples (int): Number of examples per subject (1-10 or -1 for all)
|
72 |
"""
|
73 |
+
if not model_loaded:
|
74 |
+
return "⚠️ Model not loaded. Please load the model first."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
# Convert num_subjects to -1 if all_subjects is True
|
77 |
+
if all_subjects:
|
78 |
+
num_subjects = -1
|
79 |
+
|
80 |
+
# Run evaluation
|
81 |
+
results = evaluate_mmlu(
|
82 |
+
model,
|
83 |
+
tokenizer,
|
84 |
+
num_subjects=num_subjects,
|
85 |
+
num_questions=num_examples,
|
86 |
+
num_shots=num_shots
|
87 |
+
)
|
88 |
|
89 |
+
# Format results
|
90 |
+
overall_acc = results["overall_accuracy"]
|
91 |
+
min_subject, min_acc = results["min_accuracy_subject"]
|
92 |
+
max_subject, max_acc = results["max_accuracy_subject"]
|
93 |
+
|
94 |
+
# Create DataFrame from results table
|
95 |
+
results_df = pd.DataFrame(results["full_accuracy_table"])
|
96 |
+
|
97 |
+
# Format the report
|
98 |
report = (
|
99 |
+
f"### Overall Results\n"
|
100 |
+
f"* Overall Accuracy: {overall_acc:.3f}\n"
|
101 |
+
f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
|
102 |
+
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n\n"
|
103 |
+
f"### Detailed Results Table\n"
|
104 |
+
f"{results_df.to_markdown()}\n"
|
105 |
)
|
106 |
|
107 |
return report
|
108 |
+
|
109 |
# ---------------------------------------------------------------------------
|
110 |
# 4. Gradio Interface
|
111 |
# ---------------------------------------------------------------------------
|
112 |
with gr.Blocks() as demo:
|
113 |
+
gr.Markdown("# Mistral-7B on MMLU - Evaluation Demo")
|
114 |
gr.Markdown("""
|
115 |
+
This demo evaluates Mistral-7B on the MMLU Dataset.
|
116 |
""")
|
117 |
|
118 |
+
# Load Model Section
|
119 |
+
with gr.Row():
|
120 |
+
load_button = gr.Button("Load Model", variant="primary")
|
121 |
+
load_status = gr.Textbox(label="Model Status", interactive=False)
|
122 |
|
123 |
+
# Toy Dataset Evaluation Section
|
124 |
gr.Markdown("### Toy Dataset Evaluation")
|
125 |
+
with gr.Row():
|
126 |
+
eval_toy_button = gr.Button("Run Toy Evaluation", variant="primary")
|
127 |
+
toy_output = gr.Textbox(label="Results")
|
128 |
+
toy_plot = gr.HTML(label="Visualization and Details")
|
129 |
+
|
130 |
+
# MMLU Evaluation Section
|
|
|
131 |
gr.Markdown("### MMLU Evaluation")
|
132 |
+
|
133 |
+
with gr.Row():
|
134 |
+
all_subjects_checkbox = gr.Checkbox(
|
135 |
+
label="Evaluate All Subjects",
|
136 |
+
value=True,
|
137 |
+
info="When checked, evaluates all 57 MMLU subjects"
|
138 |
+
)
|
139 |
+
num_subjects_slider = gr.Slider(
|
140 |
+
minimum=1,
|
141 |
+
maximum=57,
|
142 |
+
value=57,
|
143 |
+
step=1,
|
144 |
+
label="Number of Subjects",
|
145 |
+
info="Number of subjects to evaluate (1-57)",
|
146 |
+
interactive=True
|
147 |
+
)
|
148 |
+
|
149 |
+
with gr.Row():
|
150 |
+
num_shots_slider = gr.Slider(
|
151 |
+
minimum=0,
|
152 |
+
maximum=5,
|
153 |
+
value=5,
|
154 |
+
step=1,
|
155 |
+
label="Number of Few-shot Examples",
|
156 |
+
info="Number of examples to use for few-shot learning (0-5)"
|
157 |
+
)
|
158 |
+
num_examples_slider = gr.Slider(
|
159 |
+
minimum=1,
|
160 |
+
maximum=10,
|
161 |
+
value=5,
|
162 |
+
step=1,
|
163 |
+
label="Examples per Subject",
|
164 |
+
info="Number of test examples per subject (1-10)"
|
165 |
+
)
|
166 |
+
|
167 |
+
with gr.Row():
|
168 |
+
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
|
169 |
+
results_output = gr.Markdown(label="Evaluation Results")
|
170 |
+
|
171 |
+
# Connect components
|
172 |
+
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
173 |
+
|
174 |
+
# Connect toy evaluation
|
175 |
+
eval_toy_button.click(
|
176 |
+
fn=run_toy_evaluation,
|
177 |
+
inputs=None,
|
178 |
+
outputs=[toy_output, toy_plot]
|
179 |
+
)
|
180 |
+
|
181 |
+
# Update num_subjects_slider interactivity based on all_subjects checkbox
|
182 |
+
all_subjects_checkbox.change(
|
183 |
+
fn=lambda x: gr.update(interactive=not x),
|
184 |
+
inputs=[all_subjects_checkbox],
|
185 |
+
outputs=[num_subjects_slider]
|
186 |
+
)
|
187 |
+
|
188 |
+
# Connect MMLU evaluation button
|
189 |
+
eval_mmlu_button.click(
|
190 |
+
fn=run_mmlu_evaluation,
|
191 |
+
inputs=[
|
192 |
+
all_subjects_checkbox,
|
193 |
+
num_subjects_slider,
|
194 |
+
num_shots_slider,
|
195 |
+
num_examples_slider
|
196 |
+
],
|
197 |
+
outputs=results_output
|
198 |
+
)
|
199 |
|
200 |
+
demo.launch()
|