Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -36,7 +36,8 @@ def extract_choice_letter(output):
|
|
36 |
match = re.search(r"\b([ABCD])\b", output.strip())
|
37 |
return match.group(1) if match else None
|
38 |
|
39 |
-
|
|
|
40 |
if config_name == "ALL":
|
41 |
subjects = [
|
42 |
"abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
|
@@ -57,11 +58,13 @@ def evaluate(model_id, sample_count, config_name):
|
|
57 |
total_correct = 0
|
58 |
total_samples = 0
|
59 |
all_results = []
|
60 |
-
for subject
|
|
|
61 |
dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
|
62 |
dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
|
63 |
correct = 0
|
64 |
-
for
|
|
|
65 |
prompt, answer = format_prompt(item)
|
66 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
67 |
output_letter = extract_choice_letter(output)
|
@@ -70,7 +73,7 @@ def evaluate(model_id, sample_count, config_name):
|
|
70 |
accuracy = correct / len(dataset) * 100
|
71 |
record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
|
72 |
with open("eval.jsonl", "a") as f:
|
73 |
-
f.write(json.dumps(record) + "\n")
|
74 |
total_correct += correct
|
75 |
total_samples += len(dataset)
|
76 |
avg_accuracy = total_correct / total_samples * 100
|
@@ -82,7 +85,8 @@ def evaluate(model_id, sample_count, config_name):
|
|
82 |
correct = 0
|
83 |
results = []
|
84 |
|
85 |
-
for
|
|
|
86 |
prompt, answer = format_prompt(item)
|
87 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
88 |
output_letter = extract_choice_letter(output)
|
@@ -93,8 +97,9 @@ def evaluate(model_id, sample_count, config_name):
|
|
93 |
accuracy = correct / len(dataset) * 100
|
94 |
return f"Accuracy: {accuracy:.2f}%, out of {len(dataset)} samples", results
|
95 |
|
96 |
-
|
97 |
-
|
|
|
98 |
formatted = "\n\n".join([
|
99 |
f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
|
100 |
for q, o, a, g, c in details
|
@@ -102,7 +107,7 @@ def run(model_id, sample_count, config_name):
|
|
102 |
accuracy_value = float(score.split()[1][:-1])
|
103 |
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
104 |
with open("eval.jsonl", "a") as f:
|
105 |
-
f.write(json.dumps(record) + "\n")
|
106 |
return score, formatted
|
107 |
|
108 |
def save_text(text):
|
@@ -133,6 +138,7 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
|
|
133 |
detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
|
134 |
download_button = gr.Button("📥 Download Full Evaluation")
|
135 |
|
|
|
136 |
run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
|
137 |
download_button.click(save_text, inputs=detail_output, outputs=gr.File())
|
138 |
|
@@ -157,8 +163,8 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
|
|
157 |
return fig, df_sorted
|
158 |
except Exception as e:
|
159 |
# Handle the case where eval.jsonl might not exist yet
|
160 |
-
return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"])
|
161 |
|
162 |
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
163 |
|
164 |
-
demo.launch()
|
|
|
36 |
match = re.search(r"\b([ABCD])\b", output.strip())
|
37 |
return match.group(1) if match else None
|
38 |
|
39 |
+
# Added progress parameter to the evaluate function
|
40 |
+
def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
|
41 |
if config_name == "ALL":
|
42 |
subjects = [
|
43 |
"abstract_algebra", "anatomy", "astronomy", "business_ethics", "college_biology",
|
|
|
58 |
total_correct = 0
|
59 |
total_samples = 0
|
60 |
all_results = []
|
61 |
+
# Use progress for subject iteration
|
62 |
+
for i, subject in enumerate(progress.tqdm(subjects, desc="Evaluating subjects")):
|
63 |
dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
|
64 |
dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
|
65 |
correct = 0
|
66 |
+
# Use progress for sample iteration within each subject
|
67 |
+
for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
|
68 |
prompt, answer = format_prompt(item)
|
69 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
70 |
output_letter = extract_choice_letter(output)
|
|
|
73 |
accuracy = correct / len(dataset) * 100
|
74 |
record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
|
75 |
with open("eval.jsonl", "a") as f:
|
76 |
+
f.write(json.dumps(record) + "\n")
|
77 |
total_correct += correct
|
78 |
total_samples += len(dataset)
|
79 |
avg_accuracy = total_correct / total_samples * 100
|
|
|
85 |
correct = 0
|
86 |
results = []
|
87 |
|
88 |
+
# Use progress for sample iteration
|
89 |
+
for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
|
90 |
prompt, answer = format_prompt(item)
|
91 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
92 |
output_letter = extract_choice_letter(output)
|
|
|
97 |
accuracy = correct / len(dataset) * 100
|
98 |
return f"Accuracy: {accuracy:.2f}%, out of {len(dataset)} samples", results
|
99 |
|
100 |
+
# Pass progress to evaluate function
|
101 |
+
def run(model_id, sample_count, config_name, progress=gr.Progress()):
|
102 |
+
score, details = evaluate(model_id, sample_count, config_name, progress)
|
103 |
formatted = "\n\n".join([
|
104 |
f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
|
105 |
for q, o, a, g, c in details
|
|
|
107 |
accuracy_value = float(score.split()[1][:-1])
|
108 |
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
109 |
with open("eval.jsonl", "a") as f:
|
110 |
+
f.write(json.dumps(record) + "\n")
|
111 |
return score, formatted
|
112 |
|
113 |
def save_text(text):
|
|
|
138 |
detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False)
|
139 |
download_button = gr.Button("📥 Download Full Evaluation")
|
140 |
|
141 |
+
# Pass progress to the run function
|
142 |
run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
|
143 |
download_button.click(save_text, inputs=detail_output, outputs=gr.File())
|
144 |
|
|
|
163 |
return fig, df_sorted
|
164 |
except Exception as e:
|
165 |
# Handle the case where eval.jsonl might not exist yet
|
166 |
+
return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"])
|
167 |
|
168 |
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
169 |
|
170 |
+
demo.launch()
|