Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -36,7 +36,7 @@ def extract_choice_letter(output):
|
|
36 |
match = re.search(r"\b([ABCD])\b", output.strip())
|
37 |
return match.group(1) if match else None
|
38 |
|
39 |
-
#
|
40 |
def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
|
41 |
if config_name == "ALL":
|
42 |
subjects = [
|
@@ -58,26 +58,22 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
|
|
58 |
total_correct = 0
|
59 |
total_samples = 0
|
60 |
all_results = []
|
61 |
-
# Use progress for subject iteration
|
62 |
for i, subject in enumerate(progress.tqdm(subjects, desc="Evaluating subjects")):
|
63 |
dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
|
64 |
dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
|
65 |
correct = 0
|
66 |
-
# Use progress for sample iteration within each subject
|
67 |
for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
|
68 |
prompt, answer = format_prompt(item)
|
69 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
70 |
output_letter = extract_choice_letter(output)
|
71 |
correct += output_letter == answer
|
72 |
all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
|
73 |
-
|
74 |
-
record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
|
75 |
-
with open("eval.jsonl", "a") as f:
|
76 |
-
f.write(json.dumps(record) + "\n")
|
77 |
total_correct += correct
|
78 |
total_samples += len(dataset)
|
79 |
avg_accuracy = total_correct / total_samples * 100
|
80 |
-
|
|
|
81 |
gen = load_model(model_id)
|
82 |
dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
|
83 |
dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
|
@@ -85,7 +81,6 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
|
|
85 |
correct = 0
|
86 |
results = []
|
87 |
|
88 |
-
# Use progress for sample iteration
|
89 |
for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
|
90 |
prompt, answer = format_prompt(item)
|
91 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
@@ -95,20 +90,30 @@ def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
|
|
95 |
results.append((prompt, output.strip(), answer, output_letter, is_correct))
|
96 |
|
97 |
accuracy = correct / len(dataset) * 100
|
98 |
-
|
|
|
99 |
|
100 |
# Pass progress to evaluate function
|
101 |
def run(model_id, sample_count, config_name, progress=gr.Progress()):
|
102 |
-
|
|
|
|
|
103 |
formatted = "\n\n".join([
|
104 |
f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
|
105 |
for q, o, a, g, c in details
|
106 |
])
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
109 |
with open("eval.jsonl", "a") as f:
|
110 |
f.write(json.dumps(record) + "\n")
|
111 |
-
return
|
112 |
|
113 |
def save_text(text):
|
114 |
return "evaluation_results.txt", text
|
@@ -167,4 +172,4 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
|
|
167 |
|
168 |
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
169 |
|
170 |
-
demo.launch()
|
|
|
36 |
match = re.search(r"\b([ABCD])\b", output.strip())
|
37 |
return match.group(1) if match else None
|
38 |
|
39 |
+
# Modified evaluate function to return accuracy as a float directly
|
40 |
def evaluate(model_id, sample_count, config_name, progress=gr.Progress()):
|
41 |
if config_name == "ALL":
|
42 |
subjects = [
|
|
|
58 |
total_correct = 0
|
59 |
total_samples = 0
|
60 |
all_results = []
|
|
|
61 |
for i, subject in enumerate(progress.tqdm(subjects, desc="Evaluating subjects")):
|
62 |
dataset = load_dataset("cais/mmlu", subject, token=HF_TOKEN)["test"]
|
63 |
dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
|
64 |
correct = 0
|
|
|
65 |
for j, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
|
66 |
prompt, answer = format_prompt(item)
|
67 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
68 |
output_letter = extract_choice_letter(output)
|
69 |
correct += output_letter == answer
|
70 |
all_results.append((prompt, output.strip(), answer, output_letter, output_letter == answer))
|
71 |
+
# No need to write subject-level record here, only aggregate
|
|
|
|
|
|
|
72 |
total_correct += correct
|
73 |
total_samples += len(dataset)
|
74 |
avg_accuracy = total_correct / total_samples * 100
|
75 |
+
# Return the float accuracy value
|
76 |
+
return avg_accuracy, all_results
|
77 |
gen = load_model(model_id)
|
78 |
dataset = load_dataset("cais/mmlu", config_name, token=HF_TOKEN)["test"]
|
79 |
dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset))))
|
|
|
81 |
correct = 0
|
82 |
results = []
|
83 |
|
|
|
84 |
for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {config_name} samples")):
|
85 |
prompt, answer = format_prompt(item)
|
86 |
output = gen(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
|
|
|
90 |
results.append((prompt, output.strip(), answer, output_letter, is_correct))
|
91 |
|
92 |
accuracy = correct / len(dataset) * 100
|
93 |
+
# Return the float accuracy value
|
94 |
+
return accuracy, results
|
95 |
|
96 |
# Pass progress to evaluate function
|
97 |
def run(model_id, sample_count, config_name, progress=gr.Progress()):
|
98 |
+
# Receive accuracy_value directly as a float
|
99 |
+
accuracy_value, details = evaluate(model_id, sample_count, config_name, progress)
|
100 |
+
|
101 |
formatted = "\n\n".join([
|
102 |
f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
|
103 |
for q, o, a, g, c in details
|
104 |
])
|
105 |
+
|
106 |
+
# Format the score string based on config_name
|
107 |
+
if config_name == "ALL":
|
108 |
+
score_string = f"Average Accuracy: {accuracy_value:.2f}% across all subjects"
|
109 |
+
else:
|
110 |
+
# Assuming len(details) corresponds to the number of samples processed for a single subject
|
111 |
+
score_string = f"Accuracy: {accuracy_value:.2f}%, out of {len(details)} samples"
|
112 |
+
|
113 |
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
114 |
with open("eval.jsonl", "a") as f:
|
115 |
f.write(json.dumps(record) + "\n")
|
116 |
+
return score_string, formatted # Return the formatted string and details
|
117 |
|
118 |
def save_text(text):
|
119 |
return "evaluation_results.txt", text
|
|
|
172 |
|
173 |
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
174 |
|
175 |
+
demo.launch()
|