Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -70,8 +70,7 @@ def evaluate(model_id, sample_count, config_name):
|
|
70 |
accuracy = correct / len(dataset) * 100
|
71 |
record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
|
72 |
with open("eval.jsonl", "a") as f:
|
73 |
-
f.write(json.dumps(record) + "
|
74 |
-
")
|
75 |
total_correct += correct
|
76 |
total_samples += len(dataset)
|
77 |
avg_accuracy = total_correct / total_samples * 100
|
@@ -96,23 +95,14 @@ def evaluate(model_id, sample_count, config_name):
|
|
96 |
|
97 |
def run(model_id, sample_count, config_name):
|
98 |
score, details = evaluate(model_id, sample_count, config_name)
|
99 |
-
formatted = "
|
100 |
-
|
101 |
-
".join([
|
102 |
-
f"### Question:
|
103 |
-
{q}
|
104 |
-
|
105 |
-
**Model Answer:** {o}
|
106 |
-
**Expected:** {a}
|
107 |
-
**Predicted:** {g}
|
108 |
-
**Correct:** {c}"
|
109 |
for q, o, a, g, c in details
|
110 |
])
|
111 |
accuracy_value = float(score.split()[1][:-1])
|
112 |
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
113 |
with open("eval.jsonl", "a") as f:
|
114 |
-
f.write(json.dumps(record) + "
|
115 |
-
")
|
116 |
return score, formatted
|
117 |
|
118 |
def save_text(text):
|
@@ -151,25 +141,24 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
|
|
151 |
leaderboard_table = gr.Dataframe(headers=["Model ID", "Average Accuracy"], interactive=False, datatype=["str", "number"], row_count=20, col_count=2)
|
152 |
|
153 |
def load_leaderboard():
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
except Exception as e:
|
169 |
-
return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"])
|
170 |
except Exception as e:
|
171 |
-
|
|
|
172 |
|
173 |
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
174 |
|
175 |
-
demo.launch()
|
|
|
70 |
accuracy = correct / len(dataset) * 100
|
71 |
record = {"model_id": model_id, "subject": subject, "accuracy": accuracy}
|
72 |
with open("eval.jsonl", "a") as f:
|
73 |
+
f.write(json.dumps(record) + "\n") # Fixed: added closing double quote and newline
|
|
|
74 |
total_correct += correct
|
75 |
total_samples += len(dataset)
|
76 |
avg_accuracy = total_correct / total_samples * 100
|
|
|
95 |
|
96 |
def run(model_id, sample_count, config_name):
|
97 |
score, details = evaluate(model_id, sample_count, config_name)
|
98 |
+
formatted = "\n\n".join([
|
99 |
+
f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
for q, o, a, g, c in details
|
101 |
])
|
102 |
accuracy_value = float(score.split()[1][:-1])
|
103 |
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
104 |
with open("eval.jsonl", "a") as f:
|
105 |
+
f.write(json.dumps(record) + "\n") # Fixed: added closing double quote and newline
|
|
|
106 |
return score, formatted
|
107 |
|
108 |
def save_text(text):
|
|
|
141 |
leaderboard_table = gr.Dataframe(headers=["Model ID", "Average Accuracy"], interactive=False, datatype=["str", "number"], row_count=20, col_count=2)
|
142 |
|
143 |
def load_leaderboard():
|
144 |
+
try:
|
145 |
+
df = pd.read_json("eval.jsonl", lines=True)
|
146 |
+
df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
|
147 |
+
df_avg.columns = ["model_id", "average_accuracy"]
|
148 |
+
df_sorted = df_avg.sort_values(by="average_accuracy", ascending=False)
|
149 |
+
top10 = df_sorted.head(10)
|
150 |
+
|
151 |
+
fig, ax = plt.subplots()
|
152 |
+
ax.barh(top10['model_id'], top10['average_accuracy'])
|
153 |
+
ax.set_xlabel("Average Accuracy")
|
154 |
+
ax.set_ylabel("Model")
|
155 |
+
ax.set_title("Top 10 Models by Average Accuracy")
|
156 |
+
|
157 |
+
return fig, df_sorted
|
|
|
|
|
158 |
except Exception as e:
|
159 |
+
# Handle the case where eval.jsonl might not exist yet
|
160 |
+
return plt.figure(), pd.DataFrame(columns=["model_id", "average_accuracy"]) # Corrected columns
|
161 |
|
162 |
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
163 |
|
164 |
+
demo.launch()
|