Spaces:
Running
Running
- metrics.py +0 -37
metrics.py
CHANGED
@@ -100,11 +100,6 @@ def load_results_sample_one_only():
|
|
100 |
WHERE rn = 1;
|
101 |
"""
|
102 |
conn.execute(query).fetchall()
|
103 |
-
# #print how how many rows are in the table
|
104 |
-
# print(conn.execute("SELECT COUNT(*) FROM sampled").fetchall())
|
105 |
-
# #describe the sampled table
|
106 |
-
# print(conn.execute("DESCRIBE sampled").fetchall())
|
107 |
-
|
108 |
conn.execute("""
|
109 |
CREATE TABLE challenges AS
|
110 |
SELECT * FROM 'puzzles_cleaned.csv'
|
@@ -218,38 +213,6 @@ def accuracy_by_model(conn):
|
|
218 |
AnswerCheck
|
219 |
""")
|
220 |
|
221 |
-
def accuracy_by_model_only_one(conn):
|
222 |
-
query = """
|
223 |
-
WITH FirstResponses AS (
|
224 |
-
SELECT
|
225 |
-
parent_dir AS model,
|
226 |
-
prompt_id,
|
227 |
-
completion,
|
228 |
-
count,
|
229 |
-
ROW_NUMBER() OVER (PARTITION BY parent_dir, prompt_id) AS rn
|
230 |
-
FROM results.completions
|
231 |
-
WHERE parent_dir = 'completions-r1_cursor_hosted' -- Only consider rows where parent_dir is 'r1_cursor_hosted'
|
232 |
-
),
|
233 |
-
AnswerCheck AS (
|
234 |
-
SELECT
|
235 |
-
fr.model,
|
236 |
-
SUM(fr.count) AS total,
|
237 |
-
SUM(fr.count * CAST(check_answer(fr.completion, c.answer) AS INTEGER)) AS correct
|
238 |
-
FROM FirstResponses fr
|
239 |
-
JOIN challenges c ON fr.prompt_id = c.ID
|
240 |
-
WHERE fr.rn = 1 -- Select only the first response per model per prompt
|
241 |
-
GROUP BY fr.model
|
242 |
-
)
|
243 |
-
SELECT
|
244 |
-
model,
|
245 |
-
total,
|
246 |
-
correct,
|
247 |
-
ROUND(correct / total, 2) AS accuracy
|
248 |
-
FROM AnswerCheck;
|
249 |
-
"""
|
250 |
-
return conn.sql(query)
|
251 |
-
|
252 |
-
|
253 |
def main():
|
254 |
parser = argparse.ArgumentParser()
|
255 |
parser.add_argument("--by-model-and-time", action="store_true")
|
|
|
100 |
WHERE rn = 1;
|
101 |
"""
|
102 |
conn.execute(query).fetchall()
|
|
|
|
|
|
|
|
|
|
|
103 |
conn.execute("""
|
104 |
CREATE TABLE challenges AS
|
105 |
SELECT * FROM 'puzzles_cleaned.csv'
|
|
|
213 |
AnswerCheck
|
214 |
""")
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
def main():
|
217 |
parser = argparse.ArgumentParser()
|
218 |
parser.add_argument("--by-model-and-time", action="store_true")
|