Spaces:
Running
Running
update metrics
Browse files- answer_cache/cache.db +0 -3
- metrics.py +17 -20
- results.duckdb +2 -2
answer_cache/cache.db
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:23e1a6adfdf2de2f7718c1284bf810dc64d191ee415a0975949b01c6954be1b2
|
3 |
-
size 196784128
|
|
|
|
|
|
|
|
metrics.py
CHANGED
@@ -4,12 +4,6 @@ import textwrap
|
|
4 |
from typing import List, Tuple
|
5 |
import argparse
|
6 |
import unicodedata
|
7 |
-
import re
|
8 |
-
|
9 |
-
|
10 |
-
import diskcache as dc
|
11 |
-
|
12 |
-
cache = dc.Cache("answer_cache")
|
13 |
|
14 |
def normalize_text(text: str) -> str:
|
15 |
"""Normalize text to remove accents, convert to lowercase, and strip spaces."""
|
@@ -42,22 +36,25 @@ def _parse_answer(text: str) -> List[List[str]]:
|
|
42 |
return result
|
43 |
|
44 |
def _answer_without_thoughts(completion: str) -> str:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
50 |
|
51 |
def _check_answer(completion: str, answer: str) -> bool:
|
52 |
"""
|
53 |
Check that all the phrases that must appear in the answer appear in the
|
54 |
completion. We ignore "thoughts", capitalization, and punctuation.
|
55 |
"""
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
61 |
completion = completion.replace("**","")
|
62 |
completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
|
63 |
completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
|
@@ -66,9 +63,7 @@ def _check_answer(completion: str, answer: str) -> bool:
|
|
66 |
for answer_phrases in alternative_answers:
|
67 |
# if all(phrase in completion for phrase in answer_phrases):
|
68 |
if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
|
69 |
-
cache[key] = True
|
70 |
return True
|
71 |
-
cache[key] = False
|
72 |
return False
|
73 |
|
74 |
|
@@ -81,7 +76,6 @@ def _wrap_text(text: str, width: int) -> str:
|
|
81 |
def load_results():
|
82 |
conn = duckdb.connect(":memory:")
|
83 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
84 |
-
# conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
|
85 |
conn.execute("""
|
86 |
CREATE TABLE challenges AS
|
87 |
SELECT * FROM 'puzzles_cleaned.csv'
|
@@ -92,6 +86,7 @@ def load_results():
|
|
92 |
conn.create_function("wrap_text", _wrap_text)
|
93 |
return conn
|
94 |
|
|
|
95 |
def load_results_sample_one_only():
|
96 |
conn = duckdb.connect(":memory:")
|
97 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
@@ -118,6 +113,7 @@ def load_results_sample_one_only():
|
|
118 |
conn.create_function("wrap_text", _wrap_text)
|
119 |
return conn
|
120 |
|
|
|
121 |
def r1_accuracy_by_completion_length(conn,model_name):
|
122 |
"""
|
123 |
For the responses from the completions-r1 model:
|
@@ -219,6 +215,8 @@ def accuracy_by_model(conn):
|
|
219 |
ROUND(correct / total, 2) AS accuracy
|
220 |
FROM
|
221 |
AnswerCheck
|
|
|
|
|
222 |
""")
|
223 |
|
224 |
def main():
|
@@ -226,7 +224,6 @@ def main():
|
|
226 |
parser.add_argument("--by-model-and-time", action="store_true")
|
227 |
args = parser.parse_args()
|
228 |
conn = load_results()
|
229 |
-
|
230 |
if args.by_model_and_time:
|
231 |
print(accuracy_by_model_and_time(conn))
|
232 |
else:
|
|
|
4 |
from typing import List, Tuple
|
5 |
import argparse
|
6 |
import unicodedata
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def normalize_text(text: str) -> str:
|
9 |
"""Normalize text to remove accents, convert to lowercase, and strip spaces."""
|
|
|
36 |
return result
|
37 |
|
38 |
def _answer_without_thoughts(completion: str) -> str:
|
39 |
+
end_think_index = completion.find("</think>")
|
40 |
+
if end_think_index == -1:
|
41 |
+
if "<think>" in completion:
|
42 |
+
return ""
|
43 |
+
return completion
|
44 |
+
else:
|
45 |
+
return completion[end_think_index + len("</think>"):]
|
46 |
|
47 |
def _check_answer(completion: str, answer: str) -> bool:
|
48 |
"""
|
49 |
Check that all the phrases that must appear in the answer appear in the
|
50 |
completion. We ignore "thoughts", capitalization, and punctuation.
|
51 |
"""
|
52 |
+
completion = _answer_without_thoughts(completion)
|
53 |
+
|
54 |
+
if len(completion) > 3.14 * 5_300:
|
55 |
+
return False
|
56 |
+
|
57 |
+
completion = completion.lower()
|
58 |
completion = completion.replace("**","")
|
59 |
completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
|
60 |
completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
|
|
|
63 |
for answer_phrases in alternative_answers:
|
64 |
# if all(phrase in completion for phrase in answer_phrases):
|
65 |
if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
|
|
|
66 |
return True
|
|
|
67 |
return False
|
68 |
|
69 |
|
|
|
76 |
def load_results():
|
77 |
conn = duckdb.connect(":memory:")
|
78 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
|
|
79 |
conn.execute("""
|
80 |
CREATE TABLE challenges AS
|
81 |
SELECT * FROM 'puzzles_cleaned.csv'
|
|
|
86 |
conn.create_function("wrap_text", _wrap_text)
|
87 |
return conn
|
88 |
|
89 |
+
|
90 |
def load_results_sample_one_only():
|
91 |
conn = duckdb.connect(":memory:")
|
92 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
|
|
113 |
conn.create_function("wrap_text", _wrap_text)
|
114 |
return conn
|
115 |
|
116 |
+
|
117 |
def r1_accuracy_by_completion_length(conn,model_name):
|
118 |
"""
|
119 |
For the responses from the completions-r1 model:
|
|
|
215 |
ROUND(correct / total, 2) AS accuracy
|
216 |
FROM
|
217 |
AnswerCheck
|
218 |
+
ORDER BY
|
219 |
+
model
|
220 |
""")
|
221 |
|
222 |
def main():
|
|
|
224 |
parser.add_argument("--by-model-and-time", action="store_true")
|
225 |
args = parser.parse_args()
|
226 |
conn = load_results()
|
|
|
227 |
if args.by_model_and_time:
|
228 |
print(accuracy_by_model_and_time(conn))
|
229 |
else:
|
results.duckdb
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29bc4620908391a0cdfba5740eae731755e2a81afddd9f7941532acb16f3465e
|
3 |
+
size 103034880
|