Spaces:
Running
Running
update metrics
Browse files- answer_cache/cache.db +0 -3
- metrics.py +17 -20
- results.duckdb +2 -2
answer_cache/cache.db
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:23e1a6adfdf2de2f7718c1284bf810dc64d191ee415a0975949b01c6954be1b2
|
| 3 |
-
size 196784128
|
|
|
|
|
|
|
|
|
|
|
|
metrics.py
CHANGED
|
@@ -4,12 +4,6 @@ import textwrap
|
|
| 4 |
from typing import List, Tuple
|
| 5 |
import argparse
|
| 6 |
import unicodedata
|
| 7 |
-
import re
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
import diskcache as dc
|
| 11 |
-
|
| 12 |
-
cache = dc.Cache("answer_cache")
|
| 13 |
|
| 14 |
def normalize_text(text: str) -> str:
|
| 15 |
"""Normalize text to remove accents, convert to lowercase, and strip spaces."""
|
|
@@ -42,22 +36,25 @@ def _parse_answer(text: str) -> List[List[str]]:
|
|
| 42 |
return result
|
| 43 |
|
| 44 |
def _answer_without_thoughts(completion: str) -> str:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
|
| 51 |
def _check_answer(completion: str, answer: str) -> bool:
|
| 52 |
"""
|
| 53 |
Check that all the phrases that must appear in the answer appear in the
|
| 54 |
completion. We ignore "thoughts", capitalization, and punctuation.
|
| 55 |
"""
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
completion = completion.replace("**","")
|
| 62 |
completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
|
| 63 |
completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
|
|
@@ -66,9 +63,7 @@ def _check_answer(completion: str, answer: str) -> bool:
|
|
| 66 |
for answer_phrases in alternative_answers:
|
| 67 |
# if all(phrase in completion for phrase in answer_phrases):
|
| 68 |
if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
|
| 69 |
-
cache[key] = True
|
| 70 |
return True
|
| 71 |
-
cache[key] = False
|
| 72 |
return False
|
| 73 |
|
| 74 |
|
|
@@ -81,7 +76,6 @@ def _wrap_text(text: str, width: int) -> str:
|
|
| 81 |
def load_results():
|
| 82 |
conn = duckdb.connect(":memory:")
|
| 83 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
| 84 |
-
# conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
|
| 85 |
conn.execute("""
|
| 86 |
CREATE TABLE challenges AS
|
| 87 |
SELECT * FROM 'puzzles_cleaned.csv'
|
|
@@ -92,6 +86,7 @@ def load_results():
|
|
| 92 |
conn.create_function("wrap_text", _wrap_text)
|
| 93 |
return conn
|
| 94 |
|
|
|
|
| 95 |
def load_results_sample_one_only():
|
| 96 |
conn = duckdb.connect(":memory:")
|
| 97 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
|
@@ -118,6 +113,7 @@ def load_results_sample_one_only():
|
|
| 118 |
conn.create_function("wrap_text", _wrap_text)
|
| 119 |
return conn
|
| 120 |
|
|
|
|
| 121 |
def r1_accuracy_by_completion_length(conn,model_name):
|
| 122 |
"""
|
| 123 |
For the responses from the completions-r1 model:
|
|
@@ -219,6 +215,8 @@ def accuracy_by_model(conn):
|
|
| 219 |
ROUND(correct / total, 2) AS accuracy
|
| 220 |
FROM
|
| 221 |
AnswerCheck
|
|
|
|
|
|
|
| 222 |
""")
|
| 223 |
|
| 224 |
def main():
|
|
@@ -226,7 +224,6 @@ def main():
|
|
| 226 |
parser.add_argument("--by-model-and-time", action="store_true")
|
| 227 |
args = parser.parse_args()
|
| 228 |
conn = load_results()
|
| 229 |
-
|
| 230 |
if args.by_model_and_time:
|
| 231 |
print(accuracy_by_model_and_time(conn))
|
| 232 |
else:
|
|
|
|
| 4 |
from typing import List, Tuple
|
| 5 |
import argparse
|
| 6 |
import unicodedata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def normalize_text(text: str) -> str:
|
| 9 |
"""Normalize text to remove accents, convert to lowercase, and strip spaces."""
|
|
|
|
| 36 |
return result
|
| 37 |
|
| 38 |
def _answer_without_thoughts(completion: str) -> str:
|
| 39 |
+
end_think_index = completion.find("</think>")
|
| 40 |
+
if end_think_index == -1:
|
| 41 |
+
if "<think>" in completion:
|
| 42 |
+
return ""
|
| 43 |
+
return completion
|
| 44 |
+
else:
|
| 45 |
+
return completion[end_think_index + len("</think>"):]
|
| 46 |
|
| 47 |
def _check_answer(completion: str, answer: str) -> bool:
|
| 48 |
"""
|
| 49 |
Check that all the phrases that must appear in the answer appear in the
|
| 50 |
completion. We ignore "thoughts", capitalization, and punctuation.
|
| 51 |
"""
|
| 52 |
+
completion = _answer_without_thoughts(completion)
|
| 53 |
+
|
| 54 |
+
if len(completion) > 3.14 * 5_300:
|
| 55 |
+
return False
|
| 56 |
+
|
| 57 |
+
completion = completion.lower()
|
| 58 |
completion = completion.replace("**","")
|
| 59 |
completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
|
| 60 |
completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
|
|
|
|
| 63 |
for answer_phrases in alternative_answers:
|
| 64 |
# if all(phrase in completion for phrase in answer_phrases):
|
| 65 |
if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
|
|
|
|
| 66 |
return True
|
|
|
|
| 67 |
return False
|
| 68 |
|
| 69 |
|
|
|
|
| 76 |
def load_results():
|
| 77 |
conn = duckdb.connect(":memory:")
|
| 78 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
|
|
|
| 79 |
conn.execute("""
|
| 80 |
CREATE TABLE challenges AS
|
| 81 |
SELECT * FROM 'puzzles_cleaned.csv'
|
|
|
|
| 86 |
conn.create_function("wrap_text", _wrap_text)
|
| 87 |
return conn
|
| 88 |
|
| 89 |
+
|
| 90 |
def load_results_sample_one_only():
|
| 91 |
conn = duckdb.connect(":memory:")
|
| 92 |
conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
|
|
|
|
| 113 |
conn.create_function("wrap_text", _wrap_text)
|
| 114 |
return conn
|
| 115 |
|
| 116 |
+
|
| 117 |
def r1_accuracy_by_completion_length(conn,model_name):
|
| 118 |
"""
|
| 119 |
For the responses from the completions-r1 model:
|
|
|
|
| 215 |
ROUND(correct / total, 2) AS accuracy
|
| 216 |
FROM
|
| 217 |
AnswerCheck
|
| 218 |
+
ORDER BY
|
| 219 |
+
model
|
| 220 |
""")
|
| 221 |
|
| 222 |
def main():
|
|
|
|
| 224 |
parser.add_argument("--by-model-and-time", action="store_true")
|
| 225 |
args = parser.parse_args()
|
| 226 |
conn = load_results()
|
|
|
|
| 227 |
if args.by_model_and_time:
|
| 228 |
print(accuracy_by_model_and_time(conn))
|
| 229 |
else:
|
results.duckdb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29bc4620908391a0cdfba5740eae731755e2a81afddd9f7941532acb16f3465e
|
| 3 |
+
size 103034880
|