Spaces:

nuprl
/

reasoning-weekly

Running

App Files Files

Aryarya commited on Mar 28

Commit

1526739

1 Parent(s): 43abcec

update metrics

Browse files

Files changed (3) hide show

answer_cache/cache.db +0 -3
metrics.py +17 -20
results.duckdb +2 -2

answer_cache/cache.db DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:23e1a6adfdf2de2f7718c1284bf810dc64d191ee415a0975949b01c6954be1b2
-size 196784128

metrics.py CHANGED Viewed

@@ -4,12 +4,6 @@ import textwrap
 from typing import List, Tuple
 import argparse
 import unicodedata
-import re
-import diskcache as dc
-cache = dc.Cache("answer_cache")
 def normalize_text(text: str) -> str:
     """Normalize text to remove accents, convert to lowercase, and strip spaces."""
@@ -42,22 +36,25 @@ def _parse_answer(text: str) -> List[List[str]]:
     return result
 def _answer_without_thoughts(completion: str) -> str:
-    completion = re.sub(r"(<think>)?[^<]*<\/think>", "", completion).strip()
-    completion = re.sub(r".*</think>", "", completion).strip() #because qwen sometimes misses <think>
-    return completion
 def _check_answer(completion: str, answer: str) -> bool:
     """
     Check that all the phrases that must appear in the answer appear in the
     completion. We ignore "thoughts", capitalization, and punctuation.
     """
-    key = (completion, answer)
-    if key in cache:
-        return cache[key]
-    completion = _answer_without_thoughts(completion).lower()
     completion = completion.replace("**","")
     completion  = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
     completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
@@ -66,9 +63,7 @@ def _check_answer(completion: str, answer: str) -> bool:
     for answer_phrases in alternative_answers:
         # if all(phrase in completion for phrase in answer_phrases):
         if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
-            cache[key] = True
             return True
-    cache[key] = False
     return False
@@ -81,7 +76,6 @@ def _wrap_text(text: str, width: int) -> str:
 def load_results():
     conn = duckdb.connect(":memory:")
     conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
-    # conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
     conn.execute("""
         CREATE TABLE challenges AS
         SELECT * FROM 'puzzles_cleaned.csv'
@@ -92,6 +86,7 @@ def load_results():
     conn.create_function("wrap_text", _wrap_text)
     return conn
 def load_results_sample_one_only():
     conn = duckdb.connect(":memory:")
     conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
@@ -118,6 +113,7 @@ def load_results_sample_one_only():
     conn.create_function("wrap_text", _wrap_text)
     return conn
 def r1_accuracy_by_completion_length(conn,model_name):
     """
     For the responses from the completions-r1 model:
@@ -219,6 +215,8 @@ def accuracy_by_model(conn):
             ROUND(correct / total, 2) AS accuracy
         FROM
             AnswerCheck
     """)
 def main():
@@ -226,7 +224,6 @@ def main():
     parser.add_argument("--by-model-and-time", action="store_true")
     args = parser.parse_args()
     conn = load_results()
     if args.by_model_and_time:
         print(accuracy_by_model_and_time(conn))
     else:

 from typing import List, Tuple
 import argparse
 import unicodedata
 def normalize_text(text: str) -> str:
     """Normalize text to remove accents, convert to lowercase, and strip spaces."""
     return result
 def _answer_without_thoughts(completion: str) -> str:
+    end_think_index = completion.find("</think>")
+    if end_think_index == -1:
+        if "<think>" in completion:
+            return ""
+        return completion
+    else:
+        return completion[end_think_index + len("</think>"):]
 def _check_answer(completion: str, answer: str) -> bool:
     """
     Check that all the phrases that must appear in the answer appear in the
     completion. We ignore "thoughts", capitalization, and punctuation.
     """
+    completion = _answer_without_thoughts(completion)
+    if len(completion) > 3.14 * 5_300:
+        return False
+    completion = completion.lower()
     completion = completion.replace("**","")
     completion  = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
     completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
     for answer_phrases in alternative_answers:
         # if all(phrase in completion for phrase in answer_phrases):
         if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
             return True
     return False
 def load_results():
     conn = duckdb.connect(":memory:")
     conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
     conn.execute("""
         CREATE TABLE challenges AS
         SELECT * FROM 'puzzles_cleaned.csv'
     conn.create_function("wrap_text", _wrap_text)
     return conn
 def load_results_sample_one_only():
     conn = duckdb.connect(":memory:")
     conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
     conn.create_function("wrap_text", _wrap_text)
     return conn
 def r1_accuracy_by_completion_length(conn,model_name):
     """
     For the responses from the completions-r1 model:
             ROUND(correct / total, 2) AS accuracy
         FROM
             AnswerCheck
+        ORDER BY
+            model
     """)
 def main():
     parser.add_argument("--by-model-and-time", action="store_true")
     args = parser.parse_args()
     conn = load_results()
     if args.by_model_and_time:
         print(accuracy_by_model_and_time(conn))
     else:

results.duckdb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:56425a7e1731d6709f5df26384b8b355f78e9511d31b7d8c9faa46af7370ba7a
-size 96743424

 version https://git-lfs.github.com/spec/v1
+oid sha256:29bc4620908391a0cdfba5740eae731755e2a81afddd9f7941532acb16f3465e
+size 103034880