Spaces:

nuprl
/

reasoning-weekly

Running

App Files Files

Aryarya commited on Mar 20

Commit

c073751

1 Parent(s): 099c250

update

Browse files

Files changed (4) hide show

app.py +8 -5
metrics.py +8 -21
puzzles_cleaned.csv +2 -2
results.duckdb +2 -2

app.py CHANGED Viewed

@@ -135,13 +135,16 @@ def summary_view():
     )
-def r1_accuracy_by_completion_length():
     r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
     gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
     r1_completions["model"] = "R1"
     gemini2_completions["model"] = "Gemini2"
-    r1_completions = pd.concat([r1_completions, gemini2_completions])
     r1_completions["length"] = r1_completions["length"] / 3.2
@@ -249,8 +252,8 @@ def create_interface():
                 all_challenges_view()
             with gr.TabItem("Accuracy Over Time"):
                 summary_view()
-            with gr.TabItem("DeepSeek R1 Analysis"):
-                r1_accuracy_by_completion_length()
     demo.launch()
 if __name__ == "__main__":

     )
+def accuracy_by_completion_length():
     r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
     gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
+    qwq_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-qwen32b').to_df()
+    sonnetET_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-claude-3-7-sonnet-20250219').to_df()
     r1_completions["model"] = "R1"
     gemini2_completions["model"] = "Gemini2"
+    qwq_completions["model"] = "QWQ 32B"
+    sonnetET_completions["model"] = "Sonnet 3.7 ET"
+    r1_completions = pd.concat([r1_completions, gemini2_completions, qwq_completions, sonnetET_completions])
     r1_completions["length"] = r1_completions["length"] / 3.2
                 all_challenges_view()
             with gr.TabItem("Accuracy Over Time"):
                 summary_view()
+            with gr.TabItem("Reasoning Length Analysis"):
+                accuracy_by_completion_length()
     demo.launch()
 if __name__ == "__main__":

metrics.py CHANGED Viewed

@@ -3,8 +3,6 @@ import duckdb
 import textwrap
 from typing import List, Tuple
 import argparse
-import unicodedata
 import unicodedata
 import re
@@ -29,6 +27,7 @@ def _parse_answer(text: str) -> List[List[str]]:
     distinct phrases that may be present in any order. All other characters
     are dropped.
     """
     text = normalize_text(text)
     alternatives = re.split(r';', text)
     result = [ ]
@@ -38,14 +37,10 @@ def _parse_answer(text: str) -> List[List[str]]:
     return result
 def _answer_without_thoughts(completion: str) -> str:
-    if "<think>" not in completion[:200]:
-        return completion
-    chunks = completion.split("</think>")
-    if len(chunks) <= 1:
-        return ""
-    return chunks[-1].strip()
 def _check_answer(completion: str, answer: str) -> bool:
     """
@@ -53,6 +48,7 @@ def _check_answer(completion: str, answer: str) -> bool:
     completion. We ignore "thoughts", capitalization, and punctuation.
     """
     completion = _answer_without_thoughts(completion).lower()
     completion  = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
     completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
     completion = normalize_text(completion)
@@ -63,7 +59,6 @@ def _check_answer(completion: str, answer: str) -> bool:
             return True
     return False
 def _clip_text(text: str, width: int) -> str:
     return text if len(text) <= width else text[:width] + "..."
@@ -217,16 +212,8 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--by-model-and-time", action="store_true")
     args = parser.parse_args()
-    conn = load_results_sample_one_only()
-    query = """
-    SELECT parent_dir, prompt_id, COUNT(DISTINCT completion) AS completion_count
-    FROM sampled
-    GROUP BY parent_dir, prompt_id
-    HAVING COUNT(DISTINCT completion) == 1;
-    """
-    wrongones = conn.execute(query).fetchall()
-    assert not wrongones, f"Found {len(wrongones)} prompts with not just one completion"
     if args.by_model_and_time:
         print(accuracy_by_model_and_time(conn))
     else:

 import textwrap
 from typing import List, Tuple
 import argparse
 import unicodedata
 import re
     distinct phrases that may be present in any order. All other characters
     are dropped.
     """
+    text = text.lower()
     text = normalize_text(text)
     alternatives = re.split(r';', text)
     result = [ ]
     return result
 def _answer_without_thoughts(completion: str) -> str:
+    completion = re.sub(r"(<think>)?[^<]*<\/think>", "", completion).strip()
+    completion = re.sub(r".*</think>", "", completion).strip() #because qwen sometimes misses <think>
+    return completion
 def _check_answer(completion: str, answer: str) -> bool:
     """
     completion. We ignore "thoughts", capitalization, and punctuation.
     """
     completion = _answer_without_thoughts(completion).lower()
+    completion = completion.replace("**","")
     completion  = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
     completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
     completion = normalize_text(completion)
             return True
     return False
 def _clip_text(text: str, width: int) -> str:
     return text if len(text) <= width else text[:width] + "..."
     parser = argparse.ArgumentParser()
     parser.add_argument("--by-model-and-time", action="store_true")
     args = parser.parse_args()
+    conn = load_results()
     if args.by_model_and_time:
         print(accuracy_by_model_and_time(conn))
     else:

puzzles_cleaned.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd7800331f13023c80208d235108234f2bff94ca42e7e6f7d1909f03c4d3f75d
-size 1120687

 version https://git-lfs.github.com/spec/v1
+oid sha256:5b37392e635231db3aebcff00d322dfac5f0ceec8b922d581e2c7b1bef4075ba
+size 1123093

results.duckdb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:569fd5c2d797ed702e5d2551bc6a9d4d54df78481ee016bc9ee07767361525b8
-size 137637888

 version https://git-lfs.github.com/spec/v1
+oid sha256:56425a7e1731d6709f5df26384b8b355f78e9511d31b7d8c9faa46af7370ba7a
+size 96743424