Aryarya commited on
Commit
c073751
·
1 Parent(s): 099c250
Files changed (4) hide show
  1. app.py +8 -5
  2. metrics.py +8 -21
  3. puzzles_cleaned.csv +2 -2
  4. results.duckdb +2 -2
app.py CHANGED
@@ -135,13 +135,16 @@ def summary_view():
135
  )
136
 
137
 
138
- def r1_accuracy_by_completion_length():
139
  r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
140
  gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
141
-
 
142
  r1_completions["model"] = "R1"
143
  gemini2_completions["model"] = "Gemini2"
144
- r1_completions = pd.concat([r1_completions, gemini2_completions])
 
 
145
 
146
  r1_completions["length"] = r1_completions["length"] / 3.2
147
 
@@ -249,8 +252,8 @@ def create_interface():
249
  all_challenges_view()
250
  with gr.TabItem("Accuracy Over Time"):
251
  summary_view()
252
- with gr.TabItem("DeepSeek R1 Analysis"):
253
- r1_accuracy_by_completion_length()
254
  demo.launch()
255
 
256
  if __name__ == "__main__":
 
135
  )
136
 
137
 
138
+ def accuracy_by_completion_length():
139
  r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
140
  gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
141
+ qwq_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-qwen32b').to_df()
142
+ sonnetET_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-claude-3-7-sonnet-20250219').to_df()
143
  r1_completions["model"] = "R1"
144
  gemini2_completions["model"] = "Gemini2"
145
+ qwq_completions["model"] = "QWQ 32B"
146
+ sonnetET_completions["model"] = "Sonnet 3.7 ET"
147
+ r1_completions = pd.concat([r1_completions, gemini2_completions, qwq_completions, sonnetET_completions])
148
 
149
  r1_completions["length"] = r1_completions["length"] / 3.2
150
 
 
252
  all_challenges_view()
253
  with gr.TabItem("Accuracy Over Time"):
254
  summary_view()
255
+ with gr.TabItem("Reasoning Length Analysis"):
256
+ accuracy_by_completion_length()
257
  demo.launch()
258
 
259
  if __name__ == "__main__":
metrics.py CHANGED
@@ -3,8 +3,6 @@ import duckdb
3
  import textwrap
4
  from typing import List, Tuple
5
  import argparse
6
- import unicodedata
7
-
8
  import unicodedata
9
  import re
10
 
@@ -29,6 +27,7 @@ def _parse_answer(text: str) -> List[List[str]]:
29
  distinct phrases that may be present in any order. All other characters
30
  are dropped.
31
  """
 
32
  text = normalize_text(text)
33
  alternatives = re.split(r';', text)
34
  result = [ ]
@@ -38,14 +37,10 @@ def _parse_answer(text: str) -> List[List[str]]:
38
  return result
39
 
40
  def _answer_without_thoughts(completion: str) -> str:
41
- if "<think>" not in completion[:200]:
42
- return completion
43
-
44
- chunks = completion.split("</think>")
45
- if len(chunks) <= 1:
46
- return ""
47
-
48
- return chunks[-1].strip()
49
 
50
  def _check_answer(completion: str, answer: str) -> bool:
51
  """
@@ -53,6 +48,7 @@ def _check_answer(completion: str, answer: str) -> bool:
53
  completion. We ignore "thoughts", capitalization, and punctuation.
54
  """
55
  completion = _answer_without_thoughts(completion).lower()
 
56
  completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
57
  completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
58
  completion = normalize_text(completion)
@@ -63,7 +59,6 @@ def _check_answer(completion: str, answer: str) -> bool:
63
  return True
64
  return False
65
 
66
-
67
  def _clip_text(text: str, width: int) -> str:
68
  return text if len(text) <= width else text[:width] + "..."
69
 
@@ -217,16 +212,8 @@ def main():
217
  parser = argparse.ArgumentParser()
218
  parser.add_argument("--by-model-and-time", action="store_true")
219
  args = parser.parse_args()
220
- conn = load_results_sample_one_only()
221
- query = """
222
- SELECT parent_dir, prompt_id, COUNT(DISTINCT completion) AS completion_count
223
- FROM sampled
224
- GROUP BY parent_dir, prompt_id
225
- HAVING COUNT(DISTINCT completion) == 1;
226
- """
227
- wrongones = conn.execute(query).fetchall()
228
- assert not wrongones, f"Found {len(wrongones)} prompts with not just one completion"
229
-
230
  if args.by_model_and_time:
231
  print(accuracy_by_model_and_time(conn))
232
  else:
 
3
  import textwrap
4
  from typing import List, Tuple
5
  import argparse
 
 
6
  import unicodedata
7
  import re
8
 
 
27
  distinct phrases that may be present in any order. All other characters
28
  are dropped.
29
  """
30
+ text = text.lower()
31
  text = normalize_text(text)
32
  alternatives = re.split(r';', text)
33
  result = [ ]
 
37
  return result
38
 
39
  def _answer_without_thoughts(completion: str) -> str:
40
+ completion = re.sub(r"(<think>)?[^<]*<\/think>", "", completion).strip()
41
+ completion = re.sub(r".*</think>", "", completion).strip() #because qwen sometimes misses <think>
42
+ return completion
43
+
 
 
 
 
44
 
45
  def _check_answer(completion: str, answer: str) -> bool:
46
  """
 
48
  completion. We ignore "thoughts", capitalization, and punctuation.
49
  """
50
  completion = _answer_without_thoughts(completion).lower()
51
+ completion = completion.replace("**","")
52
  completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
53
  completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
54
  completion = normalize_text(completion)
 
59
  return True
60
  return False
61
 
 
62
  def _clip_text(text: str, width: int) -> str:
63
  return text if len(text) <= width else text[:width] + "..."
64
 
 
212
  parser = argparse.ArgumentParser()
213
  parser.add_argument("--by-model-and-time", action="store_true")
214
  args = parser.parse_args()
215
+ conn = load_results()
216
+
 
 
 
 
 
 
 
 
217
  if args.by_model_and_time:
218
  print(accuracy_by_model_and_time(conn))
219
  else:
puzzles_cleaned.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd7800331f13023c80208d235108234f2bff94ca42e7e6f7d1909f03c4d3f75d
3
- size 1120687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b37392e635231db3aebcff00d322dfac5f0ceec8b922d581e2c7b1bef4075ba
3
+ size 1123093
results.duckdb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:569fd5c2d797ed702e5d2551bc6a9d4d54df78481ee016bc9ee07767361525b8
3
- size 137637888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56425a7e1731d6709f5df26384b8b355f78e9511d31b7d8c9faa46af7370ba7a
3
+ size 96743424