Spaces:
Running
Running
update
Browse files- app.py +8 -5
- metrics.py +8 -21
- puzzles_cleaned.csv +2 -2
- results.duckdb +2 -2
app.py
CHANGED
@@ -135,13 +135,16 @@ def summary_view():
|
|
135 |
)
|
136 |
|
137 |
|
138 |
-
def
|
139 |
r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
|
140 |
gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
|
141 |
-
|
|
|
142 |
r1_completions["model"] = "R1"
|
143 |
gemini2_completions["model"] = "Gemini2"
|
144 |
-
|
|
|
|
|
145 |
|
146 |
r1_completions["length"] = r1_completions["length"] / 3.2
|
147 |
|
@@ -249,8 +252,8 @@ def create_interface():
|
|
249 |
all_challenges_view()
|
250 |
with gr.TabItem("Accuracy Over Time"):
|
251 |
summary_view()
|
252 |
-
with gr.TabItem("
|
253 |
-
|
254 |
demo.launch()
|
255 |
|
256 |
if __name__ == "__main__":
|
|
|
135 |
)
|
136 |
|
137 |
|
138 |
+
def accuracy_by_completion_length():
|
139 |
r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
|
140 |
gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
|
141 |
+
qwq_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-qwen32b').to_df()
|
142 |
+
sonnetET_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-claude-3-7-sonnet-20250219').to_df()
|
143 |
r1_completions["model"] = "R1"
|
144 |
gemini2_completions["model"] = "Gemini2"
|
145 |
+
qwq_completions["model"] = "QWQ 32B"
|
146 |
+
sonnetET_completions["model"] = "Sonnet 3.7 ET"
|
147 |
+
r1_completions = pd.concat([r1_completions, gemini2_completions, qwq_completions, sonnetET_completions])
|
148 |
|
149 |
r1_completions["length"] = r1_completions["length"] / 3.2
|
150 |
|
|
|
252 |
all_challenges_view()
|
253 |
with gr.TabItem("Accuracy Over Time"):
|
254 |
summary_view()
|
255 |
+
with gr.TabItem("Reasoning Length Analysis"):
|
256 |
+
accuracy_by_completion_length()
|
257 |
demo.launch()
|
258 |
|
259 |
if __name__ == "__main__":
|
metrics.py
CHANGED
@@ -3,8 +3,6 @@ import duckdb
|
|
3 |
import textwrap
|
4 |
from typing import List, Tuple
|
5 |
import argparse
|
6 |
-
import unicodedata
|
7 |
-
|
8 |
import unicodedata
|
9 |
import re
|
10 |
|
@@ -29,6 +27,7 @@ def _parse_answer(text: str) -> List[List[str]]:
|
|
29 |
distinct phrases that may be present in any order. All other characters
|
30 |
are dropped.
|
31 |
"""
|
|
|
32 |
text = normalize_text(text)
|
33 |
alternatives = re.split(r';', text)
|
34 |
result = [ ]
|
@@ -38,14 +37,10 @@ def _parse_answer(text: str) -> List[List[str]]:
|
|
38 |
return result
|
39 |
|
40 |
def _answer_without_thoughts(completion: str) -> str:
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
if len(chunks) <= 1:
|
46 |
-
return ""
|
47 |
-
|
48 |
-
return chunks[-1].strip()
|
49 |
|
50 |
def _check_answer(completion: str, answer: str) -> bool:
|
51 |
"""
|
@@ -53,6 +48,7 @@ def _check_answer(completion: str, answer: str) -> bool:
|
|
53 |
completion. We ignore "thoughts", capitalization, and punctuation.
|
54 |
"""
|
55 |
completion = _answer_without_thoughts(completion).lower()
|
|
|
56 |
completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
|
57 |
completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
|
58 |
completion = normalize_text(completion)
|
@@ -63,7 +59,6 @@ def _check_answer(completion: str, answer: str) -> bool:
|
|
63 |
return True
|
64 |
return False
|
65 |
|
66 |
-
|
67 |
def _clip_text(text: str, width: int) -> str:
|
68 |
return text if len(text) <= width else text[:width] + "..."
|
69 |
|
@@ -217,16 +212,8 @@ def main():
|
|
217 |
parser = argparse.ArgumentParser()
|
218 |
parser.add_argument("--by-model-and-time", action="store_true")
|
219 |
args = parser.parse_args()
|
220 |
-
conn =
|
221 |
-
|
222 |
-
SELECT parent_dir, prompt_id, COUNT(DISTINCT completion) AS completion_count
|
223 |
-
FROM sampled
|
224 |
-
GROUP BY parent_dir, prompt_id
|
225 |
-
HAVING COUNT(DISTINCT completion) == 1;
|
226 |
-
"""
|
227 |
-
wrongones = conn.execute(query).fetchall()
|
228 |
-
assert not wrongones, f"Found {len(wrongones)} prompts with not just one completion"
|
229 |
-
|
230 |
if args.by_model_and_time:
|
231 |
print(accuracy_by_model_and_time(conn))
|
232 |
else:
|
|
|
3 |
import textwrap
|
4 |
from typing import List, Tuple
|
5 |
import argparse
|
|
|
|
|
6 |
import unicodedata
|
7 |
import re
|
8 |
|
|
|
27 |
distinct phrases that may be present in any order. All other characters
|
28 |
are dropped.
|
29 |
"""
|
30 |
+
text = text.lower()
|
31 |
text = normalize_text(text)
|
32 |
alternatives = re.split(r';', text)
|
33 |
result = [ ]
|
|
|
37 |
return result
|
38 |
|
39 |
def _answer_without_thoughts(completion: str) -> str:
|
40 |
+
completion = re.sub(r"(<think>)?[^<]*<\/think>", "", completion).strip()
|
41 |
+
completion = re.sub(r".*</think>", "", completion).strip() #because qwen sometimes misses <think>
|
42 |
+
return completion
|
43 |
+
|
|
|
|
|
|
|
|
|
44 |
|
45 |
def _check_answer(completion: str, answer: str) -> bool:
|
46 |
"""
|
|
|
48 |
completion. We ignore "thoughts", capitalization, and punctuation.
|
49 |
"""
|
50 |
completion = _answer_without_thoughts(completion).lower()
|
51 |
+
completion = completion.replace("**","")
|
52 |
completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
|
53 |
completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
|
54 |
completion = normalize_text(completion)
|
|
|
59 |
return True
|
60 |
return False
|
61 |
|
|
|
62 |
def _clip_text(text: str, width: int) -> str:
|
63 |
return text if len(text) <= width else text[:width] + "..."
|
64 |
|
|
|
212 |
parser = argparse.ArgumentParser()
|
213 |
parser.add_argument("--by-model-and-time", action="store_true")
|
214 |
args = parser.parse_args()
|
215 |
+
conn = load_results()
|
216 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
if args.by_model_and_time:
|
218 |
print(accuracy_by_model_and_time(conn))
|
219 |
else:
|
puzzles_cleaned.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b37392e635231db3aebcff00d322dfac5f0ceec8b922d581e2c7b1bef4075ba
|
3 |
+
size 1123093
|
results.duckdb
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56425a7e1731d6709f5df26384b8b355f78e9511d31b7d8c9faa46af7370ba7a
|
3 |
+
size 96743424
|