Aryarya commited on
Commit
1526739
·
1 Parent(s): 43abcec

update metrics

Browse files
Files changed (3) hide show
  1. answer_cache/cache.db +0 -3
  2. metrics.py +17 -20
  3. results.duckdb +2 -2
answer_cache/cache.db DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:23e1a6adfdf2de2f7718c1284bf810dc64d191ee415a0975949b01c6954be1b2
3
- size 196784128
 
 
 
 
metrics.py CHANGED
@@ -4,12 +4,6 @@ import textwrap
4
  from typing import List, Tuple
5
  import argparse
6
  import unicodedata
7
- import re
8
-
9
-
10
- import diskcache as dc
11
-
12
- cache = dc.Cache("answer_cache")
13
 
14
  def normalize_text(text: str) -> str:
15
  """Normalize text to remove accents, convert to lowercase, and strip spaces."""
@@ -42,22 +36,25 @@ def _parse_answer(text: str) -> List[List[str]]:
42
  return result
43
 
44
  def _answer_without_thoughts(completion: str) -> str:
45
- completion = re.sub(r"(<think>)?[^<]*<\/think>", "", completion).strip()
46
- completion = re.sub(r".*</think>", "", completion).strip() #because qwen sometimes misses <think>
47
- return completion
48
-
49
-
 
 
50
 
51
  def _check_answer(completion: str, answer: str) -> bool:
52
  """
53
  Check that all the phrases that must appear in the answer appear in the
54
  completion. We ignore "thoughts", capitalization, and punctuation.
55
  """
56
- key = (completion, answer)
57
- if key in cache:
58
- return cache[key]
59
-
60
- completion = _answer_without_thoughts(completion).lower()
 
61
  completion = completion.replace("**","")
62
  completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
63
  completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
@@ -66,9 +63,7 @@ def _check_answer(completion: str, answer: str) -> bool:
66
  for answer_phrases in alternative_answers:
67
  # if all(phrase in completion for phrase in answer_phrases):
68
  if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
69
- cache[key] = True
70
  return True
71
- cache[key] = False
72
  return False
73
 
74
 
@@ -81,7 +76,6 @@ def _wrap_text(text: str, width: int) -> str:
81
  def load_results():
82
  conn = duckdb.connect(":memory:")
83
  conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
84
- # conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
85
  conn.execute("""
86
  CREATE TABLE challenges AS
87
  SELECT * FROM 'puzzles_cleaned.csv'
@@ -92,6 +86,7 @@ def load_results():
92
  conn.create_function("wrap_text", _wrap_text)
93
  return conn
94
 
 
95
  def load_results_sample_one_only():
96
  conn = duckdb.connect(":memory:")
97
  conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
@@ -118,6 +113,7 @@ def load_results_sample_one_only():
118
  conn.create_function("wrap_text", _wrap_text)
119
  return conn
120
 
 
121
  def r1_accuracy_by_completion_length(conn,model_name):
122
  """
123
  For the responses from the completions-r1 model:
@@ -219,6 +215,8 @@ def accuracy_by_model(conn):
219
  ROUND(correct / total, 2) AS accuracy
220
  FROM
221
  AnswerCheck
 
 
222
  """)
223
 
224
  def main():
@@ -226,7 +224,6 @@ def main():
226
  parser.add_argument("--by-model-and-time", action="store_true")
227
  args = parser.parse_args()
228
  conn = load_results()
229
-
230
  if args.by_model_and_time:
231
  print(accuracy_by_model_and_time(conn))
232
  else:
 
4
  from typing import List, Tuple
5
  import argparse
6
  import unicodedata
 
 
 
 
 
 
7
 
8
  def normalize_text(text: str) -> str:
9
  """Normalize text to remove accents, convert to lowercase, and strip spaces."""
 
36
  return result
37
 
38
  def _answer_without_thoughts(completion: str) -> str:
39
+ end_think_index = completion.find("</think>")
40
+ if end_think_index == -1:
41
+ if "<think>" in completion:
42
+ return ""
43
+ return completion
44
+ else:
45
+ return completion[end_think_index + len("</think>"):]
46
 
47
  def _check_answer(completion: str, answer: str) -> bool:
48
  """
49
  Check that all the phrases that must appear in the answer appear in the
50
  completion. We ignore "thoughts", capitalization, and punctuation.
51
  """
52
+ completion = _answer_without_thoughts(completion)
53
+
54
+ if len(completion) > 3.14 * 5_300:
55
+ return False
56
+
57
+ completion = completion.lower()
58
  completion = completion.replace("**","")
59
  completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
60
  completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
 
63
  for answer_phrases in alternative_answers:
64
  # if all(phrase in completion for phrase in answer_phrases):
65
  if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
 
66
  return True
 
67
  return False
68
 
69
 
 
76
  def load_results():
77
  conn = duckdb.connect(":memory:")
78
  conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
 
79
  conn.execute("""
80
  CREATE TABLE challenges AS
81
  SELECT * FROM 'puzzles_cleaned.csv'
 
86
  conn.create_function("wrap_text", _wrap_text)
87
  return conn
88
 
89
+
90
  def load_results_sample_one_only():
91
  conn = duckdb.connect(":memory:")
92
  conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
 
113
  conn.create_function("wrap_text", _wrap_text)
114
  return conn
115
 
116
+
117
  def r1_accuracy_by_completion_length(conn,model_name):
118
  """
119
  For the responses from the completions-r1 model:
 
215
  ROUND(correct / total, 2) AS accuracy
216
  FROM
217
  AnswerCheck
218
+ ORDER BY
219
+ model
220
  """)
221
 
222
  def main():
 
224
  parser.add_argument("--by-model-and-time", action="store_true")
225
  args = parser.parse_args()
226
  conn = load_results()
 
227
  if args.by_model_and_time:
228
  print(accuracy_by_model_and_time(conn))
229
  else:
results.duckdb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56425a7e1731d6709f5df26384b8b355f78e9511d31b7d8c9faa46af7370ba7a
3
- size 96743424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29bc4620908391a0cdfba5740eae731755e2a81afddd9f7941532acb16f3465e
3
+ size 103034880