Aryarya commited on
Commit
479b4ac
·
1 Parent(s): 0136438

update sampled

Browse files
Files changed (2) hide show
  1. app.py +5 -5
  2. metrics.py +89 -4
app.py CHANGED
@@ -21,13 +21,13 @@ Note that not every model has a response for every puzzle.
21
  import gradio as gr
22
  import pandas as pd
23
  import numpy as np
24
- from metrics import load_results, accuracy_by_model_and_time
25
  import metrics
26
  from pathlib import Path
27
 
28
  def get_model_response(prompt_id, model_name):
29
  query = f"""
30
- SELECT completion FROM results.completions
31
  WHERE prompt_id = {prompt_id} AND parent_dir = '{model_name}'
32
  """
33
  response = conn.sql(query).fetchone()
@@ -56,10 +56,10 @@ def display_model_response(puzzle_id, model_name, show_thoughts):
56
  return response.strip()
57
 
58
 
59
- conn = load_results()
60
 
61
  # Get all unique model names
62
- model_names = [item[0] for item in conn.sql("SELECT DISTINCT parent_dir FROM results.completions").fetchall()]
63
  model_names.sort()
64
  # Just for display.
65
  cleaned_model_names = [name.replace("completions-", "") for name in model_names]
@@ -84,7 +84,7 @@ def build_table():
84
  query += """
85
  clip_text(c.challenge, 40) as challenge_clipped,
86
  FROM challenges c
87
- LEFT JOIN results.completions r
88
  ON c.ID = r.prompt_id
89
  GROUP BY c.ID, c.challenge, c.answer
90
  """
 
21
  import gradio as gr
22
  import pandas as pd
23
  import numpy as np
24
+ from metrics import load_results_sample_one_only, accuracy_by_model_and_time
25
  import metrics
26
  from pathlib import Path
27
 
28
  def get_model_response(prompt_id, model_name):
29
  query = f"""
30
+ SELECT completion FROM sampled
31
  WHERE prompt_id = {prompt_id} AND parent_dir = '{model_name}'
32
  """
33
  response = conn.sql(query).fetchone()
 
56
  return response.strip()
57
 
58
 
59
+ conn = load_results_sample_one_only()
60
 
61
  # Get all unique model names
62
+ model_names = [item[0] for item in conn.sql("SELECT DISTINCT parent_dir FROM sampled").fetchall()]
63
  model_names.sort()
64
  # Just for display.
65
  cleaned_model_names = [name.replace("completions-", "") for name in model_names]
 
84
  query += """
85
  clip_text(c.challenge, 40) as challenge_clipped,
86
  FROM challenges c
87
+ LEFT JOIN sampled r
88
  ON c.ID = r.prompt_id
89
  GROUP BY c.ID, c.challenge, c.answer
90
  """
metrics.py CHANGED
@@ -3,6 +3,18 @@ import duckdb
3
  import textwrap
4
  from typing import List, Tuple
5
  import argparse
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def _parse_answer(text: str) -> List[List[str]]:
8
  """
@@ -17,7 +29,7 @@ def _parse_answer(text: str) -> List[List[str]]:
17
  distinct phrases that may be present in any order. All other characters
18
  are dropped.
19
  """
20
- text = text.lower()
21
  alternatives = re.split(r';', text)
22
  result = [ ]
23
  for alternative in alternatives:
@@ -43,6 +55,7 @@ def _check_answer(completion: str, answer: str) -> bool:
43
  completion = _answer_without_thoughts(completion).lower()
44
  completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
45
  completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
 
46
  alternative_answers = _parse_answer(answer)
47
  for answer_phrases in alternative_answers:
48
  # if all(phrase in completion for phrase in answer_phrases):
@@ -71,6 +84,37 @@ def load_results():
71
  conn.create_function("wrap_text", _wrap_text)
72
  return conn
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def r1_accuracy_by_completion_length(conn,model_name):
75
  """
76
  For the responses from the completions-r1 model:
@@ -154,8 +198,8 @@ def accuracy_by_model(conn):
154
  WITH AnswerCheck AS (
155
  SELECT
156
  results.parent_dir AS model,
157
- COUNT(*) AS total,
158
- SUM(CAST(check_answer(results.completion, challenges.answer) AS INTEGER)) AS correct
159
  FROM
160
  results.completions results
161
  JOIN
@@ -174,11 +218,52 @@ def accuracy_by_model(conn):
174
  AnswerCheck
175
  """)
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  def main():
178
  parser = argparse.ArgumentParser()
179
  parser.add_argument("--by-model-and-time", action="store_true")
180
  args = parser.parse_args()
181
- conn = load_results()
 
 
 
 
 
 
 
 
 
182
  if args.by_model_and_time:
183
  print(accuracy_by_model_and_time(conn))
184
  else:
 
3
  import textwrap
4
  from typing import List, Tuple
5
  import argparse
6
+ import unicodedata
7
+
8
+ import unicodedata
9
+ import re
10
+
11
+ def normalize_text(text: str) -> str:
12
+ """Normalize text to remove accents, convert to lowercase, and strip spaces."""
13
+ text = unicodedata.normalize("NFKD", text) # Decomposes letters with accents (e.g., é → e + ́)
14
+ text = "".join([c for c in text if not unicodedata.combining(c)]) # Remove diacritics
15
+ text = text.lower().strip() # Convert to lowercase and strip spaces
16
+ return text
17
+
18
 
19
  def _parse_answer(text: str) -> List[List[str]]:
20
  """
 
29
  distinct phrases that may be present in any order. All other characters
30
  are dropped.
31
  """
32
+ text = normalize_text(text)
33
  alternatives = re.split(r';', text)
34
  result = [ ]
35
  for alternative in alternatives:
 
55
  completion = _answer_without_thoughts(completion).lower()
56
  completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
57
  completion = re.sub(r'\s+', ' ', completion) # normalize consecutive (Unicode) spaces to finish aligning with _parse_answer
58
+ completion = normalize_text(completion)
59
  alternative_answers = _parse_answer(answer)
60
  for answer_phrases in alternative_answers:
61
  # if all(phrase in completion for phrase in answer_phrases):
 
84
  conn.create_function("wrap_text", _wrap_text)
85
  return conn
86
 
87
+ def load_results_sample_one_only():
88
+ conn = duckdb.connect(":memory:")
89
+ conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
90
+
91
+ query = """
92
+ CREATE TABLE sampled AS
93
+ WITH numbered AS (
94
+ SELECT *,
95
+ ROW_NUMBER() OVER (PARTITION BY parent_dir, prompt ORDER BY prompt_id) AS rn
96
+ FROM results.completions
97
+ )
98
+ SELECT prompt_id, parent_dir, prompt, completion
99
+ FROM numbered
100
+ WHERE rn = 1;
101
+ """
102
+ conn.execute(query).fetchall()
103
+ # #print how how many rows are in the table
104
+ # print(conn.execute("SELECT COUNT(*) FROM sampled").fetchall())
105
+ # #describe the sampled table
106
+ # print(conn.execute("DESCRIBE sampled").fetchall())
107
+
108
+ conn.execute("""
109
+ CREATE TABLE challenges AS
110
+ SELECT * FROM 'puzzles_cleaned.csv'
111
+ WHERE Warnings IS NULL OR Warnings NOT LIKE '%(E)%'
112
+ """)
113
+ conn.create_function("check_answer", _check_answer)
114
+ conn.create_function("clip_text", _clip_text)
115
+ conn.create_function("wrap_text", _wrap_text)
116
+ return conn
117
+
118
  def r1_accuracy_by_completion_length(conn,model_name):
119
  """
120
  For the responses from the completions-r1 model:
 
198
  WITH AnswerCheck AS (
199
  SELECT
200
  results.parent_dir AS model,
201
+ SUM(results.count) AS total,
202
+ SUM(results.count * CAST(check_answer(results.completion, challenges.answer) AS INTEGER)) AS correct
203
  FROM
204
  results.completions results
205
  JOIN
 
218
  AnswerCheck
219
  """)
220
 
221
+ def accuracy_by_model_only_one(conn):
222
+ query = """
223
+ WITH FirstResponses AS (
224
+ SELECT
225
+ parent_dir AS model,
226
+ prompt_id,
227
+ completion,
228
+ count,
229
+ ROW_NUMBER() OVER (PARTITION BY parent_dir, prompt_id) AS rn
230
+ FROM results.completions
231
+ WHERE parent_dir = 'completions-r1_cursor_hosted' -- Only consider rows where parent_dir is 'r1_cursor_hosted'
232
+ ),
233
+ AnswerCheck AS (
234
+ SELECT
235
+ fr.model,
236
+ SUM(fr.count) AS total,
237
+ SUM(fr.count * CAST(check_answer(fr.completion, c.answer) AS INTEGER)) AS correct
238
+ FROM FirstResponses fr
239
+ JOIN challenges c ON fr.prompt_id = c.ID
240
+ WHERE fr.rn = 1 -- Select only the first response per model per prompt
241
+ GROUP BY fr.model
242
+ )
243
+ SELECT
244
+ model,
245
+ total,
246
+ correct,
247
+ ROUND(correct / total, 2) AS accuracy
248
+ FROM AnswerCheck;
249
+ """
250
+ return conn.sql(query)
251
+
252
+
253
  def main():
254
  parser = argparse.ArgumentParser()
255
  parser.add_argument("--by-model-and-time", action="store_true")
256
  args = parser.parse_args()
257
+ conn = load_results_sample_one_only()
258
+ query = """
259
+ SELECT parent_dir, prompt_id, COUNT(DISTINCT completion) AS completion_count
260
+ FROM sampled
261
+ GROUP BY parent_dir, prompt_id
262
+ HAVING COUNT(DISTINCT completion) == 1;
263
+ """
264
+ wrongones = conn.execute(query).fetchall()
265
+ assert not wrongones, f"Found {len(wrongones)} prompts with not just one completion"
266
+
267
  if args.by_model_and_time:
268
  print(accuracy_by_model_and_time(conn))
269
  else: