martinigoyanes commited on
Commit
f0189a6
·
1 Parent(s): 42f179a

fix: evaluation and scorer test

Browse files
dabstep_benchmark/evaluation/scorer.py CHANGED
@@ -4,8 +4,25 @@ import math
4
  from difflib import SequenceMatcher
5
 
6
  def is_numeric_with_commas(value: str) -> bool:
7
- # Check if the string is a number with comma separators
8
- return bool(re.match(r'^\$?(\d{1,3}(,\d{3})*(\.\d+)?|\.\d+)$', value.strip()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def question_scorer(input1: str, input2: str) -> bool:
11
  # Remove leading/trailing whitespace and convert to lowercase
@@ -67,7 +84,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
67
 
68
  # For percentages and small numbers, use a more lenient comparison
69
  if num1 < 1 and num2 < 1:
70
- return math.isclose(num1, num2, rel_tol=1e-2, abs_tol=1e-4)
71
 
72
  # For larger numbers, use the original comparison method
73
  dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
@@ -79,7 +96,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
79
  if rounded1 == rounded2:
80
  return True
81
 
82
- return math.isclose(num1, num2, rel_tol=1e-2, abs_tol=1e-2)
83
 
84
  def compare_strings(str1: str, str2: str) -> bool:
85
  # Remove all whitespace and punctuation
 
4
  from difflib import SequenceMatcher
5
 
6
  def is_numeric_with_commas(value: str) -> bool:
7
+ """
8
+ True for strings that are either
9
+ - numbers using comma thousands‑separators (at least one comma),
10
+ with optional dot‑decimal, e.g. "1,000" or "12,345.67"
11
+ OR
12
+ - pure decimals (no separators) with a decimal point or comma,
13
+ e.g. "0.99" or "0,99"
14
+ Plain ints without commas (e.g. "64") are rejected.
15
+ """
16
+ v = value.strip()
17
+ pattern = r'''
18
+ ^\$? # optional dollar sign
19
+ (?: # two alternate groups:
20
+ \d{1,3}(?:,\d{3})+(?:\.\d+)? # 1) at least one comma‑group + optional .decimal
21
+ | \d+[.,]\d+ # 2) or plain decimal with . or ,
22
+ )
23
+ $ # end of string
24
+ '''
25
+ return bool(re.match(pattern, v, re.VERBOSE))
26
 
27
  def question_scorer(input1: str, input2: str) -> bool:
28
  # Remove leading/trailing whitespace and convert to lowercase
 
84
 
85
  # For percentages and small numbers, use a more lenient comparison
86
  if num1 < 1 and num2 < 1:
87
+ return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
88
 
89
  # For larger numbers, use the original comparison method
90
  dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
 
96
  if rounded1 == rounded2:
97
  return True
98
 
99
+ return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
100
 
101
  def compare_strings(str1: str, str2: str) -> bool:
102
  # Remove all whitespace and punctuation
dabstep_benchmark/leaderboard.py CHANGED
@@ -311,7 +311,11 @@ def generate_leaderboard_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
311
  leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
312
 
313
  # sort-by best score
314
- leaderboard_df.sort_values(by="Hard Level Accuracy (%)", ascending=False, inplace=True)
 
 
 
 
315
 
316
  validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
317
  unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])
 
311
  leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
312
 
313
  # sort-by best score
314
+ leaderboard_df.sort_values(
315
+ by=["Hard Level Accuracy (%)", "Easy Level Accuracy (%)"],
316
+ ascending=[False, False],
317
+ inplace=True
318
+ )
319
 
320
  validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
321
  unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])
dabstep_benchmark/tests/test_scorer.py CHANGED
@@ -51,12 +51,13 @@ def test_list_match(input1, input2, expected):
51
  @pytest.mark.parametrize("input1, input2, expected", [
52
  ("42, hello", "42, hello", True),
53
  ("42, world", "42, hello", False),
 
54
  ])
55
  def test_mixed_list_match(input1, input2, expected):
56
  assert question_scorer(input1, input2) == expected
57
 
58
  @pytest.mark.parametrize("input1, input2, expected", [
59
- ("3.14", "3.1483", True),
60
  ("3.14", "3.20", False),
61
  ("1", "1.0", True),
62
  ("1.0", "1", True),
@@ -66,7 +67,9 @@ def test_mixed_list_match(input1, input2, expected):
66
  ("$0.10", "$0.10 per retry", True),
67
  ("D", "D) Apples", True),
68
  ("D", "A) Oranges", False),
69
- ("25.0", "0.250", False) #input is not a percentage
 
 
70
  ])
71
  def test_approximate_numeric_match(input1, input2, expected):
72
  assert question_scorer(input1, input2) == expected
@@ -74,7 +77,7 @@ def test_approximate_numeric_match(input1, input2, expected):
74
  @pytest.mark.parametrize("input1, input2, expected", [
75
  ("73.15%", "73.1495", True),
76
  ("42%", "42", True),
77
- ("30%", "30.1", True),
78
  ("25", "25%", True),
79
  ("100%", "100", True),
80
  ("0.1%", "0.1", True),
 
51
  @pytest.mark.parametrize("input1, input2, expected", [
52
  ("42, hello", "42, hello", True),
53
  ("42, world", "42, hello", False),
54
+ ("64", "64, 53, 454, 231, 473, 381", False)
55
  ])
56
  def test_mixed_list_match(input1, input2, expected):
57
  assert question_scorer(input1, input2) == expected
58
 
59
  @pytest.mark.parametrize("input1, input2, expected", [
60
+ ("3.14", "3.1483", False),
61
  ("3.14", "3.20", False),
62
  ("1", "1.0", True),
63
  ("1.0", "1", True),
 
67
  ("$0.10", "$0.10 per retry", True),
68
  ("D", "D) Apples", True),
69
  ("D", "A) Oranges", False),
70
+ ("25.0", "0.250", False), #input is not a percentage,
71
+ ("5.760000", "5.715872", False),
72
+ ("8.68000000000000", "8.66999999999916", False)
73
  ])
74
  def test_approximate_numeric_match(input1, input2, expected):
75
  assert question_scorer(input1, input2) == expected
 
77
  @pytest.mark.parametrize("input1, input2, expected", [
78
  ("73.15%", "73.1495", True),
79
  ("42%", "42", True),
80
+ ("30%", "30.1", False),
81
  ("25", "25%", True),
82
  ("100%", "100", True),
83
  ("0.1%", "0.1", True),