Spaces:

adyen
/

DABstep

Running on CPU Upgrade

App Files Files Community

martinigoyanes commited on Apr 20

Commit

f0189a6

1 Parent(s): 42f179a

fix: evaluation and scorer test

Browse files

Files changed (3) hide show

dabstep_benchmark/evaluation/scorer.py +21 -4
dabstep_benchmark/leaderboard.py +5 -1
dabstep_benchmark/tests/test_scorer.py +6 -3

dabstep_benchmark/evaluation/scorer.py CHANGED Viewed

@@ -4,8 +4,25 @@ import math
 from difflib import SequenceMatcher
 def is_numeric_with_commas(value: str) -> bool:
-    # Check if the string is a number with comma separators
-    return bool(re.match(r'^\$?(\d{1,3}(,\d{3})*(\.\d+)?|\.\d+)$', value.strip()))
 def question_scorer(input1: str, input2: str) -> bool:
     # Remove leading/trailing whitespace and convert to lowercase
@@ -67,7 +84,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
     # For percentages and small numbers, use a more lenient comparison
     if num1 < 1 and num2 < 1:
-        return math.isclose(num1, num2, rel_tol=1e-2, abs_tol=1e-4)
     # For larger numbers, use the original comparison method
     dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
@@ -79,7 +96,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
     if rounded1 == rounded2:
         return True
-    return math.isclose(num1, num2, rel_tol=1e-2, abs_tol=1e-2)
 def compare_strings(str1: str, str2: str) -> bool:
     # Remove all whitespace and punctuation

 from difflib import SequenceMatcher
 def is_numeric_with_commas(value: str) -> bool:
+    """
+    True for strings that are either
+      - numbers using comma thousands‑separators (at least one comma),
+        with optional dot‑decimal, e.g.  "1,000"  or  "12,345.67"
+    OR
+      - pure decimals (no separators) with a decimal point or comma,
+        e.g. "0.99" or "0,99"
+    Plain ints without commas (e.g. "64") are rejected.
+    """
+    v = value.strip()
+    pattern = r'''
+      ^\$?                                  # optional dollar sign
+      (?:                                   # two alternate groups:
+         \d{1,3}(?:,\d{3})+(?:\.\d+)?       # 1) at least one comma‑group + optional .decimal
+       | \d+[.,]\d+                         # 2) or plain decimal with . or ,
+      )
+      $                                     # end of string
+    '''
+    return bool(re.match(pattern, v, re.VERBOSE))
 def question_scorer(input1: str, input2: str) -> bool:
     # Remove leading/trailing whitespace and convert to lowercase
     # For percentages and small numbers, use a more lenient comparison
     if num1 < 1 and num2 < 1:
+        return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
     # For larger numbers, use the original comparison method
     dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
     if rounded1 == rounded2:
         return True
+    return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
 def compare_strings(str1: str, str2: str) -> bool:
     # Remove all whitespace and punctuation

dabstep_benchmark/leaderboard.py CHANGED Viewed

@@ -311,7 +311,11 @@ def generate_leaderboard_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
     leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
     # sort-by best score
-    leaderboard_df.sort_values(by="Hard Level Accuracy (%)", ascending=False, inplace=True)
     validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
     unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])

     leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
     # sort-by best score
+    leaderboard_df.sort_values(
+        by=["Hard Level Accuracy (%)", "Easy Level Accuracy (%)"],
+        ascending=[False, False],
+        inplace=True
+    )
     validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
     unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])

dabstep_benchmark/tests/test_scorer.py CHANGED Viewed

@@ -51,12 +51,13 @@ def test_list_match(input1, input2, expected):
 @pytest.mark.parametrize("input1, input2, expected", [
     ("42, hello", "42, hello", True),
     ("42, world", "42, hello", False),
 ])
 def test_mixed_list_match(input1, input2, expected):
     assert question_scorer(input1, input2) == expected
 @pytest.mark.parametrize("input1, input2, expected", [
-    ("3.14", "3.1483", True),
     ("3.14", "3.20", False),
     ("1", "1.0", True),
     ("1.0", "1", True),
@@ -66,7 +67,9 @@ def test_mixed_list_match(input1, input2, expected):
     ("$0.10", "$0.10 per retry", True),
     ("D", "D) Apples", True),
     ("D", "A) Oranges", False),
-    ("25.0", "0.250", False) #input is not a percentage
 ])
 def test_approximate_numeric_match(input1, input2, expected):
     assert question_scorer(input1, input2) == expected
@@ -74,7 +77,7 @@ def test_approximate_numeric_match(input1, input2, expected):
 @pytest.mark.parametrize("input1, input2, expected", [
     ("73.15%", "73.1495", True),
     ("42%", "42", True),
-    ("30%", "30.1", True),
     ("25", "25%", True),
     ("100%", "100", True),
     ("0.1%", "0.1", True),

 @pytest.mark.parametrize("input1, input2, expected", [
     ("42, hello", "42, hello", True),
     ("42, world", "42, hello", False),
+    ("64", "64, 53, 454, 231, 473, 381", False)
 ])
 def test_mixed_list_match(input1, input2, expected):
     assert question_scorer(input1, input2) == expected
 @pytest.mark.parametrize("input1, input2, expected", [
+    ("3.14", "3.1483", False),
     ("3.14", "3.20", False),
     ("1", "1.0", True),
     ("1.0", "1", True),
     ("$0.10", "$0.10 per retry", True),
     ("D", "D) Apples", True),
     ("D", "A) Oranges", False),
+    ("25.0", "0.250", False), #input is not a percentage,
+    ("5.760000", "5.715872", False),
+    ("8.68000000000000", "8.66999999999916", False)
 ])
 def test_approximate_numeric_match(input1, input2, expected):
     assert question_scorer(input1, input2) == expected
 @pytest.mark.parametrize("input1, input2, expected", [
     ("73.15%", "73.1495", True),
     ("42%", "42", True),
+    ("30%", "30.1", False),
     ("25", "25%", True),
     ("100%", "100", True),
     ("0.1%", "0.1", True),