Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
f0189a6
1
Parent(s):
42f179a
fix: evaluation and scorer test
Browse files
dabstep_benchmark/evaluation/scorer.py
CHANGED
@@ -4,8 +4,25 @@ import math
|
|
4 |
from difflib import SequenceMatcher
|
5 |
|
6 |
def is_numeric_with_commas(value: str) -> bool:
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def question_scorer(input1: str, input2: str) -> bool:
|
11 |
# Remove leading/trailing whitespace and convert to lowercase
|
@@ -67,7 +84,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
|
|
67 |
|
68 |
# For percentages and small numbers, use a more lenient comparison
|
69 |
if num1 < 1 and num2 < 1:
|
70 |
-
return math.isclose(num1, num2, rel_tol=1e-
|
71 |
|
72 |
# For larger numbers, use the original comparison method
|
73 |
dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
|
@@ -79,7 +96,7 @@ def compare_numeric(num1: float, num2: float) -> bool:
|
|
79 |
if rounded1 == rounded2:
|
80 |
return True
|
81 |
|
82 |
-
return math.isclose(num1, num2, rel_tol=1e-
|
83 |
|
84 |
def compare_strings(str1: str, str2: str) -> bool:
|
85 |
# Remove all whitespace and punctuation
|
|
|
4 |
from difflib import SequenceMatcher
|
5 |
|
6 |
def is_numeric_with_commas(value: str) -> bool:
|
7 |
+
"""
|
8 |
+
True for strings that are either
|
9 |
+
- numbers using comma thousands‑separators (at least one comma),
|
10 |
+
with optional dot‑decimal, e.g. "1,000" or "12,345.67"
|
11 |
+
OR
|
12 |
+
- pure decimals (no separators) with a decimal point or comma,
|
13 |
+
e.g. "0.99" or "0,99"
|
14 |
+
Plain ints without commas (e.g. "64") are rejected.
|
15 |
+
"""
|
16 |
+
v = value.strip()
|
17 |
+
pattern = r'''
|
18 |
+
^\$? # optional dollar sign
|
19 |
+
(?: # two alternate groups:
|
20 |
+
\d{1,3}(?:,\d{3})+(?:\.\d+)? # 1) at least one comma‑group + optional .decimal
|
21 |
+
| \d+[.,]\d+ # 2) or plain decimal with . or ,
|
22 |
+
)
|
23 |
+
$ # end of string
|
24 |
+
'''
|
25 |
+
return bool(re.match(pattern, v, re.VERBOSE))
|
26 |
|
27 |
def question_scorer(input1: str, input2: str) -> bool:
|
28 |
# Remove leading/trailing whitespace and convert to lowercase
|
|
|
84 |
|
85 |
# For percentages and small numbers, use a more lenient comparison
|
86 |
if num1 < 1 and num2 < 1:
|
87 |
+
return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
|
88 |
|
89 |
# For larger numbers, use the original comparison method
|
90 |
dec_places1 = len(str(num1).split('.')[-1]) if '.' in str(num1) else 0
|
|
|
96 |
if rounded1 == rounded2:
|
97 |
return True
|
98 |
|
99 |
+
return math.isclose(num1, num2, rel_tol=1e-4, abs_tol=1e-4)
|
100 |
|
101 |
def compare_strings(str1: str, str2: str) -> bool:
|
102 |
# Remove all whitespace and punctuation
|
dabstep_benchmark/leaderboard.py
CHANGED
@@ -311,7 +311,11 @@ def generate_leaderboard_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
311 |
leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
|
312 |
|
313 |
# sort-by best score
|
314 |
-
leaderboard_df.sort_values(
|
|
|
|
|
|
|
|
|
315 |
|
316 |
validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
|
317 |
unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])
|
|
|
311 |
leaderboard_df["Agent"] = leaderboard_df["Agent"].apply(lambda x: f"**{x}**")
|
312 |
|
313 |
# sort-by best score
|
314 |
+
leaderboard_df.sort_values(
|
315 |
+
by=["Hard Level Accuracy (%)", "Easy Level Accuracy (%)"],
|
316 |
+
ascending=[False, False],
|
317 |
+
inplace=True
|
318 |
+
)
|
319 |
|
320 |
validated_lb = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"])
|
321 |
unvalidated_lb = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"])
|
dabstep_benchmark/tests/test_scorer.py
CHANGED
@@ -51,12 +51,13 @@ def test_list_match(input1, input2, expected):
|
|
51 |
@pytest.mark.parametrize("input1, input2, expected", [
|
52 |
("42, hello", "42, hello", True),
|
53 |
("42, world", "42, hello", False),
|
|
|
54 |
])
|
55 |
def test_mixed_list_match(input1, input2, expected):
|
56 |
assert question_scorer(input1, input2) == expected
|
57 |
|
58 |
@pytest.mark.parametrize("input1, input2, expected", [
|
59 |
-
("3.14", "3.1483",
|
60 |
("3.14", "3.20", False),
|
61 |
("1", "1.0", True),
|
62 |
("1.0", "1", True),
|
@@ -66,7 +67,9 @@ def test_mixed_list_match(input1, input2, expected):
|
|
66 |
("$0.10", "$0.10 per retry", True),
|
67 |
("D", "D) Apples", True),
|
68 |
("D", "A) Oranges", False),
|
69 |
-
("25.0", "0.250", False) #input is not a percentage
|
|
|
|
|
70 |
])
|
71 |
def test_approximate_numeric_match(input1, input2, expected):
|
72 |
assert question_scorer(input1, input2) == expected
|
@@ -74,7 +77,7 @@ def test_approximate_numeric_match(input1, input2, expected):
|
|
74 |
@pytest.mark.parametrize("input1, input2, expected", [
|
75 |
("73.15%", "73.1495", True),
|
76 |
("42%", "42", True),
|
77 |
-
("30%", "30.1",
|
78 |
("25", "25%", True),
|
79 |
("100%", "100", True),
|
80 |
("0.1%", "0.1", True),
|
|
|
51 |
@pytest.mark.parametrize("input1, input2, expected", [
|
52 |
("42, hello", "42, hello", True),
|
53 |
("42, world", "42, hello", False),
|
54 |
+
("64", "64, 53, 454, 231, 473, 381", False)
|
55 |
])
|
56 |
def test_mixed_list_match(input1, input2, expected):
|
57 |
assert question_scorer(input1, input2) == expected
|
58 |
|
59 |
@pytest.mark.parametrize("input1, input2, expected", [
|
60 |
+
("3.14", "3.1483", False),
|
61 |
("3.14", "3.20", False),
|
62 |
("1", "1.0", True),
|
63 |
("1.0", "1", True),
|
|
|
67 |
("$0.10", "$0.10 per retry", True),
|
68 |
("D", "D) Apples", True),
|
69 |
("D", "A) Oranges", False),
|
70 |
+
("25.0", "0.250", False), #input is not a percentage,
|
71 |
+
("5.760000", "5.715872", False),
|
72 |
+
("8.68000000000000", "8.66999999999916", False)
|
73 |
])
|
74 |
def test_approximate_numeric_match(input1, input2, expected):
|
75 |
assert question_scorer(input1, input2) == expected
|
|
|
77 |
@pytest.mark.parametrize("input1, input2, expected", [
|
78 |
("73.15%", "73.1495", True),
|
79 |
("42%", "42", True),
|
80 |
+
("30%", "30.1", False),
|
81 |
("25", "25%", True),
|
82 |
("100%", "100", True),
|
83 |
("0.1%", "0.1", True),
|