File size: 7,981 Bytes
44529bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb32f8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import pytest

from llmdataparser.bbh_parser import BBHDatasetParser, BBHParseEntry


@pytest.fixture
def bbh_parser():
    """Create a BBH parser instance for testing."""
    return BBHDatasetParser()


@pytest.fixture
def loaded_bbh_parser(bbh_parser):
    """Create and load a BBH parser instance for testing."""
    bbh_parser.load(task_name="reasoning_about_colored_objects", split="test")
    return bbh_parser


@pytest.fixture
def sample_row():
    """Create a sample BBH data row for testing."""
    return {
        "input": "What color is the sky on a clear day?\nA) Blue\nB) Green\nC) Red\nD) Yellow",
        "target": "(A)",
    }


def test_bbh_parse_entry_creation_valid():
    """Test valid creation of BBHParseEntry."""
    entry = BBHParseEntry.create(
        prompt="Test prompt",
        answer="A",
        raw_question="Test question",
        raw_answer="(A)",
        task_name="reasoning_about_colored_objects",
    )
    assert isinstance(entry, BBHParseEntry)
    assert entry.prompt == "Test prompt"
    assert entry.answer == "A"
    assert entry.raw_question == "Test question"
    assert entry.raw_answer == "(A)"
    assert entry.task_name == "reasoning_about_colored_objects"


def test_bbh_parser_initialization(bbh_parser):
    """Test BBH parser initialization."""
    assert bbh_parser._data_source == "lukaemon/bbh"
    assert bbh_parser._default_task == "reasoning_about_colored_objects"
    assert "boolean_expressions" in bbh_parser._task_names
    assert "word_sorting" in bbh_parser._task_names
    assert (
        bbh_parser.get_huggingface_link
        == "https://huggingface.co/datasets/lukaemon/bbh"
    )


def test_load_dataset(loaded_bbh_parser):
    """Test loading the dataset."""
    assert loaded_bbh_parser.raw_data is not None
    assert loaded_bbh_parser.split_names == ["test"]
    assert loaded_bbh_parser._current_task == "reasoning_about_colored_objects"


@pytest.mark.integration
def test_full_parse_workflow(loaded_bbh_parser):
    """Test the complete workflow of loading and parsing data."""
    # Parse the test split
    loaded_bbh_parser.parse(split_names="test", force=True)
    parsed_data = loaded_bbh_parser.get_parsed_data

    # Basic checks
    assert len(parsed_data) > 0

    # Check first entry structure
    first_entry = parsed_data[0]
    assert isinstance(first_entry, BBHParseEntry)
    assert first_entry.task_name == "reasoning_about_colored_objects"
    assert first_entry.answer.strip("()").isalpha()  # Should be a single letter
    assert first_entry.prompt.startswith(loaded_bbh_parser._system_prompt)


def test_process_entry(bbh_parser, sample_row):
    """Test processing of a single BBH entry."""
    entry = bbh_parser.process_entry(
        sample_row, task_name="reasoning_about_colored_objects"
    )

    assert isinstance(entry, BBHParseEntry)
    assert entry.answer == "A"  # Stripped from "(A)"
    assert "What color is the sky" in entry.raw_question
    assert entry.raw_answer == "(A)"
    assert bbh_parser._system_prompt in entry.prompt
    assert entry.task_name == "reasoning_about_colored_objects"


@pytest.mark.parametrize("split_name", ["invalid_split", "wrong_split"])
def test_parse_with_invalid_split(bbh_parser, split_name):
    """Test parsing with invalid split names."""
    bbh_parser.raw_data = {"train": [], "test": []}  # Mock data

    with pytest.raises(
        ValueError, match=f"Split '{split_name}' not found in the dataset"
    ):
        bbh_parser.parse(split_name)


def test_parse_without_loaded_data(bbh_parser):
    """Test parsing without loading data first."""
    with pytest.raises(
        ValueError, match="No data loaded. Please load the dataset first"
    ):
        bbh_parser.parse()


@pytest.mark.parametrize(
    "test_case",
    [
        {"input": "Test question", "target": "(A)"},
        {"input": "Test question", "target": "(B)"},
        {"input": "Test question", "target": "(C)"},
    ],
)
def test_answer_stripping(bbh_parser, test_case):
    """Test stripping of parentheses from answers."""
    entry = bbh_parser.process_entry(
        test_case, task_name="reasoning_about_colored_objects"
    )
    assert entry.answer == test_case["target"].strip("()")
    assert entry.raw_answer == test_case["target"]


def test_parser_properties(bbh_parser):
    """Test parser property getters."""
    assert len(bbh_parser.task_names) > 0
    assert bbh_parser.total_tasks == len(bbh_parser._task_names)
    assert all(isinstance(task, str) for task in bbh_parser.task_names)


def test_parser_string_representation(loaded_bbh_parser):
    """Test string representation of parser."""
    repr_str = str(loaded_bbh_parser)
    assert "BBHDatasetParser" in repr_str
    assert "lukaemon/bbh" in repr_str
    assert "reasoning_about_colored_objects" in repr_str
    assert "loaded" in repr_str


@pytest.mark.integration
@pytest.mark.parametrize(
    "task_name", ["boolean_expressions", "causal_judgement", "date_understanding"]
)
def test_different_tasks_parsing(bbh_parser, task_name):
    """Test parsing different tasks of the dataset."""
    bbh_parser.load(task_name=task_name, split="test")
    bbh_parser.parse(split_names="test", force=True)
    parsed_data = bbh_parser.get_parsed_data

    assert len(parsed_data) > 0
    assert all(entry.task_name == task_name for entry in parsed_data)
    assert all(isinstance(entry.answer, str) for entry in parsed_data)


def test_get_evaluation_metrics(bbh_parser):
    """Test evaluation metrics structure and content."""
    metrics = bbh_parser.get_evaluation_metrics()

    # Check basic structure
    assert isinstance(metrics, list)
    assert len(metrics) > 0

    # Check each metric has required fields
    required_fields = ["name", "type", "description", "implementation", "primary"]
    for metric in metrics:
        for field in required_fields:
            assert field in metric, f"Missing field {field} in metric {metric['name']}"

        # Check field types
        assert isinstance(metric["name"], str)
        assert isinstance(metric["type"], str)
        assert isinstance(metric["description"], str)
        assert isinstance(metric["implementation"], str)
        assert isinstance(metric["primary"], bool)

    # Check specific metrics exist
    metric_names = {m["name"] for m in metrics}
    expected_metrics = {
        "accuracy",
        "human_eval_delta",
        "per_task_accuracy",
        "exact_match",
    }
    assert expected_metrics.issubset(metric_names)

    # Check primary metrics
    primary_metrics = {m["name"] for m in metrics if m["primary"]}
    assert "accuracy" in primary_metrics
    assert "human_eval_delta" in primary_metrics


def test_dataset_description_citation_format(bbh_parser):
    """Test that the citation in dataset description is properly formatted."""
    description = bbh_parser.get_dataset_description()
    citation = description["citation"]

    # Check citation structure
    assert citation.startswith("@article{")
    assert "title=" in citation
    assert "author=" in citation
    assert "journal=" in citation
    assert "year=" in citation

    # Check specific author formatting
    assert "Suzgun, Mirac" in citation
    assert "Wei, Jason" in citation
    assert "and Wei, Jason" in citation  # Should be last author
    assert "and and" not in citation  # No double "and"


def test_evaluation_metrics_implementations(bbh_parser):
    """Test that evaluation metric implementations are properly specified."""
    metrics = bbh_parser.get_evaluation_metrics()

    for metric in metrics:
        impl = metric["implementation"]

        if "evaluate.load" in impl:
            # Check standard metric format
            assert impl.startswith("evaluate.load('")
            assert impl.endswith("')")
        elif "custom_" in impl:
            # Check custom metric format
            assert impl.startswith("custom_")
            assert len(impl) > 7  # More than just "custom_"