Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

File size: 7,981 Bytes

import pytest

from llmdataparser.bbh_parser import BBHDatasetParser, BBHParseEntry


@pytest.fixture
def bbh_parser():
    """Create a BBH parser instance for testing."""
    return BBHDatasetParser()


@pytest.fixture
def loaded_bbh_parser(bbh_parser):
    """Create and load a BBH parser instance for testing."""
    bbh_parser.load(task_name="reasoning_about_colored_objects", split="test")
    return bbh_parser


@pytest.fixture
def sample_row():
    """Create a sample BBH data row for testing."""
    return {
        "input": "What color is the sky on a clear day?\nA) Blue\nB) Green\nC) Red\nD) Yellow",
        "target": "(A)",
    }


def test_bbh_parse_entry_creation_valid():
    """Test valid creation of BBHParseEntry."""
    entry = BBHParseEntry.create(
        prompt="Test prompt",
        answer="A",
        raw_question="Test question",
        raw_answer="(A)",
        task_name="reasoning_about_colored_objects",
    )
    assert isinstance(entry, BBHParseEntry)
    assert entry.prompt == "Test prompt"
    assert entry.answer == "A"
    assert entry.raw_question == "Test question"
    assert entry.raw_answer == "(A)"
    assert entry.task_name == "reasoning_about_colored_objects"


def test_bbh_parser_initialization(bbh_parser):
    """Test BBH parser initialization."""
    assert bbh_parser._data_source == "lukaemon/bbh"
    assert bbh_parser._default_task == "reasoning_about_colored_objects"
    assert "boolean_expressions" in bbh_parser._task_names
    assert "word_sorting" in bbh_parser._task_names
    assert (
        bbh_parser.get_huggingface_link
        == "https://huggingface.co/datasets/lukaemon/bbh"
    )


def test_load_dataset(loaded_bbh_parser):
    """Test loading the dataset."""
    assert loaded_bbh_parser.raw_data is not None
    assert loaded_bbh_parser.split_names == ["test"]
    assert loaded_bbh_parser._current_task == "reasoning_about_colored_objects"


@pytest.mark.integration
def test_full_parse_workflow(loaded_bbh_parser):
    """Test the complete workflow of loading and parsing data."""
    # Parse the test split
    loaded_bbh_parser.parse(split_names="test", force=True)
    parsed_data = loaded_bbh_parser.get_parsed_data

    # Basic checks
    assert len(parsed_data) > 0

    # Check first entry structure
    first_entry = parsed_data[0]
    assert isinstance(first_entry, BBHParseEntry)
    assert first_entry.task_name == "reasoning_about_colored_objects"
    assert first_entry.answer.strip("()").isalpha()  # Should be a single letter
    assert first_entry.prompt.startswith(loaded_bbh_parser._system_prompt)


def test_process_entry(bbh_parser, sample_row):
    """Test processing of a single BBH entry."""
    entry = bbh_parser.process_entry(
        sample_row, task_name="reasoning_about_colored_objects"
    )

    assert isinstance(entry, BBHParseEntry)
    assert entry.answer == "A"  # Stripped from "(A)"
    assert "What color is the sky" in entry.raw_question
    assert entry.raw_answer == "(A)"
    assert bbh_parser._system_prompt in entry.prompt
    assert entry.task_name == "reasoning_about_colored_objects"


@pytest.mark.parametrize("split_name", ["invalid_split", "wrong_split"])
def test_parse_with_invalid_split(bbh_parser, split_name):
    """Test parsing with invalid split names."""
    bbh_parser.raw_data = {"train": [], "test": []}  # Mock data

    with pytest.raises(
        ValueError, match=f"Split '{split_name}' not found in the dataset"
    ):
        bbh_parser.parse(split_name)


def test_parse_without_loaded_data(bbh_parser):
    """Test parsing without loading data first."""
    with pytest.raises(
        ValueError, match="No data loaded. Please load the dataset first"
    ):
        bbh_parser.parse()


@pytest.mark.parametrize(
    "test_case",
    [
        {"input": "Test question", "target": "(A)"},
        {"input": "Test question", "target": "(B)"},
        {"input": "Test question", "target": "(C)"},
    ],
)
def test_answer_stripping(bbh_parser, test_case):
    """Test stripping of parentheses from answers."""
    entry = bbh_parser.process_entry(
        test_case, task_name="reasoning_about_colored_objects"
    )
    assert entry.answer == test_case["target"].strip("()")
    assert entry.raw_answer == test_case["target"]


def test_parser_properties(bbh_parser):
    """Test parser property getters."""
    assert len(bbh_parser.task_names) > 0
    assert bbh_parser.total_tasks == len(bbh_parser._task_names)
    assert all(isinstance(task, str) for task in bbh_parser.task_names)


def test_parser_string_representation(loaded_bbh_parser):
    """Test string representation of parser."""
    repr_str = str(loaded_bbh_parser)
    assert "BBHDatasetParser" in repr_str
    assert "lukaemon/bbh" in repr_str
    assert "reasoning_about_colored_objects" in repr_str
    assert "loaded" in repr_str


@pytest.mark.integration
@pytest.mark.parametrize(
    "task_name", ["boolean_expressions", "causal_judgement", "date_understanding"]
)
def test_different_tasks_parsing(bbh_parser, task_name):
    """Test parsing different tasks of the dataset."""
    bbh_parser.load(task_name=task_name, split="test")
    bbh_parser.parse(split_names="test", force=True)
    parsed_data = bbh_parser.get_parsed_data

    assert len(parsed_data) > 0
    assert all(entry.task_name == task_name for entry in parsed_data)
    assert all(isinstance(entry.answer, str) for entry in parsed_data)


def test_get_evaluation_metrics(bbh_parser):
    """Test evaluation metrics structure and content."""
    metrics = bbh_parser.get_evaluation_metrics()

    # Check basic structure
    assert isinstance(metrics, list)
    assert len(metrics) > 0

    # Check each metric has required fields
    required_fields = ["name", "type", "description", "implementation", "primary"]
    for metric in metrics:
        for field in required_fields:
            assert field in metric, f"Missing field {field} in metric {metric['name']}"

        # Check field types
        assert isinstance(metric["name"], str)
        assert isinstance(metric["type"], str)
        assert isinstance(metric["description"], str)
        assert isinstance(metric["implementation"], str)
        assert isinstance(metric["primary"], bool)

    # Check specific metrics exist
    metric_names = {m["name"] for m in metrics}
    expected_metrics = {
        "accuracy",
        "human_eval_delta",
        "per_task_accuracy",
        "exact_match",
    }
    assert expected_metrics.issubset(metric_names)

    # Check primary metrics
    primary_metrics = {m["name"] for m in metrics if m["primary"]}
    assert "accuracy" in primary_metrics
    assert "human_eval_delta" in primary_metrics


def test_dataset_description_citation_format(bbh_parser):
    """Test that the citation in dataset description is properly formatted."""
    description = bbh_parser.get_dataset_description()
    citation = description["citation"]

    # Check citation structure
    assert citation.startswith("@article{")
    assert "title=" in citation
    assert "author=" in citation
    assert "journal=" in citation
    assert "year=" in citation

    # Check specific author formatting
    assert "Suzgun, Mirac" in citation
    assert "Wei, Jason" in citation
    assert "and Wei, Jason" in citation  # Should be last author
    assert "and and" not in citation  # No double "and"


def test_evaluation_metrics_implementations(bbh_parser):
    """Test that evaluation metric implementations are properly specified."""
    metrics = bbh_parser.get_evaluation_metrics()

    for metric in metrics:
        impl = metric["implementation"]

        if "evaluate.load" in impl:
            # Check standard metric format
            assert impl.startswith("evaluate.load('")
            assert impl.endswith("')")
        elif "custom_" in impl:
            # Check custom metric format
            assert impl.startswith("custom_")
            assert len(impl) > 7  # More than just "custom_"