File size: 2,981 Bytes

289c905

from dataclasses import dataclass
from typing import Any, ClassVar, List

from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT  # You'll need to create this


@dataclass(frozen=True, kw_only=True, slots=True)
class IFEvalParseEntry(HuggingFaceParseEntry):
    """Custom entry class for IFEval, with fields specific to this dataset parser."""

    key: int
    instruction_id_list: List[str]
    kwargs: dict[str, Any]

    @classmethod
    def create(
        cls,
        prompt: str,
        answer: str,
        raw_question: str,
        raw_answer: str,
        key: int,
        instruction_id_list: List[str],
        kwargs: dict[str, Any],
        task_name: str,
    ) -> "IFEvalParseEntry":
        return cls(
            prompt=prompt,
            answer=answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            key=key,
            instruction_id_list=instruction_id_list,
            kwargs=kwargs,
            task_name=task_name,
        )


class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
    """Parser for the IFEval dataset."""

    _data_source: ClassVar[str] = "google/IFEval"
    _default_task: ClassVar[str] = "default"
    _task_names: ClassVar[list[str]] = ["default"]
    _default_system_prompt: ClassVar[str] = IFEVAL_SYSTEM_PROMPT

    def process_entry(
        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
    ) -> IFEvalParseEntry:
        """Process a single IFEval entry."""
        # Extract fields from the row
        key = row["key"]
        raw_question = row["prompt"]  # The prompt is the raw question in this case
        instruction_id_list = row["instruction_id_list"]
        kwargs_data = row["kwargs"]

        # For IFEval, we don't have explicit answers in the dataset
        # We'll use empty strings as placeholders
        answer = ""
        raw_answer = ""

        # Combine system prompt with the instruction prompt
        prompt = f"{self._system_prompt}\n\n{raw_question}"

        # Use task_name if provided, otherwise use default
        task = task_name or self._get_current_task(row)

        return IFEvalParseEntry.create(
            prompt=prompt,
            answer=answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            key=key,
            instruction_id_list=instruction_id_list,
            kwargs=kwargs_data,
            task_name=task,
        )


if __name__ == "__main__":
    # Example usage
    parser = IFEvalDatasetParser()
    parser.load()
    parser.parse()

    parsed_data = parser.get_parsed_data
    if parsed_data:
        example = parsed_data[0]
        print("\nExample parsed entry:")
        print(f"Key: {example.key}")
        print(f"Prompt: {example.prompt}")
        print(f"Instruction IDs: {example.instruction_id_list}")
        print(f"kwargs: {example.kwargs}")