JeffYang52415 commited on
Commit
18bf871
·
unverified ·
1 Parent(s): cddf75e

feat: add mbpp parser

Browse files
llmdataparser/__init__.py CHANGED
@@ -2,7 +2,19 @@
2
  from typing import Type
3
 
4
  from .base_parser import DatasetParser
5
- from .mmlu_parser import MMLUDatasetParser
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  class ParserRegistry:
@@ -31,3 +43,14 @@ class ParserRegistry:
31
 
32
  # Register parsers
33
  ParserRegistry.register_parser("mmlu", MMLUDatasetParser)
 
 
 
 
 
 
 
 
 
 
 
 
2
  from typing import Type
3
 
4
  from .base_parser import DatasetParser
5
+ from .bbh_parser import BBHDatasetParser
6
+ from .gsm8k_parser import GSM8KDatasetParser
7
+ from .humaneval_parser import HumanEvalDatasetParser, HumanEvalDatasetPlusParser
8
+ from .ifeval_parser import IFEvalDatasetParser
9
+ from .math_parser import MATHDatasetParser
10
+ from .mbpp_parser import MBPPDatasetParser
11
+ from .mgsm_parser import MGSMDatasetParser
12
+ from .mmlu_parser import (
13
+ MMLUDatasetParser,
14
+ MMLUProDatasetParser,
15
+ MMLUReduxDatasetParser,
16
+ TMMLUPlusDatasetParser,
17
+ )
18
 
19
 
20
  class ParserRegistry:
 
43
 
44
  # Register parsers
45
  ParserRegistry.register_parser("mmlu", MMLUDatasetParser)
46
+ ParserRegistry.register_parser("mmlupro", MMLUProDatasetParser)
47
+ ParserRegistry.register_parser("mmluredux", MMLUReduxDatasetParser)
48
+ ParserRegistry.register_parser("tmmluplus", TMMLUPlusDatasetParser)
49
+ ParserRegistry.register_parser("gsm8k", GSM8KDatasetParser)
50
+ ParserRegistry.register_parser("math", MATHDatasetParser)
51
+ ParserRegistry.register_parser("mgsm", MGSMDatasetParser)
52
+ ParserRegistry.register_parser("humaneval", HumanEvalDatasetParser)
53
+ ParserRegistry.register_parser("humanevalplus", HumanEvalDatasetPlusParser)
54
+ ParserRegistry.register_parser("bbh", BBHDatasetParser)
55
+ ParserRegistry.register_parser("mbpp", MBPPDatasetParser)
56
+ ParserRegistry.register_parser("ifeval", IFEvalDatasetParser)
llmdataparser/base_parser.py CHANGED
@@ -80,6 +80,8 @@ class HuggingFaceDatasetParser(DatasetParser[T]):
80
  _default_task: ClassVar[str]
81
  # _default_system_prompt is the default system prompt to use if no system prompt is specified
82
  _default_system_prompt: ClassVar[str]
 
 
83
 
84
  def __init__(self, system_prompt: str | None = None, **kwargs):
85
  """
 
80
  _default_task: ClassVar[str]
81
  # _default_system_prompt is the default system prompt to use if no system prompt is specified
82
  _default_system_prompt: ClassVar[str]
83
+ # _hidden_task_names is the list of task names that are hidden in the dataset, e.g. ["math", "physics", "chemistry"]
84
+ _hidden_task_names: ClassVar[list[str]] = []
85
 
86
  def __init__(self, system_prompt: str | None = None, **kwargs):
87
  """
llmdataparser/mbpp_parser.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar
3
+
4
+ from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
5
+ from llmdataparser.prompts import MBPP_SYSTEM_PROMPT
6
+
7
+
8
+ @dataclass(frozen=True, kw_only=True, slots=True)
9
+ class MBPPParseEntry(HuggingFaceParseEntry):
10
+ """Custom entry class for MBPP, with fields specific to this dataset parser."""
11
+
12
+ task_id: int
13
+ test_list: list[str]
14
+ test_setup_code: str
15
+ challenge_test_list: list[str]
16
+ source_file: str
17
+
18
+ @classmethod
19
+ def create(
20
+ cls,
21
+ prompt: str,
22
+ answer: str,
23
+ raw_question: str,
24
+ task_id: int,
25
+ test_list: list[str],
26
+ test_setup_code: str,
27
+ challenge_test_list: list[str],
28
+ task_name: str,
29
+ source_file: str,
30
+ ) -> "MBPPParseEntry":
31
+ if not isinstance(task_id, int):
32
+ raise ValueError("Task ID must be an integer")
33
+
34
+ return cls(
35
+ prompt=prompt,
36
+ answer=answer,
37
+ raw_question=raw_question,
38
+ raw_answer=answer, # In MBPP, the code solution is the raw answer
39
+ task_id=task_id,
40
+ test_list=test_list,
41
+ test_setup_code=test_setup_code,
42
+ challenge_test_list=challenge_test_list,
43
+ task_name=task_name,
44
+ source_file=source_file,
45
+ )
46
+
47
+
48
+ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
49
+ """Parser for the MBPP (Mostly Basic Python Programming) dataset."""
50
+
51
+ _data_source: ClassVar[str] = "google-research-datasets/mbpp"
52
+ _default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
53
+ _task_names: ClassVar[list[str]] = ["full", "sanitized"]
54
+ _default_system_prompt: ClassVar[str] = MBPP_SYSTEM_PROMPT
55
+
56
+ def process_entry(
57
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
58
+ ) -> MBPPParseEntry:
59
+ """Process a single MBPP entry."""
60
+ raw_question = row.get("text", row.get("prompt"))
61
+ answer = row["code"]
62
+ task_id = row["task_id"]
63
+ test_list = row["test_list"]
64
+ test_setup_code = row.get("test_setup_code", "")
65
+ challenge_test_list = row.get("challenge_test_list", [])
66
+
67
+ # Combine system prompt with the task description
68
+ prompt = f"{self._system_prompt}\n\nTask: {raw_question}"
69
+
70
+ # Use task_name if provided, otherwise use default
71
+ task = task_name or self._get_current_task(row)
72
+ source_file = row.get("source_file", "")
73
+
74
+ return MBPPParseEntry.create(
75
+ prompt=prompt,
76
+ answer=answer,
77
+ raw_question=raw_question,
78
+ task_id=task_id,
79
+ test_list=test_list,
80
+ test_setup_code=test_setup_code,
81
+ challenge_test_list=challenge_test_list,
82
+ task_name=task,
83
+ source_file=source_file,
84
+ )
85
+
86
+
87
+ if __name__ == "__main__":
88
+ # Example usage
89
+ parser = MBPPDatasetParser()
90
+
91
+ # Load the dataset
92
+ parser.load()
93
+
94
+ # Parse all splits
95
+ parser.parse()
96
+
97
+ # Get parsed data
98
+ parsed_data = parser.get_parsed_data
99
+
100
+ # Print example entry
101
+ if parsed_data:
102
+ example = parsed_data[0]
103
+ print("\nExample parsed entry:")
104
+ print(f"Task ID: {example.task_id}")
105
+ print(f"Task: {example.raw_question}")
106
+ print(f"Solution:\n{example.answer}")
107
+ print(f"Test Cases:\n{example.test_list}")
llmdataparser/mmlu_parser.py CHANGED
@@ -339,7 +339,8 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
339
 
340
  _data_source = "TIGER-Lab/MMLU-Pro"
341
  _default_task = "default"
342
- _task_names = [
 
343
  "math",
344
  "physics",
345
  "chemistry",
 
339
 
340
  _data_source = "TIGER-Lab/MMLU-Pro"
341
  _default_task = "default"
342
+ _task_names = ["default"]
343
+ _hidden_task_names = [
344
  "math",
345
  "physics",
346
  "chemistry",
llmdataparser/prompts.py CHANGED
@@ -121,3 +121,20 @@ BBH_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
121
  6. Respond with ONLY the letter (A, B, C, etc.) or "True"/"False" or "Yes"/"No" - no explanations or additional text
122
  """
123
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  6. Respond with ONLY the letter (A, B, C, etc.) or "True"/"False" or "Yes"/"No" - no explanations or additional text
122
  """
123
  )
124
+
125
+ MBPP_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
126
+ """\
127
+ You are an expert Python programmer tasked with solving basic programming problems. Your goal is to write clean, efficient, and well-tested Python code that solves the given task.
128
+
129
+ Instructions:
130
+ 1. Read the task description carefully
131
+ 2. Write a complete Python solution that solves the problem
132
+ 3. Follow Python best practices and PEP 8 style guidelines
133
+ 4. Write clear, readable code with descriptive variable names
134
+ 5. Handle edge cases and input validation appropriately
135
+ 6. Include docstrings or comments to explain complex logic
136
+ 7. Focus on fundamental programming concepts and standard library usage
137
+ 8. Optimize for readability and maintainability
138
+ 9. Return only the implementation code, no additional text
139
+ """
140
+ )
tests/test_mbpp_parser.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.mbpp_parser import MBPPDatasetParser, MBPPParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def sample_entry():
8
+ return {
9
+ "text": "Write a function to find the sum of numbers in a list.",
10
+ "code": "def sum_list(lst):\n return sum(lst)",
11
+ "task_id": 42,
12
+ "test_list": ["assert sum_list([1, 2, 3]) == 6"],
13
+ "test_setup_code": "",
14
+ "challenge_test_list": ["assert sum_list([4, 5, 6]) == 15"],
15
+ }
16
+
17
+
18
+ @pytest.fixture
19
+ def parser():
20
+ return MBPPDatasetParser()
21
+
22
+
23
+ def test_mbpp_parse_entry_creation():
24
+ """Test creation of MBPPParseEntry"""
25
+ entry = MBPPParseEntry.create(
26
+ prompt="test prompt",
27
+ answer="test answer",
28
+ raw_question="raw question",
29
+ task_id=42,
30
+ test_list=["test1", "test2"],
31
+ test_setup_code="setup code",
32
+ challenge_test_list=["challenge1"],
33
+ task_name="full",
34
+ source_file="test.pdf",
35
+ )
36
+
37
+ assert entry.prompt == "test prompt"
38
+ assert entry.answer == "test answer"
39
+ assert entry.raw_question == "raw question"
40
+ assert entry.raw_answer == "test answer"
41
+ assert entry.task_id == 42
42
+ assert entry.test_list == ["test1", "test2"]
43
+ assert entry.test_setup_code == "setup code"
44
+ assert entry.challenge_test_list == ["challenge1"]
45
+ assert entry.task_name == "full"
46
+
47
+
48
+ def test_mbpp_parse_entry_validation():
49
+ """Test validation of required fields"""
50
+ with pytest.raises(ValueError, match="Task ID must be an integer"):
51
+ MBPPParseEntry.create(
52
+ prompt="test",
53
+ answer="test",
54
+ raw_question="test",
55
+ task_id="not_an_int", # Invalid task_id type
56
+ test_list=[],
57
+ test_setup_code="",
58
+ challenge_test_list=[],
59
+ task_name="full",
60
+ source_file="test.pdf",
61
+ )
62
+
63
+
64
+ def test_process_entry(parser, sample_entry):
65
+ """Test processing of a single entry"""
66
+ result = parser.process_entry(sample_entry, task_name="full")
67
+
68
+ assert isinstance(result, MBPPParseEntry)
69
+ assert result.task_id == 42
70
+ assert result.raw_question == sample_entry["text"]
71
+ assert result.answer == sample_entry["code"]
72
+ assert result.test_list == sample_entry["test_list"]
73
+ assert result.challenge_test_list == sample_entry["challenge_test_list"]
74
+ expected_prompt = f"{parser._system_prompt}\n\nTask: {sample_entry['text']}"
75
+ assert result.prompt == expected_prompt
76
+ assert result.task_name == "full"
77
+
78
+
79
+ def test_parser_initialization(parser):
80
+ """Test parser initialization and properties"""
81
+ assert parser._data_source == "google-research-datasets/mbpp"
82
+ assert parser._default_task == "full"
83
+ assert parser._task_names == ["full", "sanitized"]
84
+ assert (
85
+ parser.get_huggingface_link
86
+ == "https://huggingface.co/datasets/google-research-datasets/mbpp"
87
+ )
88
+
89
+
90
+ @pytest.mark.integration
91
+ @pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
92
+ def test_parser_load_and_parse(parser):
93
+ """Integration test for loading and parsing data"""
94
+ parser.load(split="train")
95
+ parser.parse(force=True)
96
+ parsed_data = parser.get_parsed_data
97
+
98
+ assert len(parsed_data) > 0
99
+ assert all(isinstance(entry, MBPPParseEntry) for entry in parsed_data)
100
+
101
+
102
+ def test_get_current_task(parser, sample_entry):
103
+ """Test _get_current_task method"""
104
+ task = parser._get_current_task(sample_entry)
105
+ assert task == parser._default_task
106
+
107
+
108
+ @pytest.mark.parametrize("task_name", ["full", "sanitized"])
109
+ @pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
110
+ def test_different_tasks_loading(parser, task_name):
111
+ """Test loading different tasks of the dataset"""
112
+ parser.load(task_name=task_name, split="train")
113
+ assert parser._current_task == task_name
114
+
115
+
116
+ def test_parser_string_representation(parser):
117
+ """Test string representation of parser"""
118
+ repr_str = str(parser)
119
+ assert "MBPPDatasetParser" in repr_str
120
+ assert "google-research-datasets/mbpp" in repr_str
121
+ assert "not loaded" in repr_str
122
+
123
+
124
+ def test_parse_without_loaded_data(parser):
125
+ """Test parsing without loading data first"""
126
+ with pytest.raises(
127
+ ValueError, match="No data loaded. Please load the dataset first"
128
+ ):
129
+ parser.parse()
130
+
131
+
132
+ @pytest.mark.integration
133
+ @pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
134
+ def test_full_workflow_with_different_splits(parser):
135
+ """Test the complete workflow with different splits"""
136
+ parser.load(split="train")
137
+ parser.parse(force=True)
138
+ train_data = parser.get_parsed_data
139
+
140
+ assert len(train_data) > 0
141
+ assert all(isinstance(entry, MBPPParseEntry) for entry in train_data)
142
+ assert all(entry.task_name == "full" for entry in train_data)
143
+
144
+
145
+ def test_custom_system_prompt():
146
+ """Test parser initialization with custom system prompt"""
147
+ custom_prompt = "Custom system prompt"
148
+ parser = MBPPDatasetParser(system_prompt=custom_prompt)
149
+ assert parser._system_prompt == custom_prompt
150
+
151
+
152
+ def test_default_system_prompt(parser):
153
+ """Test parser uses default system prompt when none provided"""
154
+ assert parser._system_prompt == parser._default_system_prompt
tests/test_mmlu_parser.py CHANGED
@@ -112,7 +112,7 @@ def test_process_entry_base(base_parser, sample_mmlu_entries):
112
  assert "D. Madrid" in entry.prompt
113
  assert entry.raw_question == "What is the capital of France?"
114
  assert entry.raw_choices == ["London", "Paris", "Berlin", "Madrid"]
115
- assert entry.raw_answer == 1
116
  assert entry.task_name == "geography"
117
 
118
 
@@ -169,7 +169,7 @@ def test_tmmlu_process_entry(tmmlu_parser):
169
  ("base_parser", 57, "cais/mmlu"),
170
  ("redux_parser", 30, "edinburgh-dawg/mmlu-redux"),
171
  ("tmmlu_parser", 66, "ikala/tmmluplus"),
172
- ("mmlu_pro_parser", 14, "TIGER-Lab/MMLU-Pro"),
173
  ],
174
  )
175
  def test_parser_initialization(
 
112
  assert "D. Madrid" in entry.prompt
113
  assert entry.raw_question == "What is the capital of France?"
114
  assert entry.raw_choices == ["London", "Paris", "Berlin", "Madrid"]
115
+ assert entry.raw_answer == "1"
116
  assert entry.task_name == "geography"
117
 
118
 
 
169
  ("base_parser", 57, "cais/mmlu"),
170
  ("redux_parser", 30, "edinburgh-dawg/mmlu-redux"),
171
  ("tmmlu_parser", 66, "ikala/tmmluplus"),
172
+ ("mmlu_pro_parser", 1, "TIGER-Lab/MMLU-Pro"),
173
  ],
174
  )
175
  def test_parser_initialization(