JeffYang52415 commited on
Commit
0772011
·
unverified ·
1 Parent(s): 8f24676

refactor: mbpp parser

Browse files
llmdataparser/mbpp_parser.py CHANGED
@@ -1,7 +1,12 @@
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar
3
 
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 
 
 
 
 
5
  from llmdataparser.prompts import MBPP_SYSTEM_PROMPT
6
 
7
 
@@ -83,6 +88,70 @@ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
83
  source_file=source_file,
84
  )
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  if __name__ == "__main__":
88
  # Example usage
 
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar
3
 
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
  from llmdataparser.prompts import MBPP_SYSTEM_PROMPT
11
 
12
 
 
88
  source_file=source_file,
89
  )
90
 
91
+ def get_dataset_description(self) -> DatasetDescription:
92
+ """Returns a description of the MBPP dataset."""
93
+ return DatasetDescription.create(
94
+ name="Mostly Basic Python Problems (MBPP)",
95
+ purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
96
+ source="https://github.com/google-research/google-research/tree/master/mbpp",
97
+ language="English and Python",
98
+ format="Task descriptions in English with corresponding Python solutions and automated test cases",
99
+ characteristics=(
100
+ "Contains approximately 1,000 crowd-sourced Python programming problems "
101
+ "designed for entry-level programmers. Problems cover programming fundamentals "
102
+ "and standard library functionality. Each problem includes a task description, "
103
+ "code solution, and 3 automated test cases. A subset of the data has been "
104
+ "hand-verified by the authors."
105
+ ),
106
+ citation=(
107
+ "@article{austin2021program,\n"
108
+ " title={Program Synthesis with Large Language Models},\n"
109
+ " author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
110
+ " journal={arXiv preprint arXiv:2108.07732},\n"
111
+ " year={2021}\n"
112
+ "}"
113
+ ),
114
+ additional_info={
115
+ "size": "~1,000 programming problems",
116
+ "splits": "Available in full or sanitized versions",
117
+ "test_coverage": "Each problem includes 3 automated test cases",
118
+ "verification": "Subset of data has been hand-verified by authors",
119
+ },
120
+ )
121
+
122
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
123
+ """Returns the recommended evaluation metrics for MBPP dataset."""
124
+ return [
125
+ EvaluationMetric.create(
126
+ name="pass@k",
127
+ type="code_evaluation",
128
+ description="Percentage of problems where at least one solution in k generations passes all test cases",
129
+ implementation="custom_pass_at_k",
130
+ primary=True,
131
+ ),
132
+ EvaluationMetric.create(
133
+ name="test_case_success_rate",
134
+ type="code_evaluation",
135
+ description="Percentage of test cases passed across all problems",
136
+ implementation="custom_test_success_rate",
137
+ primary=False,
138
+ ),
139
+ EvaluationMetric.create(
140
+ name="syntax_validity",
141
+ type="code_evaluation",
142
+ description="Verifies that generated code is syntactically valid Python",
143
+ implementation="custom_syntax_check",
144
+ primary=False,
145
+ ),
146
+ EvaluationMetric.create(
147
+ name="code_similarity",
148
+ type="similarity",
149
+ description="Similarity between generated code and reference solution",
150
+ implementation="evaluate.load('code_eval')",
151
+ primary=False,
152
+ ),
153
+ ]
154
+
155
 
156
  if __name__ == "__main__":
157
  # Example usage
tests/test_mbpp_parser.py CHANGED
@@ -152,3 +152,74 @@ def test_custom_system_prompt():
152
  def test_default_system_prompt(parser):
153
  """Test parser uses default system prompt when none provided"""
154
  assert parser._system_prompt == parser._default_system_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def test_default_system_prompt(parser):
153
  """Test parser uses default system prompt when none provided"""
154
  assert parser._system_prompt == parser._default_system_prompt
155
+
156
+
157
+ def test_get_dataset_description(parser):
158
+ """Test dataset description generation."""
159
+ description = parser.get_dataset_description()
160
+
161
+ assert description.name == "Mostly Basic Python Problems (MBPP)"
162
+ assert "code generation" in description.purpose.lower()
163
+ assert "google-research" in description.source
164
+ assert description.language == "English and Python"
165
+ assert "task descriptions" in description.format.lower()
166
+ assert "python solutions" in description.format.lower()
167
+ assert "1,000" in description.characteristics
168
+ assert "entry-level programmers" in description.characteristics.lower()
169
+ assert "3 automated test cases" in description.characteristics
170
+ assert "hand-verified" in description.characteristics
171
+ assert "austin2021program" in description.citation
172
+ assert "Program Synthesis" in description.citation
173
+
174
+ # Check additional info
175
+ assert description.additional_info is not None
176
+ assert description.additional_info["size"] == "~1,000 programming problems"
177
+ assert (
178
+ description.additional_info["splits"]
179
+ == "Available in full or sanitized versions"
180
+ )
181
+ assert (
182
+ description.additional_info["test_coverage"]
183
+ == "Each problem includes 3 automated test cases"
184
+ )
185
+ assert (
186
+ description.additional_info["verification"]
187
+ == "Subset of data has been hand-verified by authors"
188
+ )
189
+
190
+
191
+ def test_get_evaluation_metrics(parser):
192
+ """Test evaluation metrics generation."""
193
+ metrics = parser.get_evaluation_metrics()
194
+
195
+ # Check total number of metrics
196
+ assert len(metrics) == 4
197
+
198
+ # Check primary metrics
199
+ primary_metrics = [m for m in metrics if m.primary]
200
+ assert len(primary_metrics) == 1
201
+
202
+ # Verify specific metrics exist with correct properties
203
+ metric_names = {m.name for m in metrics}
204
+ assert "pass@k" in metric_names
205
+ assert "test_case_success_rate" in metric_names
206
+ assert "syntax_validity" in metric_names
207
+
208
+ # Check specific metric properties
209
+ pass_k_metric = next(m for m in metrics if m.name == "pass@k")
210
+ assert pass_k_metric.type == "code_evaluation"
211
+ assert pass_k_metric.primary is True
212
+ assert "k generations" in pass_k_metric.description.lower()
213
+ assert "custom_pass_at_k" in pass_k_metric.implementation
214
+
215
+ test_case_metric = next(m for m in metrics if m.name == "test_case_success_rate")
216
+ assert test_case_metric.type == "code_evaluation"
217
+ assert test_case_metric.primary is False
218
+ assert "test cases" in test_case_metric.description.lower()
219
+ assert "custom_test_success_rate" in test_case_metric.implementation
220
+
221
+ syntax_metric = next(m for m in metrics if m.name == "syntax_validity")
222
+ assert syntax_metric.type == "code_evaluation"
223
+ assert syntax_metric.primary is False
224
+ assert "syntactically valid" in syntax_metric.description.lower()
225
+ assert "custom_syntax_check" in syntax_metric.implementation