refactor: citations
Browse files- llmdataparser/gsm8k_parser.py +5 -5
- llmdataparser/humaneval_parser.py +13 -13
- llmdataparser/ifeval_parser.py +8 -8
- llmdataparser/math_parser.py +5 -5
- llmdataparser/mgsm_parser.py +11 -11
- llmdataparser/mmlu_parser.py +27 -27
- llmdataparser/tmlu_parser.py +13 -13
llmdataparser/gsm8k_parser.py
CHANGED
@@ -95,11 +95,11 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
|
|
95 |
"and cover basic arithmetic, word problems, and elementary algebra"
|
96 |
),
|
97 |
citation="""@article{cobbe2021gsm8k,
|
98 |
-
title={Training Verifiers to Solve Math Word Problems},
|
99 |
-
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
|
100 |
-
journal={arXiv preprint arXiv:2110.14168},
|
101 |
-
year={2021}
|
102 |
-
}""",
|
103 |
)
|
104 |
|
105 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
|
|
95 |
"and cover basic arithmetic, word problems, and elementary algebra"
|
96 |
),
|
97 |
citation="""@article{cobbe2021gsm8k,
|
98 |
+
title={Training Verifiers to Solve Math Word Problems},
|
99 |
+
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
|
100 |
+
journal={arXiv preprint arXiv:2110.14168},
|
101 |
+
year={2021}
|
102 |
+
}""",
|
103 |
)
|
104 |
|
105 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
llmdataparser/humaneval_parser.py
CHANGED
@@ -94,13 +94,13 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
|
|
94 |
"tests. Problems test basic programming, algorithms, and data structure skills"
|
95 |
),
|
96 |
citation="""@article{chen2021codex,
|
97 |
-
title={Evaluating Large Language Models Trained on Code},
|
98 |
-
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
|
99 |
-
year={2021},
|
100 |
-
eprint={2107.03374},
|
101 |
-
archivePrefix={arXiv},
|
102 |
-
primaryClass={cs.LG}
|
103 |
-
}""",
|
104 |
)
|
105 |
|
106 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
@@ -194,12 +194,12 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
|
|
194 |
"and ensure production-quality code generation."
|
195 |
),
|
196 |
citation="""@inproceedings{evalplus,
|
197 |
-
title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
|
198 |
-
author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
|
199 |
-
booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
|
200 |
-
year = {2023},
|
201 |
-
url = {https://openreview.net/forum?id=1qvx610Cu7},
|
202 |
-
}""",
|
203 |
)
|
204 |
|
205 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
|
|
94 |
"tests. Problems test basic programming, algorithms, and data structure skills"
|
95 |
),
|
96 |
citation="""@article{chen2021codex,
|
97 |
+
title={Evaluating Large Language Models Trained on Code},
|
98 |
+
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
|
99 |
+
year={2021},
|
100 |
+
eprint={2107.03374},
|
101 |
+
archivePrefix={arXiv},
|
102 |
+
primaryClass={cs.LG}
|
103 |
+
}""",
|
104 |
)
|
105 |
|
106 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
|
|
194 |
"and ensure production-quality code generation."
|
195 |
),
|
196 |
citation="""@inproceedings{evalplus,
|
197 |
+
title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
|
198 |
+
author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
|
199 |
+
booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
|
200 |
+
year = {2023},
|
201 |
+
url = {https://openreview.net/forum?id=1qvx610Cu7},
|
202 |
+
}""",
|
203 |
)
|
204 |
|
205 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
llmdataparser/ifeval_parser.py
CHANGED
@@ -99,14 +99,14 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
|
99 |
"for evaluating chat or instruction fine-tuned language models."
|
100 |
),
|
101 |
citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
|
102 |
-
title={Instruction-Following Evaluation for Large Language Models},
|
103 |
-
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
|
104 |
-
year={2023},
|
105 |
-
eprint={2311.07911},
|
106 |
-
archivePrefix={arXiv},
|
107 |
-
primaryClass={cs.CL},
|
108 |
-
url={https://arxiv.org/abs/2311.07911}
|
109 |
-
}""",
|
110 |
)
|
111 |
|
112 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
|
|
99 |
"for evaluating chat or instruction fine-tuned language models."
|
100 |
),
|
101 |
citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
|
102 |
+
title={Instruction-Following Evaluation for Large Language Models},
|
103 |
+
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
|
104 |
+
year={2023},
|
105 |
+
eprint={2311.07911},
|
106 |
+
archivePrefix={arXiv},
|
107 |
+
primaryClass={cs.CL},
|
108 |
+
url={https://arxiv.org/abs/2311.07911}
|
109 |
+
}""",
|
110 |
)
|
111 |
|
112 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
llmdataparser/math_parser.py
CHANGED
@@ -104,11 +104,11 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
|
|
104 |
"Problems are categorized by subject area and difficulty level (1-5)."
|
105 |
),
|
106 |
citation="""@article{hendrycksmath2021,
|
107 |
-
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
108 |
-
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
|
109 |
-
journal={NeurIPS},
|
110 |
-
year={2021}
|
111 |
-
}""",
|
112 |
additional_info={
|
113 |
"difficulty_levels": "1-5",
|
114 |
"topics": [
|
|
|
104 |
"Problems are categorized by subject area and difficulty level (1-5)."
|
105 |
),
|
106 |
citation="""@article{hendrycksmath2021,
|
107 |
+
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
108 |
+
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
|
109 |
+
journal={NeurIPS},
|
110 |
+
year={2021}
|
111 |
+
}""",
|
112 |
additional_info={
|
113 |
"difficulty_levels": "1-5",
|
114 |
"topics": [
|
llmdataparser/mgsm_parser.py
CHANGED
@@ -113,18 +113,18 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
|
|
113 |
"language models' ability to perform mathematical reasoning across different languages."
|
114 |
),
|
115 |
citation="""@misc{shi2022language,
|
116 |
-
title={Language Models are Multilingual Chain-of-Thought Reasoners},
|
117 |
-
author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
|
118 |
-
year={2022},
|
119 |
-
eprint={2210.03057},
|
120 |
-
archivePrefix={arXiv},
|
121 |
-
primaryClass={cs.CL}
|
122 |
-
}
|
123 |
@article{cobbe2021gsm8k,
|
124 |
-
title={Training Verifiers to Solve Math Word Problems},
|
125 |
-
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
|
126 |
-
journal={arXiv preprint arXiv:2110.14168},
|
127 |
-
year={2021}
|
128 |
}""",
|
129 |
additional_info={
|
130 |
"languages": [
|
|
|
113 |
"language models' ability to perform mathematical reasoning across different languages."
|
114 |
),
|
115 |
citation="""@misc{shi2022language,
|
116 |
+
title={Language Models are Multilingual Chain-of-Thought Reasoners},
|
117 |
+
author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
|
118 |
+
year={2022},
|
119 |
+
eprint={2210.03057},
|
120 |
+
archivePrefix={arXiv},
|
121 |
+
primaryClass={cs.CL}
|
122 |
+
}
|
123 |
@article{cobbe2021gsm8k,
|
124 |
+
title={Training Verifiers to Solve Math Word Problems},
|
125 |
+
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
|
126 |
+
journal={arXiv preprint arXiv:2110.14168},
|
127 |
+
year={2021}
|
128 |
}""",
|
129 |
additional_info={
|
130 |
"languages": [
|
llmdataparser/mmlu_parser.py
CHANGED
@@ -220,17 +220,17 @@ class BaseMMLUDatasetParser(MMLUDatasetParser):
|
|
220 |
"requires both extensive world knowledge and strong problem-solving capabilities."
|
221 |
),
|
222 |
citation="""@article{hendryckstest2021,
|
223 |
-
title={Measuring Massive Multitask Language Understanding},
|
224 |
-
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
225 |
-
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
226 |
-
year={2021}
|
227 |
-
}
|
228 |
@article{hendrycks2021ethics,
|
229 |
-
title={Aligning AI With Shared Human Values},
|
230 |
-
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
|
231 |
-
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
232 |
-
year={2021}
|
233 |
-
}""",
|
234 |
additional_info={
|
235 |
"subjects": "57 tasks/subjects",
|
236 |
"categories": [
|
@@ -340,13 +340,13 @@ class MMLUReduxDatasetParser(MMLUDatasetParser):
|
|
340 |
"against credible sources."
|
341 |
),
|
342 |
citation="""@misc{gema2024mmlu,
|
343 |
-
title={Are We Done with MMLU?},
|
344 |
-
author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
|
345 |
-
year={2024},
|
346 |
-
eprint={2406.04127},
|
347 |
-
archivePrefix={arXiv},
|
348 |
-
primaryClass={cs.CL}
|
349 |
-
}""",
|
350 |
additional_info={
|
351 |
"size": "3,000 questions (100 per subject)",
|
352 |
"subjects": "30 MMLU subjects",
|
@@ -503,11 +503,11 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
|
503 |
"parameters ranging from 1.8B to 72B."
|
504 |
),
|
505 |
citation="""@article{ikala2024improved,
|
506 |
-
title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
|
507 |
-
author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
|
508 |
-
journal={arXiv preprint arXiv:2403.01858},
|
509 |
-
year={2024}
|
510 |
-
}""",
|
511 |
additional_info={
|
512 |
"subjects": "66 diverse subjects",
|
513 |
"difficulty_levels": ["Elementary", "Secondary", "Professional"],
|
@@ -630,11 +630,11 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
|
630 |
"with expert review and GPT-4 assisted distractor generation."
|
631 |
),
|
632 |
citation="""@article{wang2024mmlu,
|
633 |
-
title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
|
634 |
-
author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
|
635 |
-
journal={arXiv preprint arXiv:2406.01574},
|
636 |
-
year={2024}
|
637 |
-
}""",
|
638 |
additional_info={
|
639 |
"size": "12,000 complex questions",
|
640 |
"options": "Up to 10 choices per question",
|
|
|
220 |
"requires both extensive world knowledge and strong problem-solving capabilities."
|
221 |
),
|
222 |
citation="""@article{hendryckstest2021,
|
223 |
+
title={Measuring Massive Multitask Language Understanding},
|
224 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
225 |
+
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
226 |
+
year={2021}
|
227 |
+
}
|
228 |
@article{hendrycks2021ethics,
|
229 |
+
title={Aligning AI With Shared Human Values},
|
230 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
|
231 |
+
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
232 |
+
year={2021}
|
233 |
+
}""",
|
234 |
additional_info={
|
235 |
"subjects": "57 tasks/subjects",
|
236 |
"categories": [
|
|
|
340 |
"against credible sources."
|
341 |
),
|
342 |
citation="""@misc{gema2024mmlu,
|
343 |
+
title={Are We Done with MMLU?},
|
344 |
+
author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
|
345 |
+
year={2024},
|
346 |
+
eprint={2406.04127},
|
347 |
+
archivePrefix={arXiv},
|
348 |
+
primaryClass={cs.CL}
|
349 |
+
}""",
|
350 |
additional_info={
|
351 |
"size": "3,000 questions (100 per subject)",
|
352 |
"subjects": "30 MMLU subjects",
|
|
|
503 |
"parameters ranging from 1.8B to 72B."
|
504 |
),
|
505 |
citation="""@article{ikala2024improved,
|
506 |
+
title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
|
507 |
+
author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
|
508 |
+
journal={arXiv preprint arXiv:2403.01858},
|
509 |
+
year={2024}
|
510 |
+
}""",
|
511 |
additional_info={
|
512 |
"subjects": "66 diverse subjects",
|
513 |
"difficulty_levels": ["Elementary", "Secondary", "Professional"],
|
|
|
630 |
"with expert review and GPT-4 assisted distractor generation."
|
631 |
),
|
632 |
citation="""@article{wang2024mmlu,
|
633 |
+
title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
|
634 |
+
author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
|
635 |
+
journal={arXiv preprint arXiv:2406.01574},
|
636 |
+
year={2024}
|
637 |
+
}""",
|
638 |
additional_info={
|
639 |
"size": "12,000 complex questions",
|
640 |
"options": "Up to 10 choices per question",
|
llmdataparser/tmlu_parser.py
CHANGED
@@ -137,19 +137,19 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
|
|
137 |
"and professional certifications"
|
138 |
),
|
139 |
citation="""@article{DBLP:journals/corr/abs-2403-20180,
|
140 |
-
author = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
|
141 |
-
title = {Measuring Taiwanese Mandarin Language Understanding},
|
142 |
-
journal = {CoRR},
|
143 |
-
volume = {abs/2403.20180},
|
144 |
-
year = {2024},
|
145 |
-
url = {https://doi.org/10.48550/arXiv.2403.20180},
|
146 |
-
doi = {10.48550/ARXIV.2403.20180},
|
147 |
-
eprinttype = {arXiv},
|
148 |
-
eprint = {2403.20180},
|
149 |
-
timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
|
150 |
-
biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
|
151 |
-
bibsource = {dblp computer science bibliography, https://dblp.org}
|
152 |
-
}""",
|
153 |
)
|
154 |
|
155 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
|
|
137 |
"and professional certifications"
|
138 |
),
|
139 |
citation="""@article{DBLP:journals/corr/abs-2403-20180,
|
140 |
+
author = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
|
141 |
+
title = {Measuring Taiwanese Mandarin Language Understanding},
|
142 |
+
journal = {CoRR},
|
143 |
+
volume = {abs/2403.20180},
|
144 |
+
year = {2024},
|
145 |
+
url = {https://doi.org/10.48550/arXiv.2403.20180},
|
146 |
+
doi = {10.48550/ARXIV.2403.20180},
|
147 |
+
eprinttype = {arXiv},
|
148 |
+
eprint = {2403.20180},
|
149 |
+
timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
|
150 |
+
biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
|
151 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
152 |
+
}""",
|
153 |
)
|
154 |
|
155 |
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|