JeffYang52415 commited on
Commit
27ff91e
·
unverified ·
1 Parent(s): 1223129

refactor: citations

Browse files
llmdataparser/gsm8k_parser.py CHANGED
@@ -95,11 +95,11 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
95
  "and cover basic arithmetic, word problems, and elementary algebra"
96
  ),
97
  citation="""@article{cobbe2021gsm8k,
98
- title={Training Verifiers to Solve Math Word Problems},
99
- author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
100
- journal={arXiv preprint arXiv:2110.14168},
101
- year={2021}
102
- }""",
103
  )
104
 
105
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
 
95
  "and cover basic arithmetic, word problems, and elementary algebra"
96
  ),
97
  citation="""@article{cobbe2021gsm8k,
98
+ title={Training Verifiers to Solve Math Word Problems},
99
+ author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
100
+ journal={arXiv preprint arXiv:2110.14168},
101
+ year={2021}
102
+ }""",
103
  )
104
 
105
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
llmdataparser/humaneval_parser.py CHANGED
@@ -94,13 +94,13 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
94
  "tests. Problems test basic programming, algorithms, and data structure skills"
95
  ),
96
  citation="""@article{chen2021codex,
97
- title={Evaluating Large Language Models Trained on Code},
98
- author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
99
- year={2021},
100
- eprint={2107.03374},
101
- archivePrefix={arXiv},
102
- primaryClass={cs.LG}
103
- }""",
104
  )
105
 
106
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
@@ -194,12 +194,12 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
194
  "and ensure production-quality code generation."
195
  ),
196
  citation="""@inproceedings{evalplus,
197
- title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
198
- author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
199
- booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
200
- year = {2023},
201
- url = {https://openreview.net/forum?id=1qvx610Cu7},
202
- }""",
203
  )
204
 
205
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
 
94
  "tests. Problems test basic programming, algorithms, and data structure skills"
95
  ),
96
  citation="""@article{chen2021codex,
97
+ title={Evaluating Large Language Models Trained on Code},
98
+ author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
99
+ year={2021},
100
+ eprint={2107.03374},
101
+ archivePrefix={arXiv},
102
+ primaryClass={cs.LG}
103
+ }""",
104
  )
105
 
106
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
 
194
  "and ensure production-quality code generation."
195
  ),
196
  citation="""@inproceedings{evalplus,
197
+ title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
198
+ author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
199
+ booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
200
+ year = {2023},
201
+ url = {https://openreview.net/forum?id=1qvx610Cu7},
202
+ }""",
203
  )
204
 
205
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
llmdataparser/ifeval_parser.py CHANGED
@@ -99,14 +99,14 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
99
  "for evaluating chat or instruction fine-tuned language models."
100
  ),
101
  citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
102
- title={Instruction-Following Evaluation for Large Language Models},
103
- author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
104
- year={2023},
105
- eprint={2311.07911},
106
- archivePrefix={arXiv},
107
- primaryClass={cs.CL},
108
- url={https://arxiv.org/abs/2311.07911}
109
- }""",
110
  )
111
 
112
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
 
99
  "for evaluating chat or instruction fine-tuned language models."
100
  ),
101
  citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
102
+ title={Instruction-Following Evaluation for Large Language Models},
103
+ author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
104
+ year={2023},
105
+ eprint={2311.07911},
106
+ archivePrefix={arXiv},
107
+ primaryClass={cs.CL},
108
+ url={https://arxiv.org/abs/2311.07911}
109
+ }""",
110
  )
111
 
112
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
llmdataparser/math_parser.py CHANGED
@@ -104,11 +104,11 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
104
  "Problems are categorized by subject area and difficulty level (1-5)."
105
  ),
106
  citation="""@article{hendrycksmath2021,
107
- title={Measuring Mathematical Problem Solving With the MATH Dataset},
108
- author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
109
- journal={NeurIPS},
110
- year={2021}
111
- }""",
112
  additional_info={
113
  "difficulty_levels": "1-5",
114
  "topics": [
 
104
  "Problems are categorized by subject area and difficulty level (1-5)."
105
  ),
106
  citation="""@article{hendrycksmath2021,
107
+ title={Measuring Mathematical Problem Solving With the MATH Dataset},
108
+ author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
109
+ journal={NeurIPS},
110
+ year={2021}
111
+ }""",
112
  additional_info={
113
  "difficulty_levels": "1-5",
114
  "topics": [
llmdataparser/mgsm_parser.py CHANGED
@@ -113,18 +113,18 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
113
  "language models' ability to perform mathematical reasoning across different languages."
114
  ),
115
  citation="""@misc{shi2022language,
116
- title={Language Models are Multilingual Chain-of-Thought Reasoners},
117
- author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
118
- year={2022},
119
- eprint={2210.03057},
120
- archivePrefix={arXiv},
121
- primaryClass={cs.CL}
122
- }
123
  @article{cobbe2021gsm8k,
124
- title={Training Verifiers to Solve Math Word Problems},
125
- author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
126
- journal={arXiv preprint arXiv:2110.14168},
127
- year={2021}
128
  }""",
129
  additional_info={
130
  "languages": [
 
113
  "language models' ability to perform mathematical reasoning across different languages."
114
  ),
115
  citation="""@misc{shi2022language,
116
+ title={Language Models are Multilingual Chain-of-Thought Reasoners},
117
+ author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
118
+ year={2022},
119
+ eprint={2210.03057},
120
+ archivePrefix={arXiv},
121
+ primaryClass={cs.CL}
122
+ }
123
  @article{cobbe2021gsm8k,
124
+ title={Training Verifiers to Solve Math Word Problems},
125
+ author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
126
+ journal={arXiv preprint arXiv:2110.14168},
127
+ year={2021}
128
  }""",
129
  additional_info={
130
  "languages": [
llmdataparser/mmlu_parser.py CHANGED
@@ -220,17 +220,17 @@ class BaseMMLUDatasetParser(MMLUDatasetParser):
220
  "requires both extensive world knowledge and strong problem-solving capabilities."
221
  ),
222
  citation="""@article{hendryckstest2021,
223
- title={Measuring Massive Multitask Language Understanding},
224
- author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
225
- journal={Proceedings of the International Conference on Learning Representations (ICLR)},
226
- year={2021}
227
- }
228
  @article{hendrycks2021ethics,
229
- title={Aligning AI With Shared Human Values},
230
- author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
231
- journal={Proceedings of the International Conference on Learning Representations (ICLR)},
232
- year={2021}
233
- }""",
234
  additional_info={
235
  "subjects": "57 tasks/subjects",
236
  "categories": [
@@ -340,13 +340,13 @@ class MMLUReduxDatasetParser(MMLUDatasetParser):
340
  "against credible sources."
341
  ),
342
  citation="""@misc{gema2024mmlu,
343
- title={Are We Done with MMLU?},
344
- author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
345
- year={2024},
346
- eprint={2406.04127},
347
- archivePrefix={arXiv},
348
- primaryClass={cs.CL}
349
- }""",
350
  additional_info={
351
  "size": "3,000 questions (100 per subject)",
352
  "subjects": "30 MMLU subjects",
@@ -503,11 +503,11 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
503
  "parameters ranging from 1.8B to 72B."
504
  ),
505
  citation="""@article{ikala2024improved,
506
- title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
507
- author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
508
- journal={arXiv preprint arXiv:2403.01858},
509
- year={2024}
510
- }""",
511
  additional_info={
512
  "subjects": "66 diverse subjects",
513
  "difficulty_levels": ["Elementary", "Secondary", "Professional"],
@@ -630,11 +630,11 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
630
  "with expert review and GPT-4 assisted distractor generation."
631
  ),
632
  citation="""@article{wang2024mmlu,
633
- title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
634
- author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
635
- journal={arXiv preprint arXiv:2406.01574},
636
- year={2024}
637
- }""",
638
  additional_info={
639
  "size": "12,000 complex questions",
640
  "options": "Up to 10 choices per question",
 
220
  "requires both extensive world knowledge and strong problem-solving capabilities."
221
  ),
222
  citation="""@article{hendryckstest2021,
223
+ title={Measuring Massive Multitask Language Understanding},
224
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
225
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
226
+ year={2021}
227
+ }
228
  @article{hendrycks2021ethics,
229
+ title={Aligning AI With Shared Human Values},
230
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
231
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
232
+ year={2021}
233
+ }""",
234
  additional_info={
235
  "subjects": "57 tasks/subjects",
236
  "categories": [
 
340
  "against credible sources."
341
  ),
342
  citation="""@misc{gema2024mmlu,
343
+ title={Are We Done with MMLU?},
344
+ author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
345
+ year={2024},
346
+ eprint={2406.04127},
347
+ archivePrefix={arXiv},
348
+ primaryClass={cs.CL}
349
+ }""",
350
  additional_info={
351
  "size": "3,000 questions (100 per subject)",
352
  "subjects": "30 MMLU subjects",
 
503
  "parameters ranging from 1.8B to 72B."
504
  ),
505
  citation="""@article{ikala2024improved,
506
+ title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
507
+ author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
508
+ journal={arXiv preprint arXiv:2403.01858},
509
+ year={2024}
510
+ }""",
511
  additional_info={
512
  "subjects": "66 diverse subjects",
513
  "difficulty_levels": ["Elementary", "Secondary", "Professional"],
 
630
  "with expert review and GPT-4 assisted distractor generation."
631
  ),
632
  citation="""@article{wang2024mmlu,
633
+ title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
634
+ author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
635
+ journal={arXiv preprint arXiv:2406.01574},
636
+ year={2024}
637
+ }""",
638
  additional_info={
639
  "size": "12,000 complex questions",
640
  "options": "Up to 10 choices per question",
llmdataparser/tmlu_parser.py CHANGED
@@ -137,19 +137,19 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
137
  "and professional certifications"
138
  ),
139
  citation="""@article{DBLP:journals/corr/abs-2403-20180,
140
- author = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
141
- title = {Measuring Taiwanese Mandarin Language Understanding},
142
- journal = {CoRR},
143
- volume = {abs/2403.20180},
144
- year = {2024},
145
- url = {https://doi.org/10.48550/arXiv.2403.20180},
146
- doi = {10.48550/ARXIV.2403.20180},
147
- eprinttype = {arXiv},
148
- eprint = {2403.20180},
149
- timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
150
- biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
151
- bibsource = {dblp computer science bibliography, https://dblp.org}
152
- }""",
153
  )
154
 
155
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
 
137
  "and professional certifications"
138
  ),
139
  citation="""@article{DBLP:journals/corr/abs-2403-20180,
140
+ author = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
141
+ title = {Measuring Taiwanese Mandarin Language Understanding},
142
+ journal = {CoRR},
143
+ volume = {abs/2403.20180},
144
+ year = {2024},
145
+ url = {https://doi.org/10.48550/arXiv.2403.20180},
146
+ doi = {10.48550/ARXIV.2403.20180},
147
+ eprinttype = {arXiv},
148
+ eprint = {2403.20180},
149
+ timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
150
+ biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
151
+ bibsource = {dblp computer science bibliography, https://dblp.org}
152
+ }""",
153
  )
154
 
155
  def get_evaluation_metrics(self) -> list[EvaluationMetric]: