lewtun HF Staff commited on
Commit
b9de37a
·
1 Parent(s): 33b1934
eval_results/lewtun/gemma-7b-sft-full-longest-1k/main/arc/results_2024-02-28T15-54-03.932261.json DELETED
@@ -1,88 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 359706.246646181,
9
- "end_time": 359831.155219081,
10
- "total_evaluation_time_secondes": "124.90857289999258",
11
- "model_name": "lewtun/gemma-7b-sft-full-longest-1k",
12
- "model_sha": "5b354486c4322c02994885f68f00d66a75275dd1",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "16.4 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|arc:challenge|25": {
19
- "acc": 0.39078498293515357,
20
- "acc_stderr": 0.014258563880513773,
21
- "acc_norm": 0.31569965870307165,
22
- "acc_norm_stderr": 0.013582571095815291
23
- }
24
- },
25
- "versions": {
26
- "lighteval|arc:challenge|25": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|arc:challenge": {
30
- "name": "arc:challenge",
31
- "prompt_function": "arc",
32
- "hf_repo": "ai2_arc",
33
- "hf_subset": "ARC-Challenge",
34
- "metric": [
35
- "loglikelihood_acc",
36
- "loglikelihood_acc_norm_nospace"
37
- ],
38
- "hf_avail_splits": [
39
- "train",
40
- "test"
41
- ],
42
- "evaluation_splits": [
43
- "test"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": "random_sampling_from_train",
47
- "generation_size": 1,
48
- "stop_sequence": [
49
- "\n"
50
- ],
51
- "output_regex": null,
52
- "frozen": false,
53
- "suite": [
54
- "lighteval",
55
- "arc"
56
- ]
57
- }
58
- },
59
- "summary_tasks": {
60
- "lighteval|arc:challenge|25": {
61
- "hashes": {
62
- "hash_examples": "17b0cae357c0259e",
63
- "hash_full_prompts": "66498cebc864f4ca",
64
- "hash_input_tokens": "a57cf2c416de063f",
65
- "hash_cont_tokens": "1b3b6d7b2c7f1084"
66
- },
67
- "truncated": 0,
68
- "non_truncated": 1172,
69
- "padded": 4664,
70
- "non_padded": 23,
71
- "effective_few_shots": 25.0,
72
- "num_truncated_few_shots": 0
73
- }
74
- },
75
- "summary_general": {
76
- "hashes": {
77
- "hash_examples": "aaa6929c6d3771fb",
78
- "hash_full_prompts": "848db321e3b0d4cd",
79
- "hash_input_tokens": "2b59bc2f2de9c037",
80
- "hash_cont_tokens": "9255ea5037675476"
81
- },
82
- "truncated": 0,
83
- "non_truncated": 1172,
84
- "padded": 4664,
85
- "non_padded": 23,
86
- "num_truncated_few_shots": 0
87
- }
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-sft-full-longest-1k/main/gsm8k/results_2024-02-28T16-13-26.432022.json DELETED
@@ -1,86 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 6137973.644333813,
9
- "end_time": 6139261.405040871,
10
- "total_evaluation_time_secondes": "1287.7607070580125",
11
- "model_name": "lewtun/gemma-7b-sft-full-longest-1k",
12
- "model_sha": "5b354486c4322c02994885f68f00d66a75275dd1",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "16.4 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|gsm8k|5": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- }
22
- },
23
- "versions": {
24
- "lighteval|gsm8k|5": 0
25
- },
26
- "config_tasks": {
27
- "lighteval|gsm8k": {
28
- "name": "gsm8k",
29
- "prompt_function": "gsm8k",
30
- "hf_repo": "gsm8k",
31
- "hf_subset": "main",
32
- "metric": [
33
- "quasi_exact_match_gsm8k"
34
- ],
35
- "hf_avail_splits": [
36
- "train",
37
- "test"
38
- ],
39
- "evaluation_splits": [
40
- "test"
41
- ],
42
- "few_shots_split": null,
43
- "few_shots_select": "random_sampling_from_train",
44
- "generation_size": 256,
45
- "stop_sequence": [
46
- ":",
47
- "Question:",
48
- "Question"
49
- ],
50
- "output_regex": null,
51
- "frozen": false,
52
- "suite": [
53
- "lighteval"
54
- ]
55
- }
56
- },
57
- "summary_tasks": {
58
- "lighteval|gsm8k|5": {
59
- "hashes": {
60
- "hash_examples": "0ed016e24e7512fd",
61
- "hash_full_prompts": "a9c6515c2ecd2d07",
62
- "hash_input_tokens": "405769622b290f70",
63
- "hash_cont_tokens": "87c6baf31dde0857"
64
- },
65
- "truncated": 0,
66
- "non_truncated": 1319,
67
- "padded": 0,
68
- "non_padded": 1319,
69
- "effective_few_shots": 5.0,
70
- "num_truncated_few_shots": 0
71
- }
72
- },
73
- "summary_general": {
74
- "hashes": {
75
- "hash_examples": "bc71463e88551d0e",
76
- "hash_full_prompts": "f49691ec139bfeac",
77
- "hash_input_tokens": "c58b1910c1aa8688",
78
- "hash_cont_tokens": "2b4ff6b36558de9c"
79
- },
80
- "truncated": 0,
81
- "non_truncated": 1319,
82
- "padded": 0,
83
- "non_padded": 1319,
84
- "num_truncated_few_shots": 0
85
- }
86
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-sft-full-longest-1k/main/hellaswag/results_2024-02-28T16-02-23.662862.json DELETED
@@ -1,88 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 504726.626899214,
9
- "end_time": 505233.592411743,
10
- "total_evaluation_time_secondes": "506.96551252901554",
11
- "model_name": "lewtun/gemma-7b-sft-full-longest-1k",
12
- "model_sha": "5b354486c4322c02994885f68f00d66a75275dd1",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "16.4 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|hellaswag|10": {
19
- "acc": 0.58743278231428,
20
- "acc_stderr": 0.0049129004503708365,
21
- "acc_norm": 0.36825333598884685,
22
- "acc_norm_stderr": 0.004813448615404436
23
- }
24
- },
25
- "versions": {
26
- "lighteval|hellaswag|10": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|hellaswag": {
30
- "name": "hellaswag",
31
- "prompt_function": "hellaswag_harness",
32
- "hf_repo": "hellaswag",
33
- "hf_subset": "default",
34
- "metric": [
35
- "loglikelihood_acc",
36
- "loglikelihood_acc_norm"
37
- ],
38
- "hf_avail_splits": [
39
- "train",
40
- "test",
41
- "validation"
42
- ],
43
- "evaluation_splits": [
44
- "validation"
45
- ],
46
- "few_shots_split": null,
47
- "few_shots_select": "random_sampling_from_train",
48
- "generation_size": -1,
49
- "stop_sequence": [
50
- "\n"
51
- ],
52
- "output_regex": null,
53
- "frozen": false,
54
- "suite": [
55
- "lighteval"
56
- ]
57
- }
58
- },
59
- "summary_tasks": {
60
- "lighteval|hellaswag|10": {
61
- "hashes": {
62
- "hash_examples": "31985c805c3a737e",
63
- "hash_full_prompts": "451e22e8c3aaeeb6",
64
- "hash_input_tokens": "3d82c80e65600713",
65
- "hash_cont_tokens": "c311c0b0860cae95"
66
- },
67
- "truncated": 0,
68
- "non_truncated": 10042,
69
- "padded": 39905,
70
- "non_padded": 263,
71
- "effective_few_shots": 10.0,
72
- "num_truncated_few_shots": 0
73
- }
74
- },
75
- "summary_general": {
76
- "hashes": {
77
- "hash_examples": "63bc2cf8bae03fbc",
78
- "hash_full_prompts": "b2452b7405d0326a",
79
- "hash_input_tokens": "55ce21941662c730",
80
- "hash_cont_tokens": "6b2a595d078ce69e"
81
- },
82
- "truncated": 0,
83
- "non_truncated": 10042,
84
- "padded": 39905,
85
- "non_padded": 263,
86
- "num_truncated_few_shots": 0
87
- }
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-sft-full-longest-1k/main/mmlu/results_2024-02-28T16-07-20.656683.json DELETED
@@ -1,2835 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 360593.602233905,
9
- "end_time": 361515.620827363,
10
- "total_evaluation_time_secondes": "922.0185934579931",
11
- "model_name": "lewtun/gemma-7b-sft-full-longest-1k",
12
- "model_sha": "5b354486c4322c02994885f68f00d66a75275dd1",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "16.4 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|mmlu:abstract_algebra|5": {
19
- "acc": 0.22,
20
- "acc_stderr": 0.04163331998932268
21
- },
22
- "lighteval|mmlu:anatomy|5": {
23
- "acc": 0.18518518518518517,
24
- "acc_stderr": 0.03355677216313142
25
- },
26
- "lighteval|mmlu:astronomy|5": {
27
- "acc": 0.17763157894736842,
28
- "acc_stderr": 0.031103182383123398
29
- },
30
- "lighteval|mmlu:business_ethics|5": {
31
- "acc": 0.3,
32
- "acc_stderr": 0.046056618647183814
33
- },
34
- "lighteval|mmlu:clinical_knowledge|5": {
35
- "acc": 0.21509433962264152,
36
- "acc_stderr": 0.02528839450289137
37
- },
38
- "lighteval|mmlu:college_biology|5": {
39
- "acc": 0.2569444444444444,
40
- "acc_stderr": 0.03653946969442099
41
- },
42
- "lighteval|mmlu:college_chemistry|5": {
43
- "acc": 0.2,
44
- "acc_stderr": 0.04020151261036845
45
- },
46
- "lighteval|mmlu:college_computer_science|5": {
47
- "acc": 0.26,
48
- "acc_stderr": 0.0440844002276808
49
- },
50
- "lighteval|mmlu:college_mathematics|5": {
51
- "acc": 0.21,
52
- "acc_stderr": 0.040936018074033256
53
- },
54
- "lighteval|mmlu:college_medicine|5": {
55
- "acc": 0.20809248554913296,
56
- "acc_stderr": 0.030952890217749874
57
- },
58
- "lighteval|mmlu:college_physics|5": {
59
- "acc": 0.21568627450980393,
60
- "acc_stderr": 0.04092563958237654
61
- },
62
- "lighteval|mmlu:computer_security|5": {
63
- "acc": 0.28,
64
- "acc_stderr": 0.045126085985421276
65
- },
66
- "lighteval|mmlu:conceptual_physics|5": {
67
- "acc": 0.26382978723404255,
68
- "acc_stderr": 0.028809989854102973
69
- },
70
- "lighteval|mmlu:econometrics|5": {
71
- "acc": 0.23684210526315788,
72
- "acc_stderr": 0.039994238792813365
73
- },
74
- "lighteval|mmlu:electrical_engineering|5": {
75
- "acc": 0.2413793103448276,
76
- "acc_stderr": 0.03565998174135302
77
- },
78
- "lighteval|mmlu:elementary_mathematics|5": {
79
- "acc": 0.20899470899470898,
80
- "acc_stderr": 0.02094048156533486
81
- },
82
- "lighteval|mmlu:formal_logic|5": {
83
- "acc": 0.2857142857142857,
84
- "acc_stderr": 0.04040610178208841
85
- },
86
- "lighteval|mmlu:global_facts|5": {
87
- "acc": 0.18,
88
- "acc_stderr": 0.038612291966536934
89
- },
90
- "lighteval|mmlu:high_school_biology|5": {
91
- "acc": 0.1774193548387097,
92
- "acc_stderr": 0.02173254068932927
93
- },
94
- "lighteval|mmlu:high_school_chemistry|5": {
95
- "acc": 0.15270935960591134,
96
- "acc_stderr": 0.02530890453938063
97
- },
98
- "lighteval|mmlu:high_school_computer_science|5": {
99
- "acc": 0.25,
100
- "acc_stderr": 0.04351941398892446
101
- },
102
- "lighteval|mmlu:high_school_european_history|5": {
103
- "acc": 0.21818181818181817,
104
- "acc_stderr": 0.03225078108306289
105
- },
106
- "lighteval|mmlu:high_school_geography|5": {
107
- "acc": 0.17676767676767677,
108
- "acc_stderr": 0.027178752639044915
109
- },
110
- "lighteval|mmlu:high_school_government_and_politics|5": {
111
- "acc": 0.19689119170984457,
112
- "acc_stderr": 0.028697873971860664
113
- },
114
- "lighteval|mmlu:high_school_macroeconomics|5": {
115
- "acc": 0.20256410256410257,
116
- "acc_stderr": 0.020377660970371372
117
- },
118
- "lighteval|mmlu:high_school_mathematics|5": {
119
- "acc": 0.2111111111111111,
120
- "acc_stderr": 0.024882116857655075
121
- },
122
- "lighteval|mmlu:high_school_microeconomics|5": {
123
- "acc": 0.21008403361344538,
124
- "acc_stderr": 0.026461398717471874
125
- },
126
- "lighteval|mmlu:high_school_physics|5": {
127
- "acc": 0.1986754966887417,
128
- "acc_stderr": 0.03257847384436776
129
- },
130
- "lighteval|mmlu:high_school_psychology|5": {
131
- "acc": 0.1926605504587156,
132
- "acc_stderr": 0.016909276884936094
133
- },
134
- "lighteval|mmlu:high_school_statistics|5": {
135
- "acc": 0.1527777777777778,
136
- "acc_stderr": 0.024536326026134224
137
- },
138
- "lighteval|mmlu:high_school_us_history|5": {
139
- "acc": 0.25,
140
- "acc_stderr": 0.03039153369274154
141
- },
142
- "lighteval|mmlu:high_school_world_history|5": {
143
- "acc": 0.270042194092827,
144
- "acc_stderr": 0.028900721906293426
145
- },
146
- "lighteval|mmlu:human_aging|5": {
147
- "acc": 0.31390134529147984,
148
- "acc_stderr": 0.031146796482972465
149
- },
150
- "lighteval|mmlu:human_sexuality|5": {
151
- "acc": 0.2595419847328244,
152
- "acc_stderr": 0.03844876139785271
153
- },
154
- "lighteval|mmlu:international_law|5": {
155
- "acc": 0.2396694214876033,
156
- "acc_stderr": 0.03896878985070417
157
- },
158
- "lighteval|mmlu:jurisprudence|5": {
159
- "acc": 0.25925925925925924,
160
- "acc_stderr": 0.042365112580946336
161
- },
162
- "lighteval|mmlu:logical_fallacies|5": {
163
- "acc": 0.22085889570552147,
164
- "acc_stderr": 0.032591773927421776
165
- },
166
- "lighteval|mmlu:machine_learning|5": {
167
- "acc": 0.3125,
168
- "acc_stderr": 0.043994650575715215
169
- },
170
- "lighteval|mmlu:management|5": {
171
- "acc": 0.17475728155339806,
172
- "acc_stderr": 0.037601780060266224
173
- },
174
- "lighteval|mmlu:marketing|5": {
175
- "acc": 0.2905982905982906,
176
- "acc_stderr": 0.02974504857267404
177
- },
178
- "lighteval|mmlu:medical_genetics|5": {
179
- "acc": 0.3,
180
- "acc_stderr": 0.046056618647183814
181
- },
182
- "lighteval|mmlu:miscellaneous|5": {
183
- "acc": 0.23754789272030652,
184
- "acc_stderr": 0.015218733046150193
185
- },
186
- "lighteval|mmlu:moral_disputes|5": {
187
- "acc": 0.24855491329479767,
188
- "acc_stderr": 0.023267528432100174
189
- },
190
- "lighteval|mmlu:moral_scenarios|5": {
191
- "acc": 0.23798882681564246,
192
- "acc_stderr": 0.014242630070574915
193
- },
194
- "lighteval|mmlu:nutrition|5": {
195
- "acc": 0.22549019607843138,
196
- "acc_stderr": 0.023929155517351284
197
- },
198
- "lighteval|mmlu:philosophy|5": {
199
- "acc": 0.1864951768488746,
200
- "acc_stderr": 0.02212243977248077
201
- },
202
- "lighteval|mmlu:prehistory|5": {
203
- "acc": 0.21604938271604937,
204
- "acc_stderr": 0.022899162918445806
205
- },
206
- "lighteval|mmlu:professional_accounting|5": {
207
- "acc": 0.23404255319148937,
208
- "acc_stderr": 0.025257861359432417
209
- },
210
- "lighteval|mmlu:professional_law|5": {
211
- "acc": 0.2457627118644068,
212
- "acc_stderr": 0.010996156635142692
213
- },
214
- "lighteval|mmlu:professional_medicine|5": {
215
- "acc": 0.18382352941176472,
216
- "acc_stderr": 0.023529242185193106
217
- },
218
- "lighteval|mmlu:professional_psychology|5": {
219
- "acc": 0.25,
220
- "acc_stderr": 0.01751781884501444
221
- },
222
- "lighteval|mmlu:public_relations|5": {
223
- "acc": 0.21818181818181817,
224
- "acc_stderr": 0.03955932861795833
225
- },
226
- "lighteval|mmlu:security_studies|5": {
227
- "acc": 0.19183673469387755,
228
- "acc_stderr": 0.025206963154225395
229
- },
230
- "lighteval|mmlu:sociology|5": {
231
- "acc": 0.24378109452736318,
232
- "acc_stderr": 0.03036049015401465
233
- },
234
- "lighteval|mmlu:us_foreign_policy|5": {
235
- "acc": 0.28,
236
- "acc_stderr": 0.04512608598542128
237
- },
238
- "lighteval|mmlu:virology|5": {
239
- "acc": 0.28313253012048195,
240
- "acc_stderr": 0.03507295431370518
241
- },
242
- "lighteval|mmlu:world_religions|5": {
243
- "acc": 0.3216374269005848,
244
- "acc_stderr": 0.03582529442573122
245
- },
246
- "lighteval|mmlu:_average|5": {
247
- "acc": 0.2312401831441149,
248
- "acc_stderr": 0.031501830581055885
249
- }
250
- },
251
- "versions": {
252
- "lighteval|mmlu:abstract_algebra|5": 0,
253
- "lighteval|mmlu:anatomy|5": 0,
254
- "lighteval|mmlu:astronomy|5": 0,
255
- "lighteval|mmlu:business_ethics|5": 0,
256
- "lighteval|mmlu:clinical_knowledge|5": 0,
257
- "lighteval|mmlu:college_biology|5": 0,
258
- "lighteval|mmlu:college_chemistry|5": 0,
259
- "lighteval|mmlu:college_computer_science|5": 0,
260
- "lighteval|mmlu:college_mathematics|5": 0,
261
- "lighteval|mmlu:college_medicine|5": 0,
262
- "lighteval|mmlu:college_physics|5": 0,
263
- "lighteval|mmlu:computer_security|5": 0,
264
- "lighteval|mmlu:conceptual_physics|5": 0,
265
- "lighteval|mmlu:econometrics|5": 0,
266
- "lighteval|mmlu:electrical_engineering|5": 0,
267
- "lighteval|mmlu:elementary_mathematics|5": 0,
268
- "lighteval|mmlu:formal_logic|5": 0,
269
- "lighteval|mmlu:global_facts|5": 0,
270
- "lighteval|mmlu:high_school_biology|5": 0,
271
- "lighteval|mmlu:high_school_chemistry|5": 0,
272
- "lighteval|mmlu:high_school_computer_science|5": 0,
273
- "lighteval|mmlu:high_school_european_history|5": 0,
274
- "lighteval|mmlu:high_school_geography|5": 0,
275
- "lighteval|mmlu:high_school_government_and_politics|5": 0,
276
- "lighteval|mmlu:high_school_macroeconomics|5": 0,
277
- "lighteval|mmlu:high_school_mathematics|5": 0,
278
- "lighteval|mmlu:high_school_microeconomics|5": 0,
279
- "lighteval|mmlu:high_school_physics|5": 0,
280
- "lighteval|mmlu:high_school_psychology|5": 0,
281
- "lighteval|mmlu:high_school_statistics|5": 0,
282
- "lighteval|mmlu:high_school_us_history|5": 0,
283
- "lighteval|mmlu:high_school_world_history|5": 0,
284
- "lighteval|mmlu:human_aging|5": 0,
285
- "lighteval|mmlu:human_sexuality|5": 0,
286
- "lighteval|mmlu:international_law|5": 0,
287
- "lighteval|mmlu:jurisprudence|5": 0,
288
- "lighteval|mmlu:logical_fallacies|5": 0,
289
- "lighteval|mmlu:machine_learning|5": 0,
290
- "lighteval|mmlu:management|5": 0,
291
- "lighteval|mmlu:marketing|5": 0,
292
- "lighteval|mmlu:medical_genetics|5": 0,
293
- "lighteval|mmlu:miscellaneous|5": 0,
294
- "lighteval|mmlu:moral_disputes|5": 0,
295
- "lighteval|mmlu:moral_scenarios|5": 0,
296
- "lighteval|mmlu:nutrition|5": 0,
297
- "lighteval|mmlu:philosophy|5": 0,
298
- "lighteval|mmlu:prehistory|5": 0,
299
- "lighteval|mmlu:professional_accounting|5": 0,
300
- "lighteval|mmlu:professional_law|5": 0,
301
- "lighteval|mmlu:professional_medicine|5": 0,
302
- "lighteval|mmlu:professional_psychology|5": 0,
303
- "lighteval|mmlu:public_relations|5": 0,
304
- "lighteval|mmlu:security_studies|5": 0,
305
- "lighteval|mmlu:sociology|5": 0,
306
- "lighteval|mmlu:us_foreign_policy|5": 0,
307
- "lighteval|mmlu:virology|5": 0,
308
- "lighteval|mmlu:world_religions|5": 0
309
- },
310
- "config_tasks": {
311
- "lighteval|mmlu:abstract_algebra": {
312
- "name": "mmlu:abstract_algebra",
313
- "prompt_function": "mmlu_harness",
314
- "hf_repo": "lighteval/mmlu",
315
- "hf_subset": "abstract_algebra",
316
- "metric": [
317
- "loglikelihood_acc"
318
- ],
319
- "hf_avail_splits": [
320
- "auxiliary_train",
321
- "test",
322
- "validation",
323
- "dev"
324
- ],
325
- "evaluation_splits": [
326
- "test"
327
- ],
328
- "few_shots_split": "dev",
329
- "few_shots_select": "sequential",
330
- "generation_size": 1,
331
- "stop_sequence": [
332
- "\n"
333
- ],
334
- "output_regex": null,
335
- "frozen": false,
336
- "suite": [
337
- "lighteval",
338
- "mmlu"
339
- ]
340
- },
341
- "lighteval|mmlu:anatomy": {
342
- "name": "mmlu:anatomy",
343
- "prompt_function": "mmlu_harness",
344
- "hf_repo": "lighteval/mmlu",
345
- "hf_subset": "anatomy",
346
- "metric": [
347
- "loglikelihood_acc"
348
- ],
349
- "hf_avail_splits": [
350
- "auxiliary_train",
351
- "test",
352
- "validation",
353
- "dev"
354
- ],
355
- "evaluation_splits": [
356
- "test"
357
- ],
358
- "few_shots_split": "dev",
359
- "few_shots_select": "sequential",
360
- "generation_size": 1,
361
- "stop_sequence": [
362
- "\n"
363
- ],
364
- "output_regex": null,
365
- "frozen": false,
366
- "suite": [
367
- "lighteval",
368
- "mmlu"
369
- ]
370
- },
371
- "lighteval|mmlu:astronomy": {
372
- "name": "mmlu:astronomy",
373
- "prompt_function": "mmlu_harness",
374
- "hf_repo": "lighteval/mmlu",
375
- "hf_subset": "astronomy",
376
- "metric": [
377
- "loglikelihood_acc"
378
- ],
379
- "hf_avail_splits": [
380
- "auxiliary_train",
381
- "test",
382
- "validation",
383
- "dev"
384
- ],
385
- "evaluation_splits": [
386
- "test"
387
- ],
388
- "few_shots_split": "dev",
389
- "few_shots_select": "sequential",
390
- "generation_size": 1,
391
- "stop_sequence": [
392
- "\n"
393
- ],
394
- "output_regex": null,
395
- "frozen": false,
396
- "suite": [
397
- "lighteval",
398
- "mmlu"
399
- ]
400
- },
401
- "lighteval|mmlu:business_ethics": {
402
- "name": "mmlu:business_ethics",
403
- "prompt_function": "mmlu_harness",
404
- "hf_repo": "lighteval/mmlu",
405
- "hf_subset": "business_ethics",
406
- "metric": [
407
- "loglikelihood_acc"
408
- ],
409
- "hf_avail_splits": [
410
- "auxiliary_train",
411
- "test",
412
- "validation",
413
- "dev"
414
- ],
415
- "evaluation_splits": [
416
- "test"
417
- ],
418
- "few_shots_split": "dev",
419
- "few_shots_select": "sequential",
420
- "generation_size": 1,
421
- "stop_sequence": [
422
- "\n"
423
- ],
424
- "output_regex": null,
425
- "frozen": false,
426
- "suite": [
427
- "lighteval",
428
- "mmlu"
429
- ]
430
- },
431
- "lighteval|mmlu:clinical_knowledge": {
432
- "name": "mmlu:clinical_knowledge",
433
- "prompt_function": "mmlu_harness",
434
- "hf_repo": "lighteval/mmlu",
435
- "hf_subset": "clinical_knowledge",
436
- "metric": [
437
- "loglikelihood_acc"
438
- ],
439
- "hf_avail_splits": [
440
- "auxiliary_train",
441
- "test",
442
- "validation",
443
- "dev"
444
- ],
445
- "evaluation_splits": [
446
- "test"
447
- ],
448
- "few_shots_split": "dev",
449
- "few_shots_select": "sequential",
450
- "generation_size": 1,
451
- "stop_sequence": [
452
- "\n"
453
- ],
454
- "output_regex": null,
455
- "frozen": false,
456
- "suite": [
457
- "lighteval",
458
- "mmlu"
459
- ]
460
- },
461
- "lighteval|mmlu:college_biology": {
462
- "name": "mmlu:college_biology",
463
- "prompt_function": "mmlu_harness",
464
- "hf_repo": "lighteval/mmlu",
465
- "hf_subset": "college_biology",
466
- "metric": [
467
- "loglikelihood_acc"
468
- ],
469
- "hf_avail_splits": [
470
- "auxiliary_train",
471
- "test",
472
- "validation",
473
- "dev"
474
- ],
475
- "evaluation_splits": [
476
- "test"
477
- ],
478
- "few_shots_split": "dev",
479
- "few_shots_select": "sequential",
480
- "generation_size": 1,
481
- "stop_sequence": [
482
- "\n"
483
- ],
484
- "output_regex": null,
485
- "frozen": false,
486
- "suite": [
487
- "lighteval",
488
- "mmlu"
489
- ]
490
- },
491
- "lighteval|mmlu:college_chemistry": {
492
- "name": "mmlu:college_chemistry",
493
- "prompt_function": "mmlu_harness",
494
- "hf_repo": "lighteval/mmlu",
495
- "hf_subset": "college_chemistry",
496
- "metric": [
497
- "loglikelihood_acc"
498
- ],
499
- "hf_avail_splits": [
500
- "auxiliary_train",
501
- "test",
502
- "validation",
503
- "dev"
504
- ],
505
- "evaluation_splits": [
506
- "test"
507
- ],
508
- "few_shots_split": "dev",
509
- "few_shots_select": "sequential",
510
- "generation_size": 1,
511
- "stop_sequence": [
512
- "\n"
513
- ],
514
- "output_regex": null,
515
- "frozen": false,
516
- "suite": [
517
- "lighteval",
518
- "mmlu"
519
- ]
520
- },
521
- "lighteval|mmlu:college_computer_science": {
522
- "name": "mmlu:college_computer_science",
523
- "prompt_function": "mmlu_harness",
524
- "hf_repo": "lighteval/mmlu",
525
- "hf_subset": "college_computer_science",
526
- "metric": [
527
- "loglikelihood_acc"
528
- ],
529
- "hf_avail_splits": [
530
- "auxiliary_train",
531
- "test",
532
- "validation",
533
- "dev"
534
- ],
535
- "evaluation_splits": [
536
- "test"
537
- ],
538
- "few_shots_split": "dev",
539
- "few_shots_select": "sequential",
540
- "generation_size": 1,
541
- "stop_sequence": [
542
- "\n"
543
- ],
544
- "output_regex": null,
545
- "frozen": false,
546
- "suite": [
547
- "lighteval",
548
- "mmlu"
549
- ]
550
- },
551
- "lighteval|mmlu:college_mathematics": {
552
- "name": "mmlu:college_mathematics",
553
- "prompt_function": "mmlu_harness",
554
- "hf_repo": "lighteval/mmlu",
555
- "hf_subset": "college_mathematics",
556
- "metric": [
557
- "loglikelihood_acc"
558
- ],
559
- "hf_avail_splits": [
560
- "auxiliary_train",
561
- "test",
562
- "validation",
563
- "dev"
564
- ],
565
- "evaluation_splits": [
566
- "test"
567
- ],
568
- "few_shots_split": "dev",
569
- "few_shots_select": "sequential",
570
- "generation_size": 1,
571
- "stop_sequence": [
572
- "\n"
573
- ],
574
- "output_regex": null,
575
- "frozen": false,
576
- "suite": [
577
- "lighteval",
578
- "mmlu"
579
- ]
580
- },
581
- "lighteval|mmlu:college_medicine": {
582
- "name": "mmlu:college_medicine",
583
- "prompt_function": "mmlu_harness",
584
- "hf_repo": "lighteval/mmlu",
585
- "hf_subset": "college_medicine",
586
- "metric": [
587
- "loglikelihood_acc"
588
- ],
589
- "hf_avail_splits": [
590
- "auxiliary_train",
591
- "test",
592
- "validation",
593
- "dev"
594
- ],
595
- "evaluation_splits": [
596
- "test"
597
- ],
598
- "few_shots_split": "dev",
599
- "few_shots_select": "sequential",
600
- "generation_size": 1,
601
- "stop_sequence": [
602
- "\n"
603
- ],
604
- "output_regex": null,
605
- "frozen": false,
606
- "suite": [
607
- "lighteval",
608
- "mmlu"
609
- ]
610
- },
611
- "lighteval|mmlu:college_physics": {
612
- "name": "mmlu:college_physics",
613
- "prompt_function": "mmlu_harness",
614
- "hf_repo": "lighteval/mmlu",
615
- "hf_subset": "college_physics",
616
- "metric": [
617
- "loglikelihood_acc"
618
- ],
619
- "hf_avail_splits": [
620
- "auxiliary_train",
621
- "test",
622
- "validation",
623
- "dev"
624
- ],
625
- "evaluation_splits": [
626
- "test"
627
- ],
628
- "few_shots_split": "dev",
629
- "few_shots_select": "sequential",
630
- "generation_size": 1,
631
- "stop_sequence": [
632
- "\n"
633
- ],
634
- "output_regex": null,
635
- "frozen": false,
636
- "suite": [
637
- "lighteval",
638
- "mmlu"
639
- ]
640
- },
641
- "lighteval|mmlu:computer_security": {
642
- "name": "mmlu:computer_security",
643
- "prompt_function": "mmlu_harness",
644
- "hf_repo": "lighteval/mmlu",
645
- "hf_subset": "computer_security",
646
- "metric": [
647
- "loglikelihood_acc"
648
- ],
649
- "hf_avail_splits": [
650
- "auxiliary_train",
651
- "test",
652
- "validation",
653
- "dev"
654
- ],
655
- "evaluation_splits": [
656
- "test"
657
- ],
658
- "few_shots_split": "dev",
659
- "few_shots_select": "sequential",
660
- "generation_size": 1,
661
- "stop_sequence": [
662
- "\n"
663
- ],
664
- "output_regex": null,
665
- "frozen": false,
666
- "suite": [
667
- "lighteval",
668
- "mmlu"
669
- ]
670
- },
671
- "lighteval|mmlu:conceptual_physics": {
672
- "name": "mmlu:conceptual_physics",
673
- "prompt_function": "mmlu_harness",
674
- "hf_repo": "lighteval/mmlu",
675
- "hf_subset": "conceptual_physics",
676
- "metric": [
677
- "loglikelihood_acc"
678
- ],
679
- "hf_avail_splits": [
680
- "auxiliary_train",
681
- "test",
682
- "validation",
683
- "dev"
684
- ],
685
- "evaluation_splits": [
686
- "test"
687
- ],
688
- "few_shots_split": "dev",
689
- "few_shots_select": "sequential",
690
- "generation_size": 1,
691
- "stop_sequence": [
692
- "\n"
693
- ],
694
- "output_regex": null,
695
- "frozen": false,
696
- "suite": [
697
- "lighteval",
698
- "mmlu"
699
- ]
700
- },
701
- "lighteval|mmlu:econometrics": {
702
- "name": "mmlu:econometrics",
703
- "prompt_function": "mmlu_harness",
704
- "hf_repo": "lighteval/mmlu",
705
- "hf_subset": "econometrics",
706
- "metric": [
707
- "loglikelihood_acc"
708
- ],
709
- "hf_avail_splits": [
710
- "auxiliary_train",
711
- "test",
712
- "validation",
713
- "dev"
714
- ],
715
- "evaluation_splits": [
716
- "test"
717
- ],
718
- "few_shots_split": "dev",
719
- "few_shots_select": "sequential",
720
- "generation_size": 1,
721
- "stop_sequence": [
722
- "\n"
723
- ],
724
- "output_regex": null,
725
- "frozen": false,
726
- "suite": [
727
- "lighteval",
728
- "mmlu"
729
- ]
730
- },
731
- "lighteval|mmlu:electrical_engineering": {
732
- "name": "mmlu:electrical_engineering",
733
- "prompt_function": "mmlu_harness",
734
- "hf_repo": "lighteval/mmlu",
735
- "hf_subset": "electrical_engineering",
736
- "metric": [
737
- "loglikelihood_acc"
738
- ],
739
- "hf_avail_splits": [
740
- "auxiliary_train",
741
- "test",
742
- "validation",
743
- "dev"
744
- ],
745
- "evaluation_splits": [
746
- "test"
747
- ],
748
- "few_shots_split": "dev",
749
- "few_shots_select": "sequential",
750
- "generation_size": 1,
751
- "stop_sequence": [
752
- "\n"
753
- ],
754
- "output_regex": null,
755
- "frozen": false,
756
- "suite": [
757
- "lighteval",
758
- "mmlu"
759
- ]
760
- },
761
- "lighteval|mmlu:elementary_mathematics": {
762
- "name": "mmlu:elementary_mathematics",
763
- "prompt_function": "mmlu_harness",
764
- "hf_repo": "lighteval/mmlu",
765
- "hf_subset": "elementary_mathematics",
766
- "metric": [
767
- "loglikelihood_acc"
768
- ],
769
- "hf_avail_splits": [
770
- "auxiliary_train",
771
- "test",
772
- "validation",
773
- "dev"
774
- ],
775
- "evaluation_splits": [
776
- "test"
777
- ],
778
- "few_shots_split": "dev",
779
- "few_shots_select": "sequential",
780
- "generation_size": 1,
781
- "stop_sequence": [
782
- "\n"
783
- ],
784
- "output_regex": null,
785
- "frozen": false,
786
- "suite": [
787
- "lighteval",
788
- "mmlu"
789
- ]
790
- },
791
- "lighteval|mmlu:formal_logic": {
792
- "name": "mmlu:formal_logic",
793
- "prompt_function": "mmlu_harness",
794
- "hf_repo": "lighteval/mmlu",
795
- "hf_subset": "formal_logic",
796
- "metric": [
797
- "loglikelihood_acc"
798
- ],
799
- "hf_avail_splits": [
800
- "auxiliary_train",
801
- "test",
802
- "validation",
803
- "dev"
804
- ],
805
- "evaluation_splits": [
806
- "test"
807
- ],
808
- "few_shots_split": "dev",
809
- "few_shots_select": "sequential",
810
- "generation_size": 1,
811
- "stop_sequence": [
812
- "\n"
813
- ],
814
- "output_regex": null,
815
- "frozen": false,
816
- "suite": [
817
- "lighteval",
818
- "mmlu"
819
- ]
820
- },
821
- "lighteval|mmlu:global_facts": {
822
- "name": "mmlu:global_facts",
823
- "prompt_function": "mmlu_harness",
824
- "hf_repo": "lighteval/mmlu",
825
- "hf_subset": "global_facts",
826
- "metric": [
827
- "loglikelihood_acc"
828
- ],
829
- "hf_avail_splits": [
830
- "auxiliary_train",
831
- "test",
832
- "validation",
833
- "dev"
834
- ],
835
- "evaluation_splits": [
836
- "test"
837
- ],
838
- "few_shots_split": "dev",
839
- "few_shots_select": "sequential",
840
- "generation_size": 1,
841
- "stop_sequence": [
842
- "\n"
843
- ],
844
- "output_regex": null,
845
- "frozen": false,
846
- "suite": [
847
- "lighteval",
848
- "mmlu"
849
- ]
850
- },
851
- "lighteval|mmlu:high_school_biology": {
852
- "name": "mmlu:high_school_biology",
853
- "prompt_function": "mmlu_harness",
854
- "hf_repo": "lighteval/mmlu",
855
- "hf_subset": "high_school_biology",
856
- "metric": [
857
- "loglikelihood_acc"
858
- ],
859
- "hf_avail_splits": [
860
- "auxiliary_train",
861
- "test",
862
- "validation",
863
- "dev"
864
- ],
865
- "evaluation_splits": [
866
- "test"
867
- ],
868
- "few_shots_split": "dev",
869
- "few_shots_select": "sequential",
870
- "generation_size": 1,
871
- "stop_sequence": [
872
- "\n"
873
- ],
874
- "output_regex": null,
875
- "frozen": false,
876
- "suite": [
877
- "lighteval",
878
- "mmlu"
879
- ]
880
- },
881
- "lighteval|mmlu:high_school_chemistry": {
882
- "name": "mmlu:high_school_chemistry",
883
- "prompt_function": "mmlu_harness",
884
- "hf_repo": "lighteval/mmlu",
885
- "hf_subset": "high_school_chemistry",
886
- "metric": [
887
- "loglikelihood_acc"
888
- ],
889
- "hf_avail_splits": [
890
- "auxiliary_train",
891
- "test",
892
- "validation",
893
- "dev"
894
- ],
895
- "evaluation_splits": [
896
- "test"
897
- ],
898
- "few_shots_split": "dev",
899
- "few_shots_select": "sequential",
900
- "generation_size": 1,
901
- "stop_sequence": [
902
- "\n"
903
- ],
904
- "output_regex": null,
905
- "frozen": false,
906
- "suite": [
907
- "lighteval",
908
- "mmlu"
909
- ]
910
- },
911
- "lighteval|mmlu:high_school_computer_science": {
912
- "name": "mmlu:high_school_computer_science",
913
- "prompt_function": "mmlu_harness",
914
- "hf_repo": "lighteval/mmlu",
915
- "hf_subset": "high_school_computer_science",
916
- "metric": [
917
- "loglikelihood_acc"
918
- ],
919
- "hf_avail_splits": [
920
- "auxiliary_train",
921
- "test",
922
- "validation",
923
- "dev"
924
- ],
925
- "evaluation_splits": [
926
- "test"
927
- ],
928
- "few_shots_split": "dev",
929
- "few_shots_select": "sequential",
930
- "generation_size": 1,
931
- "stop_sequence": [
932
- "\n"
933
- ],
934
- "output_regex": null,
935
- "frozen": false,
936
- "suite": [
937
- "lighteval",
938
- "mmlu"
939
- ]
940
- },
941
- "lighteval|mmlu:high_school_european_history": {
942
- "name": "mmlu:high_school_european_history",
943
- "prompt_function": "mmlu_harness",
944
- "hf_repo": "lighteval/mmlu",
945
- "hf_subset": "high_school_european_history",
946
- "metric": [
947
- "loglikelihood_acc"
948
- ],
949
- "hf_avail_splits": [
950
- "auxiliary_train",
951
- "test",
952
- "validation",
953
- "dev"
954
- ],
955
- "evaluation_splits": [
956
- "test"
957
- ],
958
- "few_shots_split": "dev",
959
- "few_shots_select": "sequential",
960
- "generation_size": 1,
961
- "stop_sequence": [
962
- "\n"
963
- ],
964
- "output_regex": null,
965
- "frozen": false,
966
- "suite": [
967
- "lighteval",
968
- "mmlu"
969
- ]
970
- },
971
- "lighteval|mmlu:high_school_geography": {
972
- "name": "mmlu:high_school_geography",
973
- "prompt_function": "mmlu_harness",
974
- "hf_repo": "lighteval/mmlu",
975
- "hf_subset": "high_school_geography",
976
- "metric": [
977
- "loglikelihood_acc"
978
- ],
979
- "hf_avail_splits": [
980
- "auxiliary_train",
981
- "test",
982
- "validation",
983
- "dev"
984
- ],
985
- "evaluation_splits": [
986
- "test"
987
- ],
988
- "few_shots_split": "dev",
989
- "few_shots_select": "sequential",
990
- "generation_size": 1,
991
- "stop_sequence": [
992
- "\n"
993
- ],
994
- "output_regex": null,
995
- "frozen": false,
996
- "suite": [
997
- "lighteval",
998
- "mmlu"
999
- ]
1000
- },
1001
- "lighteval|mmlu:high_school_government_and_politics": {
1002
- "name": "mmlu:high_school_government_and_politics",
1003
- "prompt_function": "mmlu_harness",
1004
- "hf_repo": "lighteval/mmlu",
1005
- "hf_subset": "high_school_government_and_politics",
1006
- "metric": [
1007
- "loglikelihood_acc"
1008
- ],
1009
- "hf_avail_splits": [
1010
- "auxiliary_train",
1011
- "test",
1012
- "validation",
1013
- "dev"
1014
- ],
1015
- "evaluation_splits": [
1016
- "test"
1017
- ],
1018
- "few_shots_split": "dev",
1019
- "few_shots_select": "sequential",
1020
- "generation_size": 1,
1021
- "stop_sequence": [
1022
- "\n"
1023
- ],
1024
- "output_regex": null,
1025
- "frozen": false,
1026
- "suite": [
1027
- "lighteval",
1028
- "mmlu"
1029
- ]
1030
- },
1031
- "lighteval|mmlu:high_school_macroeconomics": {
1032
- "name": "mmlu:high_school_macroeconomics",
1033
- "prompt_function": "mmlu_harness",
1034
- "hf_repo": "lighteval/mmlu",
1035
- "hf_subset": "high_school_macroeconomics",
1036
- "metric": [
1037
- "loglikelihood_acc"
1038
- ],
1039
- "hf_avail_splits": [
1040
- "auxiliary_train",
1041
- "test",
1042
- "validation",
1043
- "dev"
1044
- ],
1045
- "evaluation_splits": [
1046
- "test"
1047
- ],
1048
- "few_shots_split": "dev",
1049
- "few_shots_select": "sequential",
1050
- "generation_size": 1,
1051
- "stop_sequence": [
1052
- "\n"
1053
- ],
1054
- "output_regex": null,
1055
- "frozen": false,
1056
- "suite": [
1057
- "lighteval",
1058
- "mmlu"
1059
- ]
1060
- },
1061
- "lighteval|mmlu:high_school_mathematics": {
1062
- "name": "mmlu:high_school_mathematics",
1063
- "prompt_function": "mmlu_harness",
1064
- "hf_repo": "lighteval/mmlu",
1065
- "hf_subset": "high_school_mathematics",
1066
- "metric": [
1067
- "loglikelihood_acc"
1068
- ],
1069
- "hf_avail_splits": [
1070
- "auxiliary_train",
1071
- "test",
1072
- "validation",
1073
- "dev"
1074
- ],
1075
- "evaluation_splits": [
1076
- "test"
1077
- ],
1078
- "few_shots_split": "dev",
1079
- "few_shots_select": "sequential",
1080
- "generation_size": 1,
1081
- "stop_sequence": [
1082
- "\n"
1083
- ],
1084
- "output_regex": null,
1085
- "frozen": false,
1086
- "suite": [
1087
- "lighteval",
1088
- "mmlu"
1089
- ]
1090
- },
1091
- "lighteval|mmlu:high_school_microeconomics": {
1092
- "name": "mmlu:high_school_microeconomics",
1093
- "prompt_function": "mmlu_harness",
1094
- "hf_repo": "lighteval/mmlu",
1095
- "hf_subset": "high_school_microeconomics",
1096
- "metric": [
1097
- "loglikelihood_acc"
1098
- ],
1099
- "hf_avail_splits": [
1100
- "auxiliary_train",
1101
- "test",
1102
- "validation",
1103
- "dev"
1104
- ],
1105
- "evaluation_splits": [
1106
- "test"
1107
- ],
1108
- "few_shots_split": "dev",
1109
- "few_shots_select": "sequential",
1110
- "generation_size": 1,
1111
- "stop_sequence": [
1112
- "\n"
1113
- ],
1114
- "output_regex": null,
1115
- "frozen": false,
1116
- "suite": [
1117
- "lighteval",
1118
- "mmlu"
1119
- ]
1120
- },
1121
- "lighteval|mmlu:high_school_physics": {
1122
- "name": "mmlu:high_school_physics",
1123
- "prompt_function": "mmlu_harness",
1124
- "hf_repo": "lighteval/mmlu",
1125
- "hf_subset": "high_school_physics",
1126
- "metric": [
1127
- "loglikelihood_acc"
1128
- ],
1129
- "hf_avail_splits": [
1130
- "auxiliary_train",
1131
- "test",
1132
- "validation",
1133
- "dev"
1134
- ],
1135
- "evaluation_splits": [
1136
- "test"
1137
- ],
1138
- "few_shots_split": "dev",
1139
- "few_shots_select": "sequential",
1140
- "generation_size": 1,
1141
- "stop_sequence": [
1142
- "\n"
1143
- ],
1144
- "output_regex": null,
1145
- "frozen": false,
1146
- "suite": [
1147
- "lighteval",
1148
- "mmlu"
1149
- ]
1150
- },
1151
- "lighteval|mmlu:high_school_psychology": {
1152
- "name": "mmlu:high_school_psychology",
1153
- "prompt_function": "mmlu_harness",
1154
- "hf_repo": "lighteval/mmlu",
1155
- "hf_subset": "high_school_psychology",
1156
- "metric": [
1157
- "loglikelihood_acc"
1158
- ],
1159
- "hf_avail_splits": [
1160
- "auxiliary_train",
1161
- "test",
1162
- "validation",
1163
- "dev"
1164
- ],
1165
- "evaluation_splits": [
1166
- "test"
1167
- ],
1168
- "few_shots_split": "dev",
1169
- "few_shots_select": "sequential",
1170
- "generation_size": 1,
1171
- "stop_sequence": [
1172
- "\n"
1173
- ],
1174
- "output_regex": null,
1175
- "frozen": false,
1176
- "suite": [
1177
- "lighteval",
1178
- "mmlu"
1179
- ]
1180
- },
1181
- "lighteval|mmlu:high_school_statistics": {
1182
- "name": "mmlu:high_school_statistics",
1183
- "prompt_function": "mmlu_harness",
1184
- "hf_repo": "lighteval/mmlu",
1185
- "hf_subset": "high_school_statistics",
1186
- "metric": [
1187
- "loglikelihood_acc"
1188
- ],
1189
- "hf_avail_splits": [
1190
- "auxiliary_train",
1191
- "test",
1192
- "validation",
1193
- "dev"
1194
- ],
1195
- "evaluation_splits": [
1196
- "test"
1197
- ],
1198
- "few_shots_split": "dev",
1199
- "few_shots_select": "sequential",
1200
- "generation_size": 1,
1201
- "stop_sequence": [
1202
- "\n"
1203
- ],
1204
- "output_regex": null,
1205
- "frozen": false,
1206
- "suite": [
1207
- "lighteval",
1208
- "mmlu"
1209
- ]
1210
- },
1211
- "lighteval|mmlu:high_school_us_history": {
1212
- "name": "mmlu:high_school_us_history",
1213
- "prompt_function": "mmlu_harness",
1214
- "hf_repo": "lighteval/mmlu",
1215
- "hf_subset": "high_school_us_history",
1216
- "metric": [
1217
- "loglikelihood_acc"
1218
- ],
1219
- "hf_avail_splits": [
1220
- "auxiliary_train",
1221
- "test",
1222
- "validation",
1223
- "dev"
1224
- ],
1225
- "evaluation_splits": [
1226
- "test"
1227
- ],
1228
- "few_shots_split": "dev",
1229
- "few_shots_select": "sequential",
1230
- "generation_size": 1,
1231
- "stop_sequence": [
1232
- "\n"
1233
- ],
1234
- "output_regex": null,
1235
- "frozen": false,
1236
- "suite": [
1237
- "lighteval",
1238
- "mmlu"
1239
- ]
1240
- },
1241
- "lighteval|mmlu:high_school_world_history": {
1242
- "name": "mmlu:high_school_world_history",
1243
- "prompt_function": "mmlu_harness",
1244
- "hf_repo": "lighteval/mmlu",
1245
- "hf_subset": "high_school_world_history",
1246
- "metric": [
1247
- "loglikelihood_acc"
1248
- ],
1249
- "hf_avail_splits": [
1250
- "auxiliary_train",
1251
- "test",
1252
- "validation",
1253
- "dev"
1254
- ],
1255
- "evaluation_splits": [
1256
- "test"
1257
- ],
1258
- "few_shots_split": "dev",
1259
- "few_shots_select": "sequential",
1260
- "generation_size": 1,
1261
- "stop_sequence": [
1262
- "\n"
1263
- ],
1264
- "output_regex": null,
1265
- "frozen": false,
1266
- "suite": [
1267
- "lighteval",
1268
- "mmlu"
1269
- ]
1270
- },
1271
- "lighteval|mmlu:human_aging": {
1272
- "name": "mmlu:human_aging",
1273
- "prompt_function": "mmlu_harness",
1274
- "hf_repo": "lighteval/mmlu",
1275
- "hf_subset": "human_aging",
1276
- "metric": [
1277
- "loglikelihood_acc"
1278
- ],
1279
- "hf_avail_splits": [
1280
- "auxiliary_train",
1281
- "test",
1282
- "validation",
1283
- "dev"
1284
- ],
1285
- "evaluation_splits": [
1286
- "test"
1287
- ],
1288
- "few_shots_split": "dev",
1289
- "few_shots_select": "sequential",
1290
- "generation_size": 1,
1291
- "stop_sequence": [
1292
- "\n"
1293
- ],
1294
- "output_regex": null,
1295
- "frozen": false,
1296
- "suite": [
1297
- "lighteval",
1298
- "mmlu"
1299
- ]
1300
- },
1301
- "lighteval|mmlu:human_sexuality": {
1302
- "name": "mmlu:human_sexuality",
1303
- "prompt_function": "mmlu_harness",
1304
- "hf_repo": "lighteval/mmlu",
1305
- "hf_subset": "human_sexuality",
1306
- "metric": [
1307
- "loglikelihood_acc"
1308
- ],
1309
- "hf_avail_splits": [
1310
- "auxiliary_train",
1311
- "test",
1312
- "validation",
1313
- "dev"
1314
- ],
1315
- "evaluation_splits": [
1316
- "test"
1317
- ],
1318
- "few_shots_split": "dev",
1319
- "few_shots_select": "sequential",
1320
- "generation_size": 1,
1321
- "stop_sequence": [
1322
- "\n"
1323
- ],
1324
- "output_regex": null,
1325
- "frozen": false,
1326
- "suite": [
1327
- "lighteval",
1328
- "mmlu"
1329
- ]
1330
- },
1331
- "lighteval|mmlu:international_law": {
1332
- "name": "mmlu:international_law",
1333
- "prompt_function": "mmlu_harness",
1334
- "hf_repo": "lighteval/mmlu",
1335
- "hf_subset": "international_law",
1336
- "metric": [
1337
- "loglikelihood_acc"
1338
- ],
1339
- "hf_avail_splits": [
1340
- "auxiliary_train",
1341
- "test",
1342
- "validation",
1343
- "dev"
1344
- ],
1345
- "evaluation_splits": [
1346
- "test"
1347
- ],
1348
- "few_shots_split": "dev",
1349
- "few_shots_select": "sequential",
1350
- "generation_size": 1,
1351
- "stop_sequence": [
1352
- "\n"
1353
- ],
1354
- "output_regex": null,
1355
- "frozen": false,
1356
- "suite": [
1357
- "lighteval",
1358
- "mmlu"
1359
- ]
1360
- },
1361
- "lighteval|mmlu:jurisprudence": {
1362
- "name": "mmlu:jurisprudence",
1363
- "prompt_function": "mmlu_harness",
1364
- "hf_repo": "lighteval/mmlu",
1365
- "hf_subset": "jurisprudence",
1366
- "metric": [
1367
- "loglikelihood_acc"
1368
- ],
1369
- "hf_avail_splits": [
1370
- "auxiliary_train",
1371
- "test",
1372
- "validation",
1373
- "dev"
1374
- ],
1375
- "evaluation_splits": [
1376
- "test"
1377
- ],
1378
- "few_shots_split": "dev",
1379
- "few_shots_select": "sequential",
1380
- "generation_size": 1,
1381
- "stop_sequence": [
1382
- "\n"
1383
- ],
1384
- "output_regex": null,
1385
- "frozen": false,
1386
- "suite": [
1387
- "lighteval",
1388
- "mmlu"
1389
- ]
1390
- },
1391
- "lighteval|mmlu:logical_fallacies": {
1392
- "name": "mmlu:logical_fallacies",
1393
- "prompt_function": "mmlu_harness",
1394
- "hf_repo": "lighteval/mmlu",
1395
- "hf_subset": "logical_fallacies",
1396
- "metric": [
1397
- "loglikelihood_acc"
1398
- ],
1399
- "hf_avail_splits": [
1400
- "auxiliary_train",
1401
- "test",
1402
- "validation",
1403
- "dev"
1404
- ],
1405
- "evaluation_splits": [
1406
- "test"
1407
- ],
1408
- "few_shots_split": "dev",
1409
- "few_shots_select": "sequential",
1410
- "generation_size": 1,
1411
- "stop_sequence": [
1412
- "\n"
1413
- ],
1414
- "output_regex": null,
1415
- "frozen": false,
1416
- "suite": [
1417
- "lighteval",
1418
- "mmlu"
1419
- ]
1420
- },
1421
- "lighteval|mmlu:machine_learning": {
1422
- "name": "mmlu:machine_learning",
1423
- "prompt_function": "mmlu_harness",
1424
- "hf_repo": "lighteval/mmlu",
1425
- "hf_subset": "machine_learning",
1426
- "metric": [
1427
- "loglikelihood_acc"
1428
- ],
1429
- "hf_avail_splits": [
1430
- "auxiliary_train",
1431
- "test",
1432
- "validation",
1433
- "dev"
1434
- ],
1435
- "evaluation_splits": [
1436
- "test"
1437
- ],
1438
- "few_shots_split": "dev",
1439
- "few_shots_select": "sequential",
1440
- "generation_size": 1,
1441
- "stop_sequence": [
1442
- "\n"
1443
- ],
1444
- "output_regex": null,
1445
- "frozen": false,
1446
- "suite": [
1447
- "lighteval",
1448
- "mmlu"
1449
- ]
1450
- },
1451
- "lighteval|mmlu:management": {
1452
- "name": "mmlu:management",
1453
- "prompt_function": "mmlu_harness",
1454
- "hf_repo": "lighteval/mmlu",
1455
- "hf_subset": "management",
1456
- "metric": [
1457
- "loglikelihood_acc"
1458
- ],
1459
- "hf_avail_splits": [
1460
- "auxiliary_train",
1461
- "test",
1462
- "validation",
1463
- "dev"
1464
- ],
1465
- "evaluation_splits": [
1466
- "test"
1467
- ],
1468
- "few_shots_split": "dev",
1469
- "few_shots_select": "sequential",
1470
- "generation_size": 1,
1471
- "stop_sequence": [
1472
- "\n"
1473
- ],
1474
- "output_regex": null,
1475
- "frozen": false,
1476
- "suite": [
1477
- "lighteval",
1478
- "mmlu"
1479
- ]
1480
- },
1481
- "lighteval|mmlu:marketing": {
1482
- "name": "mmlu:marketing",
1483
- "prompt_function": "mmlu_harness",
1484
- "hf_repo": "lighteval/mmlu",
1485
- "hf_subset": "marketing",
1486
- "metric": [
1487
- "loglikelihood_acc"
1488
- ],
1489
- "hf_avail_splits": [
1490
- "auxiliary_train",
1491
- "test",
1492
- "validation",
1493
- "dev"
1494
- ],
1495
- "evaluation_splits": [
1496
- "test"
1497
- ],
1498
- "few_shots_split": "dev",
1499
- "few_shots_select": "sequential",
1500
- "generation_size": 1,
1501
- "stop_sequence": [
1502
- "\n"
1503
- ],
1504
- "output_regex": null,
1505
- "frozen": false,
1506
- "suite": [
1507
- "lighteval",
1508
- "mmlu"
1509
- ]
1510
- },
1511
- "lighteval|mmlu:medical_genetics": {
1512
- "name": "mmlu:medical_genetics",
1513
- "prompt_function": "mmlu_harness",
1514
- "hf_repo": "lighteval/mmlu",
1515
- "hf_subset": "medical_genetics",
1516
- "metric": [
1517
- "loglikelihood_acc"
1518
- ],
1519
- "hf_avail_splits": [
1520
- "auxiliary_train",
1521
- "test",
1522
- "validation",
1523
- "dev"
1524
- ],
1525
- "evaluation_splits": [
1526
- "test"
1527
- ],
1528
- "few_shots_split": "dev",
1529
- "few_shots_select": "sequential",
1530
- "generation_size": 1,
1531
- "stop_sequence": [
1532
- "\n"
1533
- ],
1534
- "output_regex": null,
1535
- "frozen": false,
1536
- "suite": [
1537
- "lighteval",
1538
- "mmlu"
1539
- ]
1540
- },
1541
- "lighteval|mmlu:miscellaneous": {
1542
- "name": "mmlu:miscellaneous",
1543
- "prompt_function": "mmlu_harness",
1544
- "hf_repo": "lighteval/mmlu",
1545
- "hf_subset": "miscellaneous",
1546
- "metric": [
1547
- "loglikelihood_acc"
1548
- ],
1549
- "hf_avail_splits": [
1550
- "auxiliary_train",
1551
- "test",
1552
- "validation",
1553
- "dev"
1554
- ],
1555
- "evaluation_splits": [
1556
- "test"
1557
- ],
1558
- "few_shots_split": "dev",
1559
- "few_shots_select": "sequential",
1560
- "generation_size": 1,
1561
- "stop_sequence": [
1562
- "\n"
1563
- ],
1564
- "output_regex": null,
1565
- "frozen": false,
1566
- "suite": [
1567
- "lighteval",
1568
- "mmlu"
1569
- ]
1570
- },
1571
- "lighteval|mmlu:moral_disputes": {
1572
- "name": "mmlu:moral_disputes",
1573
- "prompt_function": "mmlu_harness",
1574
- "hf_repo": "lighteval/mmlu",
1575
- "hf_subset": "moral_disputes",
1576
- "metric": [
1577
- "loglikelihood_acc"
1578
- ],
1579
- "hf_avail_splits": [
1580
- "auxiliary_train",
1581
- "test",
1582
- "validation",
1583
- "dev"
1584
- ],
1585
- "evaluation_splits": [
1586
- "test"
1587
- ],
1588
- "few_shots_split": "dev",
1589
- "few_shots_select": "sequential",
1590
- "generation_size": 1,
1591
- "stop_sequence": [
1592
- "\n"
1593
- ],
1594
- "output_regex": null,
1595
- "frozen": false,
1596
- "suite": [
1597
- "lighteval",
1598
- "mmlu"
1599
- ]
1600
- },
1601
- "lighteval|mmlu:moral_scenarios": {
1602
- "name": "mmlu:moral_scenarios",
1603
- "prompt_function": "mmlu_harness",
1604
- "hf_repo": "lighteval/mmlu",
1605
- "hf_subset": "moral_scenarios",
1606
- "metric": [
1607
- "loglikelihood_acc"
1608
- ],
1609
- "hf_avail_splits": [
1610
- "auxiliary_train",
1611
- "test",
1612
- "validation",
1613
- "dev"
1614
- ],
1615
- "evaluation_splits": [
1616
- "test"
1617
- ],
1618
- "few_shots_split": "dev",
1619
- "few_shots_select": "sequential",
1620
- "generation_size": 1,
1621
- "stop_sequence": [
1622
- "\n"
1623
- ],
1624
- "output_regex": null,
1625
- "frozen": false,
1626
- "suite": [
1627
- "lighteval",
1628
- "mmlu"
1629
- ]
1630
- },
1631
- "lighteval|mmlu:nutrition": {
1632
- "name": "mmlu:nutrition",
1633
- "prompt_function": "mmlu_harness",
1634
- "hf_repo": "lighteval/mmlu",
1635
- "hf_subset": "nutrition",
1636
- "metric": [
1637
- "loglikelihood_acc"
1638
- ],
1639
- "hf_avail_splits": [
1640
- "auxiliary_train",
1641
- "test",
1642
- "validation",
1643
- "dev"
1644
- ],
1645
- "evaluation_splits": [
1646
- "test"
1647
- ],
1648
- "few_shots_split": "dev",
1649
- "few_shots_select": "sequential",
1650
- "generation_size": 1,
1651
- "stop_sequence": [
1652
- "\n"
1653
- ],
1654
- "output_regex": null,
1655
- "frozen": false,
1656
- "suite": [
1657
- "lighteval",
1658
- "mmlu"
1659
- ]
1660
- },
1661
- "lighteval|mmlu:philosophy": {
1662
- "name": "mmlu:philosophy",
1663
- "prompt_function": "mmlu_harness",
1664
- "hf_repo": "lighteval/mmlu",
1665
- "hf_subset": "philosophy",
1666
- "metric": [
1667
- "loglikelihood_acc"
1668
- ],
1669
- "hf_avail_splits": [
1670
- "auxiliary_train",
1671
- "test",
1672
- "validation",
1673
- "dev"
1674
- ],
1675
- "evaluation_splits": [
1676
- "test"
1677
- ],
1678
- "few_shots_split": "dev",
1679
- "few_shots_select": "sequential",
1680
- "generation_size": 1,
1681
- "stop_sequence": [
1682
- "\n"
1683
- ],
1684
- "output_regex": null,
1685
- "frozen": false,
1686
- "suite": [
1687
- "lighteval",
1688
- "mmlu"
1689
- ]
1690
- },
1691
- "lighteval|mmlu:prehistory": {
1692
- "name": "mmlu:prehistory",
1693
- "prompt_function": "mmlu_harness",
1694
- "hf_repo": "lighteval/mmlu",
1695
- "hf_subset": "prehistory",
1696
- "metric": [
1697
- "loglikelihood_acc"
1698
- ],
1699
- "hf_avail_splits": [
1700
- "auxiliary_train",
1701
- "test",
1702
- "validation",
1703
- "dev"
1704
- ],
1705
- "evaluation_splits": [
1706
- "test"
1707
- ],
1708
- "few_shots_split": "dev",
1709
- "few_shots_select": "sequential",
1710
- "generation_size": 1,
1711
- "stop_sequence": [
1712
- "\n"
1713
- ],
1714
- "output_regex": null,
1715
- "frozen": false,
1716
- "suite": [
1717
- "lighteval",
1718
- "mmlu"
1719
- ]
1720
- },
1721
- "lighteval|mmlu:professional_accounting": {
1722
- "name": "mmlu:professional_accounting",
1723
- "prompt_function": "mmlu_harness",
1724
- "hf_repo": "lighteval/mmlu",
1725
- "hf_subset": "professional_accounting",
1726
- "metric": [
1727
- "loglikelihood_acc"
1728
- ],
1729
- "hf_avail_splits": [
1730
- "auxiliary_train",
1731
- "test",
1732
- "validation",
1733
- "dev"
1734
- ],
1735
- "evaluation_splits": [
1736
- "test"
1737
- ],
1738
- "few_shots_split": "dev",
1739
- "few_shots_select": "sequential",
1740
- "generation_size": 1,
1741
- "stop_sequence": [
1742
- "\n"
1743
- ],
1744
- "output_regex": null,
1745
- "frozen": false,
1746
- "suite": [
1747
- "lighteval",
1748
- "mmlu"
1749
- ]
1750
- },
1751
- "lighteval|mmlu:professional_law": {
1752
- "name": "mmlu:professional_law",
1753
- "prompt_function": "mmlu_harness",
1754
- "hf_repo": "lighteval/mmlu",
1755
- "hf_subset": "professional_law",
1756
- "metric": [
1757
- "loglikelihood_acc"
1758
- ],
1759
- "hf_avail_splits": [
1760
- "auxiliary_train",
1761
- "test",
1762
- "validation",
1763
- "dev"
1764
- ],
1765
- "evaluation_splits": [
1766
- "test"
1767
- ],
1768
- "few_shots_split": "dev",
1769
- "few_shots_select": "sequential",
1770
- "generation_size": 1,
1771
- "stop_sequence": [
1772
- "\n"
1773
- ],
1774
- "output_regex": null,
1775
- "frozen": false,
1776
- "suite": [
1777
- "lighteval",
1778
- "mmlu"
1779
- ]
1780
- },
1781
- "lighteval|mmlu:professional_medicine": {
1782
- "name": "mmlu:professional_medicine",
1783
- "prompt_function": "mmlu_harness",
1784
- "hf_repo": "lighteval/mmlu",
1785
- "hf_subset": "professional_medicine",
1786
- "metric": [
1787
- "loglikelihood_acc"
1788
- ],
1789
- "hf_avail_splits": [
1790
- "auxiliary_train",
1791
- "test",
1792
- "validation",
1793
- "dev"
1794
- ],
1795
- "evaluation_splits": [
1796
- "test"
1797
- ],
1798
- "few_shots_split": "dev",
1799
- "few_shots_select": "sequential",
1800
- "generation_size": 1,
1801
- "stop_sequence": [
1802
- "\n"
1803
- ],
1804
- "output_regex": null,
1805
- "frozen": false,
1806
- "suite": [
1807
- "lighteval",
1808
- "mmlu"
1809
- ]
1810
- },
1811
- "lighteval|mmlu:professional_psychology": {
1812
- "name": "mmlu:professional_psychology",
1813
- "prompt_function": "mmlu_harness",
1814
- "hf_repo": "lighteval/mmlu",
1815
- "hf_subset": "professional_psychology",
1816
- "metric": [
1817
- "loglikelihood_acc"
1818
- ],
1819
- "hf_avail_splits": [
1820
- "auxiliary_train",
1821
- "test",
1822
- "validation",
1823
- "dev"
1824
- ],
1825
- "evaluation_splits": [
1826
- "test"
1827
- ],
1828
- "few_shots_split": "dev",
1829
- "few_shots_select": "sequential",
1830
- "generation_size": 1,
1831
- "stop_sequence": [
1832
- "\n"
1833
- ],
1834
- "output_regex": null,
1835
- "frozen": false,
1836
- "suite": [
1837
- "lighteval",
1838
- "mmlu"
1839
- ]
1840
- },
1841
- "lighteval|mmlu:public_relations": {
1842
- "name": "mmlu:public_relations",
1843
- "prompt_function": "mmlu_harness",
1844
- "hf_repo": "lighteval/mmlu",
1845
- "hf_subset": "public_relations",
1846
- "metric": [
1847
- "loglikelihood_acc"
1848
- ],
1849
- "hf_avail_splits": [
1850
- "auxiliary_train",
1851
- "test",
1852
- "validation",
1853
- "dev"
1854
- ],
1855
- "evaluation_splits": [
1856
- "test"
1857
- ],
1858
- "few_shots_split": "dev",
1859
- "few_shots_select": "sequential",
1860
- "generation_size": 1,
1861
- "stop_sequence": [
1862
- "\n"
1863
- ],
1864
- "output_regex": null,
1865
- "frozen": false,
1866
- "suite": [
1867
- "lighteval",
1868
- "mmlu"
1869
- ]
1870
- },
1871
- "lighteval|mmlu:security_studies": {
1872
- "name": "mmlu:security_studies",
1873
- "prompt_function": "mmlu_harness",
1874
- "hf_repo": "lighteval/mmlu",
1875
- "hf_subset": "security_studies",
1876
- "metric": [
1877
- "loglikelihood_acc"
1878
- ],
1879
- "hf_avail_splits": [
1880
- "auxiliary_train",
1881
- "test",
1882
- "validation",
1883
- "dev"
1884
- ],
1885
- "evaluation_splits": [
1886
- "test"
1887
- ],
1888
- "few_shots_split": "dev",
1889
- "few_shots_select": "sequential",
1890
- "generation_size": 1,
1891
- "stop_sequence": [
1892
- "\n"
1893
- ],
1894
- "output_regex": null,
1895
- "frozen": false,
1896
- "suite": [
1897
- "lighteval",
1898
- "mmlu"
1899
- ]
1900
- },
1901
- "lighteval|mmlu:sociology": {
1902
- "name": "mmlu:sociology",
1903
- "prompt_function": "mmlu_harness",
1904
- "hf_repo": "lighteval/mmlu",
1905
- "hf_subset": "sociology",
1906
- "metric": [
1907
- "loglikelihood_acc"
1908
- ],
1909
- "hf_avail_splits": [
1910
- "auxiliary_train",
1911
- "test",
1912
- "validation",
1913
- "dev"
1914
- ],
1915
- "evaluation_splits": [
1916
- "test"
1917
- ],
1918
- "few_shots_split": "dev",
1919
- "few_shots_select": "sequential",
1920
- "generation_size": 1,
1921
- "stop_sequence": [
1922
- "\n"
1923
- ],
1924
- "output_regex": null,
1925
- "frozen": false,
1926
- "suite": [
1927
- "lighteval",
1928
- "mmlu"
1929
- ]
1930
- },
1931
- "lighteval|mmlu:us_foreign_policy": {
1932
- "name": "mmlu:us_foreign_policy",
1933
- "prompt_function": "mmlu_harness",
1934
- "hf_repo": "lighteval/mmlu",
1935
- "hf_subset": "us_foreign_policy",
1936
- "metric": [
1937
- "loglikelihood_acc"
1938
- ],
1939
- "hf_avail_splits": [
1940
- "auxiliary_train",
1941
- "test",
1942
- "validation",
1943
- "dev"
1944
- ],
1945
- "evaluation_splits": [
1946
- "test"
1947
- ],
1948
- "few_shots_split": "dev",
1949
- "few_shots_select": "sequential",
1950
- "generation_size": 1,
1951
- "stop_sequence": [
1952
- "\n"
1953
- ],
1954
- "output_regex": null,
1955
- "frozen": false,
1956
- "suite": [
1957
- "lighteval",
1958
- "mmlu"
1959
- ]
1960
- },
1961
- "lighteval|mmlu:virology": {
1962
- "name": "mmlu:virology",
1963
- "prompt_function": "mmlu_harness",
1964
- "hf_repo": "lighteval/mmlu",
1965
- "hf_subset": "virology",
1966
- "metric": [
1967
- "loglikelihood_acc"
1968
- ],
1969
- "hf_avail_splits": [
1970
- "auxiliary_train",
1971
- "test",
1972
- "validation",
1973
- "dev"
1974
- ],
1975
- "evaluation_splits": [
1976
- "test"
1977
- ],
1978
- "few_shots_split": "dev",
1979
- "few_shots_select": "sequential",
1980
- "generation_size": 1,
1981
- "stop_sequence": [
1982
- "\n"
1983
- ],
1984
- "output_regex": null,
1985
- "frozen": false,
1986
- "suite": [
1987
- "lighteval",
1988
- "mmlu"
1989
- ]
1990
- },
1991
- "lighteval|mmlu:world_religions": {
1992
- "name": "mmlu:world_religions",
1993
- "prompt_function": "mmlu_harness",
1994
- "hf_repo": "lighteval/mmlu",
1995
- "hf_subset": "world_religions",
1996
- "metric": [
1997
- "loglikelihood_acc"
1998
- ],
1999
- "hf_avail_splits": [
2000
- "auxiliary_train",
2001
- "test",
2002
- "validation",
2003
- "dev"
2004
- ],
2005
- "evaluation_splits": [
2006
- "test"
2007
- ],
2008
- "few_shots_split": "dev",
2009
- "few_shots_select": "sequential",
2010
- "generation_size": 1,
2011
- "stop_sequence": [
2012
- "\n"
2013
- ],
2014
- "output_regex": null,
2015
- "frozen": false,
2016
- "suite": [
2017
- "lighteval",
2018
- "mmlu"
2019
- ]
2020
- }
2021
- },
2022
- "summary_tasks": {
2023
- "lighteval|mmlu:abstract_algebra|5": {
2024
- "hashes": {
2025
- "hash_examples": "4c76229e00c9c0e9",
2026
- "hash_full_prompts": "a45d01c3409c889c",
2027
- "hash_input_tokens": "4948b2c6cf57057c",
2028
- "hash_cont_tokens": "ca6635f013682116"
2029
- },
2030
- "truncated": 0,
2031
- "non_truncated": 100,
2032
- "padded": 400,
2033
- "non_padded": 0,
2034
- "effective_few_shots": 5.0,
2035
- "num_truncated_few_shots": 0
2036
- },
2037
- "lighteval|mmlu:anatomy|5": {
2038
- "hashes": {
2039
- "hash_examples": "6a1f8104dccbd33b",
2040
- "hash_full_prompts": "e245c6600e03cc32",
2041
- "hash_input_tokens": "ccae0b047572b80f",
2042
- "hash_cont_tokens": "e1ba0772a6068b5f"
2043
- },
2044
- "truncated": 0,
2045
- "non_truncated": 135,
2046
- "padded": 540,
2047
- "non_padded": 0,
2048
- "effective_few_shots": 5.0,
2049
- "num_truncated_few_shots": 0
2050
- },
2051
- "lighteval|mmlu:astronomy|5": {
2052
- "hashes": {
2053
- "hash_examples": "1302effa3a76ce4c",
2054
- "hash_full_prompts": "390f9bddf857ad04",
2055
- "hash_input_tokens": "95e99849ae58bc29",
2056
- "hash_cont_tokens": "5ceb0e5afafe79b5"
2057
- },
2058
- "truncated": 0,
2059
- "non_truncated": 152,
2060
- "padded": 608,
2061
- "non_padded": 0,
2062
- "effective_few_shots": 5.0,
2063
- "num_truncated_few_shots": 0
2064
- },
2065
- "lighteval|mmlu:business_ethics|5": {
2066
- "hashes": {
2067
- "hash_examples": "03cb8bce5336419a",
2068
- "hash_full_prompts": "5504f893bc4f2fa1",
2069
- "hash_input_tokens": "b497f7a5f8bbb8f3",
2070
- "hash_cont_tokens": "ca6635f013682116"
2071
- },
2072
- "truncated": 0,
2073
- "non_truncated": 100,
2074
- "padded": 400,
2075
- "non_padded": 0,
2076
- "effective_few_shots": 5.0,
2077
- "num_truncated_few_shots": 0
2078
- },
2079
- "lighteval|mmlu:clinical_knowledge|5": {
2080
- "hashes": {
2081
- "hash_examples": "ffbb9c7b2be257f9",
2082
- "hash_full_prompts": "106ad0bab4b90b78",
2083
- "hash_input_tokens": "22ddadb0674e1859",
2084
- "hash_cont_tokens": "aed310f2c7712a91"
2085
- },
2086
- "truncated": 0,
2087
- "non_truncated": 265,
2088
- "padded": 1060,
2089
- "non_padded": 0,
2090
- "effective_few_shots": 5.0,
2091
- "num_truncated_few_shots": 0
2092
- },
2093
- "lighteval|mmlu:college_biology|5": {
2094
- "hashes": {
2095
- "hash_examples": "3ee77f176f38eb8e",
2096
- "hash_full_prompts": "59f9bdf2695cb226",
2097
- "hash_input_tokens": "d4dee762441c2914",
2098
- "hash_cont_tokens": "0dadd21454ffb16b"
2099
- },
2100
- "truncated": 0,
2101
- "non_truncated": 144,
2102
- "padded": 576,
2103
- "non_padded": 0,
2104
- "effective_few_shots": 5.0,
2105
- "num_truncated_few_shots": 0
2106
- },
2107
- "lighteval|mmlu:college_chemistry|5": {
2108
- "hashes": {
2109
- "hash_examples": "ce61a69c46d47aeb",
2110
- "hash_full_prompts": "3cac9b759fcff7a0",
2111
- "hash_input_tokens": "d8a83002fd2891fc",
2112
- "hash_cont_tokens": "ca6635f013682116"
2113
- },
2114
- "truncated": 0,
2115
- "non_truncated": 100,
2116
- "padded": 400,
2117
- "non_padded": 0,
2118
- "effective_few_shots": 5.0,
2119
- "num_truncated_few_shots": 0
2120
- },
2121
- "lighteval|mmlu:college_computer_science|5": {
2122
- "hashes": {
2123
- "hash_examples": "32805b52d7d5daab",
2124
- "hash_full_prompts": "010b0cca35070130",
2125
- "hash_input_tokens": "bf24575e01b75368",
2126
- "hash_cont_tokens": "ca6635f013682116"
2127
- },
2128
- "truncated": 0,
2129
- "non_truncated": 100,
2130
- "padded": 400,
2131
- "non_padded": 0,
2132
- "effective_few_shots": 5.0,
2133
- "num_truncated_few_shots": 0
2134
- },
2135
- "lighteval|mmlu:college_mathematics|5": {
2136
- "hashes": {
2137
- "hash_examples": "55da1a0a0bd33722",
2138
- "hash_full_prompts": "511422eb9eefc773",
2139
- "hash_input_tokens": "958ae747d6c39df7",
2140
- "hash_cont_tokens": "ca6635f013682116"
2141
- },
2142
- "truncated": 0,
2143
- "non_truncated": 100,
2144
- "padded": 400,
2145
- "non_padded": 0,
2146
- "effective_few_shots": 5.0,
2147
- "num_truncated_few_shots": 0
2148
- },
2149
- "lighteval|mmlu:college_medicine|5": {
2150
- "hashes": {
2151
- "hash_examples": "c33e143163049176",
2152
- "hash_full_prompts": "c8cc1a82a51a046e",
2153
- "hash_input_tokens": "726811436fcc0abd",
2154
- "hash_cont_tokens": "b4dea139dbc832db"
2155
- },
2156
- "truncated": 0,
2157
- "non_truncated": 173,
2158
- "padded": 692,
2159
- "non_padded": 0,
2160
- "effective_few_shots": 5.0,
2161
- "num_truncated_few_shots": 0
2162
- },
2163
- "lighteval|mmlu:college_physics|5": {
2164
- "hashes": {
2165
- "hash_examples": "ebdab1cdb7e555df",
2166
- "hash_full_prompts": "e40721b5059c5818",
2167
- "hash_input_tokens": "5e5caeee24119b1f",
2168
- "hash_cont_tokens": "f5a25833e1dae922"
2169
- },
2170
- "truncated": 0,
2171
- "non_truncated": 102,
2172
- "padded": 408,
2173
- "non_padded": 0,
2174
- "effective_few_shots": 5.0,
2175
- "num_truncated_few_shots": 0
2176
- },
2177
- "lighteval|mmlu:computer_security|5": {
2178
- "hashes": {
2179
- "hash_examples": "a24fd7d08a560921",
2180
- "hash_full_prompts": "946c9be5964ac44a",
2181
- "hash_input_tokens": "f0fe150445434938",
2182
- "hash_cont_tokens": "ca6635f013682116"
2183
- },
2184
- "truncated": 0,
2185
- "non_truncated": 100,
2186
- "padded": 400,
2187
- "non_padded": 0,
2188
- "effective_few_shots": 5.0,
2189
- "num_truncated_few_shots": 0
2190
- },
2191
- "lighteval|mmlu:conceptual_physics|5": {
2192
- "hashes": {
2193
- "hash_examples": "8300977a79386993",
2194
- "hash_full_prompts": "506a4f6094cc40c9",
2195
- "hash_input_tokens": "1f0efe59b0409eb6",
2196
- "hash_cont_tokens": "3aaa2f9b51df2bc9"
2197
- },
2198
- "truncated": 0,
2199
- "non_truncated": 235,
2200
- "padded": 940,
2201
- "non_padded": 0,
2202
- "effective_few_shots": 5.0,
2203
- "num_truncated_few_shots": 0
2204
- },
2205
- "lighteval|mmlu:econometrics|5": {
2206
- "hashes": {
2207
- "hash_examples": "ddde36788a04a46f",
2208
- "hash_full_prompts": "4ed2703f27f1ed05",
2209
- "hash_input_tokens": "0fbb5ec4fd743fd9",
2210
- "hash_cont_tokens": "40eac819ba437eec"
2211
- },
2212
- "truncated": 0,
2213
- "non_truncated": 114,
2214
- "padded": 456,
2215
- "non_padded": 0,
2216
- "effective_few_shots": 5.0,
2217
- "num_truncated_few_shots": 0
2218
- },
2219
- "lighteval|mmlu:electrical_engineering|5": {
2220
- "hashes": {
2221
- "hash_examples": "acbc5def98c19b3f",
2222
- "hash_full_prompts": "d8f4b3e11c23653c",
2223
- "hash_input_tokens": "abc466e84eca1d48",
2224
- "hash_cont_tokens": "9ad54070b8c8d481"
2225
- },
2226
- "truncated": 0,
2227
- "non_truncated": 145,
2228
- "padded": 580,
2229
- "non_padded": 0,
2230
- "effective_few_shots": 5.0,
2231
- "num_truncated_few_shots": 0
2232
- },
2233
- "lighteval|mmlu:elementary_mathematics|5": {
2234
- "hashes": {
2235
- "hash_examples": "146e61d07497a9bd",
2236
- "hash_full_prompts": "256d111bd15647ff",
2237
- "hash_input_tokens": "0050e6543b33ab8e",
2238
- "hash_cont_tokens": "1737a64affbf2372"
2239
- },
2240
- "truncated": 0,
2241
- "non_truncated": 378,
2242
- "padded": 1512,
2243
- "non_padded": 0,
2244
- "effective_few_shots": 5.0,
2245
- "num_truncated_few_shots": 0
2246
- },
2247
- "lighteval|mmlu:formal_logic|5": {
2248
- "hashes": {
2249
- "hash_examples": "8635216e1909a03f",
2250
- "hash_full_prompts": "1171d04f3b1a11f5",
2251
- "hash_input_tokens": "084c9357045f3417",
2252
- "hash_cont_tokens": "bbf4e8421cceccf3"
2253
- },
2254
- "truncated": 0,
2255
- "non_truncated": 126,
2256
- "padded": 504,
2257
- "non_padded": 0,
2258
- "effective_few_shots": 5.0,
2259
- "num_truncated_few_shots": 0
2260
- },
2261
- "lighteval|mmlu:global_facts|5": {
2262
- "hashes": {
2263
- "hash_examples": "30b315aa6353ee47",
2264
- "hash_full_prompts": "a7e56dbc074c7529",
2265
- "hash_input_tokens": "29026e1886290a72",
2266
- "hash_cont_tokens": "ca6635f013682116"
2267
- },
2268
- "truncated": 0,
2269
- "non_truncated": 100,
2270
- "padded": 400,
2271
- "non_padded": 0,
2272
- "effective_few_shots": 5.0,
2273
- "num_truncated_few_shots": 0
2274
- },
2275
- "lighteval|mmlu:high_school_biology|5": {
2276
- "hashes": {
2277
- "hash_examples": "c9136373af2180de",
2278
- "hash_full_prompts": "ad6e859ed978e04a",
2279
- "hash_input_tokens": "3a30f81c4a1b993c",
2280
- "hash_cont_tokens": "3b147e76117feda3"
2281
- },
2282
- "truncated": 0,
2283
- "non_truncated": 310,
2284
- "padded": 1240,
2285
- "non_padded": 0,
2286
- "effective_few_shots": 5.0,
2287
- "num_truncated_few_shots": 0
2288
- },
2289
- "lighteval|mmlu:high_school_chemistry|5": {
2290
- "hashes": {
2291
- "hash_examples": "b0661bfa1add6404",
2292
- "hash_full_prompts": "6eb9c04bcc8a8f2a",
2293
- "hash_input_tokens": "3455612f2649f340",
2294
- "hash_cont_tokens": "308b43940e65e09e"
2295
- },
2296
- "truncated": 0,
2297
- "non_truncated": 203,
2298
- "padded": 812,
2299
- "non_padded": 0,
2300
- "effective_few_shots": 5.0,
2301
- "num_truncated_few_shots": 0
2302
- },
2303
- "lighteval|mmlu:high_school_computer_science|5": {
2304
- "hashes": {
2305
- "hash_examples": "80fc1d623a3d665f",
2306
- "hash_full_prompts": "8e51bc91c81cf8dd",
2307
- "hash_input_tokens": "1fea81f3fa06cea5",
2308
- "hash_cont_tokens": "ca6635f013682116"
2309
- },
2310
- "truncated": 0,
2311
- "non_truncated": 100,
2312
- "padded": 400,
2313
- "non_padded": 0,
2314
- "effective_few_shots": 5.0,
2315
- "num_truncated_few_shots": 0
2316
- },
2317
- "lighteval|mmlu:high_school_european_history|5": {
2318
- "hashes": {
2319
- "hash_examples": "854da6e5af0fe1a1",
2320
- "hash_full_prompts": "664a1f16c9f3195c",
2321
- "hash_input_tokens": "916c95a0536e1685",
2322
- "hash_cont_tokens": "8f181ada6dc3c1c0"
2323
- },
2324
- "truncated": 0,
2325
- "non_truncated": 165,
2326
- "padded": 656,
2327
- "non_padded": 4,
2328
- "effective_few_shots": 5.0,
2329
- "num_truncated_few_shots": 0
2330
- },
2331
- "lighteval|mmlu:high_school_geography|5": {
2332
- "hashes": {
2333
- "hash_examples": "7dc963c7acd19ad8",
2334
- "hash_full_prompts": "f3acf911f4023c8a",
2335
- "hash_input_tokens": "e78105d5c89747c6",
2336
- "hash_cont_tokens": "30d5bbb69894c7eb"
2337
- },
2338
- "truncated": 0,
2339
- "non_truncated": 198,
2340
- "padded": 792,
2341
- "non_padded": 0,
2342
- "effective_few_shots": 5.0,
2343
- "num_truncated_few_shots": 0
2344
- },
2345
- "lighteval|mmlu:high_school_government_and_politics|5": {
2346
- "hashes": {
2347
- "hash_examples": "1f675dcdebc9758f",
2348
- "hash_full_prompts": "066254feaa3158ae",
2349
- "hash_input_tokens": "ddce941d18e493a4",
2350
- "hash_cont_tokens": "a56929e6c0ce3449"
2351
- },
2352
- "truncated": 0,
2353
- "non_truncated": 193,
2354
- "padded": 772,
2355
- "non_padded": 0,
2356
- "effective_few_shots": 5.0,
2357
- "num_truncated_few_shots": 0
2358
- },
2359
- "lighteval|mmlu:high_school_macroeconomics|5": {
2360
- "hashes": {
2361
- "hash_examples": "2fb32cf2d80f0b35",
2362
- "hash_full_prompts": "19a7fa502aa85c95",
2363
- "hash_input_tokens": "837b51169b038fe0",
2364
- "hash_cont_tokens": "2fa537b929a4262f"
2365
- },
2366
- "truncated": 0,
2367
- "non_truncated": 390,
2368
- "padded": 1560,
2369
- "non_padded": 0,
2370
- "effective_few_shots": 5.0,
2371
- "num_truncated_few_shots": 0
2372
- },
2373
- "lighteval|mmlu:high_school_mathematics|5": {
2374
- "hashes": {
2375
- "hash_examples": "fd6646fdb5d58a1f",
2376
- "hash_full_prompts": "4f704e369778b5b0",
2377
- "hash_input_tokens": "f7c0c9b774e1a823",
2378
- "hash_cont_tokens": "d458ef4757b4f677"
2379
- },
2380
- "truncated": 0,
2381
- "non_truncated": 270,
2382
- "padded": 1078,
2383
- "non_padded": 2,
2384
- "effective_few_shots": 5.0,
2385
- "num_truncated_few_shots": 0
2386
- },
2387
- "lighteval|mmlu:high_school_microeconomics|5": {
2388
- "hashes": {
2389
- "hash_examples": "2118f21f71d87d84",
2390
- "hash_full_prompts": "4350f9e2240f8010",
2391
- "hash_input_tokens": "a1ee4fdb893cdebc",
2392
- "hash_cont_tokens": "4b3b9037dff8007e"
2393
- },
2394
- "truncated": 0,
2395
- "non_truncated": 238,
2396
- "padded": 952,
2397
- "non_padded": 0,
2398
- "effective_few_shots": 5.0,
2399
- "num_truncated_few_shots": 0
2400
- },
2401
- "lighteval|mmlu:high_school_physics|5": {
2402
- "hashes": {
2403
- "hash_examples": "dc3ce06378548565",
2404
- "hash_full_prompts": "5dc0d6831b66188f",
2405
- "hash_input_tokens": "724f77ba2a9aff43",
2406
- "hash_cont_tokens": "7501b169c0c12798"
2407
- },
2408
- "truncated": 0,
2409
- "non_truncated": 151,
2410
- "padded": 596,
2411
- "non_padded": 8,
2412
- "effective_few_shots": 5.0,
2413
- "num_truncated_few_shots": 0
2414
- },
2415
- "lighteval|mmlu:high_school_psychology|5": {
2416
- "hashes": {
2417
- "hash_examples": "c8d1d98a40e11f2f",
2418
- "hash_full_prompts": "af2b097da6d50365",
2419
- "hash_input_tokens": "64c9ef6542c4ebc5",
2420
- "hash_cont_tokens": "c1a9bff96c9b870b"
2421
- },
2422
- "truncated": 0,
2423
- "non_truncated": 545,
2424
- "padded": 2168,
2425
- "non_padded": 12,
2426
- "effective_few_shots": 5.0,
2427
- "num_truncated_few_shots": 0
2428
- },
2429
- "lighteval|mmlu:high_school_statistics|5": {
2430
- "hashes": {
2431
- "hash_examples": "666c8759b98ee4ff",
2432
- "hash_full_prompts": "c757694421d6d68d",
2433
- "hash_input_tokens": "6ace2cb29f29b1ad",
2434
- "hash_cont_tokens": "c8ba890b377f366c"
2435
- },
2436
- "truncated": 0,
2437
- "non_truncated": 216,
2438
- "padded": 864,
2439
- "non_padded": 0,
2440
- "effective_few_shots": 5.0,
2441
- "num_truncated_few_shots": 0
2442
- },
2443
- "lighteval|mmlu:high_school_us_history|5": {
2444
- "hashes": {
2445
- "hash_examples": "95fef1c4b7d3f81e",
2446
- "hash_full_prompts": "e34a028d0ddeec5e",
2447
- "hash_input_tokens": "6eea808bcfba866a",
2448
- "hash_cont_tokens": "a5583b10be513397"
2449
- },
2450
- "truncated": 0,
2451
- "non_truncated": 204,
2452
- "padded": 816,
2453
- "non_padded": 0,
2454
- "effective_few_shots": 5.0,
2455
- "num_truncated_few_shots": 0
2456
- },
2457
- "lighteval|mmlu:high_school_world_history|5": {
2458
- "hashes": {
2459
- "hash_examples": "7e5085b6184b0322",
2460
- "hash_full_prompts": "1fa3d51392765601",
2461
- "hash_input_tokens": "88d139decc63d147",
2462
- "hash_cont_tokens": "bc02ede066873d68"
2463
- },
2464
- "truncated": 0,
2465
- "non_truncated": 237,
2466
- "padded": 948,
2467
- "non_padded": 0,
2468
- "effective_few_shots": 5.0,
2469
- "num_truncated_few_shots": 0
2470
- },
2471
- "lighteval|mmlu:human_aging|5": {
2472
- "hashes": {
2473
- "hash_examples": "c17333e7c7c10797",
2474
- "hash_full_prompts": "cac900721f9a1a94",
2475
- "hash_input_tokens": "3b3004225023b6a1",
2476
- "hash_cont_tokens": "46bca90878b86814"
2477
- },
2478
- "truncated": 0,
2479
- "non_truncated": 223,
2480
- "padded": 892,
2481
- "non_padded": 0,
2482
- "effective_few_shots": 5.0,
2483
- "num_truncated_few_shots": 0
2484
- },
2485
- "lighteval|mmlu:human_sexuality|5": {
2486
- "hashes": {
2487
- "hash_examples": "4edd1e9045df5e3d",
2488
- "hash_full_prompts": "0d6567bafee0a13c",
2489
- "hash_input_tokens": "0fbac491ec8244cf",
2490
- "hash_cont_tokens": "83e31fa74449548b"
2491
- },
2492
- "truncated": 0,
2493
- "non_truncated": 131,
2494
- "padded": 524,
2495
- "non_padded": 0,
2496
- "effective_few_shots": 5.0,
2497
- "num_truncated_few_shots": 0
2498
- },
2499
- "lighteval|mmlu:international_law|5": {
2500
- "hashes": {
2501
- "hash_examples": "db2fa00d771a062a",
2502
- "hash_full_prompts": "d018f9116479795e",
2503
- "hash_input_tokens": "ccc8a79b71202284",
2504
- "hash_cont_tokens": "41c0cd517e147f5d"
2505
- },
2506
- "truncated": 0,
2507
- "non_truncated": 121,
2508
- "padded": 484,
2509
- "non_padded": 0,
2510
- "effective_few_shots": 5.0,
2511
- "num_truncated_few_shots": 0
2512
- },
2513
- "lighteval|mmlu:jurisprudence|5": {
2514
- "hashes": {
2515
- "hash_examples": "e956f86b124076fe",
2516
- "hash_full_prompts": "1487e89a10ec58b7",
2517
- "hash_input_tokens": "295854ba53c38e1b",
2518
- "hash_cont_tokens": "be0171360f69d6f2"
2519
- },
2520
- "truncated": 0,
2521
- "non_truncated": 108,
2522
- "padded": 432,
2523
- "non_padded": 0,
2524
- "effective_few_shots": 5.0,
2525
- "num_truncated_few_shots": 0
2526
- },
2527
- "lighteval|mmlu:logical_fallacies|5": {
2528
- "hashes": {
2529
- "hash_examples": "956e0e6365ab79f1",
2530
- "hash_full_prompts": "677785b2181f9243",
2531
- "hash_input_tokens": "9ad3e188efd1d33c",
2532
- "hash_cont_tokens": "6b024648afae9ee5"
2533
- },
2534
- "truncated": 0,
2535
- "non_truncated": 163,
2536
- "padded": 652,
2537
- "non_padded": 0,
2538
- "effective_few_shots": 5.0,
2539
- "num_truncated_few_shots": 0
2540
- },
2541
- "lighteval|mmlu:machine_learning|5": {
2542
- "hashes": {
2543
- "hash_examples": "397997cc6f4d581e",
2544
- "hash_full_prompts": "769ee14a2aea49bb",
2545
- "hash_input_tokens": "8968240944bf5437",
2546
- "hash_cont_tokens": "1e08788dc7e95ea0"
2547
- },
2548
- "truncated": 0,
2549
- "non_truncated": 112,
2550
- "padded": 448,
2551
- "non_padded": 0,
2552
- "effective_few_shots": 5.0,
2553
- "num_truncated_few_shots": 0
2554
- },
2555
- "lighteval|mmlu:management|5": {
2556
- "hashes": {
2557
- "hash_examples": "2bcbe6f6ca63d740",
2558
- "hash_full_prompts": "cb1ff9dac9582144",
2559
- "hash_input_tokens": "38100f64fb7e1fd7",
2560
- "hash_cont_tokens": "2e4e36b1749ca046"
2561
- },
2562
- "truncated": 0,
2563
- "non_truncated": 103,
2564
- "padded": 412,
2565
- "non_padded": 0,
2566
- "effective_few_shots": 5.0,
2567
- "num_truncated_few_shots": 0
2568
- },
2569
- "lighteval|mmlu:marketing|5": {
2570
- "hashes": {
2571
- "hash_examples": "8ddb20d964a1b065",
2572
- "hash_full_prompts": "9fc2114a187ad9a2",
2573
- "hash_input_tokens": "b4c521f4c53f8e08",
2574
- "hash_cont_tokens": "31cb7d1a07654a4c"
2575
- },
2576
- "truncated": 0,
2577
- "non_truncated": 234,
2578
- "padded": 936,
2579
- "non_padded": 0,
2580
- "effective_few_shots": 5.0,
2581
- "num_truncated_few_shots": 0
2582
- },
2583
- "lighteval|mmlu:medical_genetics|5": {
2584
- "hashes": {
2585
- "hash_examples": "182a71f4763d2cea",
2586
- "hash_full_prompts": "46a616fa51878959",
2587
- "hash_input_tokens": "dc1faf1dfb1362fd",
2588
- "hash_cont_tokens": "ca6635f013682116"
2589
- },
2590
- "truncated": 0,
2591
- "non_truncated": 100,
2592
- "padded": 400,
2593
- "non_padded": 0,
2594
- "effective_few_shots": 5.0,
2595
- "num_truncated_few_shots": 0
2596
- },
2597
- "lighteval|mmlu:miscellaneous|5": {
2598
- "hashes": {
2599
- "hash_examples": "4c404fdbb4ca57fc",
2600
- "hash_full_prompts": "0813e1be36dbaae1",
2601
- "hash_input_tokens": "9a92e9f634e3f086",
2602
- "hash_cont_tokens": "6c84ccca2d1eb1c8"
2603
- },
2604
- "truncated": 0,
2605
- "non_truncated": 783,
2606
- "padded": 3132,
2607
- "non_padded": 0,
2608
- "effective_few_shots": 5.0,
2609
- "num_truncated_few_shots": 0
2610
- },
2611
- "lighteval|mmlu:moral_disputes|5": {
2612
- "hashes": {
2613
- "hash_examples": "60cbd2baa3fea5c9",
2614
- "hash_full_prompts": "1d14adebb9b62519",
2615
- "hash_input_tokens": "0f9cc303e7a5371d",
2616
- "hash_cont_tokens": "1d61e904d7d686a1"
2617
- },
2618
- "truncated": 0,
2619
- "non_truncated": 346,
2620
- "padded": 1384,
2621
- "non_padded": 0,
2622
- "effective_few_shots": 5.0,
2623
- "num_truncated_few_shots": 0
2624
- },
2625
- "lighteval|mmlu:moral_scenarios|5": {
2626
- "hashes": {
2627
- "hash_examples": "fd8b0431fbdd75ef",
2628
- "hash_full_prompts": "b80d3d236165e3de",
2629
- "hash_input_tokens": "63e22f3bedd6b4ec",
2630
- "hash_cont_tokens": "7eb03b430f07003e"
2631
- },
2632
- "truncated": 0,
2633
- "non_truncated": 895,
2634
- "padded": 3551,
2635
- "non_padded": 29,
2636
- "effective_few_shots": 5.0,
2637
- "num_truncated_few_shots": 0
2638
- },
2639
- "lighteval|mmlu:nutrition|5": {
2640
- "hashes": {
2641
- "hash_examples": "71e55e2b829b6528",
2642
- "hash_full_prompts": "2bfb18e5fab8dea7",
2643
- "hash_input_tokens": "59301dd373fad06a",
2644
- "hash_cont_tokens": "7a0a57f1342b71d6"
2645
- },
2646
- "truncated": 0,
2647
- "non_truncated": 306,
2648
- "padded": 1224,
2649
- "non_padded": 0,
2650
- "effective_few_shots": 5.0,
2651
- "num_truncated_few_shots": 0
2652
- },
2653
- "lighteval|mmlu:philosophy|5": {
2654
- "hashes": {
2655
- "hash_examples": "a6d489a8d208fa4b",
2656
- "hash_full_prompts": "e8c0d5b6dae3ccc8",
2657
- "hash_input_tokens": "b0eb1522d655fe53",
2658
- "hash_cont_tokens": "0c977bed864888bd"
2659
- },
2660
- "truncated": 0,
2661
- "non_truncated": 311,
2662
- "padded": 1244,
2663
- "non_padded": 0,
2664
- "effective_few_shots": 5.0,
2665
- "num_truncated_few_shots": 0
2666
- },
2667
- "lighteval|mmlu:prehistory|5": {
2668
- "hashes": {
2669
- "hash_examples": "6cc50f032a19acaa",
2670
- "hash_full_prompts": "4a6a1d3ab1bf28e4",
2671
- "hash_input_tokens": "a39bd9702da6996f",
2672
- "hash_cont_tokens": "4e57a4b0d66f0736"
2673
- },
2674
- "truncated": 0,
2675
- "non_truncated": 324,
2676
- "padded": 1268,
2677
- "non_padded": 28,
2678
- "effective_few_shots": 5.0,
2679
- "num_truncated_few_shots": 0
2680
- },
2681
- "lighteval|mmlu:professional_accounting|5": {
2682
- "hashes": {
2683
- "hash_examples": "50f57ab32f5f6cea",
2684
- "hash_full_prompts": "e60129bd2d82ffc6",
2685
- "hash_input_tokens": "440a4a6e8c648413",
2686
- "hash_cont_tokens": "d01eb63ed6c749e0"
2687
- },
2688
- "truncated": 0,
2689
- "non_truncated": 282,
2690
- "padded": 1120,
2691
- "non_padded": 8,
2692
- "effective_few_shots": 5.0,
2693
- "num_truncated_few_shots": 0
2694
- },
2695
- "lighteval|mmlu:professional_law|5": {
2696
- "hashes": {
2697
- "hash_examples": "a8fdc85c64f4b215",
2698
- "hash_full_prompts": "0dbb1d9b72dcea03",
2699
- "hash_input_tokens": "135d3c6befbf7b8e",
2700
- "hash_cont_tokens": "04ed4a3308eb17e7"
2701
- },
2702
- "truncated": 0,
2703
- "non_truncated": 1534,
2704
- "padded": 6136,
2705
- "non_padded": 0,
2706
- "effective_few_shots": 5.0,
2707
- "num_truncated_few_shots": 0
2708
- },
2709
- "lighteval|mmlu:professional_medicine|5": {
2710
- "hashes": {
2711
- "hash_examples": "c373a28a3050a73a",
2712
- "hash_full_prompts": "5e040f9ca68b089e",
2713
- "hash_input_tokens": "bb56142819292718",
2714
- "hash_cont_tokens": "7d29b56ef2b26e26"
2715
- },
2716
- "truncated": 0,
2717
- "non_truncated": 272,
2718
- "padded": 1088,
2719
- "non_padded": 0,
2720
- "effective_few_shots": 5.0,
2721
- "num_truncated_few_shots": 0
2722
- },
2723
- "lighteval|mmlu:professional_psychology|5": {
2724
- "hashes": {
2725
- "hash_examples": "bf5254fe818356af",
2726
- "hash_full_prompts": "b386ecda8b87150e",
2727
- "hash_input_tokens": "69c9d6c20011b3c8",
2728
- "hash_cont_tokens": "05a71f9a9871b8f8"
2729
- },
2730
- "truncated": 0,
2731
- "non_truncated": 612,
2732
- "padded": 2448,
2733
- "non_padded": 0,
2734
- "effective_few_shots": 5.0,
2735
- "num_truncated_few_shots": 0
2736
- },
2737
- "lighteval|mmlu:public_relations|5": {
2738
- "hashes": {
2739
- "hash_examples": "b66d52e28e7d14e0",
2740
- "hash_full_prompts": "fe43562263e25677",
2741
- "hash_input_tokens": "a7ad0cf2c3f2b991",
2742
- "hash_cont_tokens": "54b160d9a82d8c27"
2743
- },
2744
- "truncated": 0,
2745
- "non_truncated": 110,
2746
- "padded": 440,
2747
- "non_padded": 0,
2748
- "effective_few_shots": 5.0,
2749
- "num_truncated_few_shots": 0
2750
- },
2751
- "lighteval|mmlu:security_studies|5": {
2752
- "hashes": {
2753
- "hash_examples": "514c14feaf000ad9",
2754
- "hash_full_prompts": "27d4a2ac541ef4b9",
2755
- "hash_input_tokens": "6cae3c98c2406b72",
2756
- "hash_cont_tokens": "a243156a0dfe5f0f"
2757
- },
2758
- "truncated": 0,
2759
- "non_truncated": 245,
2760
- "padded": 980,
2761
- "non_padded": 0,
2762
- "effective_few_shots": 5.0,
2763
- "num_truncated_few_shots": 0
2764
- },
2765
- "lighteval|mmlu:sociology|5": {
2766
- "hashes": {
2767
- "hash_examples": "f6c9bc9d18c80870",
2768
- "hash_full_prompts": "c072ea7d1a1524f2",
2769
- "hash_input_tokens": "73f8d9b6efbc989c",
2770
- "hash_cont_tokens": "bc76cf135cbda520"
2771
- },
2772
- "truncated": 0,
2773
- "non_truncated": 201,
2774
- "padded": 804,
2775
- "non_padded": 0,
2776
- "effective_few_shots": 5.0,
2777
- "num_truncated_few_shots": 0
2778
- },
2779
- "lighteval|mmlu:us_foreign_policy|5": {
2780
- "hashes": {
2781
- "hash_examples": "ed7b78629db6678f",
2782
- "hash_full_prompts": "341a97ca3e4d699d",
2783
- "hash_input_tokens": "3d5111b05caedd5c",
2784
- "hash_cont_tokens": "ca6635f013682116"
2785
- },
2786
- "truncated": 0,
2787
- "non_truncated": 100,
2788
- "padded": 397,
2789
- "non_padded": 3,
2790
- "effective_few_shots": 5.0,
2791
- "num_truncated_few_shots": 0
2792
- },
2793
- "lighteval|mmlu:virology|5": {
2794
- "hashes": {
2795
- "hash_examples": "bc52ffdc3f9b994a",
2796
- "hash_full_prompts": "651d471e2eb8b5e9",
2797
- "hash_input_tokens": "e451260208a4d06a",
2798
- "hash_cont_tokens": "3d8151c061cd8307"
2799
- },
2800
- "truncated": 0,
2801
- "non_truncated": 166,
2802
- "padded": 664,
2803
- "non_padded": 0,
2804
- "effective_few_shots": 5.0,
2805
- "num_truncated_few_shots": 0
2806
- },
2807
- "lighteval|mmlu:world_religions|5": {
2808
- "hashes": {
2809
- "hash_examples": "ecdb4a4f94f62930",
2810
- "hash_full_prompts": "3773f03542ce44a3",
2811
- "hash_input_tokens": "3cc09567a0d77652",
2812
- "hash_cont_tokens": "bcdec9004b1a5e7d"
2813
- },
2814
- "truncated": 0,
2815
- "non_truncated": 171,
2816
- "padded": 684,
2817
- "non_padded": 0,
2818
- "effective_few_shots": 5.0,
2819
- "num_truncated_few_shots": 0
2820
- }
2821
- },
2822
- "summary_general": {
2823
- "hashes": {
2824
- "hash_examples": "341a076d0beb7048",
2825
- "hash_full_prompts": "a5c8f2b7ff4f5ae2",
2826
- "hash_input_tokens": "aa0c1e704c27cb14",
2827
- "hash_cont_tokens": "c7e380df958b1906"
2828
- },
2829
- "truncated": 0,
2830
- "non_truncated": 14042,
2831
- "padded": 56074,
2832
- "non_padded": 94,
2833
- "num_truncated_few_shots": 0
2834
- }
2835
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-sft-full-longest-1k/main/mmlu/results_2024-02-28T16-09-28.251016.json DELETED
@@ -1,2835 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 445389.058996206,
9
- "end_time": 446312.18205434,
10
- "total_evaluation_time_secondes": "923.1230581340496",
11
- "model_name": "lewtun/gemma-7b-sft-full-longest-1k",
12
- "model_sha": "5b354486c4322c02994885f68f00d66a75275dd1",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "16.4 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|mmlu:abstract_algebra|5": {
19
- "acc": 0.22,
20
- "acc_stderr": 0.04163331998932268
21
- },
22
- "lighteval|mmlu:anatomy|5": {
23
- "acc": 0.18518518518518517,
24
- "acc_stderr": 0.03355677216313142
25
- },
26
- "lighteval|mmlu:astronomy|5": {
27
- "acc": 0.17763157894736842,
28
- "acc_stderr": 0.031103182383123398
29
- },
30
- "lighteval|mmlu:business_ethics|5": {
31
- "acc": 0.3,
32
- "acc_stderr": 0.046056618647183814
33
- },
34
- "lighteval|mmlu:clinical_knowledge|5": {
35
- "acc": 0.21509433962264152,
36
- "acc_stderr": 0.02528839450289137
37
- },
38
- "lighteval|mmlu:college_biology|5": {
39
- "acc": 0.2569444444444444,
40
- "acc_stderr": 0.03653946969442099
41
- },
42
- "lighteval|mmlu:college_chemistry|5": {
43
- "acc": 0.2,
44
- "acc_stderr": 0.04020151261036845
45
- },
46
- "lighteval|mmlu:college_computer_science|5": {
47
- "acc": 0.26,
48
- "acc_stderr": 0.0440844002276808
49
- },
50
- "lighteval|mmlu:college_mathematics|5": {
51
- "acc": 0.21,
52
- "acc_stderr": 0.040936018074033256
53
- },
54
- "lighteval|mmlu:college_medicine|5": {
55
- "acc": 0.20809248554913296,
56
- "acc_stderr": 0.030952890217749874
57
- },
58
- "lighteval|mmlu:college_physics|5": {
59
- "acc": 0.21568627450980393,
60
- "acc_stderr": 0.04092563958237654
61
- },
62
- "lighteval|mmlu:computer_security|5": {
63
- "acc": 0.28,
64
- "acc_stderr": 0.045126085985421276
65
- },
66
- "lighteval|mmlu:conceptual_physics|5": {
67
- "acc": 0.26382978723404255,
68
- "acc_stderr": 0.028809989854102973
69
- },
70
- "lighteval|mmlu:econometrics|5": {
71
- "acc": 0.23684210526315788,
72
- "acc_stderr": 0.039994238792813365
73
- },
74
- "lighteval|mmlu:electrical_engineering|5": {
75
- "acc": 0.2413793103448276,
76
- "acc_stderr": 0.03565998174135302
77
- },
78
- "lighteval|mmlu:elementary_mathematics|5": {
79
- "acc": 0.20899470899470898,
80
- "acc_stderr": 0.02094048156533486
81
- },
82
- "lighteval|mmlu:formal_logic|5": {
83
- "acc": 0.2857142857142857,
84
- "acc_stderr": 0.04040610178208841
85
- },
86
- "lighteval|mmlu:global_facts|5": {
87
- "acc": 0.18,
88
- "acc_stderr": 0.038612291966536934
89
- },
90
- "lighteval|mmlu:high_school_biology|5": {
91
- "acc": 0.1774193548387097,
92
- "acc_stderr": 0.02173254068932927
93
- },
94
- "lighteval|mmlu:high_school_chemistry|5": {
95
- "acc": 0.15270935960591134,
96
- "acc_stderr": 0.02530890453938063
97
- },
98
- "lighteval|mmlu:high_school_computer_science|5": {
99
- "acc": 0.25,
100
- "acc_stderr": 0.04351941398892446
101
- },
102
- "lighteval|mmlu:high_school_european_history|5": {
103
- "acc": 0.21818181818181817,
104
- "acc_stderr": 0.03225078108306289
105
- },
106
- "lighteval|mmlu:high_school_geography|5": {
107
- "acc": 0.17676767676767677,
108
- "acc_stderr": 0.027178752639044915
109
- },
110
- "lighteval|mmlu:high_school_government_and_politics|5": {
111
- "acc": 0.19689119170984457,
112
- "acc_stderr": 0.028697873971860664
113
- },
114
- "lighteval|mmlu:high_school_macroeconomics|5": {
115
- "acc": 0.20256410256410257,
116
- "acc_stderr": 0.020377660970371372
117
- },
118
- "lighteval|mmlu:high_school_mathematics|5": {
119
- "acc": 0.2111111111111111,
120
- "acc_stderr": 0.024882116857655075
121
- },
122
- "lighteval|mmlu:high_school_microeconomics|5": {
123
- "acc": 0.21008403361344538,
124
- "acc_stderr": 0.026461398717471874
125
- },
126
- "lighteval|mmlu:high_school_physics|5": {
127
- "acc": 0.1986754966887417,
128
- "acc_stderr": 0.03257847384436776
129
- },
130
- "lighteval|mmlu:high_school_psychology|5": {
131
- "acc": 0.1926605504587156,
132
- "acc_stderr": 0.016909276884936094
133
- },
134
- "lighteval|mmlu:high_school_statistics|5": {
135
- "acc": 0.1527777777777778,
136
- "acc_stderr": 0.024536326026134224
137
- },
138
- "lighteval|mmlu:high_school_us_history|5": {
139
- "acc": 0.25,
140
- "acc_stderr": 0.03039153369274154
141
- },
142
- "lighteval|mmlu:high_school_world_history|5": {
143
- "acc": 0.270042194092827,
144
- "acc_stderr": 0.028900721906293426
145
- },
146
- "lighteval|mmlu:human_aging|5": {
147
- "acc": 0.31390134529147984,
148
- "acc_stderr": 0.031146796482972465
149
- },
150
- "lighteval|mmlu:human_sexuality|5": {
151
- "acc": 0.2595419847328244,
152
- "acc_stderr": 0.03844876139785271
153
- },
154
- "lighteval|mmlu:international_law|5": {
155
- "acc": 0.2396694214876033,
156
- "acc_stderr": 0.03896878985070417
157
- },
158
- "lighteval|mmlu:jurisprudence|5": {
159
- "acc": 0.25925925925925924,
160
- "acc_stderr": 0.042365112580946336
161
- },
162
- "lighteval|mmlu:logical_fallacies|5": {
163
- "acc": 0.22085889570552147,
164
- "acc_stderr": 0.032591773927421776
165
- },
166
- "lighteval|mmlu:machine_learning|5": {
167
- "acc": 0.3125,
168
- "acc_stderr": 0.043994650575715215
169
- },
170
- "lighteval|mmlu:management|5": {
171
- "acc": 0.17475728155339806,
172
- "acc_stderr": 0.037601780060266224
173
- },
174
- "lighteval|mmlu:marketing|5": {
175
- "acc": 0.2905982905982906,
176
- "acc_stderr": 0.02974504857267404
177
- },
178
- "lighteval|mmlu:medical_genetics|5": {
179
- "acc": 0.3,
180
- "acc_stderr": 0.046056618647183814
181
- },
182
- "lighteval|mmlu:miscellaneous|5": {
183
- "acc": 0.23754789272030652,
184
- "acc_stderr": 0.015218733046150193
185
- },
186
- "lighteval|mmlu:moral_disputes|5": {
187
- "acc": 0.24855491329479767,
188
- "acc_stderr": 0.023267528432100174
189
- },
190
- "lighteval|mmlu:moral_scenarios|5": {
191
- "acc": 0.23798882681564246,
192
- "acc_stderr": 0.014242630070574915
193
- },
194
- "lighteval|mmlu:nutrition|5": {
195
- "acc": 0.22549019607843138,
196
- "acc_stderr": 0.023929155517351284
197
- },
198
- "lighteval|mmlu:philosophy|5": {
199
- "acc": 0.1864951768488746,
200
- "acc_stderr": 0.02212243977248077
201
- },
202
- "lighteval|mmlu:prehistory|5": {
203
- "acc": 0.21604938271604937,
204
- "acc_stderr": 0.022899162918445806
205
- },
206
- "lighteval|mmlu:professional_accounting|5": {
207
- "acc": 0.23404255319148937,
208
- "acc_stderr": 0.025257861359432417
209
- },
210
- "lighteval|mmlu:professional_law|5": {
211
- "acc": 0.2457627118644068,
212
- "acc_stderr": 0.010996156635142692
213
- },
214
- "lighteval|mmlu:professional_medicine|5": {
215
- "acc": 0.18382352941176472,
216
- "acc_stderr": 0.023529242185193106
217
- },
218
- "lighteval|mmlu:professional_psychology|5": {
219
- "acc": 0.25,
220
- "acc_stderr": 0.01751781884501444
221
- },
222
- "lighteval|mmlu:public_relations|5": {
223
- "acc": 0.21818181818181817,
224
- "acc_stderr": 0.03955932861795833
225
- },
226
- "lighteval|mmlu:security_studies|5": {
227
- "acc": 0.19183673469387755,
228
- "acc_stderr": 0.025206963154225395
229
- },
230
- "lighteval|mmlu:sociology|5": {
231
- "acc": 0.24378109452736318,
232
- "acc_stderr": 0.03036049015401465
233
- },
234
- "lighteval|mmlu:us_foreign_policy|5": {
235
- "acc": 0.28,
236
- "acc_stderr": 0.04512608598542128
237
- },
238
- "lighteval|mmlu:virology|5": {
239
- "acc": 0.28313253012048195,
240
- "acc_stderr": 0.03507295431370518
241
- },
242
- "lighteval|mmlu:world_religions|5": {
243
- "acc": 0.3216374269005848,
244
- "acc_stderr": 0.03582529442573122
245
- },
246
- "lighteval|mmlu:_average|5": {
247
- "acc": 0.2312401831441149,
248
- "acc_stderr": 0.031501830581055885
249
- }
250
- },
251
- "versions": {
252
- "lighteval|mmlu:abstract_algebra|5": 0,
253
- "lighteval|mmlu:anatomy|5": 0,
254
- "lighteval|mmlu:astronomy|5": 0,
255
- "lighteval|mmlu:business_ethics|5": 0,
256
- "lighteval|mmlu:clinical_knowledge|5": 0,
257
- "lighteval|mmlu:college_biology|5": 0,
258
- "lighteval|mmlu:college_chemistry|5": 0,
259
- "lighteval|mmlu:college_computer_science|5": 0,
260
- "lighteval|mmlu:college_mathematics|5": 0,
261
- "lighteval|mmlu:college_medicine|5": 0,
262
- "lighteval|mmlu:college_physics|5": 0,
263
- "lighteval|mmlu:computer_security|5": 0,
264
- "lighteval|mmlu:conceptual_physics|5": 0,
265
- "lighteval|mmlu:econometrics|5": 0,
266
- "lighteval|mmlu:electrical_engineering|5": 0,
267
- "lighteval|mmlu:elementary_mathematics|5": 0,
268
- "lighteval|mmlu:formal_logic|5": 0,
269
- "lighteval|mmlu:global_facts|5": 0,
270
- "lighteval|mmlu:high_school_biology|5": 0,
271
- "lighteval|mmlu:high_school_chemistry|5": 0,
272
- "lighteval|mmlu:high_school_computer_science|5": 0,
273
- "lighteval|mmlu:high_school_european_history|5": 0,
274
- "lighteval|mmlu:high_school_geography|5": 0,
275
- "lighteval|mmlu:high_school_government_and_politics|5": 0,
276
- "lighteval|mmlu:high_school_macroeconomics|5": 0,
277
- "lighteval|mmlu:high_school_mathematics|5": 0,
278
- "lighteval|mmlu:high_school_microeconomics|5": 0,
279
- "lighteval|mmlu:high_school_physics|5": 0,
280
- "lighteval|mmlu:high_school_psychology|5": 0,
281
- "lighteval|mmlu:high_school_statistics|5": 0,
282
- "lighteval|mmlu:high_school_us_history|5": 0,
283
- "lighteval|mmlu:high_school_world_history|5": 0,
284
- "lighteval|mmlu:human_aging|5": 0,
285
- "lighteval|mmlu:human_sexuality|5": 0,
286
- "lighteval|mmlu:international_law|5": 0,
287
- "lighteval|mmlu:jurisprudence|5": 0,
288
- "lighteval|mmlu:logical_fallacies|5": 0,
289
- "lighteval|mmlu:machine_learning|5": 0,
290
- "lighteval|mmlu:management|5": 0,
291
- "lighteval|mmlu:marketing|5": 0,
292
- "lighteval|mmlu:medical_genetics|5": 0,
293
- "lighteval|mmlu:miscellaneous|5": 0,
294
- "lighteval|mmlu:moral_disputes|5": 0,
295
- "lighteval|mmlu:moral_scenarios|5": 0,
296
- "lighteval|mmlu:nutrition|5": 0,
297
- "lighteval|mmlu:philosophy|5": 0,
298
- "lighteval|mmlu:prehistory|5": 0,
299
- "lighteval|mmlu:professional_accounting|5": 0,
300
- "lighteval|mmlu:professional_law|5": 0,
301
- "lighteval|mmlu:professional_medicine|5": 0,
302
- "lighteval|mmlu:professional_psychology|5": 0,
303
- "lighteval|mmlu:public_relations|5": 0,
304
- "lighteval|mmlu:security_studies|5": 0,
305
- "lighteval|mmlu:sociology|5": 0,
306
- "lighteval|mmlu:us_foreign_policy|5": 0,
307
- "lighteval|mmlu:virology|5": 0,
308
- "lighteval|mmlu:world_religions|5": 0
309
- },
310
- "config_tasks": {
311
- "lighteval|mmlu:abstract_algebra": {
312
- "name": "mmlu:abstract_algebra",
313
- "prompt_function": "mmlu_harness",
314
- "hf_repo": "lighteval/mmlu",
315
- "hf_subset": "abstract_algebra",
316
- "metric": [
317
- "loglikelihood_acc"
318
- ],
319
- "hf_avail_splits": [
320
- "auxiliary_train",
321
- "test",
322
- "validation",
323
- "dev"
324
- ],
325
- "evaluation_splits": [
326
- "test"
327
- ],
328
- "few_shots_split": "dev",
329
- "few_shots_select": "sequential",
330
- "generation_size": 1,
331
- "stop_sequence": [
332
- "\n"
333
- ],
334
- "output_regex": null,
335
- "frozen": false,
336
- "suite": [
337
- "lighteval",
338
- "mmlu"
339
- ]
340
- },
341
- "lighteval|mmlu:anatomy": {
342
- "name": "mmlu:anatomy",
343
- "prompt_function": "mmlu_harness",
344
- "hf_repo": "lighteval/mmlu",
345
- "hf_subset": "anatomy",
346
- "metric": [
347
- "loglikelihood_acc"
348
- ],
349
- "hf_avail_splits": [
350
- "auxiliary_train",
351
- "test",
352
- "validation",
353
- "dev"
354
- ],
355
- "evaluation_splits": [
356
- "test"
357
- ],
358
- "few_shots_split": "dev",
359
- "few_shots_select": "sequential",
360
- "generation_size": 1,
361
- "stop_sequence": [
362
- "\n"
363
- ],
364
- "output_regex": null,
365
- "frozen": false,
366
- "suite": [
367
- "lighteval",
368
- "mmlu"
369
- ]
370
- },
371
- "lighteval|mmlu:astronomy": {
372
- "name": "mmlu:astronomy",
373
- "prompt_function": "mmlu_harness",
374
- "hf_repo": "lighteval/mmlu",
375
- "hf_subset": "astronomy",
376
- "metric": [
377
- "loglikelihood_acc"
378
- ],
379
- "hf_avail_splits": [
380
- "auxiliary_train",
381
- "test",
382
- "validation",
383
- "dev"
384
- ],
385
- "evaluation_splits": [
386
- "test"
387
- ],
388
- "few_shots_split": "dev",
389
- "few_shots_select": "sequential",
390
- "generation_size": 1,
391
- "stop_sequence": [
392
- "\n"
393
- ],
394
- "output_regex": null,
395
- "frozen": false,
396
- "suite": [
397
- "lighteval",
398
- "mmlu"
399
- ]
400
- },
401
- "lighteval|mmlu:business_ethics": {
402
- "name": "mmlu:business_ethics",
403
- "prompt_function": "mmlu_harness",
404
- "hf_repo": "lighteval/mmlu",
405
- "hf_subset": "business_ethics",
406
- "metric": [
407
- "loglikelihood_acc"
408
- ],
409
- "hf_avail_splits": [
410
- "auxiliary_train",
411
- "test",
412
- "validation",
413
- "dev"
414
- ],
415
- "evaluation_splits": [
416
- "test"
417
- ],
418
- "few_shots_split": "dev",
419
- "few_shots_select": "sequential",
420
- "generation_size": 1,
421
- "stop_sequence": [
422
- "\n"
423
- ],
424
- "output_regex": null,
425
- "frozen": false,
426
- "suite": [
427
- "lighteval",
428
- "mmlu"
429
- ]
430
- },
431
- "lighteval|mmlu:clinical_knowledge": {
432
- "name": "mmlu:clinical_knowledge",
433
- "prompt_function": "mmlu_harness",
434
- "hf_repo": "lighteval/mmlu",
435
- "hf_subset": "clinical_knowledge",
436
- "metric": [
437
- "loglikelihood_acc"
438
- ],
439
- "hf_avail_splits": [
440
- "auxiliary_train",
441
- "test",
442
- "validation",
443
- "dev"
444
- ],
445
- "evaluation_splits": [
446
- "test"
447
- ],
448
- "few_shots_split": "dev",
449
- "few_shots_select": "sequential",
450
- "generation_size": 1,
451
- "stop_sequence": [
452
- "\n"
453
- ],
454
- "output_regex": null,
455
- "frozen": false,
456
- "suite": [
457
- "lighteval",
458
- "mmlu"
459
- ]
460
- },
461
- "lighteval|mmlu:college_biology": {
462
- "name": "mmlu:college_biology",
463
- "prompt_function": "mmlu_harness",
464
- "hf_repo": "lighteval/mmlu",
465
- "hf_subset": "college_biology",
466
- "metric": [
467
- "loglikelihood_acc"
468
- ],
469
- "hf_avail_splits": [
470
- "auxiliary_train",
471
- "test",
472
- "validation",
473
- "dev"
474
- ],
475
- "evaluation_splits": [
476
- "test"
477
- ],
478
- "few_shots_split": "dev",
479
- "few_shots_select": "sequential",
480
- "generation_size": 1,
481
- "stop_sequence": [
482
- "\n"
483
- ],
484
- "output_regex": null,
485
- "frozen": false,
486
- "suite": [
487
- "lighteval",
488
- "mmlu"
489
- ]
490
- },
491
- "lighteval|mmlu:college_chemistry": {
492
- "name": "mmlu:college_chemistry",
493
- "prompt_function": "mmlu_harness",
494
- "hf_repo": "lighteval/mmlu",
495
- "hf_subset": "college_chemistry",
496
- "metric": [
497
- "loglikelihood_acc"
498
- ],
499
- "hf_avail_splits": [
500
- "auxiliary_train",
501
- "test",
502
- "validation",
503
- "dev"
504
- ],
505
- "evaluation_splits": [
506
- "test"
507
- ],
508
- "few_shots_split": "dev",
509
- "few_shots_select": "sequential",
510
- "generation_size": 1,
511
- "stop_sequence": [
512
- "\n"
513
- ],
514
- "output_regex": null,
515
- "frozen": false,
516
- "suite": [
517
- "lighteval",
518
- "mmlu"
519
- ]
520
- },
521
- "lighteval|mmlu:college_computer_science": {
522
- "name": "mmlu:college_computer_science",
523
- "prompt_function": "mmlu_harness",
524
- "hf_repo": "lighteval/mmlu",
525
- "hf_subset": "college_computer_science",
526
- "metric": [
527
- "loglikelihood_acc"
528
- ],
529
- "hf_avail_splits": [
530
- "auxiliary_train",
531
- "test",
532
- "validation",
533
- "dev"
534
- ],
535
- "evaluation_splits": [
536
- "test"
537
- ],
538
- "few_shots_split": "dev",
539
- "few_shots_select": "sequential",
540
- "generation_size": 1,
541
- "stop_sequence": [
542
- "\n"
543
- ],
544
- "output_regex": null,
545
- "frozen": false,
546
- "suite": [
547
- "lighteval",
548
- "mmlu"
549
- ]
550
- },
551
- "lighteval|mmlu:college_mathematics": {
552
- "name": "mmlu:college_mathematics",
553
- "prompt_function": "mmlu_harness",
554
- "hf_repo": "lighteval/mmlu",
555
- "hf_subset": "college_mathematics",
556
- "metric": [
557
- "loglikelihood_acc"
558
- ],
559
- "hf_avail_splits": [
560
- "auxiliary_train",
561
- "test",
562
- "validation",
563
- "dev"
564
- ],
565
- "evaluation_splits": [
566
- "test"
567
- ],
568
- "few_shots_split": "dev",
569
- "few_shots_select": "sequential",
570
- "generation_size": 1,
571
- "stop_sequence": [
572
- "\n"
573
- ],
574
- "output_regex": null,
575
- "frozen": false,
576
- "suite": [
577
- "lighteval",
578
- "mmlu"
579
- ]
580
- },
581
- "lighteval|mmlu:college_medicine": {
582
- "name": "mmlu:college_medicine",
583
- "prompt_function": "mmlu_harness",
584
- "hf_repo": "lighteval/mmlu",
585
- "hf_subset": "college_medicine",
586
- "metric": [
587
- "loglikelihood_acc"
588
- ],
589
- "hf_avail_splits": [
590
- "auxiliary_train",
591
- "test",
592
- "validation",
593
- "dev"
594
- ],
595
- "evaluation_splits": [
596
- "test"
597
- ],
598
- "few_shots_split": "dev",
599
- "few_shots_select": "sequential",
600
- "generation_size": 1,
601
- "stop_sequence": [
602
- "\n"
603
- ],
604
- "output_regex": null,
605
- "frozen": false,
606
- "suite": [
607
- "lighteval",
608
- "mmlu"
609
- ]
610
- },
611
- "lighteval|mmlu:college_physics": {
612
- "name": "mmlu:college_physics",
613
- "prompt_function": "mmlu_harness",
614
- "hf_repo": "lighteval/mmlu",
615
- "hf_subset": "college_physics",
616
- "metric": [
617
- "loglikelihood_acc"
618
- ],
619
- "hf_avail_splits": [
620
- "auxiliary_train",
621
- "test",
622
- "validation",
623
- "dev"
624
- ],
625
- "evaluation_splits": [
626
- "test"
627
- ],
628
- "few_shots_split": "dev",
629
- "few_shots_select": "sequential",
630
- "generation_size": 1,
631
- "stop_sequence": [
632
- "\n"
633
- ],
634
- "output_regex": null,
635
- "frozen": false,
636
- "suite": [
637
- "lighteval",
638
- "mmlu"
639
- ]
640
- },
641
- "lighteval|mmlu:computer_security": {
642
- "name": "mmlu:computer_security",
643
- "prompt_function": "mmlu_harness",
644
- "hf_repo": "lighteval/mmlu",
645
- "hf_subset": "computer_security",
646
- "metric": [
647
- "loglikelihood_acc"
648
- ],
649
- "hf_avail_splits": [
650
- "auxiliary_train",
651
- "test",
652
- "validation",
653
- "dev"
654
- ],
655
- "evaluation_splits": [
656
- "test"
657
- ],
658
- "few_shots_split": "dev",
659
- "few_shots_select": "sequential",
660
- "generation_size": 1,
661
- "stop_sequence": [
662
- "\n"
663
- ],
664
- "output_regex": null,
665
- "frozen": false,
666
- "suite": [
667
- "lighteval",
668
- "mmlu"
669
- ]
670
- },
671
- "lighteval|mmlu:conceptual_physics": {
672
- "name": "mmlu:conceptual_physics",
673
- "prompt_function": "mmlu_harness",
674
- "hf_repo": "lighteval/mmlu",
675
- "hf_subset": "conceptual_physics",
676
- "metric": [
677
- "loglikelihood_acc"
678
- ],
679
- "hf_avail_splits": [
680
- "auxiliary_train",
681
- "test",
682
- "validation",
683
- "dev"
684
- ],
685
- "evaluation_splits": [
686
- "test"
687
- ],
688
- "few_shots_split": "dev",
689
- "few_shots_select": "sequential",
690
- "generation_size": 1,
691
- "stop_sequence": [
692
- "\n"
693
- ],
694
- "output_regex": null,
695
- "frozen": false,
696
- "suite": [
697
- "lighteval",
698
- "mmlu"
699
- ]
700
- },
701
- "lighteval|mmlu:econometrics": {
702
- "name": "mmlu:econometrics",
703
- "prompt_function": "mmlu_harness",
704
- "hf_repo": "lighteval/mmlu",
705
- "hf_subset": "econometrics",
706
- "metric": [
707
- "loglikelihood_acc"
708
- ],
709
- "hf_avail_splits": [
710
- "auxiliary_train",
711
- "test",
712
- "validation",
713
- "dev"
714
- ],
715
- "evaluation_splits": [
716
- "test"
717
- ],
718
- "few_shots_split": "dev",
719
- "few_shots_select": "sequential",
720
- "generation_size": 1,
721
- "stop_sequence": [
722
- "\n"
723
- ],
724
- "output_regex": null,
725
- "frozen": false,
726
- "suite": [
727
- "lighteval",
728
- "mmlu"
729
- ]
730
- },
731
- "lighteval|mmlu:electrical_engineering": {
732
- "name": "mmlu:electrical_engineering",
733
- "prompt_function": "mmlu_harness",
734
- "hf_repo": "lighteval/mmlu",
735
- "hf_subset": "electrical_engineering",
736
- "metric": [
737
- "loglikelihood_acc"
738
- ],
739
- "hf_avail_splits": [
740
- "auxiliary_train",
741
- "test",
742
- "validation",
743
- "dev"
744
- ],
745
- "evaluation_splits": [
746
- "test"
747
- ],
748
- "few_shots_split": "dev",
749
- "few_shots_select": "sequential",
750
- "generation_size": 1,
751
- "stop_sequence": [
752
- "\n"
753
- ],
754
- "output_regex": null,
755
- "frozen": false,
756
- "suite": [
757
- "lighteval",
758
- "mmlu"
759
- ]
760
- },
761
- "lighteval|mmlu:elementary_mathematics": {
762
- "name": "mmlu:elementary_mathematics",
763
- "prompt_function": "mmlu_harness",
764
- "hf_repo": "lighteval/mmlu",
765
- "hf_subset": "elementary_mathematics",
766
- "metric": [
767
- "loglikelihood_acc"
768
- ],
769
- "hf_avail_splits": [
770
- "auxiliary_train",
771
- "test",
772
- "validation",
773
- "dev"
774
- ],
775
- "evaluation_splits": [
776
- "test"
777
- ],
778
- "few_shots_split": "dev",
779
- "few_shots_select": "sequential",
780
- "generation_size": 1,
781
- "stop_sequence": [
782
- "\n"
783
- ],
784
- "output_regex": null,
785
- "frozen": false,
786
- "suite": [
787
- "lighteval",
788
- "mmlu"
789
- ]
790
- },
791
- "lighteval|mmlu:formal_logic": {
792
- "name": "mmlu:formal_logic",
793
- "prompt_function": "mmlu_harness",
794
- "hf_repo": "lighteval/mmlu",
795
- "hf_subset": "formal_logic",
796
- "metric": [
797
- "loglikelihood_acc"
798
- ],
799
- "hf_avail_splits": [
800
- "auxiliary_train",
801
- "test",
802
- "validation",
803
- "dev"
804
- ],
805
- "evaluation_splits": [
806
- "test"
807
- ],
808
- "few_shots_split": "dev",
809
- "few_shots_select": "sequential",
810
- "generation_size": 1,
811
- "stop_sequence": [
812
- "\n"
813
- ],
814
- "output_regex": null,
815
- "frozen": false,
816
- "suite": [
817
- "lighteval",
818
- "mmlu"
819
- ]
820
- },
821
- "lighteval|mmlu:global_facts": {
822
- "name": "mmlu:global_facts",
823
- "prompt_function": "mmlu_harness",
824
- "hf_repo": "lighteval/mmlu",
825
- "hf_subset": "global_facts",
826
- "metric": [
827
- "loglikelihood_acc"
828
- ],
829
- "hf_avail_splits": [
830
- "auxiliary_train",
831
- "test",
832
- "validation",
833
- "dev"
834
- ],
835
- "evaluation_splits": [
836
- "test"
837
- ],
838
- "few_shots_split": "dev",
839
- "few_shots_select": "sequential",
840
- "generation_size": 1,
841
- "stop_sequence": [
842
- "\n"
843
- ],
844
- "output_regex": null,
845
- "frozen": false,
846
- "suite": [
847
- "lighteval",
848
- "mmlu"
849
- ]
850
- },
851
- "lighteval|mmlu:high_school_biology": {
852
- "name": "mmlu:high_school_biology",
853
- "prompt_function": "mmlu_harness",
854
- "hf_repo": "lighteval/mmlu",
855
- "hf_subset": "high_school_biology",
856
- "metric": [
857
- "loglikelihood_acc"
858
- ],
859
- "hf_avail_splits": [
860
- "auxiliary_train",
861
- "test",
862
- "validation",
863
- "dev"
864
- ],
865
- "evaluation_splits": [
866
- "test"
867
- ],
868
- "few_shots_split": "dev",
869
- "few_shots_select": "sequential",
870
- "generation_size": 1,
871
- "stop_sequence": [
872
- "\n"
873
- ],
874
- "output_regex": null,
875
- "frozen": false,
876
- "suite": [
877
- "lighteval",
878
- "mmlu"
879
- ]
880
- },
881
- "lighteval|mmlu:high_school_chemistry": {
882
- "name": "mmlu:high_school_chemistry",
883
- "prompt_function": "mmlu_harness",
884
- "hf_repo": "lighteval/mmlu",
885
- "hf_subset": "high_school_chemistry",
886
- "metric": [
887
- "loglikelihood_acc"
888
- ],
889
- "hf_avail_splits": [
890
- "auxiliary_train",
891
- "test",
892
- "validation",
893
- "dev"
894
- ],
895
- "evaluation_splits": [
896
- "test"
897
- ],
898
- "few_shots_split": "dev",
899
- "few_shots_select": "sequential",
900
- "generation_size": 1,
901
- "stop_sequence": [
902
- "\n"
903
- ],
904
- "output_regex": null,
905
- "frozen": false,
906
- "suite": [
907
- "lighteval",
908
- "mmlu"
909
- ]
910
- },
911
- "lighteval|mmlu:high_school_computer_science": {
912
- "name": "mmlu:high_school_computer_science",
913
- "prompt_function": "mmlu_harness",
914
- "hf_repo": "lighteval/mmlu",
915
- "hf_subset": "high_school_computer_science",
916
- "metric": [
917
- "loglikelihood_acc"
918
- ],
919
- "hf_avail_splits": [
920
- "auxiliary_train",
921
- "test",
922
- "validation",
923
- "dev"
924
- ],
925
- "evaluation_splits": [
926
- "test"
927
- ],
928
- "few_shots_split": "dev",
929
- "few_shots_select": "sequential",
930
- "generation_size": 1,
931
- "stop_sequence": [
932
- "\n"
933
- ],
934
- "output_regex": null,
935
- "frozen": false,
936
- "suite": [
937
- "lighteval",
938
- "mmlu"
939
- ]
940
- },
941
- "lighteval|mmlu:high_school_european_history": {
942
- "name": "mmlu:high_school_european_history",
943
- "prompt_function": "mmlu_harness",
944
- "hf_repo": "lighteval/mmlu",
945
- "hf_subset": "high_school_european_history",
946
- "metric": [
947
- "loglikelihood_acc"
948
- ],
949
- "hf_avail_splits": [
950
- "auxiliary_train",
951
- "test",
952
- "validation",
953
- "dev"
954
- ],
955
- "evaluation_splits": [
956
- "test"
957
- ],
958
- "few_shots_split": "dev",
959
- "few_shots_select": "sequential",
960
- "generation_size": 1,
961
- "stop_sequence": [
962
- "\n"
963
- ],
964
- "output_regex": null,
965
- "frozen": false,
966
- "suite": [
967
- "lighteval",
968
- "mmlu"
969
- ]
970
- },
971
- "lighteval|mmlu:high_school_geography": {
972
- "name": "mmlu:high_school_geography",
973
- "prompt_function": "mmlu_harness",
974
- "hf_repo": "lighteval/mmlu",
975
- "hf_subset": "high_school_geography",
976
- "metric": [
977
- "loglikelihood_acc"
978
- ],
979
- "hf_avail_splits": [
980
- "auxiliary_train",
981
- "test",
982
- "validation",
983
- "dev"
984
- ],
985
- "evaluation_splits": [
986
- "test"
987
- ],
988
- "few_shots_split": "dev",
989
- "few_shots_select": "sequential",
990
- "generation_size": 1,
991
- "stop_sequence": [
992
- "\n"
993
- ],
994
- "output_regex": null,
995
- "frozen": false,
996
- "suite": [
997
- "lighteval",
998
- "mmlu"
999
- ]
1000
- },
1001
- "lighteval|mmlu:high_school_government_and_politics": {
1002
- "name": "mmlu:high_school_government_and_politics",
1003
- "prompt_function": "mmlu_harness",
1004
- "hf_repo": "lighteval/mmlu",
1005
- "hf_subset": "high_school_government_and_politics",
1006
- "metric": [
1007
- "loglikelihood_acc"
1008
- ],
1009
- "hf_avail_splits": [
1010
- "auxiliary_train",
1011
- "test",
1012
- "validation",
1013
- "dev"
1014
- ],
1015
- "evaluation_splits": [
1016
- "test"
1017
- ],
1018
- "few_shots_split": "dev",
1019
- "few_shots_select": "sequential",
1020
- "generation_size": 1,
1021
- "stop_sequence": [
1022
- "\n"
1023
- ],
1024
- "output_regex": null,
1025
- "frozen": false,
1026
- "suite": [
1027
- "lighteval",
1028
- "mmlu"
1029
- ]
1030
- },
1031
- "lighteval|mmlu:high_school_macroeconomics": {
1032
- "name": "mmlu:high_school_macroeconomics",
1033
- "prompt_function": "mmlu_harness",
1034
- "hf_repo": "lighteval/mmlu",
1035
- "hf_subset": "high_school_macroeconomics",
1036
- "metric": [
1037
- "loglikelihood_acc"
1038
- ],
1039
- "hf_avail_splits": [
1040
- "auxiliary_train",
1041
- "test",
1042
- "validation",
1043
- "dev"
1044
- ],
1045
- "evaluation_splits": [
1046
- "test"
1047
- ],
1048
- "few_shots_split": "dev",
1049
- "few_shots_select": "sequential",
1050
- "generation_size": 1,
1051
- "stop_sequence": [
1052
- "\n"
1053
- ],
1054
- "output_regex": null,
1055
- "frozen": false,
1056
- "suite": [
1057
- "lighteval",
1058
- "mmlu"
1059
- ]
1060
- },
1061
- "lighteval|mmlu:high_school_mathematics": {
1062
- "name": "mmlu:high_school_mathematics",
1063
- "prompt_function": "mmlu_harness",
1064
- "hf_repo": "lighteval/mmlu",
1065
- "hf_subset": "high_school_mathematics",
1066
- "metric": [
1067
- "loglikelihood_acc"
1068
- ],
1069
- "hf_avail_splits": [
1070
- "auxiliary_train",
1071
- "test",
1072
- "validation",
1073
- "dev"
1074
- ],
1075
- "evaluation_splits": [
1076
- "test"
1077
- ],
1078
- "few_shots_split": "dev",
1079
- "few_shots_select": "sequential",
1080
- "generation_size": 1,
1081
- "stop_sequence": [
1082
- "\n"
1083
- ],
1084
- "output_regex": null,
1085
- "frozen": false,
1086
- "suite": [
1087
- "lighteval",
1088
- "mmlu"
1089
- ]
1090
- },
1091
- "lighteval|mmlu:high_school_microeconomics": {
1092
- "name": "mmlu:high_school_microeconomics",
1093
- "prompt_function": "mmlu_harness",
1094
- "hf_repo": "lighteval/mmlu",
1095
- "hf_subset": "high_school_microeconomics",
1096
- "metric": [
1097
- "loglikelihood_acc"
1098
- ],
1099
- "hf_avail_splits": [
1100
- "auxiliary_train",
1101
- "test",
1102
- "validation",
1103
- "dev"
1104
- ],
1105
- "evaluation_splits": [
1106
- "test"
1107
- ],
1108
- "few_shots_split": "dev",
1109
- "few_shots_select": "sequential",
1110
- "generation_size": 1,
1111
- "stop_sequence": [
1112
- "\n"
1113
- ],
1114
- "output_regex": null,
1115
- "frozen": false,
1116
- "suite": [
1117
- "lighteval",
1118
- "mmlu"
1119
- ]
1120
- },
1121
- "lighteval|mmlu:high_school_physics": {
1122
- "name": "mmlu:high_school_physics",
1123
- "prompt_function": "mmlu_harness",
1124
- "hf_repo": "lighteval/mmlu",
1125
- "hf_subset": "high_school_physics",
1126
- "metric": [
1127
- "loglikelihood_acc"
1128
- ],
1129
- "hf_avail_splits": [
1130
- "auxiliary_train",
1131
- "test",
1132
- "validation",
1133
- "dev"
1134
- ],
1135
- "evaluation_splits": [
1136
- "test"
1137
- ],
1138
- "few_shots_split": "dev",
1139
- "few_shots_select": "sequential",
1140
- "generation_size": 1,
1141
- "stop_sequence": [
1142
- "\n"
1143
- ],
1144
- "output_regex": null,
1145
- "frozen": false,
1146
- "suite": [
1147
- "lighteval",
1148
- "mmlu"
1149
- ]
1150
- },
1151
- "lighteval|mmlu:high_school_psychology": {
1152
- "name": "mmlu:high_school_psychology",
1153
- "prompt_function": "mmlu_harness",
1154
- "hf_repo": "lighteval/mmlu",
1155
- "hf_subset": "high_school_psychology",
1156
- "metric": [
1157
- "loglikelihood_acc"
1158
- ],
1159
- "hf_avail_splits": [
1160
- "auxiliary_train",
1161
- "test",
1162
- "validation",
1163
- "dev"
1164
- ],
1165
- "evaluation_splits": [
1166
- "test"
1167
- ],
1168
- "few_shots_split": "dev",
1169
- "few_shots_select": "sequential",
1170
- "generation_size": 1,
1171
- "stop_sequence": [
1172
- "\n"
1173
- ],
1174
- "output_regex": null,
1175
- "frozen": false,
1176
- "suite": [
1177
- "lighteval",
1178
- "mmlu"
1179
- ]
1180
- },
1181
- "lighteval|mmlu:high_school_statistics": {
1182
- "name": "mmlu:high_school_statistics",
1183
- "prompt_function": "mmlu_harness",
1184
- "hf_repo": "lighteval/mmlu",
1185
- "hf_subset": "high_school_statistics",
1186
- "metric": [
1187
- "loglikelihood_acc"
1188
- ],
1189
- "hf_avail_splits": [
1190
- "auxiliary_train",
1191
- "test",
1192
- "validation",
1193
- "dev"
1194
- ],
1195
- "evaluation_splits": [
1196
- "test"
1197
- ],
1198
- "few_shots_split": "dev",
1199
- "few_shots_select": "sequential",
1200
- "generation_size": 1,
1201
- "stop_sequence": [
1202
- "\n"
1203
- ],
1204
- "output_regex": null,
1205
- "frozen": false,
1206
- "suite": [
1207
- "lighteval",
1208
- "mmlu"
1209
- ]
1210
- },
1211
- "lighteval|mmlu:high_school_us_history": {
1212
- "name": "mmlu:high_school_us_history",
1213
- "prompt_function": "mmlu_harness",
1214
- "hf_repo": "lighteval/mmlu",
1215
- "hf_subset": "high_school_us_history",
1216
- "metric": [
1217
- "loglikelihood_acc"
1218
- ],
1219
- "hf_avail_splits": [
1220
- "auxiliary_train",
1221
- "test",
1222
- "validation",
1223
- "dev"
1224
- ],
1225
- "evaluation_splits": [
1226
- "test"
1227
- ],
1228
- "few_shots_split": "dev",
1229
- "few_shots_select": "sequential",
1230
- "generation_size": 1,
1231
- "stop_sequence": [
1232
- "\n"
1233
- ],
1234
- "output_regex": null,
1235
- "frozen": false,
1236
- "suite": [
1237
- "lighteval",
1238
- "mmlu"
1239
- ]
1240
- },
1241
- "lighteval|mmlu:high_school_world_history": {
1242
- "name": "mmlu:high_school_world_history",
1243
- "prompt_function": "mmlu_harness",
1244
- "hf_repo": "lighteval/mmlu",
1245
- "hf_subset": "high_school_world_history",
1246
- "metric": [
1247
- "loglikelihood_acc"
1248
- ],
1249
- "hf_avail_splits": [
1250
- "auxiliary_train",
1251
- "test",
1252
- "validation",
1253
- "dev"
1254
- ],
1255
- "evaluation_splits": [
1256
- "test"
1257
- ],
1258
- "few_shots_split": "dev",
1259
- "few_shots_select": "sequential",
1260
- "generation_size": 1,
1261
- "stop_sequence": [
1262
- "\n"
1263
- ],
1264
- "output_regex": null,
1265
- "frozen": false,
1266
- "suite": [
1267
- "lighteval",
1268
- "mmlu"
1269
- ]
1270
- },
1271
- "lighteval|mmlu:human_aging": {
1272
- "name": "mmlu:human_aging",
1273
- "prompt_function": "mmlu_harness",
1274
- "hf_repo": "lighteval/mmlu",
1275
- "hf_subset": "human_aging",
1276
- "metric": [
1277
- "loglikelihood_acc"
1278
- ],
1279
- "hf_avail_splits": [
1280
- "auxiliary_train",
1281
- "test",
1282
- "validation",
1283
- "dev"
1284
- ],
1285
- "evaluation_splits": [
1286
- "test"
1287
- ],
1288
- "few_shots_split": "dev",
1289
- "few_shots_select": "sequential",
1290
- "generation_size": 1,
1291
- "stop_sequence": [
1292
- "\n"
1293
- ],
1294
- "output_regex": null,
1295
- "frozen": false,
1296
- "suite": [
1297
- "lighteval",
1298
- "mmlu"
1299
- ]
1300
- },
1301
- "lighteval|mmlu:human_sexuality": {
1302
- "name": "mmlu:human_sexuality",
1303
- "prompt_function": "mmlu_harness",
1304
- "hf_repo": "lighteval/mmlu",
1305
- "hf_subset": "human_sexuality",
1306
- "metric": [
1307
- "loglikelihood_acc"
1308
- ],
1309
- "hf_avail_splits": [
1310
- "auxiliary_train",
1311
- "test",
1312
- "validation",
1313
- "dev"
1314
- ],
1315
- "evaluation_splits": [
1316
- "test"
1317
- ],
1318
- "few_shots_split": "dev",
1319
- "few_shots_select": "sequential",
1320
- "generation_size": 1,
1321
- "stop_sequence": [
1322
- "\n"
1323
- ],
1324
- "output_regex": null,
1325
- "frozen": false,
1326
- "suite": [
1327
- "lighteval",
1328
- "mmlu"
1329
- ]
1330
- },
1331
- "lighteval|mmlu:international_law": {
1332
- "name": "mmlu:international_law",
1333
- "prompt_function": "mmlu_harness",
1334
- "hf_repo": "lighteval/mmlu",
1335
- "hf_subset": "international_law",
1336
- "metric": [
1337
- "loglikelihood_acc"
1338
- ],
1339
- "hf_avail_splits": [
1340
- "auxiliary_train",
1341
- "test",
1342
- "validation",
1343
- "dev"
1344
- ],
1345
- "evaluation_splits": [
1346
- "test"
1347
- ],
1348
- "few_shots_split": "dev",
1349
- "few_shots_select": "sequential",
1350
- "generation_size": 1,
1351
- "stop_sequence": [
1352
- "\n"
1353
- ],
1354
- "output_regex": null,
1355
- "frozen": false,
1356
- "suite": [
1357
- "lighteval",
1358
- "mmlu"
1359
- ]
1360
- },
1361
- "lighteval|mmlu:jurisprudence": {
1362
- "name": "mmlu:jurisprudence",
1363
- "prompt_function": "mmlu_harness",
1364
- "hf_repo": "lighteval/mmlu",
1365
- "hf_subset": "jurisprudence",
1366
- "metric": [
1367
- "loglikelihood_acc"
1368
- ],
1369
- "hf_avail_splits": [
1370
- "auxiliary_train",
1371
- "test",
1372
- "validation",
1373
- "dev"
1374
- ],
1375
- "evaluation_splits": [
1376
- "test"
1377
- ],
1378
- "few_shots_split": "dev",
1379
- "few_shots_select": "sequential",
1380
- "generation_size": 1,
1381
- "stop_sequence": [
1382
- "\n"
1383
- ],
1384
- "output_regex": null,
1385
- "frozen": false,
1386
- "suite": [
1387
- "lighteval",
1388
- "mmlu"
1389
- ]
1390
- },
1391
- "lighteval|mmlu:logical_fallacies": {
1392
- "name": "mmlu:logical_fallacies",
1393
- "prompt_function": "mmlu_harness",
1394
- "hf_repo": "lighteval/mmlu",
1395
- "hf_subset": "logical_fallacies",
1396
- "metric": [
1397
- "loglikelihood_acc"
1398
- ],
1399
- "hf_avail_splits": [
1400
- "auxiliary_train",
1401
- "test",
1402
- "validation",
1403
- "dev"
1404
- ],
1405
- "evaluation_splits": [
1406
- "test"
1407
- ],
1408
- "few_shots_split": "dev",
1409
- "few_shots_select": "sequential",
1410
- "generation_size": 1,
1411
- "stop_sequence": [
1412
- "\n"
1413
- ],
1414
- "output_regex": null,
1415
- "frozen": false,
1416
- "suite": [
1417
- "lighteval",
1418
- "mmlu"
1419
- ]
1420
- },
1421
- "lighteval|mmlu:machine_learning": {
1422
- "name": "mmlu:machine_learning",
1423
- "prompt_function": "mmlu_harness",
1424
- "hf_repo": "lighteval/mmlu",
1425
- "hf_subset": "machine_learning",
1426
- "metric": [
1427
- "loglikelihood_acc"
1428
- ],
1429
- "hf_avail_splits": [
1430
- "auxiliary_train",
1431
- "test",
1432
- "validation",
1433
- "dev"
1434
- ],
1435
- "evaluation_splits": [
1436
- "test"
1437
- ],
1438
- "few_shots_split": "dev",
1439
- "few_shots_select": "sequential",
1440
- "generation_size": 1,
1441
- "stop_sequence": [
1442
- "\n"
1443
- ],
1444
- "output_regex": null,
1445
- "frozen": false,
1446
- "suite": [
1447
- "lighteval",
1448
- "mmlu"
1449
- ]
1450
- },
1451
- "lighteval|mmlu:management": {
1452
- "name": "mmlu:management",
1453
- "prompt_function": "mmlu_harness",
1454
- "hf_repo": "lighteval/mmlu",
1455
- "hf_subset": "management",
1456
- "metric": [
1457
- "loglikelihood_acc"
1458
- ],
1459
- "hf_avail_splits": [
1460
- "auxiliary_train",
1461
- "test",
1462
- "validation",
1463
- "dev"
1464
- ],
1465
- "evaluation_splits": [
1466
- "test"
1467
- ],
1468
- "few_shots_split": "dev",
1469
- "few_shots_select": "sequential",
1470
- "generation_size": 1,
1471
- "stop_sequence": [
1472
- "\n"
1473
- ],
1474
- "output_regex": null,
1475
- "frozen": false,
1476
- "suite": [
1477
- "lighteval",
1478
- "mmlu"
1479
- ]
1480
- },
1481
- "lighteval|mmlu:marketing": {
1482
- "name": "mmlu:marketing",
1483
- "prompt_function": "mmlu_harness",
1484
- "hf_repo": "lighteval/mmlu",
1485
- "hf_subset": "marketing",
1486
- "metric": [
1487
- "loglikelihood_acc"
1488
- ],
1489
- "hf_avail_splits": [
1490
- "auxiliary_train",
1491
- "test",
1492
- "validation",
1493
- "dev"
1494
- ],
1495
- "evaluation_splits": [
1496
- "test"
1497
- ],
1498
- "few_shots_split": "dev",
1499
- "few_shots_select": "sequential",
1500
- "generation_size": 1,
1501
- "stop_sequence": [
1502
- "\n"
1503
- ],
1504
- "output_regex": null,
1505
- "frozen": false,
1506
- "suite": [
1507
- "lighteval",
1508
- "mmlu"
1509
- ]
1510
- },
1511
- "lighteval|mmlu:medical_genetics": {
1512
- "name": "mmlu:medical_genetics",
1513
- "prompt_function": "mmlu_harness",
1514
- "hf_repo": "lighteval/mmlu",
1515
- "hf_subset": "medical_genetics",
1516
- "metric": [
1517
- "loglikelihood_acc"
1518
- ],
1519
- "hf_avail_splits": [
1520
- "auxiliary_train",
1521
- "test",
1522
- "validation",
1523
- "dev"
1524
- ],
1525
- "evaluation_splits": [
1526
- "test"
1527
- ],
1528
- "few_shots_split": "dev",
1529
- "few_shots_select": "sequential",
1530
- "generation_size": 1,
1531
- "stop_sequence": [
1532
- "\n"
1533
- ],
1534
- "output_regex": null,
1535
- "frozen": false,
1536
- "suite": [
1537
- "lighteval",
1538
- "mmlu"
1539
- ]
1540
- },
1541
- "lighteval|mmlu:miscellaneous": {
1542
- "name": "mmlu:miscellaneous",
1543
- "prompt_function": "mmlu_harness",
1544
- "hf_repo": "lighteval/mmlu",
1545
- "hf_subset": "miscellaneous",
1546
- "metric": [
1547
- "loglikelihood_acc"
1548
- ],
1549
- "hf_avail_splits": [
1550
- "auxiliary_train",
1551
- "test",
1552
- "validation",
1553
- "dev"
1554
- ],
1555
- "evaluation_splits": [
1556
- "test"
1557
- ],
1558
- "few_shots_split": "dev",
1559
- "few_shots_select": "sequential",
1560
- "generation_size": 1,
1561
- "stop_sequence": [
1562
- "\n"
1563
- ],
1564
- "output_regex": null,
1565
- "frozen": false,
1566
- "suite": [
1567
- "lighteval",
1568
- "mmlu"
1569
- ]
1570
- },
1571
- "lighteval|mmlu:moral_disputes": {
1572
- "name": "mmlu:moral_disputes",
1573
- "prompt_function": "mmlu_harness",
1574
- "hf_repo": "lighteval/mmlu",
1575
- "hf_subset": "moral_disputes",
1576
- "metric": [
1577
- "loglikelihood_acc"
1578
- ],
1579
- "hf_avail_splits": [
1580
- "auxiliary_train",
1581
- "test",
1582
- "validation",
1583
- "dev"
1584
- ],
1585
- "evaluation_splits": [
1586
- "test"
1587
- ],
1588
- "few_shots_split": "dev",
1589
- "few_shots_select": "sequential",
1590
- "generation_size": 1,
1591
- "stop_sequence": [
1592
- "\n"
1593
- ],
1594
- "output_regex": null,
1595
- "frozen": false,
1596
- "suite": [
1597
- "lighteval",
1598
- "mmlu"
1599
- ]
1600
- },
1601
- "lighteval|mmlu:moral_scenarios": {
1602
- "name": "mmlu:moral_scenarios",
1603
- "prompt_function": "mmlu_harness",
1604
- "hf_repo": "lighteval/mmlu",
1605
- "hf_subset": "moral_scenarios",
1606
- "metric": [
1607
- "loglikelihood_acc"
1608
- ],
1609
- "hf_avail_splits": [
1610
- "auxiliary_train",
1611
- "test",
1612
- "validation",
1613
- "dev"
1614
- ],
1615
- "evaluation_splits": [
1616
- "test"
1617
- ],
1618
- "few_shots_split": "dev",
1619
- "few_shots_select": "sequential",
1620
- "generation_size": 1,
1621
- "stop_sequence": [
1622
- "\n"
1623
- ],
1624
- "output_regex": null,
1625
- "frozen": false,
1626
- "suite": [
1627
- "lighteval",
1628
- "mmlu"
1629
- ]
1630
- },
1631
- "lighteval|mmlu:nutrition": {
1632
- "name": "mmlu:nutrition",
1633
- "prompt_function": "mmlu_harness",
1634
- "hf_repo": "lighteval/mmlu",
1635
- "hf_subset": "nutrition",
1636
- "metric": [
1637
- "loglikelihood_acc"
1638
- ],
1639
- "hf_avail_splits": [
1640
- "auxiliary_train",
1641
- "test",
1642
- "validation",
1643
- "dev"
1644
- ],
1645
- "evaluation_splits": [
1646
- "test"
1647
- ],
1648
- "few_shots_split": "dev",
1649
- "few_shots_select": "sequential",
1650
- "generation_size": 1,
1651
- "stop_sequence": [
1652
- "\n"
1653
- ],
1654
- "output_regex": null,
1655
- "frozen": false,
1656
- "suite": [
1657
- "lighteval",
1658
- "mmlu"
1659
- ]
1660
- },
1661
- "lighteval|mmlu:philosophy": {
1662
- "name": "mmlu:philosophy",
1663
- "prompt_function": "mmlu_harness",
1664
- "hf_repo": "lighteval/mmlu",
1665
- "hf_subset": "philosophy",
1666
- "metric": [
1667
- "loglikelihood_acc"
1668
- ],
1669
- "hf_avail_splits": [
1670
- "auxiliary_train",
1671
- "test",
1672
- "validation",
1673
- "dev"
1674
- ],
1675
- "evaluation_splits": [
1676
- "test"
1677
- ],
1678
- "few_shots_split": "dev",
1679
- "few_shots_select": "sequential",
1680
- "generation_size": 1,
1681
- "stop_sequence": [
1682
- "\n"
1683
- ],
1684
- "output_regex": null,
1685
- "frozen": false,
1686
- "suite": [
1687
- "lighteval",
1688
- "mmlu"
1689
- ]
1690
- },
1691
- "lighteval|mmlu:prehistory": {
1692
- "name": "mmlu:prehistory",
1693
- "prompt_function": "mmlu_harness",
1694
- "hf_repo": "lighteval/mmlu",
1695
- "hf_subset": "prehistory",
1696
- "metric": [
1697
- "loglikelihood_acc"
1698
- ],
1699
- "hf_avail_splits": [
1700
- "auxiliary_train",
1701
- "test",
1702
- "validation",
1703
- "dev"
1704
- ],
1705
- "evaluation_splits": [
1706
- "test"
1707
- ],
1708
- "few_shots_split": "dev",
1709
- "few_shots_select": "sequential",
1710
- "generation_size": 1,
1711
- "stop_sequence": [
1712
- "\n"
1713
- ],
1714
- "output_regex": null,
1715
- "frozen": false,
1716
- "suite": [
1717
- "lighteval",
1718
- "mmlu"
1719
- ]
1720
- },
1721
- "lighteval|mmlu:professional_accounting": {
1722
- "name": "mmlu:professional_accounting",
1723
- "prompt_function": "mmlu_harness",
1724
- "hf_repo": "lighteval/mmlu",
1725
- "hf_subset": "professional_accounting",
1726
- "metric": [
1727
- "loglikelihood_acc"
1728
- ],
1729
- "hf_avail_splits": [
1730
- "auxiliary_train",
1731
- "test",
1732
- "validation",
1733
- "dev"
1734
- ],
1735
- "evaluation_splits": [
1736
- "test"
1737
- ],
1738
- "few_shots_split": "dev",
1739
- "few_shots_select": "sequential",
1740
- "generation_size": 1,
1741
- "stop_sequence": [
1742
- "\n"
1743
- ],
1744
- "output_regex": null,
1745
- "frozen": false,
1746
- "suite": [
1747
- "lighteval",
1748
- "mmlu"
1749
- ]
1750
- },
1751
- "lighteval|mmlu:professional_law": {
1752
- "name": "mmlu:professional_law",
1753
- "prompt_function": "mmlu_harness",
1754
- "hf_repo": "lighteval/mmlu",
1755
- "hf_subset": "professional_law",
1756
- "metric": [
1757
- "loglikelihood_acc"
1758
- ],
1759
- "hf_avail_splits": [
1760
- "auxiliary_train",
1761
- "test",
1762
- "validation",
1763
- "dev"
1764
- ],
1765
- "evaluation_splits": [
1766
- "test"
1767
- ],
1768
- "few_shots_split": "dev",
1769
- "few_shots_select": "sequential",
1770
- "generation_size": 1,
1771
- "stop_sequence": [
1772
- "\n"
1773
- ],
1774
- "output_regex": null,
1775
- "frozen": false,
1776
- "suite": [
1777
- "lighteval",
1778
- "mmlu"
1779
- ]
1780
- },
1781
- "lighteval|mmlu:professional_medicine": {
1782
- "name": "mmlu:professional_medicine",
1783
- "prompt_function": "mmlu_harness",
1784
- "hf_repo": "lighteval/mmlu",
1785
- "hf_subset": "professional_medicine",
1786
- "metric": [
1787
- "loglikelihood_acc"
1788
- ],
1789
- "hf_avail_splits": [
1790
- "auxiliary_train",
1791
- "test",
1792
- "validation",
1793
- "dev"
1794
- ],
1795
- "evaluation_splits": [
1796
- "test"
1797
- ],
1798
- "few_shots_split": "dev",
1799
- "few_shots_select": "sequential",
1800
- "generation_size": 1,
1801
- "stop_sequence": [
1802
- "\n"
1803
- ],
1804
- "output_regex": null,
1805
- "frozen": false,
1806
- "suite": [
1807
- "lighteval",
1808
- "mmlu"
1809
- ]
1810
- },
1811
- "lighteval|mmlu:professional_psychology": {
1812
- "name": "mmlu:professional_psychology",
1813
- "prompt_function": "mmlu_harness",
1814
- "hf_repo": "lighteval/mmlu",
1815
- "hf_subset": "professional_psychology",
1816
- "metric": [
1817
- "loglikelihood_acc"
1818
- ],
1819
- "hf_avail_splits": [
1820
- "auxiliary_train",
1821
- "test",
1822
- "validation",
1823
- "dev"
1824
- ],
1825
- "evaluation_splits": [
1826
- "test"
1827
- ],
1828
- "few_shots_split": "dev",
1829
- "few_shots_select": "sequential",
1830
- "generation_size": 1,
1831
- "stop_sequence": [
1832
- "\n"
1833
- ],
1834
- "output_regex": null,
1835
- "frozen": false,
1836
- "suite": [
1837
- "lighteval",
1838
- "mmlu"
1839
- ]
1840
- },
1841
- "lighteval|mmlu:public_relations": {
1842
- "name": "mmlu:public_relations",
1843
- "prompt_function": "mmlu_harness",
1844
- "hf_repo": "lighteval/mmlu",
1845
- "hf_subset": "public_relations",
1846
- "metric": [
1847
- "loglikelihood_acc"
1848
- ],
1849
- "hf_avail_splits": [
1850
- "auxiliary_train",
1851
- "test",
1852
- "validation",
1853
- "dev"
1854
- ],
1855
- "evaluation_splits": [
1856
- "test"
1857
- ],
1858
- "few_shots_split": "dev",
1859
- "few_shots_select": "sequential",
1860
- "generation_size": 1,
1861
- "stop_sequence": [
1862
- "\n"
1863
- ],
1864
- "output_regex": null,
1865
- "frozen": false,
1866
- "suite": [
1867
- "lighteval",
1868
- "mmlu"
1869
- ]
1870
- },
1871
- "lighteval|mmlu:security_studies": {
1872
- "name": "mmlu:security_studies",
1873
- "prompt_function": "mmlu_harness",
1874
- "hf_repo": "lighteval/mmlu",
1875
- "hf_subset": "security_studies",
1876
- "metric": [
1877
- "loglikelihood_acc"
1878
- ],
1879
- "hf_avail_splits": [
1880
- "auxiliary_train",
1881
- "test",
1882
- "validation",
1883
- "dev"
1884
- ],
1885
- "evaluation_splits": [
1886
- "test"
1887
- ],
1888
- "few_shots_split": "dev",
1889
- "few_shots_select": "sequential",
1890
- "generation_size": 1,
1891
- "stop_sequence": [
1892
- "\n"
1893
- ],
1894
- "output_regex": null,
1895
- "frozen": false,
1896
- "suite": [
1897
- "lighteval",
1898
- "mmlu"
1899
- ]
1900
- },
1901
- "lighteval|mmlu:sociology": {
1902
- "name": "mmlu:sociology",
1903
- "prompt_function": "mmlu_harness",
1904
- "hf_repo": "lighteval/mmlu",
1905
- "hf_subset": "sociology",
1906
- "metric": [
1907
- "loglikelihood_acc"
1908
- ],
1909
- "hf_avail_splits": [
1910
- "auxiliary_train",
1911
- "test",
1912
- "validation",
1913
- "dev"
1914
- ],
1915
- "evaluation_splits": [
1916
- "test"
1917
- ],
1918
- "few_shots_split": "dev",
1919
- "few_shots_select": "sequential",
1920
- "generation_size": 1,
1921
- "stop_sequence": [
1922
- "\n"
1923
- ],
1924
- "output_regex": null,
1925
- "frozen": false,
1926
- "suite": [
1927
- "lighteval",
1928
- "mmlu"
1929
- ]
1930
- },
1931
- "lighteval|mmlu:us_foreign_policy": {
1932
- "name": "mmlu:us_foreign_policy",
1933
- "prompt_function": "mmlu_harness",
1934
- "hf_repo": "lighteval/mmlu",
1935
- "hf_subset": "us_foreign_policy",
1936
- "metric": [
1937
- "loglikelihood_acc"
1938
- ],
1939
- "hf_avail_splits": [
1940
- "auxiliary_train",
1941
- "test",
1942
- "validation",
1943
- "dev"
1944
- ],
1945
- "evaluation_splits": [
1946
- "test"
1947
- ],
1948
- "few_shots_split": "dev",
1949
- "few_shots_select": "sequential",
1950
- "generation_size": 1,
1951
- "stop_sequence": [
1952
- "\n"
1953
- ],
1954
- "output_regex": null,
1955
- "frozen": false,
1956
- "suite": [
1957
- "lighteval",
1958
- "mmlu"
1959
- ]
1960
- },
1961
- "lighteval|mmlu:virology": {
1962
- "name": "mmlu:virology",
1963
- "prompt_function": "mmlu_harness",
1964
- "hf_repo": "lighteval/mmlu",
1965
- "hf_subset": "virology",
1966
- "metric": [
1967
- "loglikelihood_acc"
1968
- ],
1969
- "hf_avail_splits": [
1970
- "auxiliary_train",
1971
- "test",
1972
- "validation",
1973
- "dev"
1974
- ],
1975
- "evaluation_splits": [
1976
- "test"
1977
- ],
1978
- "few_shots_split": "dev",
1979
- "few_shots_select": "sequential",
1980
- "generation_size": 1,
1981
- "stop_sequence": [
1982
- "\n"
1983
- ],
1984
- "output_regex": null,
1985
- "frozen": false,
1986
- "suite": [
1987
- "lighteval",
1988
- "mmlu"
1989
- ]
1990
- },
1991
- "lighteval|mmlu:world_religions": {
1992
- "name": "mmlu:world_religions",
1993
- "prompt_function": "mmlu_harness",
1994
- "hf_repo": "lighteval/mmlu",
1995
- "hf_subset": "world_religions",
1996
- "metric": [
1997
- "loglikelihood_acc"
1998
- ],
1999
- "hf_avail_splits": [
2000
- "auxiliary_train",
2001
- "test",
2002
- "validation",
2003
- "dev"
2004
- ],
2005
- "evaluation_splits": [
2006
- "test"
2007
- ],
2008
- "few_shots_split": "dev",
2009
- "few_shots_select": "sequential",
2010
- "generation_size": 1,
2011
- "stop_sequence": [
2012
- "\n"
2013
- ],
2014
- "output_regex": null,
2015
- "frozen": false,
2016
- "suite": [
2017
- "lighteval",
2018
- "mmlu"
2019
- ]
2020
- }
2021
- },
2022
- "summary_tasks": {
2023
- "lighteval|mmlu:abstract_algebra|5": {
2024
- "hashes": {
2025
- "hash_examples": "4c76229e00c9c0e9",
2026
- "hash_full_prompts": "a45d01c3409c889c",
2027
- "hash_input_tokens": "4948b2c6cf57057c",
2028
- "hash_cont_tokens": "ca6635f013682116"
2029
- },
2030
- "truncated": 0,
2031
- "non_truncated": 100,
2032
- "padded": 400,
2033
- "non_padded": 0,
2034
- "effective_few_shots": 5.0,
2035
- "num_truncated_few_shots": 0
2036
- },
2037
- "lighteval|mmlu:anatomy|5": {
2038
- "hashes": {
2039
- "hash_examples": "6a1f8104dccbd33b",
2040
- "hash_full_prompts": "e245c6600e03cc32",
2041
- "hash_input_tokens": "ccae0b047572b80f",
2042
- "hash_cont_tokens": "e1ba0772a6068b5f"
2043
- },
2044
- "truncated": 0,
2045
- "non_truncated": 135,
2046
- "padded": 540,
2047
- "non_padded": 0,
2048
- "effective_few_shots": 5.0,
2049
- "num_truncated_few_shots": 0
2050
- },
2051
- "lighteval|mmlu:astronomy|5": {
2052
- "hashes": {
2053
- "hash_examples": "1302effa3a76ce4c",
2054
- "hash_full_prompts": "390f9bddf857ad04",
2055
- "hash_input_tokens": "95e99849ae58bc29",
2056
- "hash_cont_tokens": "5ceb0e5afafe79b5"
2057
- },
2058
- "truncated": 0,
2059
- "non_truncated": 152,
2060
- "padded": 608,
2061
- "non_padded": 0,
2062
- "effective_few_shots": 5.0,
2063
- "num_truncated_few_shots": 0
2064
- },
2065
- "lighteval|mmlu:business_ethics|5": {
2066
- "hashes": {
2067
- "hash_examples": "03cb8bce5336419a",
2068
- "hash_full_prompts": "5504f893bc4f2fa1",
2069
- "hash_input_tokens": "b497f7a5f8bbb8f3",
2070
- "hash_cont_tokens": "ca6635f013682116"
2071
- },
2072
- "truncated": 0,
2073
- "non_truncated": 100,
2074
- "padded": 400,
2075
- "non_padded": 0,
2076
- "effective_few_shots": 5.0,
2077
- "num_truncated_few_shots": 0
2078
- },
2079
- "lighteval|mmlu:clinical_knowledge|5": {
2080
- "hashes": {
2081
- "hash_examples": "ffbb9c7b2be257f9",
2082
- "hash_full_prompts": "106ad0bab4b90b78",
2083
- "hash_input_tokens": "22ddadb0674e1859",
2084
- "hash_cont_tokens": "aed310f2c7712a91"
2085
- },
2086
- "truncated": 0,
2087
- "non_truncated": 265,
2088
- "padded": 1060,
2089
- "non_padded": 0,
2090
- "effective_few_shots": 5.0,
2091
- "num_truncated_few_shots": 0
2092
- },
2093
- "lighteval|mmlu:college_biology|5": {
2094
- "hashes": {
2095
- "hash_examples": "3ee77f176f38eb8e",
2096
- "hash_full_prompts": "59f9bdf2695cb226",
2097
- "hash_input_tokens": "d4dee762441c2914",
2098
- "hash_cont_tokens": "0dadd21454ffb16b"
2099
- },
2100
- "truncated": 0,
2101
- "non_truncated": 144,
2102
- "padded": 576,
2103
- "non_padded": 0,
2104
- "effective_few_shots": 5.0,
2105
- "num_truncated_few_shots": 0
2106
- },
2107
- "lighteval|mmlu:college_chemistry|5": {
2108
- "hashes": {
2109
- "hash_examples": "ce61a69c46d47aeb",
2110
- "hash_full_prompts": "3cac9b759fcff7a0",
2111
- "hash_input_tokens": "d8a83002fd2891fc",
2112
- "hash_cont_tokens": "ca6635f013682116"
2113
- },
2114
- "truncated": 0,
2115
- "non_truncated": 100,
2116
- "padded": 400,
2117
- "non_padded": 0,
2118
- "effective_few_shots": 5.0,
2119
- "num_truncated_few_shots": 0
2120
- },
2121
- "lighteval|mmlu:college_computer_science|5": {
2122
- "hashes": {
2123
- "hash_examples": "32805b52d7d5daab",
2124
- "hash_full_prompts": "010b0cca35070130",
2125
- "hash_input_tokens": "bf24575e01b75368",
2126
- "hash_cont_tokens": "ca6635f013682116"
2127
- },
2128
- "truncated": 0,
2129
- "non_truncated": 100,
2130
- "padded": 400,
2131
- "non_padded": 0,
2132
- "effective_few_shots": 5.0,
2133
- "num_truncated_few_shots": 0
2134
- },
2135
- "lighteval|mmlu:college_mathematics|5": {
2136
- "hashes": {
2137
- "hash_examples": "55da1a0a0bd33722",
2138
- "hash_full_prompts": "511422eb9eefc773",
2139
- "hash_input_tokens": "958ae747d6c39df7",
2140
- "hash_cont_tokens": "ca6635f013682116"
2141
- },
2142
- "truncated": 0,
2143
- "non_truncated": 100,
2144
- "padded": 400,
2145
- "non_padded": 0,
2146
- "effective_few_shots": 5.0,
2147
- "num_truncated_few_shots": 0
2148
- },
2149
- "lighteval|mmlu:college_medicine|5": {
2150
- "hashes": {
2151
- "hash_examples": "c33e143163049176",
2152
- "hash_full_prompts": "c8cc1a82a51a046e",
2153
- "hash_input_tokens": "726811436fcc0abd",
2154
- "hash_cont_tokens": "b4dea139dbc832db"
2155
- },
2156
- "truncated": 0,
2157
- "non_truncated": 173,
2158
- "padded": 692,
2159
- "non_padded": 0,
2160
- "effective_few_shots": 5.0,
2161
- "num_truncated_few_shots": 0
2162
- },
2163
- "lighteval|mmlu:college_physics|5": {
2164
- "hashes": {
2165
- "hash_examples": "ebdab1cdb7e555df",
2166
- "hash_full_prompts": "e40721b5059c5818",
2167
- "hash_input_tokens": "5e5caeee24119b1f",
2168
- "hash_cont_tokens": "f5a25833e1dae922"
2169
- },
2170
- "truncated": 0,
2171
- "non_truncated": 102,
2172
- "padded": 408,
2173
- "non_padded": 0,
2174
- "effective_few_shots": 5.0,
2175
- "num_truncated_few_shots": 0
2176
- },
2177
- "lighteval|mmlu:computer_security|5": {
2178
- "hashes": {
2179
- "hash_examples": "a24fd7d08a560921",
2180
- "hash_full_prompts": "946c9be5964ac44a",
2181
- "hash_input_tokens": "f0fe150445434938",
2182
- "hash_cont_tokens": "ca6635f013682116"
2183
- },
2184
- "truncated": 0,
2185
- "non_truncated": 100,
2186
- "padded": 400,
2187
- "non_padded": 0,
2188
- "effective_few_shots": 5.0,
2189
- "num_truncated_few_shots": 0
2190
- },
2191
- "lighteval|mmlu:conceptual_physics|5": {
2192
- "hashes": {
2193
- "hash_examples": "8300977a79386993",
2194
- "hash_full_prompts": "506a4f6094cc40c9",
2195
- "hash_input_tokens": "1f0efe59b0409eb6",
2196
- "hash_cont_tokens": "3aaa2f9b51df2bc9"
2197
- },
2198
- "truncated": 0,
2199
- "non_truncated": 235,
2200
- "padded": 940,
2201
- "non_padded": 0,
2202
- "effective_few_shots": 5.0,
2203
- "num_truncated_few_shots": 0
2204
- },
2205
- "lighteval|mmlu:econometrics|5": {
2206
- "hashes": {
2207
- "hash_examples": "ddde36788a04a46f",
2208
- "hash_full_prompts": "4ed2703f27f1ed05",
2209
- "hash_input_tokens": "0fbb5ec4fd743fd9",
2210
- "hash_cont_tokens": "40eac819ba437eec"
2211
- },
2212
- "truncated": 0,
2213
- "non_truncated": 114,
2214
- "padded": 456,
2215
- "non_padded": 0,
2216
- "effective_few_shots": 5.0,
2217
- "num_truncated_few_shots": 0
2218
- },
2219
- "lighteval|mmlu:electrical_engineering|5": {
2220
- "hashes": {
2221
- "hash_examples": "acbc5def98c19b3f",
2222
- "hash_full_prompts": "d8f4b3e11c23653c",
2223
- "hash_input_tokens": "abc466e84eca1d48",
2224
- "hash_cont_tokens": "9ad54070b8c8d481"
2225
- },
2226
- "truncated": 0,
2227
- "non_truncated": 145,
2228
- "padded": 580,
2229
- "non_padded": 0,
2230
- "effective_few_shots": 5.0,
2231
- "num_truncated_few_shots": 0
2232
- },
2233
- "lighteval|mmlu:elementary_mathematics|5": {
2234
- "hashes": {
2235
- "hash_examples": "146e61d07497a9bd",
2236
- "hash_full_prompts": "256d111bd15647ff",
2237
- "hash_input_tokens": "0050e6543b33ab8e",
2238
- "hash_cont_tokens": "1737a64affbf2372"
2239
- },
2240
- "truncated": 0,
2241
- "non_truncated": 378,
2242
- "padded": 1512,
2243
- "non_padded": 0,
2244
- "effective_few_shots": 5.0,
2245
- "num_truncated_few_shots": 0
2246
- },
2247
- "lighteval|mmlu:formal_logic|5": {
2248
- "hashes": {
2249
- "hash_examples": "8635216e1909a03f",
2250
- "hash_full_prompts": "1171d04f3b1a11f5",
2251
- "hash_input_tokens": "084c9357045f3417",
2252
- "hash_cont_tokens": "bbf4e8421cceccf3"
2253
- },
2254
- "truncated": 0,
2255
- "non_truncated": 126,
2256
- "padded": 504,
2257
- "non_padded": 0,
2258
- "effective_few_shots": 5.0,
2259
- "num_truncated_few_shots": 0
2260
- },
2261
- "lighteval|mmlu:global_facts|5": {
2262
- "hashes": {
2263
- "hash_examples": "30b315aa6353ee47",
2264
- "hash_full_prompts": "a7e56dbc074c7529",
2265
- "hash_input_tokens": "29026e1886290a72",
2266
- "hash_cont_tokens": "ca6635f013682116"
2267
- },
2268
- "truncated": 0,
2269
- "non_truncated": 100,
2270
- "padded": 400,
2271
- "non_padded": 0,
2272
- "effective_few_shots": 5.0,
2273
- "num_truncated_few_shots": 0
2274
- },
2275
- "lighteval|mmlu:high_school_biology|5": {
2276
- "hashes": {
2277
- "hash_examples": "c9136373af2180de",
2278
- "hash_full_prompts": "ad6e859ed978e04a",
2279
- "hash_input_tokens": "3a30f81c4a1b993c",
2280
- "hash_cont_tokens": "3b147e76117feda3"
2281
- },
2282
- "truncated": 0,
2283
- "non_truncated": 310,
2284
- "padded": 1240,
2285
- "non_padded": 0,
2286
- "effective_few_shots": 5.0,
2287
- "num_truncated_few_shots": 0
2288
- },
2289
- "lighteval|mmlu:high_school_chemistry|5": {
2290
- "hashes": {
2291
- "hash_examples": "b0661bfa1add6404",
2292
- "hash_full_prompts": "6eb9c04bcc8a8f2a",
2293
- "hash_input_tokens": "3455612f2649f340",
2294
- "hash_cont_tokens": "308b43940e65e09e"
2295
- },
2296
- "truncated": 0,
2297
- "non_truncated": 203,
2298
- "padded": 812,
2299
- "non_padded": 0,
2300
- "effective_few_shots": 5.0,
2301
- "num_truncated_few_shots": 0
2302
- },
2303
- "lighteval|mmlu:high_school_computer_science|5": {
2304
- "hashes": {
2305
- "hash_examples": "80fc1d623a3d665f",
2306
- "hash_full_prompts": "8e51bc91c81cf8dd",
2307
- "hash_input_tokens": "1fea81f3fa06cea5",
2308
- "hash_cont_tokens": "ca6635f013682116"
2309
- },
2310
- "truncated": 0,
2311
- "non_truncated": 100,
2312
- "padded": 400,
2313
- "non_padded": 0,
2314
- "effective_few_shots": 5.0,
2315
- "num_truncated_few_shots": 0
2316
- },
2317
- "lighteval|mmlu:high_school_european_history|5": {
2318
- "hashes": {
2319
- "hash_examples": "854da6e5af0fe1a1",
2320
- "hash_full_prompts": "664a1f16c9f3195c",
2321
- "hash_input_tokens": "916c95a0536e1685",
2322
- "hash_cont_tokens": "8f181ada6dc3c1c0"
2323
- },
2324
- "truncated": 0,
2325
- "non_truncated": 165,
2326
- "padded": 656,
2327
- "non_padded": 4,
2328
- "effective_few_shots": 5.0,
2329
- "num_truncated_few_shots": 0
2330
- },
2331
- "lighteval|mmlu:high_school_geography|5": {
2332
- "hashes": {
2333
- "hash_examples": "7dc963c7acd19ad8",
2334
- "hash_full_prompts": "f3acf911f4023c8a",
2335
- "hash_input_tokens": "e78105d5c89747c6",
2336
- "hash_cont_tokens": "30d5bbb69894c7eb"
2337
- },
2338
- "truncated": 0,
2339
- "non_truncated": 198,
2340
- "padded": 792,
2341
- "non_padded": 0,
2342
- "effective_few_shots": 5.0,
2343
- "num_truncated_few_shots": 0
2344
- },
2345
- "lighteval|mmlu:high_school_government_and_politics|5": {
2346
- "hashes": {
2347
- "hash_examples": "1f675dcdebc9758f",
2348
- "hash_full_prompts": "066254feaa3158ae",
2349
- "hash_input_tokens": "ddce941d18e493a4",
2350
- "hash_cont_tokens": "a56929e6c0ce3449"
2351
- },
2352
- "truncated": 0,
2353
- "non_truncated": 193,
2354
- "padded": 772,
2355
- "non_padded": 0,
2356
- "effective_few_shots": 5.0,
2357
- "num_truncated_few_shots": 0
2358
- },
2359
- "lighteval|mmlu:high_school_macroeconomics|5": {
2360
- "hashes": {
2361
- "hash_examples": "2fb32cf2d80f0b35",
2362
- "hash_full_prompts": "19a7fa502aa85c95",
2363
- "hash_input_tokens": "837b51169b038fe0",
2364
- "hash_cont_tokens": "2fa537b929a4262f"
2365
- },
2366
- "truncated": 0,
2367
- "non_truncated": 390,
2368
- "padded": 1560,
2369
- "non_padded": 0,
2370
- "effective_few_shots": 5.0,
2371
- "num_truncated_few_shots": 0
2372
- },
2373
- "lighteval|mmlu:high_school_mathematics|5": {
2374
- "hashes": {
2375
- "hash_examples": "fd6646fdb5d58a1f",
2376
- "hash_full_prompts": "4f704e369778b5b0",
2377
- "hash_input_tokens": "f7c0c9b774e1a823",
2378
- "hash_cont_tokens": "d458ef4757b4f677"
2379
- },
2380
- "truncated": 0,
2381
- "non_truncated": 270,
2382
- "padded": 1078,
2383
- "non_padded": 2,
2384
- "effective_few_shots": 5.0,
2385
- "num_truncated_few_shots": 0
2386
- },
2387
- "lighteval|mmlu:high_school_microeconomics|5": {
2388
- "hashes": {
2389
- "hash_examples": "2118f21f71d87d84",
2390
- "hash_full_prompts": "4350f9e2240f8010",
2391
- "hash_input_tokens": "a1ee4fdb893cdebc",
2392
- "hash_cont_tokens": "4b3b9037dff8007e"
2393
- },
2394
- "truncated": 0,
2395
- "non_truncated": 238,
2396
- "padded": 952,
2397
- "non_padded": 0,
2398
- "effective_few_shots": 5.0,
2399
- "num_truncated_few_shots": 0
2400
- },
2401
- "lighteval|mmlu:high_school_physics|5": {
2402
- "hashes": {
2403
- "hash_examples": "dc3ce06378548565",
2404
- "hash_full_prompts": "5dc0d6831b66188f",
2405
- "hash_input_tokens": "724f77ba2a9aff43",
2406
- "hash_cont_tokens": "7501b169c0c12798"
2407
- },
2408
- "truncated": 0,
2409
- "non_truncated": 151,
2410
- "padded": 596,
2411
- "non_padded": 8,
2412
- "effective_few_shots": 5.0,
2413
- "num_truncated_few_shots": 0
2414
- },
2415
- "lighteval|mmlu:high_school_psychology|5": {
2416
- "hashes": {
2417
- "hash_examples": "c8d1d98a40e11f2f",
2418
- "hash_full_prompts": "af2b097da6d50365",
2419
- "hash_input_tokens": "64c9ef6542c4ebc5",
2420
- "hash_cont_tokens": "c1a9bff96c9b870b"
2421
- },
2422
- "truncated": 0,
2423
- "non_truncated": 545,
2424
- "padded": 2168,
2425
- "non_padded": 12,
2426
- "effective_few_shots": 5.0,
2427
- "num_truncated_few_shots": 0
2428
- },
2429
- "lighteval|mmlu:high_school_statistics|5": {
2430
- "hashes": {
2431
- "hash_examples": "666c8759b98ee4ff",
2432
- "hash_full_prompts": "c757694421d6d68d",
2433
- "hash_input_tokens": "6ace2cb29f29b1ad",
2434
- "hash_cont_tokens": "c8ba890b377f366c"
2435
- },
2436
- "truncated": 0,
2437
- "non_truncated": 216,
2438
- "padded": 864,
2439
- "non_padded": 0,
2440
- "effective_few_shots": 5.0,
2441
- "num_truncated_few_shots": 0
2442
- },
2443
- "lighteval|mmlu:high_school_us_history|5": {
2444
- "hashes": {
2445
- "hash_examples": "95fef1c4b7d3f81e",
2446
- "hash_full_prompts": "e34a028d0ddeec5e",
2447
- "hash_input_tokens": "6eea808bcfba866a",
2448
- "hash_cont_tokens": "a5583b10be513397"
2449
- },
2450
- "truncated": 0,
2451
- "non_truncated": 204,
2452
- "padded": 816,
2453
- "non_padded": 0,
2454
- "effective_few_shots": 5.0,
2455
- "num_truncated_few_shots": 0
2456
- },
2457
- "lighteval|mmlu:high_school_world_history|5": {
2458
- "hashes": {
2459
- "hash_examples": "7e5085b6184b0322",
2460
- "hash_full_prompts": "1fa3d51392765601",
2461
- "hash_input_tokens": "88d139decc63d147",
2462
- "hash_cont_tokens": "bc02ede066873d68"
2463
- },
2464
- "truncated": 0,
2465
- "non_truncated": 237,
2466
- "padded": 948,
2467
- "non_padded": 0,
2468
- "effective_few_shots": 5.0,
2469
- "num_truncated_few_shots": 0
2470
- },
2471
- "lighteval|mmlu:human_aging|5": {
2472
- "hashes": {
2473
- "hash_examples": "c17333e7c7c10797",
2474
- "hash_full_prompts": "cac900721f9a1a94",
2475
- "hash_input_tokens": "3b3004225023b6a1",
2476
- "hash_cont_tokens": "46bca90878b86814"
2477
- },
2478
- "truncated": 0,
2479
- "non_truncated": 223,
2480
- "padded": 892,
2481
- "non_padded": 0,
2482
- "effective_few_shots": 5.0,
2483
- "num_truncated_few_shots": 0
2484
- },
2485
- "lighteval|mmlu:human_sexuality|5": {
2486
- "hashes": {
2487
- "hash_examples": "4edd1e9045df5e3d",
2488
- "hash_full_prompts": "0d6567bafee0a13c",
2489
- "hash_input_tokens": "0fbac491ec8244cf",
2490
- "hash_cont_tokens": "83e31fa74449548b"
2491
- },
2492
- "truncated": 0,
2493
- "non_truncated": 131,
2494
- "padded": 524,
2495
- "non_padded": 0,
2496
- "effective_few_shots": 5.0,
2497
- "num_truncated_few_shots": 0
2498
- },
2499
- "lighteval|mmlu:international_law|5": {
2500
- "hashes": {
2501
- "hash_examples": "db2fa00d771a062a",
2502
- "hash_full_prompts": "d018f9116479795e",
2503
- "hash_input_tokens": "ccc8a79b71202284",
2504
- "hash_cont_tokens": "41c0cd517e147f5d"
2505
- },
2506
- "truncated": 0,
2507
- "non_truncated": 121,
2508
- "padded": 484,
2509
- "non_padded": 0,
2510
- "effective_few_shots": 5.0,
2511
- "num_truncated_few_shots": 0
2512
- },
2513
- "lighteval|mmlu:jurisprudence|5": {
2514
- "hashes": {
2515
- "hash_examples": "e956f86b124076fe",
2516
- "hash_full_prompts": "1487e89a10ec58b7",
2517
- "hash_input_tokens": "295854ba53c38e1b",
2518
- "hash_cont_tokens": "be0171360f69d6f2"
2519
- },
2520
- "truncated": 0,
2521
- "non_truncated": 108,
2522
- "padded": 432,
2523
- "non_padded": 0,
2524
- "effective_few_shots": 5.0,
2525
- "num_truncated_few_shots": 0
2526
- },
2527
- "lighteval|mmlu:logical_fallacies|5": {
2528
- "hashes": {
2529
- "hash_examples": "956e0e6365ab79f1",
2530
- "hash_full_prompts": "677785b2181f9243",
2531
- "hash_input_tokens": "9ad3e188efd1d33c",
2532
- "hash_cont_tokens": "6b024648afae9ee5"
2533
- },
2534
- "truncated": 0,
2535
- "non_truncated": 163,
2536
- "padded": 652,
2537
- "non_padded": 0,
2538
- "effective_few_shots": 5.0,
2539
- "num_truncated_few_shots": 0
2540
- },
2541
- "lighteval|mmlu:machine_learning|5": {
2542
- "hashes": {
2543
- "hash_examples": "397997cc6f4d581e",
2544
- "hash_full_prompts": "769ee14a2aea49bb",
2545
- "hash_input_tokens": "8968240944bf5437",
2546
- "hash_cont_tokens": "1e08788dc7e95ea0"
2547
- },
2548
- "truncated": 0,
2549
- "non_truncated": 112,
2550
- "padded": 448,
2551
- "non_padded": 0,
2552
- "effective_few_shots": 5.0,
2553
- "num_truncated_few_shots": 0
2554
- },
2555
- "lighteval|mmlu:management|5": {
2556
- "hashes": {
2557
- "hash_examples": "2bcbe6f6ca63d740",
2558
- "hash_full_prompts": "cb1ff9dac9582144",
2559
- "hash_input_tokens": "38100f64fb7e1fd7",
2560
- "hash_cont_tokens": "2e4e36b1749ca046"
2561
- },
2562
- "truncated": 0,
2563
- "non_truncated": 103,
2564
- "padded": 412,
2565
- "non_padded": 0,
2566
- "effective_few_shots": 5.0,
2567
- "num_truncated_few_shots": 0
2568
- },
2569
- "lighteval|mmlu:marketing|5": {
2570
- "hashes": {
2571
- "hash_examples": "8ddb20d964a1b065",
2572
- "hash_full_prompts": "9fc2114a187ad9a2",
2573
- "hash_input_tokens": "b4c521f4c53f8e08",
2574
- "hash_cont_tokens": "31cb7d1a07654a4c"
2575
- },
2576
- "truncated": 0,
2577
- "non_truncated": 234,
2578
- "padded": 936,
2579
- "non_padded": 0,
2580
- "effective_few_shots": 5.0,
2581
- "num_truncated_few_shots": 0
2582
- },
2583
- "lighteval|mmlu:medical_genetics|5": {
2584
- "hashes": {
2585
- "hash_examples": "182a71f4763d2cea",
2586
- "hash_full_prompts": "46a616fa51878959",
2587
- "hash_input_tokens": "dc1faf1dfb1362fd",
2588
- "hash_cont_tokens": "ca6635f013682116"
2589
- },
2590
- "truncated": 0,
2591
- "non_truncated": 100,
2592
- "padded": 400,
2593
- "non_padded": 0,
2594
- "effective_few_shots": 5.0,
2595
- "num_truncated_few_shots": 0
2596
- },
2597
- "lighteval|mmlu:miscellaneous|5": {
2598
- "hashes": {
2599
- "hash_examples": "4c404fdbb4ca57fc",
2600
- "hash_full_prompts": "0813e1be36dbaae1",
2601
- "hash_input_tokens": "9a92e9f634e3f086",
2602
- "hash_cont_tokens": "6c84ccca2d1eb1c8"
2603
- },
2604
- "truncated": 0,
2605
- "non_truncated": 783,
2606
- "padded": 3132,
2607
- "non_padded": 0,
2608
- "effective_few_shots": 5.0,
2609
- "num_truncated_few_shots": 0
2610
- },
2611
- "lighteval|mmlu:moral_disputes|5": {
2612
- "hashes": {
2613
- "hash_examples": "60cbd2baa3fea5c9",
2614
- "hash_full_prompts": "1d14adebb9b62519",
2615
- "hash_input_tokens": "0f9cc303e7a5371d",
2616
- "hash_cont_tokens": "1d61e904d7d686a1"
2617
- },
2618
- "truncated": 0,
2619
- "non_truncated": 346,
2620
- "padded": 1384,
2621
- "non_padded": 0,
2622
- "effective_few_shots": 5.0,
2623
- "num_truncated_few_shots": 0
2624
- },
2625
- "lighteval|mmlu:moral_scenarios|5": {
2626
- "hashes": {
2627
- "hash_examples": "fd8b0431fbdd75ef",
2628
- "hash_full_prompts": "b80d3d236165e3de",
2629
- "hash_input_tokens": "63e22f3bedd6b4ec",
2630
- "hash_cont_tokens": "7eb03b430f07003e"
2631
- },
2632
- "truncated": 0,
2633
- "non_truncated": 895,
2634
- "padded": 3551,
2635
- "non_padded": 29,
2636
- "effective_few_shots": 5.0,
2637
- "num_truncated_few_shots": 0
2638
- },
2639
- "lighteval|mmlu:nutrition|5": {
2640
- "hashes": {
2641
- "hash_examples": "71e55e2b829b6528",
2642
- "hash_full_prompts": "2bfb18e5fab8dea7",
2643
- "hash_input_tokens": "59301dd373fad06a",
2644
- "hash_cont_tokens": "7a0a57f1342b71d6"
2645
- },
2646
- "truncated": 0,
2647
- "non_truncated": 306,
2648
- "padded": 1224,
2649
- "non_padded": 0,
2650
- "effective_few_shots": 5.0,
2651
- "num_truncated_few_shots": 0
2652
- },
2653
- "lighteval|mmlu:philosophy|5": {
2654
- "hashes": {
2655
- "hash_examples": "a6d489a8d208fa4b",
2656
- "hash_full_prompts": "e8c0d5b6dae3ccc8",
2657
- "hash_input_tokens": "b0eb1522d655fe53",
2658
- "hash_cont_tokens": "0c977bed864888bd"
2659
- },
2660
- "truncated": 0,
2661
- "non_truncated": 311,
2662
- "padded": 1244,
2663
- "non_padded": 0,
2664
- "effective_few_shots": 5.0,
2665
- "num_truncated_few_shots": 0
2666
- },
2667
- "lighteval|mmlu:prehistory|5": {
2668
- "hashes": {
2669
- "hash_examples": "6cc50f032a19acaa",
2670
- "hash_full_prompts": "4a6a1d3ab1bf28e4",
2671
- "hash_input_tokens": "a39bd9702da6996f",
2672
- "hash_cont_tokens": "4e57a4b0d66f0736"
2673
- },
2674
- "truncated": 0,
2675
- "non_truncated": 324,
2676
- "padded": 1268,
2677
- "non_padded": 28,
2678
- "effective_few_shots": 5.0,
2679
- "num_truncated_few_shots": 0
2680
- },
2681
- "lighteval|mmlu:professional_accounting|5": {
2682
- "hashes": {
2683
- "hash_examples": "50f57ab32f5f6cea",
2684
- "hash_full_prompts": "e60129bd2d82ffc6",
2685
- "hash_input_tokens": "440a4a6e8c648413",
2686
- "hash_cont_tokens": "d01eb63ed6c749e0"
2687
- },
2688
- "truncated": 0,
2689
- "non_truncated": 282,
2690
- "padded": 1120,
2691
- "non_padded": 8,
2692
- "effective_few_shots": 5.0,
2693
- "num_truncated_few_shots": 0
2694
- },
2695
- "lighteval|mmlu:professional_law|5": {
2696
- "hashes": {
2697
- "hash_examples": "a8fdc85c64f4b215",
2698
- "hash_full_prompts": "0dbb1d9b72dcea03",
2699
- "hash_input_tokens": "135d3c6befbf7b8e",
2700
- "hash_cont_tokens": "04ed4a3308eb17e7"
2701
- },
2702
- "truncated": 0,
2703
- "non_truncated": 1534,
2704
- "padded": 6136,
2705
- "non_padded": 0,
2706
- "effective_few_shots": 5.0,
2707
- "num_truncated_few_shots": 0
2708
- },
2709
- "lighteval|mmlu:professional_medicine|5": {
2710
- "hashes": {
2711
- "hash_examples": "c373a28a3050a73a",
2712
- "hash_full_prompts": "5e040f9ca68b089e",
2713
- "hash_input_tokens": "bb56142819292718",
2714
- "hash_cont_tokens": "7d29b56ef2b26e26"
2715
- },
2716
- "truncated": 0,
2717
- "non_truncated": 272,
2718
- "padded": 1088,
2719
- "non_padded": 0,
2720
- "effective_few_shots": 5.0,
2721
- "num_truncated_few_shots": 0
2722
- },
2723
- "lighteval|mmlu:professional_psychology|5": {
2724
- "hashes": {
2725
- "hash_examples": "bf5254fe818356af",
2726
- "hash_full_prompts": "b386ecda8b87150e",
2727
- "hash_input_tokens": "69c9d6c20011b3c8",
2728
- "hash_cont_tokens": "05a71f9a9871b8f8"
2729
- },
2730
- "truncated": 0,
2731
- "non_truncated": 612,
2732
- "padded": 2448,
2733
- "non_padded": 0,
2734
- "effective_few_shots": 5.0,
2735
- "num_truncated_few_shots": 0
2736
- },
2737
- "lighteval|mmlu:public_relations|5": {
2738
- "hashes": {
2739
- "hash_examples": "b66d52e28e7d14e0",
2740
- "hash_full_prompts": "fe43562263e25677",
2741
- "hash_input_tokens": "a7ad0cf2c3f2b991",
2742
- "hash_cont_tokens": "54b160d9a82d8c27"
2743
- },
2744
- "truncated": 0,
2745
- "non_truncated": 110,
2746
- "padded": 440,
2747
- "non_padded": 0,
2748
- "effective_few_shots": 5.0,
2749
- "num_truncated_few_shots": 0
2750
- },
2751
- "lighteval|mmlu:security_studies|5": {
2752
- "hashes": {
2753
- "hash_examples": "514c14feaf000ad9",
2754
- "hash_full_prompts": "27d4a2ac541ef4b9",
2755
- "hash_input_tokens": "6cae3c98c2406b72",
2756
- "hash_cont_tokens": "a243156a0dfe5f0f"
2757
- },
2758
- "truncated": 0,
2759
- "non_truncated": 245,
2760
- "padded": 980,
2761
- "non_padded": 0,
2762
- "effective_few_shots": 5.0,
2763
- "num_truncated_few_shots": 0
2764
- },
2765
- "lighteval|mmlu:sociology|5": {
2766
- "hashes": {
2767
- "hash_examples": "f6c9bc9d18c80870",
2768
- "hash_full_prompts": "c072ea7d1a1524f2",
2769
- "hash_input_tokens": "73f8d9b6efbc989c",
2770
- "hash_cont_tokens": "bc76cf135cbda520"
2771
- },
2772
- "truncated": 0,
2773
- "non_truncated": 201,
2774
- "padded": 804,
2775
- "non_padded": 0,
2776
- "effective_few_shots": 5.0,
2777
- "num_truncated_few_shots": 0
2778
- },
2779
- "lighteval|mmlu:us_foreign_policy|5": {
2780
- "hashes": {
2781
- "hash_examples": "ed7b78629db6678f",
2782
- "hash_full_prompts": "341a97ca3e4d699d",
2783
- "hash_input_tokens": "3d5111b05caedd5c",
2784
- "hash_cont_tokens": "ca6635f013682116"
2785
- },
2786
- "truncated": 0,
2787
- "non_truncated": 100,
2788
- "padded": 397,
2789
- "non_padded": 3,
2790
- "effective_few_shots": 5.0,
2791
- "num_truncated_few_shots": 0
2792
- },
2793
- "lighteval|mmlu:virology|5": {
2794
- "hashes": {
2795
- "hash_examples": "bc52ffdc3f9b994a",
2796
- "hash_full_prompts": "651d471e2eb8b5e9",
2797
- "hash_input_tokens": "e451260208a4d06a",
2798
- "hash_cont_tokens": "3d8151c061cd8307"
2799
- },
2800
- "truncated": 0,
2801
- "non_truncated": 166,
2802
- "padded": 664,
2803
- "non_padded": 0,
2804
- "effective_few_shots": 5.0,
2805
- "num_truncated_few_shots": 0
2806
- },
2807
- "lighteval|mmlu:world_religions|5": {
2808
- "hashes": {
2809
- "hash_examples": "ecdb4a4f94f62930",
2810
- "hash_full_prompts": "3773f03542ce44a3",
2811
- "hash_input_tokens": "3cc09567a0d77652",
2812
- "hash_cont_tokens": "bcdec9004b1a5e7d"
2813
- },
2814
- "truncated": 0,
2815
- "non_truncated": 171,
2816
- "padded": 684,
2817
- "non_padded": 0,
2818
- "effective_few_shots": 5.0,
2819
- "num_truncated_few_shots": 0
2820
- }
2821
- },
2822
- "summary_general": {
2823
- "hashes": {
2824
- "hash_examples": "341a076d0beb7048",
2825
- "hash_full_prompts": "a5c8f2b7ff4f5ae2",
2826
- "hash_input_tokens": "aa0c1e704c27cb14",
2827
- "hash_cont_tokens": "c7e380df958b1906"
2828
- },
2829
- "truncated": 0,
2830
- "non_truncated": 14042,
2831
- "padded": 56074,
2832
- "non_padded": 94,
2833
- "num_truncated_few_shots": 0
2834
- }
2835
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-sft-full-longest-1k/main/truthfulqa/results_2024-02-28T15-53-50.479330.json DELETED
@@ -1,85 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 361499.640182139,
9
- "end_time": 361611.41258232,
10
- "total_evaluation_time_secondes": "111.77240018098382",
11
- "model_name": "lewtun/gemma-7b-sft-full-longest-1k",
12
- "model_sha": "5b354486c4322c02994885f68f00d66a75275dd1",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "16.4 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|truthfulqa:mc|0": {
19
- "truthfulqa_mc1": 0.3561811505507956,
20
- "truthfulqa_mc1_stderr": 0.016763790728446335,
21
- "truthfulqa_mc2": 0.47992622898903625,
22
- "truthfulqa_mc2_stderr": 0.01507766133469192
23
- }
24
- },
25
- "versions": {
26
- "lighteval|truthfulqa:mc|0": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|truthfulqa:mc": {
30
- "name": "truthfulqa:mc",
31
- "prompt_function": "truthful_qa_multiple_choice",
32
- "hf_repo": "truthful_qa",
33
- "hf_subset": "multiple_choice",
34
- "metric": [
35
- "truthfulqa_mc_metrics"
36
- ],
37
- "hf_avail_splits": [
38
- "validation"
39
- ],
40
- "evaluation_splits": [
41
- "validation"
42
- ],
43
- "few_shots_split": null,
44
- "few_shots_select": null,
45
- "generation_size": -1,
46
- "stop_sequence": [
47
- "\n"
48
- ],
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "lighteval"
53
- ]
54
- }
55
- },
56
- "summary_tasks": {
57
- "lighteval|truthfulqa:mc|0": {
58
- "hashes": {
59
- "hash_examples": "36a6d90e75d92d4a",
60
- "hash_full_prompts": "17e9d0dc9f923ba3",
61
- "hash_input_tokens": "d764301724557e7f",
62
- "hash_cont_tokens": "93161ab7beb699a8"
63
- },
64
- "truncated": 0,
65
- "non_truncated": 817,
66
- "padded": 9512,
67
- "non_padded": 484,
68
- "effective_few_shots": 0.0,
69
- "num_truncated_few_shots": 0
70
- }
71
- },
72
- "summary_general": {
73
- "hashes": {
74
- "hash_examples": "aed1dfc67e53d0f2",
75
- "hash_full_prompts": "81a2e5a97bc8b7e3",
76
- "hash_input_tokens": "5706646d3600ad17",
77
- "hash_cont_tokens": "ada3600dbba47dea"
78
- },
79
- "truncated": 0,
80
- "non_truncated": 817,
81
- "padded": 9512,
82
- "non_padded": 484,
83
- "num_truncated_few_shots": 0
84
- }
85
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-sft-full-longest-1k/main/winogrande/results_2024-02-28T15-55-28.054121.json DELETED
@@ -1,85 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 361640.025355664,
9
- "end_time": 361708.990188584,
10
- "total_evaluation_time_secondes": "68.96483291999903",
11
- "model_name": "lewtun/gemma-7b-sft-full-longest-1k",
12
- "model_sha": "5b354486c4322c02994885f68f00d66a75275dd1",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "16.4 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|winogrande|5": {
19
- "acc": 0.580110497237569,
20
- "acc_stderr": 0.01387094398631039
21
- }
22
- },
23
- "versions": {
24
- "lighteval|winogrande|5": 0
25
- },
26
- "config_tasks": {
27
- "lighteval|winogrande": {
28
- "name": "winogrande",
29
- "prompt_function": "winogrande",
30
- "hf_repo": "winogrande",
31
- "hf_subset": "winogrande_xl",
32
- "metric": [
33
- "loglikelihood_acc"
34
- ],
35
- "hf_avail_splits": [
36
- "train",
37
- "test",
38
- "validation"
39
- ],
40
- "evaluation_splits": [
41
- "validation"
42
- ],
43
- "few_shots_split": null,
44
- "few_shots_select": "random_sampling",
45
- "generation_size": -1,
46
- "stop_sequence": [
47
- "\n"
48
- ],
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "lighteval"
53
- ]
54
- }
55
- },
56
- "summary_tasks": {
57
- "lighteval|winogrande|5": {
58
- "hashes": {
59
- "hash_examples": "087d5d1a1afd4c7b",
60
- "hash_full_prompts": "29e044bcf40d6a6d",
61
- "hash_input_tokens": "93c7842f85bc4240",
62
- "hash_cont_tokens": "c4ed575f59ed10a2"
63
- },
64
- "truncated": 0,
65
- "non_truncated": 1267,
66
- "padded": 2381,
67
- "non_padded": 153,
68
- "effective_few_shots": 5.0,
69
- "num_truncated_few_shots": 0
70
- }
71
- },
72
- "summary_general": {
73
- "hashes": {
74
- "hash_examples": "b9a49975cc41fab7",
75
- "hash_full_prompts": "2f908b2b9b5ec583",
76
- "hash_input_tokens": "db8606261a0b73b5",
77
- "hash_cont_tokens": "f7a87e57c3284cc1"
78
- },
79
- "truncated": 0,
80
- "non_truncated": 1267,
81
- "padded": 2381,
82
- "non_padded": 153,
83
- "num_truncated_few_shots": 0
84
- }
85
- }