lewtun HF Staff commited on
Commit
0d9ec01
·
1 Parent(s): b48eda8
Files changed (17) hide show
  1. eval_results/Qwen/Qwen1.5-0.5B-Chat/main/arc/results_2024-02-28T07-35-58.803958.json +0 -88
  2. eval_results/Qwen/Qwen1.5-0.5B-Chat/main/gsm8k/results_2024-02-28T07-44-11.234625.json +0 -86
  3. eval_results/Qwen/Qwen1.5-0.5B-Chat/main/hellaswag/results_2024-02-28T07-40-46.854914.json +0 -88
  4. eval_results/Qwen/Qwen1.5-0.5B-Chat/main/mmlu/results_2024-02-28T07-45-42.141224.json +0 -2835
  5. eval_results/Qwen/Qwen1.5-0.5B-Chat/main/truthfulqa/results_2024-02-28T07-35-48.691367.json +0 -85
  6. eval_results/Qwen/Qwen1.5-0.5B-Chat/main/winogrande/results_2024-02-27T20-42-29.221265.json +0 -85
  7. eval_results/Qwen/Qwen1.5-0.5B-Chat/main/winogrande/results_2024-02-28T07-35-24.539002.json +0 -85
  8. eval_results/Qwen/Qwen1.5-0.5B/main/arc/results_2024-03-02T12-48-25.468716.json +0 -90
  9. eval_results/Qwen/Qwen1.5-0.5B/main/gsm8k/results_2024-03-02T12-51-54.476794.json +0 -3
  10. eval_results/Qwen/Qwen1.5-0.5B/main/hellaswag/results_2024-03-02T12-51-59.737337.json +0 -90
  11. eval_results/Qwen/Qwen1.5-0.5B/main/ifeval/results_2024-03-02T12-56-57.071885.json +0 -89
  12. eval_results/Qwen/Qwen1.5-0.5B/main/mmlu/results_2024-03-02T12-55-58.089063.json +0 -2949
  13. eval_results/Qwen/Qwen1.5-0.5B/main/truthfulqa/results_2024-03-02T12-48-12.539015.json +0 -87
  14. eval_results/Qwen/Qwen1.5-0.5B/main/winogrande/results_2024-03-02T12-47-59.918589.json +0 -87
  15. eval_results/abacaj/phi-2-super/main/ifeval/results_2024-03-02T12-34-38.484385.json +0 -89
  16. eval_results/lewtun/gemma-7b-dpo-full-mix1-beta-0.05-epoch-2/main/gsm8k/results_2024-03-01T11-15-08.605142.json +0 -88
  17. eval_results/lewtun/gemma-7b-dpo-full-mix1-beta-0.05-epoch-2/main/mmlu/results_2024-03-01T11-13-36.220599.json +0 -2949
eval_results/Qwen/Qwen1.5-0.5B-Chat/main/arc/results_2024-02-28T07-35-58.803958.json DELETED
@@ -1,88 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2577613.595521999,
9
- "end_time": 2577691.977384157,
10
- "total_evaluation_time_secondes": "78.38186215795577",
11
- "model_name": "Qwen/Qwen1.5-0.5B-Chat",
12
- "model_sha": "7f630fd18dccab574ab1b78411a8753f989a55ac",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|arc:challenge|25": {
19
- "acc": 0.28668941979522183,
20
- "acc_stderr": 0.013214986329274757,
21
- "acc_norm": 0.310580204778157,
22
- "acc_norm_stderr": 0.013522292098053045
23
- }
24
- },
25
- "versions": {
26
- "lighteval|arc:challenge|25": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|arc:challenge": {
30
- "name": "arc:challenge",
31
- "prompt_function": "arc",
32
- "hf_repo": "ai2_arc",
33
- "hf_subset": "ARC-Challenge",
34
- "metric": [
35
- "loglikelihood_acc",
36
- "loglikelihood_acc_norm_nospace"
37
- ],
38
- "hf_avail_splits": [
39
- "train",
40
- "test"
41
- ],
42
- "evaluation_splits": [
43
- "test"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": "random_sampling_from_train",
47
- "generation_size": 1,
48
- "stop_sequence": [
49
- "\n"
50
- ],
51
- "output_regex": null,
52
- "frozen": false,
53
- "suite": [
54
- "lighteval",
55
- "arc"
56
- ]
57
- }
58
- },
59
- "summary_tasks": {
60
- "lighteval|arc:challenge|25": {
61
- "hashes": {
62
- "hash_examples": "17b0cae357c0259e",
63
- "hash_full_prompts": "4613138cb84a1c53",
64
- "hash_input_tokens": "85ecaa299a6a917a",
65
- "hash_cont_tokens": "da3689055cb5fa28"
66
- },
67
- "truncated": 0,
68
- "non_truncated": 1172,
69
- "padded": 4651,
70
- "non_padded": 36,
71
- "effective_few_shots": 25.0,
72
- "num_truncated_few_shots": 0
73
- }
74
- },
75
- "summary_general": {
76
- "hashes": {
77
- "hash_examples": "aaa6929c6d3771fb",
78
- "hash_full_prompts": "de0ea2e1cc95d72a",
79
- "hash_input_tokens": "549aeedee8982096",
80
- "hash_cont_tokens": "da91173bce47858f"
81
- },
82
- "truncated": 0,
83
- "non_truncated": 1172,
84
- "padded": 4651,
85
- "non_padded": 36,
86
- "num_truncated_few_shots": 0
87
- }
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B-Chat/main/gsm8k/results_2024-02-28T07-44-11.234625.json DELETED
@@ -1,86 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 763650.857058454,
9
- "end_time": 764224.230277987,
10
- "total_evaluation_time_secondes": "573.3732195330085",
11
- "model_name": "Qwen/Qwen1.5-0.5B-Chat",
12
- "model_sha": "7f630fd18dccab574ab1b78411a8753f989a55ac",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|gsm8k|5": {
19
- "qem": 0.03866565579984837,
20
- "qem_stderr": 0.005310583162098074
21
- }
22
- },
23
- "versions": {
24
- "lighteval|gsm8k|5": 0
25
- },
26
- "config_tasks": {
27
- "lighteval|gsm8k": {
28
- "name": "gsm8k",
29
- "prompt_function": "gsm8k",
30
- "hf_repo": "gsm8k",
31
- "hf_subset": "main",
32
- "metric": [
33
- "quasi_exact_match_gsm8k"
34
- ],
35
- "hf_avail_splits": [
36
- "train",
37
- "test"
38
- ],
39
- "evaluation_splits": [
40
- "test"
41
- ],
42
- "few_shots_split": null,
43
- "few_shots_select": "random_sampling_from_train",
44
- "generation_size": 256,
45
- "stop_sequence": [
46
- ":",
47
- "Question:",
48
- "Question"
49
- ],
50
- "output_regex": null,
51
- "frozen": false,
52
- "suite": [
53
- "lighteval"
54
- ]
55
- }
56
- },
57
- "summary_tasks": {
58
- "lighteval|gsm8k|5": {
59
- "hashes": {
60
- "hash_examples": "0ed016e24e7512fd",
61
- "hash_full_prompts": "66ff2108d151c6de",
62
- "hash_input_tokens": "aa57736f8c45c33c",
63
- "hash_cont_tokens": "4385070600734386"
64
- },
65
- "truncated": 0,
66
- "non_truncated": 1319,
67
- "padded": 0,
68
- "non_padded": 1319,
69
- "effective_few_shots": 5.0,
70
- "num_truncated_few_shots": 0
71
- }
72
- },
73
- "summary_general": {
74
- "hashes": {
75
- "hash_examples": "bc71463e88551d0e",
76
- "hash_full_prompts": "84c6a9cfa115dda5",
77
- "hash_input_tokens": "5aac43a2850ca464",
78
- "hash_cont_tokens": "7d429d7833abb236"
79
- },
80
- "truncated": 0,
81
- "non_truncated": 1319,
82
- "padded": 0,
83
- "non_padded": 1319,
84
- "num_truncated_few_shots": 0
85
- }
86
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B-Chat/main/hellaswag/results_2024-02-28T07-40-46.854914.json DELETED
@@ -1,88 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 3929412.765519201,
9
- "end_time": 3929781.520172699,
10
- "total_evaluation_time_secondes": "368.7546534980647",
11
- "model_name": "Qwen/Qwen1.5-0.5B-Chat",
12
- "model_sha": "7f630fd18dccab574ab1b78411a8753f989a55ac",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|hellaswag|10": {
19
- "acc": 0.3632742481577375,
20
- "acc_stderr": 0.00479959984039738,
21
- "acc_norm": 0.4380601473809998,
22
- "acc_norm_stderr": 0.00495134633816451
23
- }
24
- },
25
- "versions": {
26
- "lighteval|hellaswag|10": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|hellaswag": {
30
- "name": "hellaswag",
31
- "prompt_function": "hellaswag_harness",
32
- "hf_repo": "hellaswag",
33
- "hf_subset": "default",
34
- "metric": [
35
- "loglikelihood_acc",
36
- "loglikelihood_acc_norm"
37
- ],
38
- "hf_avail_splits": [
39
- "train",
40
- "test",
41
- "validation"
42
- ],
43
- "evaluation_splits": [
44
- "validation"
45
- ],
46
- "few_shots_split": null,
47
- "few_shots_select": "random_sampling_from_train",
48
- "generation_size": -1,
49
- "stop_sequence": [
50
- "\n"
51
- ],
52
- "output_regex": null,
53
- "frozen": false,
54
- "suite": [
55
- "lighteval"
56
- ]
57
- }
58
- },
59
- "summary_tasks": {
60
- "lighteval|hellaswag|10": {
61
- "hashes": {
62
- "hash_examples": "31985c805c3a737e",
63
- "hash_full_prompts": "693f4478cad5e6ae",
64
- "hash_input_tokens": "74bfcf67a650cc23",
65
- "hash_cont_tokens": "3a21ed80a4a7585d"
66
- },
67
- "truncated": 0,
68
- "non_truncated": 10042,
69
- "padded": 39979,
70
- "non_padded": 189,
71
- "effective_few_shots": 10.0,
72
- "num_truncated_few_shots": 0
73
- }
74
- },
75
- "summary_general": {
76
- "hashes": {
77
- "hash_examples": "63bc2cf8bae03fbc",
78
- "hash_full_prompts": "3be43840a0fc57a3",
79
- "hash_input_tokens": "5ae15d6c23138d3e",
80
- "hash_cont_tokens": "b7bfbb64b00158d2"
81
- },
82
- "truncated": 0,
83
- "non_truncated": 10042,
84
- "padded": 39979,
85
- "non_padded": 189,
86
- "num_truncated_few_shots": 0
87
- }
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B-Chat/main/mmlu/results_2024-02-28T07-45-42.141224.json DELETED
@@ -1,2835 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2507345.062565505,
9
- "end_time": 2508008.844386519,
10
- "total_evaluation_time_secondes": "663.7818210138939",
11
- "model_name": "Qwen/Qwen1.5-0.5B-Chat",
12
- "model_sha": "7f630fd18dccab574ab1b78411a8753f989a55ac",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|mmlu:abstract_algebra|5": {
19
- "acc": 0.26,
20
- "acc_stderr": 0.0440844002276808
21
- },
22
- "lighteval|mmlu:anatomy|5": {
23
- "acc": 0.31851851851851853,
24
- "acc_stderr": 0.04024778401977111
25
- },
26
- "lighteval|mmlu:astronomy|5": {
27
- "acc": 0.34210526315789475,
28
- "acc_stderr": 0.038607315993160925
29
- },
30
- "lighteval|mmlu:business_ethics|5": {
31
- "acc": 0.5,
32
- "acc_stderr": 0.050251890762960605
33
- },
34
- "lighteval|mmlu:clinical_knowledge|5": {
35
- "acc": 0.3622641509433962,
36
- "acc_stderr": 0.0295822451283843
37
- },
38
- "lighteval|mmlu:college_biology|5": {
39
- "acc": 0.3194444444444444,
40
- "acc_stderr": 0.038990736873573344
41
- },
42
- "lighteval|mmlu:college_chemistry|5": {
43
- "acc": 0.36,
44
- "acc_stderr": 0.048241815132442176
45
- },
46
- "lighteval|mmlu:college_computer_science|5": {
47
- "acc": 0.37,
48
- "acc_stderr": 0.048523658709391
49
- },
50
- "lighteval|mmlu:college_mathematics|5": {
51
- "acc": 0.33,
52
- "acc_stderr": 0.047258156262526045
53
- },
54
- "lighteval|mmlu:college_medicine|5": {
55
- "acc": 0.3468208092485549,
56
- "acc_stderr": 0.036291466701596636
57
- },
58
- "lighteval|mmlu:college_physics|5": {
59
- "acc": 0.29411764705882354,
60
- "acc_stderr": 0.04533838195929775
61
- },
62
- "lighteval|mmlu:computer_security|5": {
63
- "acc": 0.41,
64
- "acc_stderr": 0.049431107042371025
65
- },
66
- "lighteval|mmlu:conceptual_physics|5": {
67
- "acc": 0.2680851063829787,
68
- "acc_stderr": 0.028957342788342347
69
- },
70
- "lighteval|mmlu:econometrics|5": {
71
- "acc": 0.2894736842105263,
72
- "acc_stderr": 0.04266339443159393
73
- },
74
- "lighteval|mmlu:electrical_engineering|5": {
75
- "acc": 0.4,
76
- "acc_stderr": 0.04082482904638628
77
- },
78
- "lighteval|mmlu:elementary_mathematics|5": {
79
- "acc": 0.2830687830687831,
80
- "acc_stderr": 0.023201392938194978
81
- },
82
- "lighteval|mmlu:formal_logic|5": {
83
- "acc": 0.25396825396825395,
84
- "acc_stderr": 0.03893259610604671
85
- },
86
- "lighteval|mmlu:global_facts|5": {
87
- "acc": 0.31,
88
- "acc_stderr": 0.04648231987117316
89
- },
90
- "lighteval|mmlu:high_school_biology|5": {
91
- "acc": 0.3870967741935484,
92
- "acc_stderr": 0.02770935967503249
93
- },
94
- "lighteval|mmlu:high_school_chemistry|5": {
95
- "acc": 0.3103448275862069,
96
- "acc_stderr": 0.03255086769970103
97
- },
98
- "lighteval|mmlu:high_school_computer_science|5": {
99
- "acc": 0.33,
100
- "acc_stderr": 0.04725815626252605
101
- },
102
- "lighteval|mmlu:high_school_european_history|5": {
103
- "acc": 0.48484848484848486,
104
- "acc_stderr": 0.03902551007374449
105
- },
106
- "lighteval|mmlu:high_school_geography|5": {
107
- "acc": 0.5050505050505051,
108
- "acc_stderr": 0.035621707606254015
109
- },
110
- "lighteval|mmlu:high_school_government_and_politics|5": {
111
- "acc": 0.41968911917098445,
112
- "acc_stderr": 0.035615873276858834
113
- },
114
- "lighteval|mmlu:high_school_macroeconomics|5": {
115
- "acc": 0.3717948717948718,
116
- "acc_stderr": 0.02450347255711094
117
- },
118
- "lighteval|mmlu:high_school_mathematics|5": {
119
- "acc": 0.25925925925925924,
120
- "acc_stderr": 0.026719240783712166
121
- },
122
- "lighteval|mmlu:high_school_microeconomics|5": {
123
- "acc": 0.3739495798319328,
124
- "acc_stderr": 0.031429466378837076
125
- },
126
- "lighteval|mmlu:high_school_physics|5": {
127
- "acc": 0.2847682119205298,
128
- "acc_stderr": 0.03684881521389023
129
- },
130
- "lighteval|mmlu:high_school_psychology|5": {
131
- "acc": 0.3889908256880734,
132
- "acc_stderr": 0.020902300887392866
133
- },
134
- "lighteval|mmlu:high_school_statistics|5": {
135
- "acc": 0.3194444444444444,
136
- "acc_stderr": 0.0317987634217685
137
- },
138
- "lighteval|mmlu:high_school_us_history|5": {
139
- "acc": 0.4411764705882353,
140
- "acc_stderr": 0.034849415144292316
141
- },
142
- "lighteval|mmlu:high_school_world_history|5": {
143
- "acc": 0.5021097046413502,
144
- "acc_stderr": 0.032546938018020076
145
- },
146
- "lighteval|mmlu:human_aging|5": {
147
- "acc": 0.3273542600896861,
148
- "acc_stderr": 0.03149384670994131
149
- },
150
- "lighteval|mmlu:human_sexuality|5": {
151
- "acc": 0.37404580152671757,
152
- "acc_stderr": 0.042438692422305246
153
- },
154
- "lighteval|mmlu:international_law|5": {
155
- "acc": 0.5289256198347108,
156
- "acc_stderr": 0.04556710331269498
157
- },
158
- "lighteval|mmlu:jurisprudence|5": {
159
- "acc": 0.4074074074074074,
160
- "acc_stderr": 0.04750077341199985
161
- },
162
- "lighteval|mmlu:logical_fallacies|5": {
163
- "acc": 0.34355828220858897,
164
- "acc_stderr": 0.03731133519673893
165
- },
166
- "lighteval|mmlu:machine_learning|5": {
167
- "acc": 0.2767857142857143,
168
- "acc_stderr": 0.042466243366976256
169
- },
170
- "lighteval|mmlu:management|5": {
171
- "acc": 0.4563106796116505,
172
- "acc_stderr": 0.04931801994220414
173
- },
174
- "lighteval|mmlu:marketing|5": {
175
- "acc": 0.5,
176
- "acc_stderr": 0.03275608910402091
177
- },
178
- "lighteval|mmlu:medical_genetics|5": {
179
- "acc": 0.37,
180
- "acc_stderr": 0.04852365870939098
181
- },
182
- "lighteval|mmlu:miscellaneous|5": {
183
- "acc": 0.38697318007662834,
184
- "acc_stderr": 0.017417138059440132
185
- },
186
- "lighteval|mmlu:moral_disputes|5": {
187
- "acc": 0.43641618497109824,
188
- "acc_stderr": 0.026700545424943687
189
- },
190
- "lighteval|mmlu:moral_scenarios|5": {
191
- "acc": 0.24134078212290502,
192
- "acc_stderr": 0.014310999547961441
193
- },
194
- "lighteval|mmlu:nutrition|5": {
195
- "acc": 0.42483660130718953,
196
- "acc_stderr": 0.028304576673141124
197
- },
198
- "lighteval|mmlu:philosophy|5": {
199
- "acc": 0.3987138263665595,
200
- "acc_stderr": 0.0278093225857745
201
- },
202
- "lighteval|mmlu:prehistory|5": {
203
- "acc": 0.3734567901234568,
204
- "acc_stderr": 0.026915003011380157
205
- },
206
- "lighteval|mmlu:professional_accounting|5": {
207
- "acc": 0.2907801418439716,
208
- "acc_stderr": 0.027090664368353178
209
- },
210
- "lighteval|mmlu:professional_law|5": {
211
- "acc": 0.33116036505867014,
212
- "acc_stderr": 0.01202012819598575
213
- },
214
- "lighteval|mmlu:professional_medicine|5": {
215
- "acc": 0.3860294117647059,
216
- "acc_stderr": 0.029573269134411124
217
- },
218
- "lighteval|mmlu:professional_psychology|5": {
219
- "acc": 0.3562091503267974,
220
- "acc_stderr": 0.019373332420724493
221
- },
222
- "lighteval|mmlu:public_relations|5": {
223
- "acc": 0.42727272727272725,
224
- "acc_stderr": 0.04738198703545483
225
- },
226
- "lighteval|mmlu:security_studies|5": {
227
- "acc": 0.2693877551020408,
228
- "acc_stderr": 0.02840125202902294
229
- },
230
- "lighteval|mmlu:sociology|5": {
231
- "acc": 0.5174129353233831,
232
- "acc_stderr": 0.03533389234739245
233
- },
234
- "lighteval|mmlu:us_foreign_policy|5": {
235
- "acc": 0.54,
236
- "acc_stderr": 0.05009082659620333
237
- },
238
- "lighteval|mmlu:virology|5": {
239
- "acc": 0.3433734939759036,
240
- "acc_stderr": 0.036965843170106004
241
- },
242
- "lighteval|mmlu:world_religions|5": {
243
- "acc": 0.2807017543859649,
244
- "acc_stderr": 0.034462962170884265
245
- },
246
- "lighteval|mmlu:_average|5": {
247
- "acc": 0.3681551334211768,
248
- "acc_stderr": 0.035698565367394484
249
- }
250
- },
251
- "versions": {
252
- "lighteval|mmlu:abstract_algebra|5": 0,
253
- "lighteval|mmlu:anatomy|5": 0,
254
- "lighteval|mmlu:astronomy|5": 0,
255
- "lighteval|mmlu:business_ethics|5": 0,
256
- "lighteval|mmlu:clinical_knowledge|5": 0,
257
- "lighteval|mmlu:college_biology|5": 0,
258
- "lighteval|mmlu:college_chemistry|5": 0,
259
- "lighteval|mmlu:college_computer_science|5": 0,
260
- "lighteval|mmlu:college_mathematics|5": 0,
261
- "lighteval|mmlu:college_medicine|5": 0,
262
- "lighteval|mmlu:college_physics|5": 0,
263
- "lighteval|mmlu:computer_security|5": 0,
264
- "lighteval|mmlu:conceptual_physics|5": 0,
265
- "lighteval|mmlu:econometrics|5": 0,
266
- "lighteval|mmlu:electrical_engineering|5": 0,
267
- "lighteval|mmlu:elementary_mathematics|5": 0,
268
- "lighteval|mmlu:formal_logic|5": 0,
269
- "lighteval|mmlu:global_facts|5": 0,
270
- "lighteval|mmlu:high_school_biology|5": 0,
271
- "lighteval|mmlu:high_school_chemistry|5": 0,
272
- "lighteval|mmlu:high_school_computer_science|5": 0,
273
- "lighteval|mmlu:high_school_european_history|5": 0,
274
- "lighteval|mmlu:high_school_geography|5": 0,
275
- "lighteval|mmlu:high_school_government_and_politics|5": 0,
276
- "lighteval|mmlu:high_school_macroeconomics|5": 0,
277
- "lighteval|mmlu:high_school_mathematics|5": 0,
278
- "lighteval|mmlu:high_school_microeconomics|5": 0,
279
- "lighteval|mmlu:high_school_physics|5": 0,
280
- "lighteval|mmlu:high_school_psychology|5": 0,
281
- "lighteval|mmlu:high_school_statistics|5": 0,
282
- "lighteval|mmlu:high_school_us_history|5": 0,
283
- "lighteval|mmlu:high_school_world_history|5": 0,
284
- "lighteval|mmlu:human_aging|5": 0,
285
- "lighteval|mmlu:human_sexuality|5": 0,
286
- "lighteval|mmlu:international_law|5": 0,
287
- "lighteval|mmlu:jurisprudence|5": 0,
288
- "lighteval|mmlu:logical_fallacies|5": 0,
289
- "lighteval|mmlu:machine_learning|5": 0,
290
- "lighteval|mmlu:management|5": 0,
291
- "lighteval|mmlu:marketing|5": 0,
292
- "lighteval|mmlu:medical_genetics|5": 0,
293
- "lighteval|mmlu:miscellaneous|5": 0,
294
- "lighteval|mmlu:moral_disputes|5": 0,
295
- "lighteval|mmlu:moral_scenarios|5": 0,
296
- "lighteval|mmlu:nutrition|5": 0,
297
- "lighteval|mmlu:philosophy|5": 0,
298
- "lighteval|mmlu:prehistory|5": 0,
299
- "lighteval|mmlu:professional_accounting|5": 0,
300
- "lighteval|mmlu:professional_law|5": 0,
301
- "lighteval|mmlu:professional_medicine|5": 0,
302
- "lighteval|mmlu:professional_psychology|5": 0,
303
- "lighteval|mmlu:public_relations|5": 0,
304
- "lighteval|mmlu:security_studies|5": 0,
305
- "lighteval|mmlu:sociology|5": 0,
306
- "lighteval|mmlu:us_foreign_policy|5": 0,
307
- "lighteval|mmlu:virology|5": 0,
308
- "lighteval|mmlu:world_religions|5": 0
309
- },
310
- "config_tasks": {
311
- "lighteval|mmlu:abstract_algebra": {
312
- "name": "mmlu:abstract_algebra",
313
- "prompt_function": "mmlu_harness",
314
- "hf_repo": "lighteval/mmlu",
315
- "hf_subset": "abstract_algebra",
316
- "metric": [
317
- "loglikelihood_acc"
318
- ],
319
- "hf_avail_splits": [
320
- "auxiliary_train",
321
- "test",
322
- "validation",
323
- "dev"
324
- ],
325
- "evaluation_splits": [
326
- "test"
327
- ],
328
- "few_shots_split": "dev",
329
- "few_shots_select": "sequential",
330
- "generation_size": 1,
331
- "stop_sequence": [
332
- "\n"
333
- ],
334
- "output_regex": null,
335
- "frozen": false,
336
- "suite": [
337
- "lighteval",
338
- "mmlu"
339
- ]
340
- },
341
- "lighteval|mmlu:anatomy": {
342
- "name": "mmlu:anatomy",
343
- "prompt_function": "mmlu_harness",
344
- "hf_repo": "lighteval/mmlu",
345
- "hf_subset": "anatomy",
346
- "metric": [
347
- "loglikelihood_acc"
348
- ],
349
- "hf_avail_splits": [
350
- "auxiliary_train",
351
- "test",
352
- "validation",
353
- "dev"
354
- ],
355
- "evaluation_splits": [
356
- "test"
357
- ],
358
- "few_shots_split": "dev",
359
- "few_shots_select": "sequential",
360
- "generation_size": 1,
361
- "stop_sequence": [
362
- "\n"
363
- ],
364
- "output_regex": null,
365
- "frozen": false,
366
- "suite": [
367
- "lighteval",
368
- "mmlu"
369
- ]
370
- },
371
- "lighteval|mmlu:astronomy": {
372
- "name": "mmlu:astronomy",
373
- "prompt_function": "mmlu_harness",
374
- "hf_repo": "lighteval/mmlu",
375
- "hf_subset": "astronomy",
376
- "metric": [
377
- "loglikelihood_acc"
378
- ],
379
- "hf_avail_splits": [
380
- "auxiliary_train",
381
- "test",
382
- "validation",
383
- "dev"
384
- ],
385
- "evaluation_splits": [
386
- "test"
387
- ],
388
- "few_shots_split": "dev",
389
- "few_shots_select": "sequential",
390
- "generation_size": 1,
391
- "stop_sequence": [
392
- "\n"
393
- ],
394
- "output_regex": null,
395
- "frozen": false,
396
- "suite": [
397
- "lighteval",
398
- "mmlu"
399
- ]
400
- },
401
- "lighteval|mmlu:business_ethics": {
402
- "name": "mmlu:business_ethics",
403
- "prompt_function": "mmlu_harness",
404
- "hf_repo": "lighteval/mmlu",
405
- "hf_subset": "business_ethics",
406
- "metric": [
407
- "loglikelihood_acc"
408
- ],
409
- "hf_avail_splits": [
410
- "auxiliary_train",
411
- "test",
412
- "validation",
413
- "dev"
414
- ],
415
- "evaluation_splits": [
416
- "test"
417
- ],
418
- "few_shots_split": "dev",
419
- "few_shots_select": "sequential",
420
- "generation_size": 1,
421
- "stop_sequence": [
422
- "\n"
423
- ],
424
- "output_regex": null,
425
- "frozen": false,
426
- "suite": [
427
- "lighteval",
428
- "mmlu"
429
- ]
430
- },
431
- "lighteval|mmlu:clinical_knowledge": {
432
- "name": "mmlu:clinical_knowledge",
433
- "prompt_function": "mmlu_harness",
434
- "hf_repo": "lighteval/mmlu",
435
- "hf_subset": "clinical_knowledge",
436
- "metric": [
437
- "loglikelihood_acc"
438
- ],
439
- "hf_avail_splits": [
440
- "auxiliary_train",
441
- "test",
442
- "validation",
443
- "dev"
444
- ],
445
- "evaluation_splits": [
446
- "test"
447
- ],
448
- "few_shots_split": "dev",
449
- "few_shots_select": "sequential",
450
- "generation_size": 1,
451
- "stop_sequence": [
452
- "\n"
453
- ],
454
- "output_regex": null,
455
- "frozen": false,
456
- "suite": [
457
- "lighteval",
458
- "mmlu"
459
- ]
460
- },
461
- "lighteval|mmlu:college_biology": {
462
- "name": "mmlu:college_biology",
463
- "prompt_function": "mmlu_harness",
464
- "hf_repo": "lighteval/mmlu",
465
- "hf_subset": "college_biology",
466
- "metric": [
467
- "loglikelihood_acc"
468
- ],
469
- "hf_avail_splits": [
470
- "auxiliary_train",
471
- "test",
472
- "validation",
473
- "dev"
474
- ],
475
- "evaluation_splits": [
476
- "test"
477
- ],
478
- "few_shots_split": "dev",
479
- "few_shots_select": "sequential",
480
- "generation_size": 1,
481
- "stop_sequence": [
482
- "\n"
483
- ],
484
- "output_regex": null,
485
- "frozen": false,
486
- "suite": [
487
- "lighteval",
488
- "mmlu"
489
- ]
490
- },
491
- "lighteval|mmlu:college_chemistry": {
492
- "name": "mmlu:college_chemistry",
493
- "prompt_function": "mmlu_harness",
494
- "hf_repo": "lighteval/mmlu",
495
- "hf_subset": "college_chemistry",
496
- "metric": [
497
- "loglikelihood_acc"
498
- ],
499
- "hf_avail_splits": [
500
- "auxiliary_train",
501
- "test",
502
- "validation",
503
- "dev"
504
- ],
505
- "evaluation_splits": [
506
- "test"
507
- ],
508
- "few_shots_split": "dev",
509
- "few_shots_select": "sequential",
510
- "generation_size": 1,
511
- "stop_sequence": [
512
- "\n"
513
- ],
514
- "output_regex": null,
515
- "frozen": false,
516
- "suite": [
517
- "lighteval",
518
- "mmlu"
519
- ]
520
- },
521
- "lighteval|mmlu:college_computer_science": {
522
- "name": "mmlu:college_computer_science",
523
- "prompt_function": "mmlu_harness",
524
- "hf_repo": "lighteval/mmlu",
525
- "hf_subset": "college_computer_science",
526
- "metric": [
527
- "loglikelihood_acc"
528
- ],
529
- "hf_avail_splits": [
530
- "auxiliary_train",
531
- "test",
532
- "validation",
533
- "dev"
534
- ],
535
- "evaluation_splits": [
536
- "test"
537
- ],
538
- "few_shots_split": "dev",
539
- "few_shots_select": "sequential",
540
- "generation_size": 1,
541
- "stop_sequence": [
542
- "\n"
543
- ],
544
- "output_regex": null,
545
- "frozen": false,
546
- "suite": [
547
- "lighteval",
548
- "mmlu"
549
- ]
550
- },
551
- "lighteval|mmlu:college_mathematics": {
552
- "name": "mmlu:college_mathematics",
553
- "prompt_function": "mmlu_harness",
554
- "hf_repo": "lighteval/mmlu",
555
- "hf_subset": "college_mathematics",
556
- "metric": [
557
- "loglikelihood_acc"
558
- ],
559
- "hf_avail_splits": [
560
- "auxiliary_train",
561
- "test",
562
- "validation",
563
- "dev"
564
- ],
565
- "evaluation_splits": [
566
- "test"
567
- ],
568
- "few_shots_split": "dev",
569
- "few_shots_select": "sequential",
570
- "generation_size": 1,
571
- "stop_sequence": [
572
- "\n"
573
- ],
574
- "output_regex": null,
575
- "frozen": false,
576
- "suite": [
577
- "lighteval",
578
- "mmlu"
579
- ]
580
- },
581
- "lighteval|mmlu:college_medicine": {
582
- "name": "mmlu:college_medicine",
583
- "prompt_function": "mmlu_harness",
584
- "hf_repo": "lighteval/mmlu",
585
- "hf_subset": "college_medicine",
586
- "metric": [
587
- "loglikelihood_acc"
588
- ],
589
- "hf_avail_splits": [
590
- "auxiliary_train",
591
- "test",
592
- "validation",
593
- "dev"
594
- ],
595
- "evaluation_splits": [
596
- "test"
597
- ],
598
- "few_shots_split": "dev",
599
- "few_shots_select": "sequential",
600
- "generation_size": 1,
601
- "stop_sequence": [
602
- "\n"
603
- ],
604
- "output_regex": null,
605
- "frozen": false,
606
- "suite": [
607
- "lighteval",
608
- "mmlu"
609
- ]
610
- },
611
- "lighteval|mmlu:college_physics": {
612
- "name": "mmlu:college_physics",
613
- "prompt_function": "mmlu_harness",
614
- "hf_repo": "lighteval/mmlu",
615
- "hf_subset": "college_physics",
616
- "metric": [
617
- "loglikelihood_acc"
618
- ],
619
- "hf_avail_splits": [
620
- "auxiliary_train",
621
- "test",
622
- "validation",
623
- "dev"
624
- ],
625
- "evaluation_splits": [
626
- "test"
627
- ],
628
- "few_shots_split": "dev",
629
- "few_shots_select": "sequential",
630
- "generation_size": 1,
631
- "stop_sequence": [
632
- "\n"
633
- ],
634
- "output_regex": null,
635
- "frozen": false,
636
- "suite": [
637
- "lighteval",
638
- "mmlu"
639
- ]
640
- },
641
- "lighteval|mmlu:computer_security": {
642
- "name": "mmlu:computer_security",
643
- "prompt_function": "mmlu_harness",
644
- "hf_repo": "lighteval/mmlu",
645
- "hf_subset": "computer_security",
646
- "metric": [
647
- "loglikelihood_acc"
648
- ],
649
- "hf_avail_splits": [
650
- "auxiliary_train",
651
- "test",
652
- "validation",
653
- "dev"
654
- ],
655
- "evaluation_splits": [
656
- "test"
657
- ],
658
- "few_shots_split": "dev",
659
- "few_shots_select": "sequential",
660
- "generation_size": 1,
661
- "stop_sequence": [
662
- "\n"
663
- ],
664
- "output_regex": null,
665
- "frozen": false,
666
- "suite": [
667
- "lighteval",
668
- "mmlu"
669
- ]
670
- },
671
- "lighteval|mmlu:conceptual_physics": {
672
- "name": "mmlu:conceptual_physics",
673
- "prompt_function": "mmlu_harness",
674
- "hf_repo": "lighteval/mmlu",
675
- "hf_subset": "conceptual_physics",
676
- "metric": [
677
- "loglikelihood_acc"
678
- ],
679
- "hf_avail_splits": [
680
- "auxiliary_train",
681
- "test",
682
- "validation",
683
- "dev"
684
- ],
685
- "evaluation_splits": [
686
- "test"
687
- ],
688
- "few_shots_split": "dev",
689
- "few_shots_select": "sequential",
690
- "generation_size": 1,
691
- "stop_sequence": [
692
- "\n"
693
- ],
694
- "output_regex": null,
695
- "frozen": false,
696
- "suite": [
697
- "lighteval",
698
- "mmlu"
699
- ]
700
- },
701
- "lighteval|mmlu:econometrics": {
702
- "name": "mmlu:econometrics",
703
- "prompt_function": "mmlu_harness",
704
- "hf_repo": "lighteval/mmlu",
705
- "hf_subset": "econometrics",
706
- "metric": [
707
- "loglikelihood_acc"
708
- ],
709
- "hf_avail_splits": [
710
- "auxiliary_train",
711
- "test",
712
- "validation",
713
- "dev"
714
- ],
715
- "evaluation_splits": [
716
- "test"
717
- ],
718
- "few_shots_split": "dev",
719
- "few_shots_select": "sequential",
720
- "generation_size": 1,
721
- "stop_sequence": [
722
- "\n"
723
- ],
724
- "output_regex": null,
725
- "frozen": false,
726
- "suite": [
727
- "lighteval",
728
- "mmlu"
729
- ]
730
- },
731
- "lighteval|mmlu:electrical_engineering": {
732
- "name": "mmlu:electrical_engineering",
733
- "prompt_function": "mmlu_harness",
734
- "hf_repo": "lighteval/mmlu",
735
- "hf_subset": "electrical_engineering",
736
- "metric": [
737
- "loglikelihood_acc"
738
- ],
739
- "hf_avail_splits": [
740
- "auxiliary_train",
741
- "test",
742
- "validation",
743
- "dev"
744
- ],
745
- "evaluation_splits": [
746
- "test"
747
- ],
748
- "few_shots_split": "dev",
749
- "few_shots_select": "sequential",
750
- "generation_size": 1,
751
- "stop_sequence": [
752
- "\n"
753
- ],
754
- "output_regex": null,
755
- "frozen": false,
756
- "suite": [
757
- "lighteval",
758
- "mmlu"
759
- ]
760
- },
761
- "lighteval|mmlu:elementary_mathematics": {
762
- "name": "mmlu:elementary_mathematics",
763
- "prompt_function": "mmlu_harness",
764
- "hf_repo": "lighteval/mmlu",
765
- "hf_subset": "elementary_mathematics",
766
- "metric": [
767
- "loglikelihood_acc"
768
- ],
769
- "hf_avail_splits": [
770
- "auxiliary_train",
771
- "test",
772
- "validation",
773
- "dev"
774
- ],
775
- "evaluation_splits": [
776
- "test"
777
- ],
778
- "few_shots_split": "dev",
779
- "few_shots_select": "sequential",
780
- "generation_size": 1,
781
- "stop_sequence": [
782
- "\n"
783
- ],
784
- "output_regex": null,
785
- "frozen": false,
786
- "suite": [
787
- "lighteval",
788
- "mmlu"
789
- ]
790
- },
791
- "lighteval|mmlu:formal_logic": {
792
- "name": "mmlu:formal_logic",
793
- "prompt_function": "mmlu_harness",
794
- "hf_repo": "lighteval/mmlu",
795
- "hf_subset": "formal_logic",
796
- "metric": [
797
- "loglikelihood_acc"
798
- ],
799
- "hf_avail_splits": [
800
- "auxiliary_train",
801
- "test",
802
- "validation",
803
- "dev"
804
- ],
805
- "evaluation_splits": [
806
- "test"
807
- ],
808
- "few_shots_split": "dev",
809
- "few_shots_select": "sequential",
810
- "generation_size": 1,
811
- "stop_sequence": [
812
- "\n"
813
- ],
814
- "output_regex": null,
815
- "frozen": false,
816
- "suite": [
817
- "lighteval",
818
- "mmlu"
819
- ]
820
- },
821
- "lighteval|mmlu:global_facts": {
822
- "name": "mmlu:global_facts",
823
- "prompt_function": "mmlu_harness",
824
- "hf_repo": "lighteval/mmlu",
825
- "hf_subset": "global_facts",
826
- "metric": [
827
- "loglikelihood_acc"
828
- ],
829
- "hf_avail_splits": [
830
- "auxiliary_train",
831
- "test",
832
- "validation",
833
- "dev"
834
- ],
835
- "evaluation_splits": [
836
- "test"
837
- ],
838
- "few_shots_split": "dev",
839
- "few_shots_select": "sequential",
840
- "generation_size": 1,
841
- "stop_sequence": [
842
- "\n"
843
- ],
844
- "output_regex": null,
845
- "frozen": false,
846
- "suite": [
847
- "lighteval",
848
- "mmlu"
849
- ]
850
- },
851
- "lighteval|mmlu:high_school_biology": {
852
- "name": "mmlu:high_school_biology",
853
- "prompt_function": "mmlu_harness",
854
- "hf_repo": "lighteval/mmlu",
855
- "hf_subset": "high_school_biology",
856
- "metric": [
857
- "loglikelihood_acc"
858
- ],
859
- "hf_avail_splits": [
860
- "auxiliary_train",
861
- "test",
862
- "validation",
863
- "dev"
864
- ],
865
- "evaluation_splits": [
866
- "test"
867
- ],
868
- "few_shots_split": "dev",
869
- "few_shots_select": "sequential",
870
- "generation_size": 1,
871
- "stop_sequence": [
872
- "\n"
873
- ],
874
- "output_regex": null,
875
- "frozen": false,
876
- "suite": [
877
- "lighteval",
878
- "mmlu"
879
- ]
880
- },
881
- "lighteval|mmlu:high_school_chemistry": {
882
- "name": "mmlu:high_school_chemistry",
883
- "prompt_function": "mmlu_harness",
884
- "hf_repo": "lighteval/mmlu",
885
- "hf_subset": "high_school_chemistry",
886
- "metric": [
887
- "loglikelihood_acc"
888
- ],
889
- "hf_avail_splits": [
890
- "auxiliary_train",
891
- "test",
892
- "validation",
893
- "dev"
894
- ],
895
- "evaluation_splits": [
896
- "test"
897
- ],
898
- "few_shots_split": "dev",
899
- "few_shots_select": "sequential",
900
- "generation_size": 1,
901
- "stop_sequence": [
902
- "\n"
903
- ],
904
- "output_regex": null,
905
- "frozen": false,
906
- "suite": [
907
- "lighteval",
908
- "mmlu"
909
- ]
910
- },
911
- "lighteval|mmlu:high_school_computer_science": {
912
- "name": "mmlu:high_school_computer_science",
913
- "prompt_function": "mmlu_harness",
914
- "hf_repo": "lighteval/mmlu",
915
- "hf_subset": "high_school_computer_science",
916
- "metric": [
917
- "loglikelihood_acc"
918
- ],
919
- "hf_avail_splits": [
920
- "auxiliary_train",
921
- "test",
922
- "validation",
923
- "dev"
924
- ],
925
- "evaluation_splits": [
926
- "test"
927
- ],
928
- "few_shots_split": "dev",
929
- "few_shots_select": "sequential",
930
- "generation_size": 1,
931
- "stop_sequence": [
932
- "\n"
933
- ],
934
- "output_regex": null,
935
- "frozen": false,
936
- "suite": [
937
- "lighteval",
938
- "mmlu"
939
- ]
940
- },
941
- "lighteval|mmlu:high_school_european_history": {
942
- "name": "mmlu:high_school_european_history",
943
- "prompt_function": "mmlu_harness",
944
- "hf_repo": "lighteval/mmlu",
945
- "hf_subset": "high_school_european_history",
946
- "metric": [
947
- "loglikelihood_acc"
948
- ],
949
- "hf_avail_splits": [
950
- "auxiliary_train",
951
- "test",
952
- "validation",
953
- "dev"
954
- ],
955
- "evaluation_splits": [
956
- "test"
957
- ],
958
- "few_shots_split": "dev",
959
- "few_shots_select": "sequential",
960
- "generation_size": 1,
961
- "stop_sequence": [
962
- "\n"
963
- ],
964
- "output_regex": null,
965
- "frozen": false,
966
- "suite": [
967
- "lighteval",
968
- "mmlu"
969
- ]
970
- },
971
- "lighteval|mmlu:high_school_geography": {
972
- "name": "mmlu:high_school_geography",
973
- "prompt_function": "mmlu_harness",
974
- "hf_repo": "lighteval/mmlu",
975
- "hf_subset": "high_school_geography",
976
- "metric": [
977
- "loglikelihood_acc"
978
- ],
979
- "hf_avail_splits": [
980
- "auxiliary_train",
981
- "test",
982
- "validation",
983
- "dev"
984
- ],
985
- "evaluation_splits": [
986
- "test"
987
- ],
988
- "few_shots_split": "dev",
989
- "few_shots_select": "sequential",
990
- "generation_size": 1,
991
- "stop_sequence": [
992
- "\n"
993
- ],
994
- "output_regex": null,
995
- "frozen": false,
996
- "suite": [
997
- "lighteval",
998
- "mmlu"
999
- ]
1000
- },
1001
- "lighteval|mmlu:high_school_government_and_politics": {
1002
- "name": "mmlu:high_school_government_and_politics",
1003
- "prompt_function": "mmlu_harness",
1004
- "hf_repo": "lighteval/mmlu",
1005
- "hf_subset": "high_school_government_and_politics",
1006
- "metric": [
1007
- "loglikelihood_acc"
1008
- ],
1009
- "hf_avail_splits": [
1010
- "auxiliary_train",
1011
- "test",
1012
- "validation",
1013
- "dev"
1014
- ],
1015
- "evaluation_splits": [
1016
- "test"
1017
- ],
1018
- "few_shots_split": "dev",
1019
- "few_shots_select": "sequential",
1020
- "generation_size": 1,
1021
- "stop_sequence": [
1022
- "\n"
1023
- ],
1024
- "output_regex": null,
1025
- "frozen": false,
1026
- "suite": [
1027
- "lighteval",
1028
- "mmlu"
1029
- ]
1030
- },
1031
- "lighteval|mmlu:high_school_macroeconomics": {
1032
- "name": "mmlu:high_school_macroeconomics",
1033
- "prompt_function": "mmlu_harness",
1034
- "hf_repo": "lighteval/mmlu",
1035
- "hf_subset": "high_school_macroeconomics",
1036
- "metric": [
1037
- "loglikelihood_acc"
1038
- ],
1039
- "hf_avail_splits": [
1040
- "auxiliary_train",
1041
- "test",
1042
- "validation",
1043
- "dev"
1044
- ],
1045
- "evaluation_splits": [
1046
- "test"
1047
- ],
1048
- "few_shots_split": "dev",
1049
- "few_shots_select": "sequential",
1050
- "generation_size": 1,
1051
- "stop_sequence": [
1052
- "\n"
1053
- ],
1054
- "output_regex": null,
1055
- "frozen": false,
1056
- "suite": [
1057
- "lighteval",
1058
- "mmlu"
1059
- ]
1060
- },
1061
- "lighteval|mmlu:high_school_mathematics": {
1062
- "name": "mmlu:high_school_mathematics",
1063
- "prompt_function": "mmlu_harness",
1064
- "hf_repo": "lighteval/mmlu",
1065
- "hf_subset": "high_school_mathematics",
1066
- "metric": [
1067
- "loglikelihood_acc"
1068
- ],
1069
- "hf_avail_splits": [
1070
- "auxiliary_train",
1071
- "test",
1072
- "validation",
1073
- "dev"
1074
- ],
1075
- "evaluation_splits": [
1076
- "test"
1077
- ],
1078
- "few_shots_split": "dev",
1079
- "few_shots_select": "sequential",
1080
- "generation_size": 1,
1081
- "stop_sequence": [
1082
- "\n"
1083
- ],
1084
- "output_regex": null,
1085
- "frozen": false,
1086
- "suite": [
1087
- "lighteval",
1088
- "mmlu"
1089
- ]
1090
- },
1091
- "lighteval|mmlu:high_school_microeconomics": {
1092
- "name": "mmlu:high_school_microeconomics",
1093
- "prompt_function": "mmlu_harness",
1094
- "hf_repo": "lighteval/mmlu",
1095
- "hf_subset": "high_school_microeconomics",
1096
- "metric": [
1097
- "loglikelihood_acc"
1098
- ],
1099
- "hf_avail_splits": [
1100
- "auxiliary_train",
1101
- "test",
1102
- "validation",
1103
- "dev"
1104
- ],
1105
- "evaluation_splits": [
1106
- "test"
1107
- ],
1108
- "few_shots_split": "dev",
1109
- "few_shots_select": "sequential",
1110
- "generation_size": 1,
1111
- "stop_sequence": [
1112
- "\n"
1113
- ],
1114
- "output_regex": null,
1115
- "frozen": false,
1116
- "suite": [
1117
- "lighteval",
1118
- "mmlu"
1119
- ]
1120
- },
1121
- "lighteval|mmlu:high_school_physics": {
1122
- "name": "mmlu:high_school_physics",
1123
- "prompt_function": "mmlu_harness",
1124
- "hf_repo": "lighteval/mmlu",
1125
- "hf_subset": "high_school_physics",
1126
- "metric": [
1127
- "loglikelihood_acc"
1128
- ],
1129
- "hf_avail_splits": [
1130
- "auxiliary_train",
1131
- "test",
1132
- "validation",
1133
- "dev"
1134
- ],
1135
- "evaluation_splits": [
1136
- "test"
1137
- ],
1138
- "few_shots_split": "dev",
1139
- "few_shots_select": "sequential",
1140
- "generation_size": 1,
1141
- "stop_sequence": [
1142
- "\n"
1143
- ],
1144
- "output_regex": null,
1145
- "frozen": false,
1146
- "suite": [
1147
- "lighteval",
1148
- "mmlu"
1149
- ]
1150
- },
1151
- "lighteval|mmlu:high_school_psychology": {
1152
- "name": "mmlu:high_school_psychology",
1153
- "prompt_function": "mmlu_harness",
1154
- "hf_repo": "lighteval/mmlu",
1155
- "hf_subset": "high_school_psychology",
1156
- "metric": [
1157
- "loglikelihood_acc"
1158
- ],
1159
- "hf_avail_splits": [
1160
- "auxiliary_train",
1161
- "test",
1162
- "validation",
1163
- "dev"
1164
- ],
1165
- "evaluation_splits": [
1166
- "test"
1167
- ],
1168
- "few_shots_split": "dev",
1169
- "few_shots_select": "sequential",
1170
- "generation_size": 1,
1171
- "stop_sequence": [
1172
- "\n"
1173
- ],
1174
- "output_regex": null,
1175
- "frozen": false,
1176
- "suite": [
1177
- "lighteval",
1178
- "mmlu"
1179
- ]
1180
- },
1181
- "lighteval|mmlu:high_school_statistics": {
1182
- "name": "mmlu:high_school_statistics",
1183
- "prompt_function": "mmlu_harness",
1184
- "hf_repo": "lighteval/mmlu",
1185
- "hf_subset": "high_school_statistics",
1186
- "metric": [
1187
- "loglikelihood_acc"
1188
- ],
1189
- "hf_avail_splits": [
1190
- "auxiliary_train",
1191
- "test",
1192
- "validation",
1193
- "dev"
1194
- ],
1195
- "evaluation_splits": [
1196
- "test"
1197
- ],
1198
- "few_shots_split": "dev",
1199
- "few_shots_select": "sequential",
1200
- "generation_size": 1,
1201
- "stop_sequence": [
1202
- "\n"
1203
- ],
1204
- "output_regex": null,
1205
- "frozen": false,
1206
- "suite": [
1207
- "lighteval",
1208
- "mmlu"
1209
- ]
1210
- },
1211
- "lighteval|mmlu:high_school_us_history": {
1212
- "name": "mmlu:high_school_us_history",
1213
- "prompt_function": "mmlu_harness",
1214
- "hf_repo": "lighteval/mmlu",
1215
- "hf_subset": "high_school_us_history",
1216
- "metric": [
1217
- "loglikelihood_acc"
1218
- ],
1219
- "hf_avail_splits": [
1220
- "auxiliary_train",
1221
- "test",
1222
- "validation",
1223
- "dev"
1224
- ],
1225
- "evaluation_splits": [
1226
- "test"
1227
- ],
1228
- "few_shots_split": "dev",
1229
- "few_shots_select": "sequential",
1230
- "generation_size": 1,
1231
- "stop_sequence": [
1232
- "\n"
1233
- ],
1234
- "output_regex": null,
1235
- "frozen": false,
1236
- "suite": [
1237
- "lighteval",
1238
- "mmlu"
1239
- ]
1240
- },
1241
- "lighteval|mmlu:high_school_world_history": {
1242
- "name": "mmlu:high_school_world_history",
1243
- "prompt_function": "mmlu_harness",
1244
- "hf_repo": "lighteval/mmlu",
1245
- "hf_subset": "high_school_world_history",
1246
- "metric": [
1247
- "loglikelihood_acc"
1248
- ],
1249
- "hf_avail_splits": [
1250
- "auxiliary_train",
1251
- "test",
1252
- "validation",
1253
- "dev"
1254
- ],
1255
- "evaluation_splits": [
1256
- "test"
1257
- ],
1258
- "few_shots_split": "dev",
1259
- "few_shots_select": "sequential",
1260
- "generation_size": 1,
1261
- "stop_sequence": [
1262
- "\n"
1263
- ],
1264
- "output_regex": null,
1265
- "frozen": false,
1266
- "suite": [
1267
- "lighteval",
1268
- "mmlu"
1269
- ]
1270
- },
1271
- "lighteval|mmlu:human_aging": {
1272
- "name": "mmlu:human_aging",
1273
- "prompt_function": "mmlu_harness",
1274
- "hf_repo": "lighteval/mmlu",
1275
- "hf_subset": "human_aging",
1276
- "metric": [
1277
- "loglikelihood_acc"
1278
- ],
1279
- "hf_avail_splits": [
1280
- "auxiliary_train",
1281
- "test",
1282
- "validation",
1283
- "dev"
1284
- ],
1285
- "evaluation_splits": [
1286
- "test"
1287
- ],
1288
- "few_shots_split": "dev",
1289
- "few_shots_select": "sequential",
1290
- "generation_size": 1,
1291
- "stop_sequence": [
1292
- "\n"
1293
- ],
1294
- "output_regex": null,
1295
- "frozen": false,
1296
- "suite": [
1297
- "lighteval",
1298
- "mmlu"
1299
- ]
1300
- },
1301
- "lighteval|mmlu:human_sexuality": {
1302
- "name": "mmlu:human_sexuality",
1303
- "prompt_function": "mmlu_harness",
1304
- "hf_repo": "lighteval/mmlu",
1305
- "hf_subset": "human_sexuality",
1306
- "metric": [
1307
- "loglikelihood_acc"
1308
- ],
1309
- "hf_avail_splits": [
1310
- "auxiliary_train",
1311
- "test",
1312
- "validation",
1313
- "dev"
1314
- ],
1315
- "evaluation_splits": [
1316
- "test"
1317
- ],
1318
- "few_shots_split": "dev",
1319
- "few_shots_select": "sequential",
1320
- "generation_size": 1,
1321
- "stop_sequence": [
1322
- "\n"
1323
- ],
1324
- "output_regex": null,
1325
- "frozen": false,
1326
- "suite": [
1327
- "lighteval",
1328
- "mmlu"
1329
- ]
1330
- },
1331
- "lighteval|mmlu:international_law": {
1332
- "name": "mmlu:international_law",
1333
- "prompt_function": "mmlu_harness",
1334
- "hf_repo": "lighteval/mmlu",
1335
- "hf_subset": "international_law",
1336
- "metric": [
1337
- "loglikelihood_acc"
1338
- ],
1339
- "hf_avail_splits": [
1340
- "auxiliary_train",
1341
- "test",
1342
- "validation",
1343
- "dev"
1344
- ],
1345
- "evaluation_splits": [
1346
- "test"
1347
- ],
1348
- "few_shots_split": "dev",
1349
- "few_shots_select": "sequential",
1350
- "generation_size": 1,
1351
- "stop_sequence": [
1352
- "\n"
1353
- ],
1354
- "output_regex": null,
1355
- "frozen": false,
1356
- "suite": [
1357
- "lighteval",
1358
- "mmlu"
1359
- ]
1360
- },
1361
- "lighteval|mmlu:jurisprudence": {
1362
- "name": "mmlu:jurisprudence",
1363
- "prompt_function": "mmlu_harness",
1364
- "hf_repo": "lighteval/mmlu",
1365
- "hf_subset": "jurisprudence",
1366
- "metric": [
1367
- "loglikelihood_acc"
1368
- ],
1369
- "hf_avail_splits": [
1370
- "auxiliary_train",
1371
- "test",
1372
- "validation",
1373
- "dev"
1374
- ],
1375
- "evaluation_splits": [
1376
- "test"
1377
- ],
1378
- "few_shots_split": "dev",
1379
- "few_shots_select": "sequential",
1380
- "generation_size": 1,
1381
- "stop_sequence": [
1382
- "\n"
1383
- ],
1384
- "output_regex": null,
1385
- "frozen": false,
1386
- "suite": [
1387
- "lighteval",
1388
- "mmlu"
1389
- ]
1390
- },
1391
- "lighteval|mmlu:logical_fallacies": {
1392
- "name": "mmlu:logical_fallacies",
1393
- "prompt_function": "mmlu_harness",
1394
- "hf_repo": "lighteval/mmlu",
1395
- "hf_subset": "logical_fallacies",
1396
- "metric": [
1397
- "loglikelihood_acc"
1398
- ],
1399
- "hf_avail_splits": [
1400
- "auxiliary_train",
1401
- "test",
1402
- "validation",
1403
- "dev"
1404
- ],
1405
- "evaluation_splits": [
1406
- "test"
1407
- ],
1408
- "few_shots_split": "dev",
1409
- "few_shots_select": "sequential",
1410
- "generation_size": 1,
1411
- "stop_sequence": [
1412
- "\n"
1413
- ],
1414
- "output_regex": null,
1415
- "frozen": false,
1416
- "suite": [
1417
- "lighteval",
1418
- "mmlu"
1419
- ]
1420
- },
1421
- "lighteval|mmlu:machine_learning": {
1422
- "name": "mmlu:machine_learning",
1423
- "prompt_function": "mmlu_harness",
1424
- "hf_repo": "lighteval/mmlu",
1425
- "hf_subset": "machine_learning",
1426
- "metric": [
1427
- "loglikelihood_acc"
1428
- ],
1429
- "hf_avail_splits": [
1430
- "auxiliary_train",
1431
- "test",
1432
- "validation",
1433
- "dev"
1434
- ],
1435
- "evaluation_splits": [
1436
- "test"
1437
- ],
1438
- "few_shots_split": "dev",
1439
- "few_shots_select": "sequential",
1440
- "generation_size": 1,
1441
- "stop_sequence": [
1442
- "\n"
1443
- ],
1444
- "output_regex": null,
1445
- "frozen": false,
1446
- "suite": [
1447
- "lighteval",
1448
- "mmlu"
1449
- ]
1450
- },
1451
- "lighteval|mmlu:management": {
1452
- "name": "mmlu:management",
1453
- "prompt_function": "mmlu_harness",
1454
- "hf_repo": "lighteval/mmlu",
1455
- "hf_subset": "management",
1456
- "metric": [
1457
- "loglikelihood_acc"
1458
- ],
1459
- "hf_avail_splits": [
1460
- "auxiliary_train",
1461
- "test",
1462
- "validation",
1463
- "dev"
1464
- ],
1465
- "evaluation_splits": [
1466
- "test"
1467
- ],
1468
- "few_shots_split": "dev",
1469
- "few_shots_select": "sequential",
1470
- "generation_size": 1,
1471
- "stop_sequence": [
1472
- "\n"
1473
- ],
1474
- "output_regex": null,
1475
- "frozen": false,
1476
- "suite": [
1477
- "lighteval",
1478
- "mmlu"
1479
- ]
1480
- },
1481
- "lighteval|mmlu:marketing": {
1482
- "name": "mmlu:marketing",
1483
- "prompt_function": "mmlu_harness",
1484
- "hf_repo": "lighteval/mmlu",
1485
- "hf_subset": "marketing",
1486
- "metric": [
1487
- "loglikelihood_acc"
1488
- ],
1489
- "hf_avail_splits": [
1490
- "auxiliary_train",
1491
- "test",
1492
- "validation",
1493
- "dev"
1494
- ],
1495
- "evaluation_splits": [
1496
- "test"
1497
- ],
1498
- "few_shots_split": "dev",
1499
- "few_shots_select": "sequential",
1500
- "generation_size": 1,
1501
- "stop_sequence": [
1502
- "\n"
1503
- ],
1504
- "output_regex": null,
1505
- "frozen": false,
1506
- "suite": [
1507
- "lighteval",
1508
- "mmlu"
1509
- ]
1510
- },
1511
- "lighteval|mmlu:medical_genetics": {
1512
- "name": "mmlu:medical_genetics",
1513
- "prompt_function": "mmlu_harness",
1514
- "hf_repo": "lighteval/mmlu",
1515
- "hf_subset": "medical_genetics",
1516
- "metric": [
1517
- "loglikelihood_acc"
1518
- ],
1519
- "hf_avail_splits": [
1520
- "auxiliary_train",
1521
- "test",
1522
- "validation",
1523
- "dev"
1524
- ],
1525
- "evaluation_splits": [
1526
- "test"
1527
- ],
1528
- "few_shots_split": "dev",
1529
- "few_shots_select": "sequential",
1530
- "generation_size": 1,
1531
- "stop_sequence": [
1532
- "\n"
1533
- ],
1534
- "output_regex": null,
1535
- "frozen": false,
1536
- "suite": [
1537
- "lighteval",
1538
- "mmlu"
1539
- ]
1540
- },
1541
- "lighteval|mmlu:miscellaneous": {
1542
- "name": "mmlu:miscellaneous",
1543
- "prompt_function": "mmlu_harness",
1544
- "hf_repo": "lighteval/mmlu",
1545
- "hf_subset": "miscellaneous",
1546
- "metric": [
1547
- "loglikelihood_acc"
1548
- ],
1549
- "hf_avail_splits": [
1550
- "auxiliary_train",
1551
- "test",
1552
- "validation",
1553
- "dev"
1554
- ],
1555
- "evaluation_splits": [
1556
- "test"
1557
- ],
1558
- "few_shots_split": "dev",
1559
- "few_shots_select": "sequential",
1560
- "generation_size": 1,
1561
- "stop_sequence": [
1562
- "\n"
1563
- ],
1564
- "output_regex": null,
1565
- "frozen": false,
1566
- "suite": [
1567
- "lighteval",
1568
- "mmlu"
1569
- ]
1570
- },
1571
- "lighteval|mmlu:moral_disputes": {
1572
- "name": "mmlu:moral_disputes",
1573
- "prompt_function": "mmlu_harness",
1574
- "hf_repo": "lighteval/mmlu",
1575
- "hf_subset": "moral_disputes",
1576
- "metric": [
1577
- "loglikelihood_acc"
1578
- ],
1579
- "hf_avail_splits": [
1580
- "auxiliary_train",
1581
- "test",
1582
- "validation",
1583
- "dev"
1584
- ],
1585
- "evaluation_splits": [
1586
- "test"
1587
- ],
1588
- "few_shots_split": "dev",
1589
- "few_shots_select": "sequential",
1590
- "generation_size": 1,
1591
- "stop_sequence": [
1592
- "\n"
1593
- ],
1594
- "output_regex": null,
1595
- "frozen": false,
1596
- "suite": [
1597
- "lighteval",
1598
- "mmlu"
1599
- ]
1600
- },
1601
- "lighteval|mmlu:moral_scenarios": {
1602
- "name": "mmlu:moral_scenarios",
1603
- "prompt_function": "mmlu_harness",
1604
- "hf_repo": "lighteval/mmlu",
1605
- "hf_subset": "moral_scenarios",
1606
- "metric": [
1607
- "loglikelihood_acc"
1608
- ],
1609
- "hf_avail_splits": [
1610
- "auxiliary_train",
1611
- "test",
1612
- "validation",
1613
- "dev"
1614
- ],
1615
- "evaluation_splits": [
1616
- "test"
1617
- ],
1618
- "few_shots_split": "dev",
1619
- "few_shots_select": "sequential",
1620
- "generation_size": 1,
1621
- "stop_sequence": [
1622
- "\n"
1623
- ],
1624
- "output_regex": null,
1625
- "frozen": false,
1626
- "suite": [
1627
- "lighteval",
1628
- "mmlu"
1629
- ]
1630
- },
1631
- "lighteval|mmlu:nutrition": {
1632
- "name": "mmlu:nutrition",
1633
- "prompt_function": "mmlu_harness",
1634
- "hf_repo": "lighteval/mmlu",
1635
- "hf_subset": "nutrition",
1636
- "metric": [
1637
- "loglikelihood_acc"
1638
- ],
1639
- "hf_avail_splits": [
1640
- "auxiliary_train",
1641
- "test",
1642
- "validation",
1643
- "dev"
1644
- ],
1645
- "evaluation_splits": [
1646
- "test"
1647
- ],
1648
- "few_shots_split": "dev",
1649
- "few_shots_select": "sequential",
1650
- "generation_size": 1,
1651
- "stop_sequence": [
1652
- "\n"
1653
- ],
1654
- "output_regex": null,
1655
- "frozen": false,
1656
- "suite": [
1657
- "lighteval",
1658
- "mmlu"
1659
- ]
1660
- },
1661
- "lighteval|mmlu:philosophy": {
1662
- "name": "mmlu:philosophy",
1663
- "prompt_function": "mmlu_harness",
1664
- "hf_repo": "lighteval/mmlu",
1665
- "hf_subset": "philosophy",
1666
- "metric": [
1667
- "loglikelihood_acc"
1668
- ],
1669
- "hf_avail_splits": [
1670
- "auxiliary_train",
1671
- "test",
1672
- "validation",
1673
- "dev"
1674
- ],
1675
- "evaluation_splits": [
1676
- "test"
1677
- ],
1678
- "few_shots_split": "dev",
1679
- "few_shots_select": "sequential",
1680
- "generation_size": 1,
1681
- "stop_sequence": [
1682
- "\n"
1683
- ],
1684
- "output_regex": null,
1685
- "frozen": false,
1686
- "suite": [
1687
- "lighteval",
1688
- "mmlu"
1689
- ]
1690
- },
1691
- "lighteval|mmlu:prehistory": {
1692
- "name": "mmlu:prehistory",
1693
- "prompt_function": "mmlu_harness",
1694
- "hf_repo": "lighteval/mmlu",
1695
- "hf_subset": "prehistory",
1696
- "metric": [
1697
- "loglikelihood_acc"
1698
- ],
1699
- "hf_avail_splits": [
1700
- "auxiliary_train",
1701
- "test",
1702
- "validation",
1703
- "dev"
1704
- ],
1705
- "evaluation_splits": [
1706
- "test"
1707
- ],
1708
- "few_shots_split": "dev",
1709
- "few_shots_select": "sequential",
1710
- "generation_size": 1,
1711
- "stop_sequence": [
1712
- "\n"
1713
- ],
1714
- "output_regex": null,
1715
- "frozen": false,
1716
- "suite": [
1717
- "lighteval",
1718
- "mmlu"
1719
- ]
1720
- },
1721
- "lighteval|mmlu:professional_accounting": {
1722
- "name": "mmlu:professional_accounting",
1723
- "prompt_function": "mmlu_harness",
1724
- "hf_repo": "lighteval/mmlu",
1725
- "hf_subset": "professional_accounting",
1726
- "metric": [
1727
- "loglikelihood_acc"
1728
- ],
1729
- "hf_avail_splits": [
1730
- "auxiliary_train",
1731
- "test",
1732
- "validation",
1733
- "dev"
1734
- ],
1735
- "evaluation_splits": [
1736
- "test"
1737
- ],
1738
- "few_shots_split": "dev",
1739
- "few_shots_select": "sequential",
1740
- "generation_size": 1,
1741
- "stop_sequence": [
1742
- "\n"
1743
- ],
1744
- "output_regex": null,
1745
- "frozen": false,
1746
- "suite": [
1747
- "lighteval",
1748
- "mmlu"
1749
- ]
1750
- },
1751
- "lighteval|mmlu:professional_law": {
1752
- "name": "mmlu:professional_law",
1753
- "prompt_function": "mmlu_harness",
1754
- "hf_repo": "lighteval/mmlu",
1755
- "hf_subset": "professional_law",
1756
- "metric": [
1757
- "loglikelihood_acc"
1758
- ],
1759
- "hf_avail_splits": [
1760
- "auxiliary_train",
1761
- "test",
1762
- "validation",
1763
- "dev"
1764
- ],
1765
- "evaluation_splits": [
1766
- "test"
1767
- ],
1768
- "few_shots_split": "dev",
1769
- "few_shots_select": "sequential",
1770
- "generation_size": 1,
1771
- "stop_sequence": [
1772
- "\n"
1773
- ],
1774
- "output_regex": null,
1775
- "frozen": false,
1776
- "suite": [
1777
- "lighteval",
1778
- "mmlu"
1779
- ]
1780
- },
1781
- "lighteval|mmlu:professional_medicine": {
1782
- "name": "mmlu:professional_medicine",
1783
- "prompt_function": "mmlu_harness",
1784
- "hf_repo": "lighteval/mmlu",
1785
- "hf_subset": "professional_medicine",
1786
- "metric": [
1787
- "loglikelihood_acc"
1788
- ],
1789
- "hf_avail_splits": [
1790
- "auxiliary_train",
1791
- "test",
1792
- "validation",
1793
- "dev"
1794
- ],
1795
- "evaluation_splits": [
1796
- "test"
1797
- ],
1798
- "few_shots_split": "dev",
1799
- "few_shots_select": "sequential",
1800
- "generation_size": 1,
1801
- "stop_sequence": [
1802
- "\n"
1803
- ],
1804
- "output_regex": null,
1805
- "frozen": false,
1806
- "suite": [
1807
- "lighteval",
1808
- "mmlu"
1809
- ]
1810
- },
1811
- "lighteval|mmlu:professional_psychology": {
1812
- "name": "mmlu:professional_psychology",
1813
- "prompt_function": "mmlu_harness",
1814
- "hf_repo": "lighteval/mmlu",
1815
- "hf_subset": "professional_psychology",
1816
- "metric": [
1817
- "loglikelihood_acc"
1818
- ],
1819
- "hf_avail_splits": [
1820
- "auxiliary_train",
1821
- "test",
1822
- "validation",
1823
- "dev"
1824
- ],
1825
- "evaluation_splits": [
1826
- "test"
1827
- ],
1828
- "few_shots_split": "dev",
1829
- "few_shots_select": "sequential",
1830
- "generation_size": 1,
1831
- "stop_sequence": [
1832
- "\n"
1833
- ],
1834
- "output_regex": null,
1835
- "frozen": false,
1836
- "suite": [
1837
- "lighteval",
1838
- "mmlu"
1839
- ]
1840
- },
1841
- "lighteval|mmlu:public_relations": {
1842
- "name": "mmlu:public_relations",
1843
- "prompt_function": "mmlu_harness",
1844
- "hf_repo": "lighteval/mmlu",
1845
- "hf_subset": "public_relations",
1846
- "metric": [
1847
- "loglikelihood_acc"
1848
- ],
1849
- "hf_avail_splits": [
1850
- "auxiliary_train",
1851
- "test",
1852
- "validation",
1853
- "dev"
1854
- ],
1855
- "evaluation_splits": [
1856
- "test"
1857
- ],
1858
- "few_shots_split": "dev",
1859
- "few_shots_select": "sequential",
1860
- "generation_size": 1,
1861
- "stop_sequence": [
1862
- "\n"
1863
- ],
1864
- "output_regex": null,
1865
- "frozen": false,
1866
- "suite": [
1867
- "lighteval",
1868
- "mmlu"
1869
- ]
1870
- },
1871
- "lighteval|mmlu:security_studies": {
1872
- "name": "mmlu:security_studies",
1873
- "prompt_function": "mmlu_harness",
1874
- "hf_repo": "lighteval/mmlu",
1875
- "hf_subset": "security_studies",
1876
- "metric": [
1877
- "loglikelihood_acc"
1878
- ],
1879
- "hf_avail_splits": [
1880
- "auxiliary_train",
1881
- "test",
1882
- "validation",
1883
- "dev"
1884
- ],
1885
- "evaluation_splits": [
1886
- "test"
1887
- ],
1888
- "few_shots_split": "dev",
1889
- "few_shots_select": "sequential",
1890
- "generation_size": 1,
1891
- "stop_sequence": [
1892
- "\n"
1893
- ],
1894
- "output_regex": null,
1895
- "frozen": false,
1896
- "suite": [
1897
- "lighteval",
1898
- "mmlu"
1899
- ]
1900
- },
1901
- "lighteval|mmlu:sociology": {
1902
- "name": "mmlu:sociology",
1903
- "prompt_function": "mmlu_harness",
1904
- "hf_repo": "lighteval/mmlu",
1905
- "hf_subset": "sociology",
1906
- "metric": [
1907
- "loglikelihood_acc"
1908
- ],
1909
- "hf_avail_splits": [
1910
- "auxiliary_train",
1911
- "test",
1912
- "validation",
1913
- "dev"
1914
- ],
1915
- "evaluation_splits": [
1916
- "test"
1917
- ],
1918
- "few_shots_split": "dev",
1919
- "few_shots_select": "sequential",
1920
- "generation_size": 1,
1921
- "stop_sequence": [
1922
- "\n"
1923
- ],
1924
- "output_regex": null,
1925
- "frozen": false,
1926
- "suite": [
1927
- "lighteval",
1928
- "mmlu"
1929
- ]
1930
- },
1931
- "lighteval|mmlu:us_foreign_policy": {
1932
- "name": "mmlu:us_foreign_policy",
1933
- "prompt_function": "mmlu_harness",
1934
- "hf_repo": "lighteval/mmlu",
1935
- "hf_subset": "us_foreign_policy",
1936
- "metric": [
1937
- "loglikelihood_acc"
1938
- ],
1939
- "hf_avail_splits": [
1940
- "auxiliary_train",
1941
- "test",
1942
- "validation",
1943
- "dev"
1944
- ],
1945
- "evaluation_splits": [
1946
- "test"
1947
- ],
1948
- "few_shots_split": "dev",
1949
- "few_shots_select": "sequential",
1950
- "generation_size": 1,
1951
- "stop_sequence": [
1952
- "\n"
1953
- ],
1954
- "output_regex": null,
1955
- "frozen": false,
1956
- "suite": [
1957
- "lighteval",
1958
- "mmlu"
1959
- ]
1960
- },
1961
- "lighteval|mmlu:virology": {
1962
- "name": "mmlu:virology",
1963
- "prompt_function": "mmlu_harness",
1964
- "hf_repo": "lighteval/mmlu",
1965
- "hf_subset": "virology",
1966
- "metric": [
1967
- "loglikelihood_acc"
1968
- ],
1969
- "hf_avail_splits": [
1970
- "auxiliary_train",
1971
- "test",
1972
- "validation",
1973
- "dev"
1974
- ],
1975
- "evaluation_splits": [
1976
- "test"
1977
- ],
1978
- "few_shots_split": "dev",
1979
- "few_shots_select": "sequential",
1980
- "generation_size": 1,
1981
- "stop_sequence": [
1982
- "\n"
1983
- ],
1984
- "output_regex": null,
1985
- "frozen": false,
1986
- "suite": [
1987
- "lighteval",
1988
- "mmlu"
1989
- ]
1990
- },
1991
- "lighteval|mmlu:world_religions": {
1992
- "name": "mmlu:world_religions",
1993
- "prompt_function": "mmlu_harness",
1994
- "hf_repo": "lighteval/mmlu",
1995
- "hf_subset": "world_religions",
1996
- "metric": [
1997
- "loglikelihood_acc"
1998
- ],
1999
- "hf_avail_splits": [
2000
- "auxiliary_train",
2001
- "test",
2002
- "validation",
2003
- "dev"
2004
- ],
2005
- "evaluation_splits": [
2006
- "test"
2007
- ],
2008
- "few_shots_split": "dev",
2009
- "few_shots_select": "sequential",
2010
- "generation_size": 1,
2011
- "stop_sequence": [
2012
- "\n"
2013
- ],
2014
- "output_regex": null,
2015
- "frozen": false,
2016
- "suite": [
2017
- "lighteval",
2018
- "mmlu"
2019
- ]
2020
- }
2021
- },
2022
- "summary_tasks": {
2023
- "lighteval|mmlu:abstract_algebra|5": {
2024
- "hashes": {
2025
- "hash_examples": "4c76229e00c9c0e9",
2026
- "hash_full_prompts": "273278cb9fb5ac01",
2027
- "hash_input_tokens": "caf9777ccf71eab5",
2028
- "hash_cont_tokens": "00520b0ec06da34f"
2029
- },
2030
- "truncated": 0,
2031
- "non_truncated": 100,
2032
- "padded": 400,
2033
- "non_padded": 0,
2034
- "effective_few_shots": 5.0,
2035
- "num_truncated_few_shots": 0
2036
- },
2037
- "lighteval|mmlu:anatomy|5": {
2038
- "hashes": {
2039
- "hash_examples": "6a1f8104dccbd33b",
2040
- "hash_full_prompts": "e77b5ebe030aabba",
2041
- "hash_input_tokens": "d192cd7584fda4dc",
2042
- "hash_cont_tokens": "263324e6ce7f9b36"
2043
- },
2044
- "truncated": 0,
2045
- "non_truncated": 135,
2046
- "padded": 540,
2047
- "non_padded": 0,
2048
- "effective_few_shots": 5.0,
2049
- "num_truncated_few_shots": 0
2050
- },
2051
- "lighteval|mmlu:astronomy|5": {
2052
- "hashes": {
2053
- "hash_examples": "1302effa3a76ce4c",
2054
- "hash_full_prompts": "0ff37ef4519e63f9",
2055
- "hash_input_tokens": "d241783f0bfdf860",
2056
- "hash_cont_tokens": "18ba399c6801138e"
2057
- },
2058
- "truncated": 0,
2059
- "non_truncated": 152,
2060
- "padded": 608,
2061
- "non_padded": 0,
2062
- "effective_few_shots": 5.0,
2063
- "num_truncated_few_shots": 0
2064
- },
2065
- "lighteval|mmlu:business_ethics|5": {
2066
- "hashes": {
2067
- "hash_examples": "03cb8bce5336419a",
2068
- "hash_full_prompts": "7c4d312a23bdd669",
2069
- "hash_input_tokens": "0aee5ed969278926",
2070
- "hash_cont_tokens": "00520b0ec06da34f"
2071
- },
2072
- "truncated": 0,
2073
- "non_truncated": 100,
2074
- "padded": 400,
2075
- "non_padded": 0,
2076
- "effective_few_shots": 5.0,
2077
- "num_truncated_few_shots": 0
2078
- },
2079
- "lighteval|mmlu:clinical_knowledge|5": {
2080
- "hashes": {
2081
- "hash_examples": "ffbb9c7b2be257f9",
2082
- "hash_full_prompts": "472d93369b1a8382",
2083
- "hash_input_tokens": "aa05960be77863d3",
2084
- "hash_cont_tokens": "9d7500060e0dd995"
2085
- },
2086
- "truncated": 0,
2087
- "non_truncated": 265,
2088
- "padded": 1060,
2089
- "non_padded": 0,
2090
- "effective_few_shots": 5.0,
2091
- "num_truncated_few_shots": 0
2092
- },
2093
- "lighteval|mmlu:college_biology|5": {
2094
- "hashes": {
2095
- "hash_examples": "3ee77f176f38eb8e",
2096
- "hash_full_prompts": "6853bf027b349083",
2097
- "hash_input_tokens": "3843b5375a04262c",
2098
- "hash_cont_tokens": "78a731af5d2f6472"
2099
- },
2100
- "truncated": 0,
2101
- "non_truncated": 144,
2102
- "padded": 576,
2103
- "non_padded": 0,
2104
- "effective_few_shots": 5.0,
2105
- "num_truncated_few_shots": 0
2106
- },
2107
- "lighteval|mmlu:college_chemistry|5": {
2108
- "hashes": {
2109
- "hash_examples": "ce61a69c46d47aeb",
2110
- "hash_full_prompts": "e0f8624971f7af71",
2111
- "hash_input_tokens": "2096d1652e232764",
2112
- "hash_cont_tokens": "00520b0ec06da34f"
2113
- },
2114
- "truncated": 0,
2115
- "non_truncated": 100,
2116
- "padded": 400,
2117
- "non_padded": 0,
2118
- "effective_few_shots": 5.0,
2119
- "num_truncated_few_shots": 0
2120
- },
2121
- "lighteval|mmlu:college_computer_science|5": {
2122
- "hashes": {
2123
- "hash_examples": "32805b52d7d5daab",
2124
- "hash_full_prompts": "841e9d2ecfbb104d",
2125
- "hash_input_tokens": "1e007ac047722e9b",
2126
- "hash_cont_tokens": "00520b0ec06da34f"
2127
- },
2128
- "truncated": 0,
2129
- "non_truncated": 100,
2130
- "padded": 400,
2131
- "non_padded": 0,
2132
- "effective_few_shots": 5.0,
2133
- "num_truncated_few_shots": 0
2134
- },
2135
- "lighteval|mmlu:college_mathematics|5": {
2136
- "hashes": {
2137
- "hash_examples": "55da1a0a0bd33722",
2138
- "hash_full_prompts": "696c5f73522b8706",
2139
- "hash_input_tokens": "c3061d57b5a4ad7e",
2140
- "hash_cont_tokens": "00520b0ec06da34f"
2141
- },
2142
- "truncated": 0,
2143
- "non_truncated": 100,
2144
- "padded": 400,
2145
- "non_padded": 0,
2146
- "effective_few_shots": 5.0,
2147
- "num_truncated_few_shots": 0
2148
- },
2149
- "lighteval|mmlu:college_medicine|5": {
2150
- "hashes": {
2151
- "hash_examples": "c33e143163049176",
2152
- "hash_full_prompts": "7d2530816f672426",
2153
- "hash_input_tokens": "4cddd091001776d7",
2154
- "hash_cont_tokens": "699c8eb24e3e446b"
2155
- },
2156
- "truncated": 0,
2157
- "non_truncated": 173,
2158
- "padded": 692,
2159
- "non_padded": 0,
2160
- "effective_few_shots": 5.0,
2161
- "num_truncated_few_shots": 0
2162
- },
2163
- "lighteval|mmlu:college_physics|5": {
2164
- "hashes": {
2165
- "hash_examples": "ebdab1cdb7e555df",
2166
- "hash_full_prompts": "66b3a61507c4c92b",
2167
- "hash_input_tokens": "821b169941167548",
2168
- "hash_cont_tokens": "075997110cbe055e"
2169
- },
2170
- "truncated": 0,
2171
- "non_truncated": 102,
2172
- "padded": 408,
2173
- "non_padded": 0,
2174
- "effective_few_shots": 5.0,
2175
- "num_truncated_few_shots": 0
2176
- },
2177
- "lighteval|mmlu:computer_security|5": {
2178
- "hashes": {
2179
- "hash_examples": "a24fd7d08a560921",
2180
- "hash_full_prompts": "f1143da88158bf03",
2181
- "hash_input_tokens": "02e64465d74344b4",
2182
- "hash_cont_tokens": "00520b0ec06da34f"
2183
- },
2184
- "truncated": 0,
2185
- "non_truncated": 100,
2186
- "padded": 400,
2187
- "non_padded": 0,
2188
- "effective_few_shots": 5.0,
2189
- "num_truncated_few_shots": 0
2190
- },
2191
- "lighteval|mmlu:conceptual_physics|5": {
2192
- "hashes": {
2193
- "hash_examples": "8300977a79386993",
2194
- "hash_full_prompts": "d2b4c706b65a71d9",
2195
- "hash_input_tokens": "5c7a2235529d2821",
2196
- "hash_cont_tokens": "f22daa6d4818086f"
2197
- },
2198
- "truncated": 0,
2199
- "non_truncated": 235,
2200
- "padded": 940,
2201
- "non_padded": 0,
2202
- "effective_few_shots": 5.0,
2203
- "num_truncated_few_shots": 0
2204
- },
2205
- "lighteval|mmlu:econometrics|5": {
2206
- "hashes": {
2207
- "hash_examples": "ddde36788a04a46f",
2208
- "hash_full_prompts": "aa5255d923b0e3a3",
2209
- "hash_input_tokens": "e0a79ea9e037599d",
2210
- "hash_cont_tokens": "26791a0b1941b4c4"
2211
- },
2212
- "truncated": 0,
2213
- "non_truncated": 114,
2214
- "padded": 456,
2215
- "non_padded": 0,
2216
- "effective_few_shots": 5.0,
2217
- "num_truncated_few_shots": 0
2218
- },
2219
- "lighteval|mmlu:electrical_engineering|5": {
2220
- "hashes": {
2221
- "hash_examples": "acbc5def98c19b3f",
2222
- "hash_full_prompts": "c1f9a9087987d1d7",
2223
- "hash_input_tokens": "e48ddb58b2efa8e3",
2224
- "hash_cont_tokens": "3e336577994f6c0d"
2225
- },
2226
- "truncated": 0,
2227
- "non_truncated": 145,
2228
- "padded": 580,
2229
- "non_padded": 0,
2230
- "effective_few_shots": 5.0,
2231
- "num_truncated_few_shots": 0
2232
- },
2233
- "lighteval|mmlu:elementary_mathematics|5": {
2234
- "hashes": {
2235
- "hash_examples": "146e61d07497a9bd",
2236
- "hash_full_prompts": "57fb9ddf2f814bb5",
2237
- "hash_input_tokens": "9e81373b5265da10",
2238
- "hash_cont_tokens": "1d6bbfa8a67327c8"
2239
- },
2240
- "truncated": 0,
2241
- "non_truncated": 378,
2242
- "padded": 1512,
2243
- "non_padded": 0,
2244
- "effective_few_shots": 5.0,
2245
- "num_truncated_few_shots": 0
2246
- },
2247
- "lighteval|mmlu:formal_logic|5": {
2248
- "hashes": {
2249
- "hash_examples": "8635216e1909a03f",
2250
- "hash_full_prompts": "dc7e34e04346adfd",
2251
- "hash_input_tokens": "0378ed1f1a9bb3f6",
2252
- "hash_cont_tokens": "60508d85eb7693a4"
2253
- },
2254
- "truncated": 0,
2255
- "non_truncated": 126,
2256
- "padded": 504,
2257
- "non_padded": 0,
2258
- "effective_few_shots": 5.0,
2259
- "num_truncated_few_shots": 0
2260
- },
2261
- "lighteval|mmlu:global_facts|5": {
2262
- "hashes": {
2263
- "hash_examples": "30b315aa6353ee47",
2264
- "hash_full_prompts": "7dedb5baa45f3a38",
2265
- "hash_input_tokens": "d20db9bd82fb76c1",
2266
- "hash_cont_tokens": "00520b0ec06da34f"
2267
- },
2268
- "truncated": 0,
2269
- "non_truncated": 100,
2270
- "padded": 400,
2271
- "non_padded": 0,
2272
- "effective_few_shots": 5.0,
2273
- "num_truncated_few_shots": 0
2274
- },
2275
- "lighteval|mmlu:high_school_biology|5": {
2276
- "hashes": {
2277
- "hash_examples": "c9136373af2180de",
2278
- "hash_full_prompts": "15157813fc668acf",
2279
- "hash_input_tokens": "c3c10eef8c477c93",
2280
- "hash_cont_tokens": "d236ce982144e65f"
2281
- },
2282
- "truncated": 0,
2283
- "non_truncated": 310,
2284
- "padded": 1240,
2285
- "non_padded": 0,
2286
- "effective_few_shots": 5.0,
2287
- "num_truncated_few_shots": 0
2288
- },
2289
- "lighteval|mmlu:high_school_chemistry|5": {
2290
- "hashes": {
2291
- "hash_examples": "b0661bfa1add6404",
2292
- "hash_full_prompts": "f51dfd92a2d6fdba",
2293
- "hash_input_tokens": "dc53c87961ef4ab5",
2294
- "hash_cont_tokens": "59f93238ec5aead6"
2295
- },
2296
- "truncated": 0,
2297
- "non_truncated": 203,
2298
- "padded": 812,
2299
- "non_padded": 0,
2300
- "effective_few_shots": 5.0,
2301
- "num_truncated_few_shots": 0
2302
- },
2303
- "lighteval|mmlu:high_school_computer_science|5": {
2304
- "hashes": {
2305
- "hash_examples": "80fc1d623a3d665f",
2306
- "hash_full_prompts": "fe432a03fe8cc766",
2307
- "hash_input_tokens": "61fa356c3ea98372",
2308
- "hash_cont_tokens": "00520b0ec06da34f"
2309
- },
2310
- "truncated": 0,
2311
- "non_truncated": 100,
2312
- "padded": 400,
2313
- "non_padded": 0,
2314
- "effective_few_shots": 5.0,
2315
- "num_truncated_few_shots": 0
2316
- },
2317
- "lighteval|mmlu:high_school_european_history|5": {
2318
- "hashes": {
2319
- "hash_examples": "854da6e5af0fe1a1",
2320
- "hash_full_prompts": "09a62e1560fb1171",
2321
- "hash_input_tokens": "272f8d31300ef0af",
2322
- "hash_cont_tokens": "7b7414d6a5da3d91"
2323
- },
2324
- "truncated": 0,
2325
- "non_truncated": 165,
2326
- "padded": 656,
2327
- "non_padded": 4,
2328
- "effective_few_shots": 5.0,
2329
- "num_truncated_few_shots": 0
2330
- },
2331
- "lighteval|mmlu:high_school_geography|5": {
2332
- "hashes": {
2333
- "hash_examples": "7dc963c7acd19ad8",
2334
- "hash_full_prompts": "8284151c76cee4d8",
2335
- "hash_input_tokens": "12624aed9bf6356b",
2336
- "hash_cont_tokens": "1b66289e10988f84"
2337
- },
2338
- "truncated": 0,
2339
- "non_truncated": 198,
2340
- "padded": 792,
2341
- "non_padded": 0,
2342
- "effective_few_shots": 5.0,
2343
- "num_truncated_few_shots": 0
2344
- },
2345
- "lighteval|mmlu:high_school_government_and_politics|5": {
2346
- "hashes": {
2347
- "hash_examples": "1f675dcdebc9758f",
2348
- "hash_full_prompts": "083339a69a8bfafa",
2349
- "hash_input_tokens": "32e30c43a4a5347e",
2350
- "hash_cont_tokens": "5ab3c3415b1d3a55"
2351
- },
2352
- "truncated": 0,
2353
- "non_truncated": 193,
2354
- "padded": 772,
2355
- "non_padded": 0,
2356
- "effective_few_shots": 5.0,
2357
- "num_truncated_few_shots": 0
2358
- },
2359
- "lighteval|mmlu:high_school_macroeconomics|5": {
2360
- "hashes": {
2361
- "hash_examples": "2fb32cf2d80f0b35",
2362
- "hash_full_prompts": "ececedb0c4a4ffcd",
2363
- "hash_input_tokens": "dc2cd6b398f5f86e",
2364
- "hash_cont_tokens": "2f5457058d187374"
2365
- },
2366
- "truncated": 0,
2367
- "non_truncated": 390,
2368
- "padded": 1557,
2369
- "non_padded": 3,
2370
- "effective_few_shots": 5.0,
2371
- "num_truncated_few_shots": 0
2372
- },
2373
- "lighteval|mmlu:high_school_mathematics|5": {
2374
- "hashes": {
2375
- "hash_examples": "fd6646fdb5d58a1f",
2376
- "hash_full_prompts": "d58a3ca5c8ed6780",
2377
- "hash_input_tokens": "6f9c5ce6428dd87d",
2378
- "hash_cont_tokens": "e35137cb972e1918"
2379
- },
2380
- "truncated": 0,
2381
- "non_truncated": 270,
2382
- "padded": 1080,
2383
- "non_padded": 0,
2384
- "effective_few_shots": 5.0,
2385
- "num_truncated_few_shots": 0
2386
- },
2387
- "lighteval|mmlu:high_school_microeconomics|5": {
2388
- "hashes": {
2389
- "hash_examples": "2118f21f71d87d84",
2390
- "hash_full_prompts": "bd49ce8a930e3e78",
2391
- "hash_input_tokens": "44722cbe1d85e636",
2392
- "hash_cont_tokens": "f756093278ebb83e"
2393
- },
2394
- "truncated": 0,
2395
- "non_truncated": 238,
2396
- "padded": 908,
2397
- "non_padded": 44,
2398
- "effective_few_shots": 5.0,
2399
- "num_truncated_few_shots": 0
2400
- },
2401
- "lighteval|mmlu:high_school_physics|5": {
2402
- "hashes": {
2403
- "hash_examples": "dc3ce06378548565",
2404
- "hash_full_prompts": "3904af994b32b959",
2405
- "hash_input_tokens": "2132f616c2587937",
2406
- "hash_cont_tokens": "9cf883ebf1c82176"
2407
- },
2408
- "truncated": 0,
2409
- "non_truncated": 151,
2410
- "padded": 604,
2411
- "non_padded": 0,
2412
- "effective_few_shots": 5.0,
2413
- "num_truncated_few_shots": 0
2414
- },
2415
- "lighteval|mmlu:high_school_psychology|5": {
2416
- "hashes": {
2417
- "hash_examples": "c8d1d98a40e11f2f",
2418
- "hash_full_prompts": "d3a4d5dd3f3513f8",
2419
- "hash_input_tokens": "6cc69cf1a89e4a88",
2420
- "hash_cont_tokens": "bda0f77331ebb21a"
2421
- },
2422
- "truncated": 0,
2423
- "non_truncated": 545,
2424
- "padded": 2178,
2425
- "non_padded": 2,
2426
- "effective_few_shots": 5.0,
2427
- "num_truncated_few_shots": 0
2428
- },
2429
- "lighteval|mmlu:high_school_statistics|5": {
2430
- "hashes": {
2431
- "hash_examples": "666c8759b98ee4ff",
2432
- "hash_full_prompts": "1b5599f9d4edc7de",
2433
- "hash_input_tokens": "60af7a873b579818",
2434
- "hash_cont_tokens": "4d04f014105a0bad"
2435
- },
2436
- "truncated": 0,
2437
- "non_truncated": 216,
2438
- "padded": 864,
2439
- "non_padded": 0,
2440
- "effective_few_shots": 5.0,
2441
- "num_truncated_few_shots": 0
2442
- },
2443
- "lighteval|mmlu:high_school_us_history|5": {
2444
- "hashes": {
2445
- "hash_examples": "95fef1c4b7d3f81e",
2446
- "hash_full_prompts": "001f7e7cc8185618",
2447
- "hash_input_tokens": "8c2d01a0f291db69",
2448
- "hash_cont_tokens": "f4590c58f12f2766"
2449
- },
2450
- "truncated": 0,
2451
- "non_truncated": 204,
2452
- "padded": 816,
2453
- "non_padded": 0,
2454
- "effective_few_shots": 5.0,
2455
- "num_truncated_few_shots": 0
2456
- },
2457
- "lighteval|mmlu:high_school_world_history|5": {
2458
- "hashes": {
2459
- "hash_examples": "7e5085b6184b0322",
2460
- "hash_full_prompts": "6a5c2a43cf7c6cb1",
2461
- "hash_input_tokens": "612ed95e43bc21b5",
2462
- "hash_cont_tokens": "db6bcddd891df5d9"
2463
- },
2464
- "truncated": 0,
2465
- "non_truncated": 237,
2466
- "padded": 948,
2467
- "non_padded": 0,
2468
- "effective_few_shots": 5.0,
2469
- "num_truncated_few_shots": 0
2470
- },
2471
- "lighteval|mmlu:human_aging|5": {
2472
- "hashes": {
2473
- "hash_examples": "c17333e7c7c10797",
2474
- "hash_full_prompts": "a3ad8e679fe07bef",
2475
- "hash_input_tokens": "4c948b081b40ba31",
2476
- "hash_cont_tokens": "25cec8d640319105"
2477
- },
2478
- "truncated": 0,
2479
- "non_truncated": 223,
2480
- "padded": 892,
2481
- "non_padded": 0,
2482
- "effective_few_shots": 5.0,
2483
- "num_truncated_few_shots": 0
2484
- },
2485
- "lighteval|mmlu:human_sexuality|5": {
2486
- "hashes": {
2487
- "hash_examples": "4edd1e9045df5e3d",
2488
- "hash_full_prompts": "3389ffb95929a661",
2489
- "hash_input_tokens": "9e649cc80ef9f2fe",
2490
- "hash_cont_tokens": "6778302b4a10b645"
2491
- },
2492
- "truncated": 0,
2493
- "non_truncated": 131,
2494
- "padded": 524,
2495
- "non_padded": 0,
2496
- "effective_few_shots": 5.0,
2497
- "num_truncated_few_shots": 0
2498
- },
2499
- "lighteval|mmlu:international_law|5": {
2500
- "hashes": {
2501
- "hash_examples": "db2fa00d771a062a",
2502
- "hash_full_prompts": "104f48c64f6f9622",
2503
- "hash_input_tokens": "c51db1d4a2a87eed",
2504
- "hash_cont_tokens": "9eb54e1a46032749"
2505
- },
2506
- "truncated": 0,
2507
- "non_truncated": 121,
2508
- "padded": 484,
2509
- "non_padded": 0,
2510
- "effective_few_shots": 5.0,
2511
- "num_truncated_few_shots": 0
2512
- },
2513
- "lighteval|mmlu:jurisprudence|5": {
2514
- "hashes": {
2515
- "hash_examples": "e956f86b124076fe",
2516
- "hash_full_prompts": "49295d36462ddc97",
2517
- "hash_input_tokens": "a779a1b30bc13f30",
2518
- "hash_cont_tokens": "f17d9a372cfd66b1"
2519
- },
2520
- "truncated": 0,
2521
- "non_truncated": 108,
2522
- "padded": 420,
2523
- "non_padded": 12,
2524
- "effective_few_shots": 5.0,
2525
- "num_truncated_few_shots": 0
2526
- },
2527
- "lighteval|mmlu:logical_fallacies|5": {
2528
- "hashes": {
2529
- "hash_examples": "956e0e6365ab79f1",
2530
- "hash_full_prompts": "b64f452752d5cd23",
2531
- "hash_input_tokens": "61d99e8d4d4d8652",
2532
- "hash_cont_tokens": "cf44a68f5bca9a96"
2533
- },
2534
- "truncated": 0,
2535
- "non_truncated": 163,
2536
- "padded": 648,
2537
- "non_padded": 4,
2538
- "effective_few_shots": 5.0,
2539
- "num_truncated_few_shots": 0
2540
- },
2541
- "lighteval|mmlu:machine_learning|5": {
2542
- "hashes": {
2543
- "hash_examples": "397997cc6f4d581e",
2544
- "hash_full_prompts": "54da136ebd708042",
2545
- "hash_input_tokens": "11e6731506fcf366",
2546
- "hash_cont_tokens": "eace00d420f4f32c"
2547
- },
2548
- "truncated": 0,
2549
- "non_truncated": 112,
2550
- "padded": 448,
2551
- "non_padded": 0,
2552
- "effective_few_shots": 5.0,
2553
- "num_truncated_few_shots": 0
2554
- },
2555
- "lighteval|mmlu:management|5": {
2556
- "hashes": {
2557
- "hash_examples": "2bcbe6f6ca63d740",
2558
- "hash_full_prompts": "a4b864ff27598ba3",
2559
- "hash_input_tokens": "caffa6e4e80cbd5e",
2560
- "hash_cont_tokens": "b7c51d0250c252d8"
2561
- },
2562
- "truncated": 0,
2563
- "non_truncated": 103,
2564
- "padded": 412,
2565
- "non_padded": 0,
2566
- "effective_few_shots": 5.0,
2567
- "num_truncated_few_shots": 0
2568
- },
2569
- "lighteval|mmlu:marketing|5": {
2570
- "hashes": {
2571
- "hash_examples": "8ddb20d964a1b065",
2572
- "hash_full_prompts": "c7183ac32f36104d",
2573
- "hash_input_tokens": "5cd238ac5e8f19f4",
2574
- "hash_cont_tokens": "086fb63f8b1d1339"
2575
- },
2576
- "truncated": 0,
2577
- "non_truncated": 234,
2578
- "padded": 924,
2579
- "non_padded": 12,
2580
- "effective_few_shots": 5.0,
2581
- "num_truncated_few_shots": 0
2582
- },
2583
- "lighteval|mmlu:medical_genetics|5": {
2584
- "hashes": {
2585
- "hash_examples": "182a71f4763d2cea",
2586
- "hash_full_prompts": "c17b0a66e3027303",
2587
- "hash_input_tokens": "46c0c8a573b43089",
2588
- "hash_cont_tokens": "00520b0ec06da34f"
2589
- },
2590
- "truncated": 0,
2591
- "non_truncated": 100,
2592
- "padded": 400,
2593
- "non_padded": 0,
2594
- "effective_few_shots": 5.0,
2595
- "num_truncated_few_shots": 0
2596
- },
2597
- "lighteval|mmlu:miscellaneous|5": {
2598
- "hashes": {
2599
- "hash_examples": "4c404fdbb4ca57fc",
2600
- "hash_full_prompts": "bc5fa37ce20a2503",
2601
- "hash_input_tokens": "5327cd4585062ac2",
2602
- "hash_cont_tokens": "1827274fa6537077"
2603
- },
2604
- "truncated": 0,
2605
- "non_truncated": 783,
2606
- "padded": 3132,
2607
- "non_padded": 0,
2608
- "effective_few_shots": 5.0,
2609
- "num_truncated_few_shots": 0
2610
- },
2611
- "lighteval|mmlu:moral_disputes|5": {
2612
- "hashes": {
2613
- "hash_examples": "60cbd2baa3fea5c9",
2614
- "hash_full_prompts": "075742051236078f",
2615
- "hash_input_tokens": "a2c9da202f686839",
2616
- "hash_cont_tokens": "472c223f6f28cfc7"
2617
- },
2618
- "truncated": 0,
2619
- "non_truncated": 346,
2620
- "padded": 1384,
2621
- "non_padded": 0,
2622
- "effective_few_shots": 5.0,
2623
- "num_truncated_few_shots": 0
2624
- },
2625
- "lighteval|mmlu:moral_scenarios|5": {
2626
- "hashes": {
2627
- "hash_examples": "fd8b0431fbdd75ef",
2628
- "hash_full_prompts": "533c4700637599a2",
2629
- "hash_input_tokens": "9a1a9f3900b372e6",
2630
- "hash_cont_tokens": "e90dade00a092f9e"
2631
- },
2632
- "truncated": 0,
2633
- "non_truncated": 895,
2634
- "padded": 3567,
2635
- "non_padded": 13,
2636
- "effective_few_shots": 5.0,
2637
- "num_truncated_few_shots": 0
2638
- },
2639
- "lighteval|mmlu:nutrition|5": {
2640
- "hashes": {
2641
- "hash_examples": "71e55e2b829b6528",
2642
- "hash_full_prompts": "02b6877dc5a603a6",
2643
- "hash_input_tokens": "dd91fec063272e23",
2644
- "hash_cont_tokens": "128e0ec97d96b165"
2645
- },
2646
- "truncated": 0,
2647
- "non_truncated": 306,
2648
- "padded": 1224,
2649
- "non_padded": 0,
2650
- "effective_few_shots": 5.0,
2651
- "num_truncated_few_shots": 0
2652
- },
2653
- "lighteval|mmlu:philosophy|5": {
2654
- "hashes": {
2655
- "hash_examples": "a6d489a8d208fa4b",
2656
- "hash_full_prompts": "0e65b5f40a9ceb20",
2657
- "hash_input_tokens": "2255e15265a7d96a",
2658
- "hash_cont_tokens": "cbfd7829a3e0f082"
2659
- },
2660
- "truncated": 0,
2661
- "non_truncated": 311,
2662
- "padded": 1244,
2663
- "non_padded": 0,
2664
- "effective_few_shots": 5.0,
2665
- "num_truncated_few_shots": 0
2666
- },
2667
- "lighteval|mmlu:prehistory|5": {
2668
- "hashes": {
2669
- "hash_examples": "6cc50f032a19acaa",
2670
- "hash_full_prompts": "e838e60749e4a598",
2671
- "hash_input_tokens": "1b9b906efbcc97fd",
2672
- "hash_cont_tokens": "9c0cf5a2f71afa7e"
2673
- },
2674
- "truncated": 0,
2675
- "non_truncated": 324,
2676
- "padded": 1284,
2677
- "non_padded": 12,
2678
- "effective_few_shots": 5.0,
2679
- "num_truncated_few_shots": 0
2680
- },
2681
- "lighteval|mmlu:professional_accounting|5": {
2682
- "hashes": {
2683
- "hash_examples": "50f57ab32f5f6cea",
2684
- "hash_full_prompts": "9abf7319f68b7ba8",
2685
- "hash_input_tokens": "d42c8275cd4e10e1",
2686
- "hash_cont_tokens": "50f011c2453517ee"
2687
- },
2688
- "truncated": 0,
2689
- "non_truncated": 282,
2690
- "padded": 1128,
2691
- "non_padded": 0,
2692
- "effective_few_shots": 5.0,
2693
- "num_truncated_few_shots": 0
2694
- },
2695
- "lighteval|mmlu:professional_law|5": {
2696
- "hashes": {
2697
- "hash_examples": "a8fdc85c64f4b215",
2698
- "hash_full_prompts": "4074faf1eaedcfda",
2699
- "hash_input_tokens": "215c854d27e741b8",
2700
- "hash_cont_tokens": "73527e852c24186c"
2701
- },
2702
- "truncated": 0,
2703
- "non_truncated": 1534,
2704
- "padded": 6136,
2705
- "non_padded": 0,
2706
- "effective_few_shots": 5.0,
2707
- "num_truncated_few_shots": 0
2708
- },
2709
- "lighteval|mmlu:professional_medicine|5": {
2710
- "hashes": {
2711
- "hash_examples": "c373a28a3050a73a",
2712
- "hash_full_prompts": "e72202fc20fcab70",
2713
- "hash_input_tokens": "5a6e9aaaaea83544",
2714
- "hash_cont_tokens": "ceb7af5e2e789abc"
2715
- },
2716
- "truncated": 0,
2717
- "non_truncated": 272,
2718
- "padded": 1088,
2719
- "non_padded": 0,
2720
- "effective_few_shots": 5.0,
2721
- "num_truncated_few_shots": 0
2722
- },
2723
- "lighteval|mmlu:professional_psychology|5": {
2724
- "hashes": {
2725
- "hash_examples": "bf5254fe818356af",
2726
- "hash_full_prompts": "4dcb71c9ef602791",
2727
- "hash_input_tokens": "316d0ba731b0de4f",
2728
- "hash_cont_tokens": "8cfdced8a9667380"
2729
- },
2730
- "truncated": 0,
2731
- "non_truncated": 612,
2732
- "padded": 2428,
2733
- "non_padded": 20,
2734
- "effective_few_shots": 5.0,
2735
- "num_truncated_few_shots": 0
2736
- },
2737
- "lighteval|mmlu:public_relations|5": {
2738
- "hashes": {
2739
- "hash_examples": "b66d52e28e7d14e0",
2740
- "hash_full_prompts": "c6050b1748185950",
2741
- "hash_input_tokens": "2ba1d90c95e19dce",
2742
- "hash_cont_tokens": "f8327461a9cc5123"
2743
- },
2744
- "truncated": 0,
2745
- "non_truncated": 110,
2746
- "padded": 436,
2747
- "non_padded": 4,
2748
- "effective_few_shots": 5.0,
2749
- "num_truncated_few_shots": 0
2750
- },
2751
- "lighteval|mmlu:security_studies|5": {
2752
- "hashes": {
2753
- "hash_examples": "514c14feaf000ad9",
2754
- "hash_full_prompts": "4c6786915b670d03",
2755
- "hash_input_tokens": "b92f71eccf4f89bf",
2756
- "hash_cont_tokens": "c30b0c4d52c2875d"
2757
- },
2758
- "truncated": 0,
2759
- "non_truncated": 245,
2760
- "padded": 980,
2761
- "non_padded": 0,
2762
- "effective_few_shots": 5.0,
2763
- "num_truncated_few_shots": 0
2764
- },
2765
- "lighteval|mmlu:sociology|5": {
2766
- "hashes": {
2767
- "hash_examples": "f6c9bc9d18c80870",
2768
- "hash_full_prompts": "a2e9a27e985a4e9b",
2769
- "hash_input_tokens": "e821334ab55c0d44",
2770
- "hash_cont_tokens": "eef4bd16d536fbd6"
2771
- },
2772
- "truncated": 0,
2773
- "non_truncated": 201,
2774
- "padded": 804,
2775
- "non_padded": 0,
2776
- "effective_few_shots": 5.0,
2777
- "num_truncated_few_shots": 0
2778
- },
2779
- "lighteval|mmlu:us_foreign_policy|5": {
2780
- "hashes": {
2781
- "hash_examples": "ed7b78629db6678f",
2782
- "hash_full_prompts": "46d0986398662d59",
2783
- "hash_input_tokens": "9f6b40a7b6b8a3b2",
2784
- "hash_cont_tokens": "00520b0ec06da34f"
2785
- },
2786
- "truncated": 0,
2787
- "non_truncated": 100,
2788
- "padded": 400,
2789
- "non_padded": 0,
2790
- "effective_few_shots": 5.0,
2791
- "num_truncated_few_shots": 0
2792
- },
2793
- "lighteval|mmlu:virology|5": {
2794
- "hashes": {
2795
- "hash_examples": "bc52ffdc3f9b994a",
2796
- "hash_full_prompts": "6b591e3983159283",
2797
- "hash_input_tokens": "d7c6d39e149defc9",
2798
- "hash_cont_tokens": "f5fc195e049353c0"
2799
- },
2800
- "truncated": 0,
2801
- "non_truncated": 166,
2802
- "padded": 664,
2803
- "non_padded": 0,
2804
- "effective_few_shots": 5.0,
2805
- "num_truncated_few_shots": 0
2806
- },
2807
- "lighteval|mmlu:world_religions|5": {
2808
- "hashes": {
2809
- "hash_examples": "ecdb4a4f94f62930",
2810
- "hash_full_prompts": "8c2e37a02519af15",
2811
- "hash_input_tokens": "80b87b6e634441d6",
2812
- "hash_cont_tokens": "ada548665e87b1e0"
2813
- },
2814
- "truncated": 0,
2815
- "non_truncated": 171,
2816
- "padded": 684,
2817
- "non_padded": 0,
2818
- "effective_few_shots": 5.0,
2819
- "num_truncated_few_shots": 0
2820
- }
2821
- },
2822
- "summary_general": {
2823
- "hashes": {
2824
- "hash_examples": "341a076d0beb7048",
2825
- "hash_full_prompts": "7c1eeddf962b8fc9",
2826
- "hash_input_tokens": "98bef9715b6ebf74",
2827
- "hash_cont_tokens": "3672212ca582e2d0"
2828
- },
2829
- "truncated": 0,
2830
- "non_truncated": 14042,
2831
- "padded": 56038,
2832
- "non_padded": 130,
2833
- "num_truncated_few_shots": 0
2834
- }
2835
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B-Chat/main/truthfulqa/results_2024-02-28T07-35-48.691367.json DELETED
@@ -1,85 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 6108135.051076809,
9
- "end_time": 6108205.834640568,
10
- "total_evaluation_time_secondes": "70.78356375917792",
11
- "model_name": "Qwen/Qwen1.5-0.5B-Chat",
12
- "model_sha": "7f630fd18dccab574ab1b78411a8753f989a55ac",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|truthfulqa:mc|0": {
19
- "truthfulqa_mc1": 0.27050183598531213,
20
- "truthfulqa_mc1_stderr": 0.015550778332842893,
21
- "truthfulqa_mc2": 0.44803609764115016,
22
- "truthfulqa_mc2_stderr": 0.015178000327384377
23
- }
24
- },
25
- "versions": {
26
- "lighteval|truthfulqa:mc|0": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|truthfulqa:mc": {
30
- "name": "truthfulqa:mc",
31
- "prompt_function": "truthful_qa_multiple_choice",
32
- "hf_repo": "truthful_qa",
33
- "hf_subset": "multiple_choice",
34
- "metric": [
35
- "truthfulqa_mc_metrics"
36
- ],
37
- "hf_avail_splits": [
38
- "validation"
39
- ],
40
- "evaluation_splits": [
41
- "validation"
42
- ],
43
- "few_shots_split": null,
44
- "few_shots_select": null,
45
- "generation_size": -1,
46
- "stop_sequence": [
47
- "\n"
48
- ],
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "lighteval"
53
- ]
54
- }
55
- },
56
- "summary_tasks": {
57
- "lighteval|truthfulqa:mc|0": {
58
- "hashes": {
59
- "hash_examples": "36a6d90e75d92d4a",
60
- "hash_full_prompts": "0ee9c8dfb7ba5726",
61
- "hash_input_tokens": "e3613aaf606e098f",
62
- "hash_cont_tokens": "5077664e752b6f79"
63
- },
64
- "truncated": 0,
65
- "non_truncated": 817,
66
- "padded": 9192,
67
- "non_padded": 804,
68
- "effective_few_shots": 0.0,
69
- "num_truncated_few_shots": 0
70
- }
71
- },
72
- "summary_general": {
73
- "hashes": {
74
- "hash_examples": "aed1dfc67e53d0f2",
75
- "hash_full_prompts": "8e0ba83d3aa3f868",
76
- "hash_input_tokens": "449cc57eb13ad0cb",
77
- "hash_cont_tokens": "b7b1973c20b71975"
78
- },
79
- "truncated": 0,
80
- "non_truncated": 817,
81
- "padded": 9192,
82
- "non_padded": 804,
83
- "num_truncated_few_shots": 0
84
- }
85
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B-Chat/main/winogrande/results_2024-02-27T20-42-29.221265.json DELETED
@@ -1,85 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 347383.812042912,
9
- "end_time": 347427.252999333,
10
- "total_evaluation_time_secondes": "43.44095642102184",
11
- "model_name": "Qwen/Qwen1.5-0.5B-Chat",
12
- "model_sha": "7f630fd18dccab574ab1b78411a8753f989a55ac",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|winogrande|5": {
19
- "acc": 0.5193370165745856,
20
- "acc_stderr": 0.01404197273371297
21
- }
22
- },
23
- "versions": {
24
- "lighteval|winogrande|5": 0
25
- },
26
- "config_tasks": {
27
- "lighteval|winogrande": {
28
- "name": "winogrande",
29
- "prompt_function": "winogrande",
30
- "hf_repo": "winogrande",
31
- "hf_subset": "winogrande_xl",
32
- "metric": [
33
- "loglikelihood_acc"
34
- ],
35
- "hf_avail_splits": [
36
- "train",
37
- "test",
38
- "validation"
39
- ],
40
- "evaluation_splits": [
41
- "validation"
42
- ],
43
- "few_shots_split": null,
44
- "few_shots_select": "random_sampling",
45
- "generation_size": -1,
46
- "stop_sequence": [
47
- "\n"
48
- ],
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "lighteval"
53
- ]
54
- }
55
- },
56
- "summary_tasks": {
57
- "lighteval|winogrande|5": {
58
- "hashes": {
59
- "hash_examples": "087d5d1a1afd4c7b",
60
- "hash_full_prompts": "a62b027af32f1fe4",
61
- "hash_input_tokens": "2d5322eff4ad78c0",
62
- "hash_cont_tokens": "af83e7e371fcf4d4"
63
- },
64
- "truncated": 0,
65
- "non_truncated": 1267,
66
- "padded": 2356,
67
- "non_padded": 178,
68
- "effective_few_shots": 5.0,
69
- "num_truncated_few_shots": 0
70
- }
71
- },
72
- "summary_general": {
73
- "hashes": {
74
- "hash_examples": "b9a49975cc41fab7",
75
- "hash_full_prompts": "1d1bae16e80ffe07",
76
- "hash_input_tokens": "4228ab07fdbbb16c",
77
- "hash_cont_tokens": "cf29ee9ce86b26c4"
78
- },
79
- "truncated": 0,
80
- "non_truncated": 1267,
81
- "padded": 2356,
82
- "non_padded": 178,
83
- "num_truncated_few_shots": 0
84
- }
85
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B-Chat/main/winogrande/results_2024-02-28T07-35-24.539002.json DELETED
@@ -1,85 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 474770.575043302,
9
- "end_time": 474814.487678998,
10
- "total_evaluation_time_secondes": "43.912635696004145",
11
- "model_name": "Qwen/Qwen1.5-0.5B-Chat",
12
- "model_sha": "7f630fd18dccab574ab1b78411a8753f989a55ac",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|winogrande|5": {
19
- "acc": 0.5193370165745856,
20
- "acc_stderr": 0.01404197273371297
21
- }
22
- },
23
- "versions": {
24
- "lighteval|winogrande|5": 0
25
- },
26
- "config_tasks": {
27
- "lighteval|winogrande": {
28
- "name": "winogrande",
29
- "prompt_function": "winogrande",
30
- "hf_repo": "winogrande",
31
- "hf_subset": "winogrande_xl",
32
- "metric": [
33
- "loglikelihood_acc"
34
- ],
35
- "hf_avail_splits": [
36
- "train",
37
- "test",
38
- "validation"
39
- ],
40
- "evaluation_splits": [
41
- "validation"
42
- ],
43
- "few_shots_split": null,
44
- "few_shots_select": "random_sampling",
45
- "generation_size": -1,
46
- "stop_sequence": [
47
- "\n"
48
- ],
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "lighteval"
53
- ]
54
- }
55
- },
56
- "summary_tasks": {
57
- "lighteval|winogrande|5": {
58
- "hashes": {
59
- "hash_examples": "087d5d1a1afd4c7b",
60
- "hash_full_prompts": "a62b027af32f1fe4",
61
- "hash_input_tokens": "2d5322eff4ad78c0",
62
- "hash_cont_tokens": "af83e7e371fcf4d4"
63
- },
64
- "truncated": 0,
65
- "non_truncated": 1267,
66
- "padded": 2356,
67
- "non_padded": 178,
68
- "effective_few_shots": 5.0,
69
- "num_truncated_few_shots": 0
70
- }
71
- },
72
- "summary_general": {
73
- "hashes": {
74
- "hash_examples": "b9a49975cc41fab7",
75
- "hash_full_prompts": "1d1bae16e80ffe07",
76
- "hash_input_tokens": "4228ab07fdbbb16c",
77
- "hash_cont_tokens": "cf29ee9ce86b26c4"
78
- },
79
- "truncated": 0,
80
- "non_truncated": 1267,
81
- "padded": 2356,
82
- "non_padded": 178,
83
- "num_truncated_few_shots": 0
84
- }
85
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B/main/arc/results_2024-03-02T12-48-25.468716.json DELETED
@@ -1,90 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2336656.009882773,
9
- "end_time": 2336719.637681085,
10
- "total_evaluation_time_secondes": "63.62779831234366",
11
- "model_name": "Qwen/Qwen1.5-0.5B",
12
- "model_sha": "fedce23ef6393499effdf4958f9b3256f299cc7d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|arc:challenge|25": {
19
- "acc": 0.25,
20
- "acc_stderr": 0.012653835621466646,
21
- "acc_norm": 0.29266211604095566,
22
- "acc_norm_stderr": 0.013295916103619413
23
- }
24
- },
25
- "versions": {
26
- "lighteval|arc:challenge|25": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|arc:challenge": {
30
- "name": "arc:challenge",
31
- "prompt_function": "arc",
32
- "hf_repo": "ai2_arc",
33
- "hf_subset": "ARC-Challenge",
34
- "metric": [
35
- "loglikelihood_acc",
36
- "loglikelihood_acc_norm_nospace"
37
- ],
38
- "hf_avail_splits": [
39
- "train",
40
- "test"
41
- ],
42
- "evaluation_splits": [
43
- "test"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": "random_sampling_from_train",
47
- "generation_size": 1,
48
- "stop_sequence": [
49
- "\n"
50
- ],
51
- "output_regex": null,
52
- "frozen": false,
53
- "suite": [
54
- "lighteval",
55
- "arc"
56
- ],
57
- "original_num_docs": 1172,
58
- "effective_num_docs": 1172
59
- }
60
- },
61
- "summary_tasks": {
62
- "lighteval|arc:challenge|25": {
63
- "hashes": {
64
- "hash_examples": "17b0cae357c0259e",
65
- "hash_full_prompts": "66498cebc864f4ca",
66
- "hash_input_tokens": "aefefa49d87dbb4a",
67
- "hash_cont_tokens": "01b219c135fa72d2"
68
- },
69
- "truncated": 0,
70
- "non_truncated": 1172,
71
- "padded": 4651,
72
- "non_padded": 36,
73
- "effective_few_shots": 25.0,
74
- "num_truncated_few_shots": 0
75
- }
76
- },
77
- "summary_general": {
78
- "hashes": {
79
- "hash_examples": "aaa6929c6d3771fb",
80
- "hash_full_prompts": "848db321e3b0d4cd",
81
- "hash_input_tokens": "c43cd3d9f13e8ce3",
82
- "hash_cont_tokens": "18fd947d8652c7c4"
83
- },
84
- "truncated": 0,
85
- "non_truncated": 1172,
86
- "padded": 4651,
87
- "non_padded": 36,
88
- "num_truncated_few_shots": 0
89
- }
90
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B/main/gsm8k/results_2024-03-02T12-51-54.476794.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5977b6f576f7bbe5273fb70fbe1db34d1e066dd10cb38e62b619a278e07d5b50
3
- size 2155
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B/main/hellaswag/results_2024-03-02T12-51-59.737337.json DELETED
@@ -1,90 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1787653.492247887,
9
- "end_time": 1787931.376327222,
10
- "total_evaluation_time_secondes": "277.8840793350246",
11
- "model_name": "Qwen/Qwen1.5-0.5B",
12
- "model_sha": "fedce23ef6393499effdf4958f9b3256f299cc7d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|hellaswag|10": {
19
- "acc": 0.37582154949213303,
20
- "acc_stderr": 0.004833444556338636,
21
- "acc_norm": 0.4742083250348536,
22
- "acc_norm_stderr": 0.00498313847960438
23
- }
24
- },
25
- "versions": {
26
- "lighteval|hellaswag|10": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|hellaswag": {
30
- "name": "hellaswag",
31
- "prompt_function": "hellaswag_harness",
32
- "hf_repo": "hellaswag",
33
- "hf_subset": "default",
34
- "metric": [
35
- "loglikelihood_acc",
36
- "loglikelihood_acc_norm"
37
- ],
38
- "hf_avail_splits": [
39
- "train",
40
- "test",
41
- "validation"
42
- ],
43
- "evaluation_splits": [
44
- "validation"
45
- ],
46
- "few_shots_split": null,
47
- "few_shots_select": "random_sampling_from_train",
48
- "generation_size": -1,
49
- "stop_sequence": [
50
- "\n"
51
- ],
52
- "output_regex": null,
53
- "frozen": false,
54
- "suite": [
55
- "lighteval"
56
- ],
57
- "original_num_docs": 10042,
58
- "effective_num_docs": 10042
59
- }
60
- },
61
- "summary_tasks": {
62
- "lighteval|hellaswag|10": {
63
- "hashes": {
64
- "hash_examples": "31985c805c3a737e",
65
- "hash_full_prompts": "451e22e8c3aaeeb6",
66
- "hash_input_tokens": "75c8dc72e821c158",
67
- "hash_cont_tokens": "1a6c30e9c276de48"
68
- },
69
- "truncated": 0,
70
- "non_truncated": 10042,
71
- "padded": 39979,
72
- "non_padded": 189,
73
- "effective_few_shots": 10.0,
74
- "num_truncated_few_shots": 0
75
- }
76
- },
77
- "summary_general": {
78
- "hashes": {
79
- "hash_examples": "63bc2cf8bae03fbc",
80
- "hash_full_prompts": "b2452b7405d0326a",
81
- "hash_input_tokens": "94f9d746bc9de07d",
82
- "hash_cont_tokens": "0064768cfec6cc47"
83
- },
84
- "truncated": 0,
85
- "non_truncated": 10042,
86
- "padded": 39979,
87
- "non_padded": 189,
88
- "num_truncated_few_shots": 0
89
- }
90
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B/main/ifeval/results_2024-03-02T12-56-57.071885.json DELETED
@@ -1,89 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 525925.789399797,
9
- "end_time": 526490.873378994,
10
- "total_evaluation_time_secondes": "565.0839791969629",
11
- "model_name": "Qwen/Qwen1.5-0.5B",
12
- "model_sha": "fedce23ef6393499effdf4958f9b3256f299cc7d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|ifeval|0": {
19
- "prompt_level_strict_acc": 0.12384473197781885,
20
- "prompt_level_strict_acc_stderr": 0.014175305492766606,
21
- "inst_level_strict_acc": 0.21223021582733814,
22
- "inst_level_strict_acc_stderr": 0.00044404615428519543,
23
- "prompt_level_loose_acc": 0.14417744916820702,
24
- "prompt_level_loose_acc_stderr": 0.015116235588583228,
25
- "inst_level_loose_acc": 0.2529976019184652,
26
- "inst_level_loose_acc_stderr": 0.0004686377292394125
27
- }
28
- },
29
- "versions": {
30
- "custom|ifeval|0": 0
31
- },
32
- "config_tasks": {
33
- "custom|ifeval": {
34
- "name": "ifeval",
35
- "prompt_function": "ifeval_prompt",
36
- "hf_repo": "wis-k/instruction-following-eval",
37
- "hf_subset": "default",
38
- "metric": [
39
- "ifeval_metric"
40
- ],
41
- "hf_avail_splits": [
42
- "train"
43
- ],
44
- "evaluation_splits": [
45
- "train"
46
- ],
47
- "few_shots_split": "train",
48
- "few_shots_select": "random_sampling",
49
- "generation_size": 1280,
50
- "stop_sequence": [],
51
- "output_regex": null,
52
- "frozen": false,
53
- "suite": [
54
- "custom"
55
- ],
56
- "original_num_docs": 541,
57
- "effective_num_docs": 541
58
- }
59
- },
60
- "summary_tasks": {
61
- "custom|ifeval|0": {
62
- "hashes": {
63
- "hash_examples": "e99cbf567588d7c6",
64
- "hash_full_prompts": "7ea7bf2a8edba8f4",
65
- "hash_input_tokens": "4d9405a95eea122b",
66
- "hash_cont_tokens": "f1666153b739651e"
67
- },
68
- "truncated": 0,
69
- "non_truncated": 541,
70
- "padded": 0,
71
- "non_padded": 541,
72
- "effective_few_shots": 0.0,
73
- "num_truncated_few_shots": 0
74
- }
75
- },
76
- "summary_general": {
77
- "hashes": {
78
- "hash_examples": "ea046ab2c6fc5928",
79
- "hash_full_prompts": "45f8422f6ad2da79",
80
- "hash_input_tokens": "0f908ce1163008f8",
81
- "hash_cont_tokens": "9a24099350c4b383"
82
- },
83
- "truncated": 0,
84
- "non_truncated": 541,
85
- "padded": 0,
86
- "non_padded": 541,
87
- "num_truncated_few_shots": 0
88
- }
89
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B/main/mmlu/results_2024-03-02T12-55-58.089063.json DELETED
@@ -1,2949 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 6386071.192632286,
9
- "end_time": 6386606.388884317,
10
- "total_evaluation_time_secondes": "535.1962520312518",
11
- "model_name": "Qwen/Qwen1.5-0.5B",
12
- "model_sha": "fedce23ef6393499effdf4958f9b3256f299cc7d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|mmlu:abstract_algebra|5": {
19
- "acc": 0.36,
20
- "acc_stderr": 0.04824181513244218
21
- },
22
- "lighteval|mmlu:anatomy|5": {
23
- "acc": 0.32592592592592595,
24
- "acc_stderr": 0.040491220417025055
25
- },
26
- "lighteval|mmlu:astronomy|5": {
27
- "acc": 0.4144736842105263,
28
- "acc_stderr": 0.04008973785779205
29
- },
30
- "lighteval|mmlu:business_ethics|5": {
31
- "acc": 0.48,
32
- "acc_stderr": 0.050211673156867795
33
- },
34
- "lighteval|mmlu:clinical_knowledge|5": {
35
- "acc": 0.4377358490566038,
36
- "acc_stderr": 0.030533338430467512
37
- },
38
- "lighteval|mmlu:college_biology|5": {
39
- "acc": 0.375,
40
- "acc_stderr": 0.04048439222695598
41
- },
42
- "lighteval|mmlu:college_chemistry|5": {
43
- "acc": 0.28,
44
- "acc_stderr": 0.045126085985421276
45
- },
46
- "lighteval|mmlu:college_computer_science|5": {
47
- "acc": 0.36,
48
- "acc_stderr": 0.04824181513244218
49
- },
50
- "lighteval|mmlu:college_mathematics|5": {
51
- "acc": 0.36,
52
- "acc_stderr": 0.048241815132442176
53
- },
54
- "lighteval|mmlu:college_medicine|5": {
55
- "acc": 0.4046242774566474,
56
- "acc_stderr": 0.03742461193887248
57
- },
58
- "lighteval|mmlu:college_physics|5": {
59
- "acc": 0.27450980392156865,
60
- "acc_stderr": 0.04440521906179327
61
- },
62
- "lighteval|mmlu:computer_security|5": {
63
- "acc": 0.46,
64
- "acc_stderr": 0.05009082659620333
65
- },
66
- "lighteval|mmlu:conceptual_physics|5": {
67
- "acc": 0.3659574468085106,
68
- "acc_stderr": 0.031489558297455304
69
- },
70
- "lighteval|mmlu:econometrics|5": {
71
- "acc": 0.3157894736842105,
72
- "acc_stderr": 0.043727482902780064
73
- },
74
- "lighteval|mmlu:electrical_engineering|5": {
75
- "acc": 0.4482758620689655,
76
- "acc_stderr": 0.04144311810878152
77
- },
78
- "lighteval|mmlu:elementary_mathematics|5": {
79
- "acc": 0.2962962962962963,
80
- "acc_stderr": 0.023517294335963283
81
- },
82
- "lighteval|mmlu:formal_logic|5": {
83
- "acc": 0.29365079365079366,
84
- "acc_stderr": 0.04073524322147126
85
- },
86
- "lighteval|mmlu:global_facts|5": {
87
- "acc": 0.38,
88
- "acc_stderr": 0.04878317312145633
89
- },
90
- "lighteval|mmlu:high_school_biology|5": {
91
- "acc": 0.45806451612903226,
92
- "acc_stderr": 0.028343787250540625
93
- },
94
- "lighteval|mmlu:high_school_chemistry|5": {
95
- "acc": 0.33004926108374383,
96
- "acc_stderr": 0.033085304262282574
97
- },
98
- "lighteval|mmlu:high_school_computer_science|5": {
99
- "acc": 0.36,
100
- "acc_stderr": 0.04824181513244218
101
- },
102
- "lighteval|mmlu:high_school_european_history|5": {
103
- "acc": 0.4121212121212121,
104
- "acc_stderr": 0.03843566993588717
105
- },
106
- "lighteval|mmlu:high_school_geography|5": {
107
- "acc": 0.51010101010101,
108
- "acc_stderr": 0.035616254886737454
109
- },
110
- "lighteval|mmlu:high_school_government_and_politics|5": {
111
- "acc": 0.5233160621761658,
112
- "acc_stderr": 0.03604513672442202
113
- },
114
- "lighteval|mmlu:high_school_macroeconomics|5": {
115
- "acc": 0.3641025641025641,
116
- "acc_stderr": 0.024396672985094767
117
- },
118
- "lighteval|mmlu:high_school_mathematics|5": {
119
- "acc": 0.3074074074074074,
120
- "acc_stderr": 0.028133252578815632
121
- },
122
- "lighteval|mmlu:high_school_microeconomics|5": {
123
- "acc": 0.4327731092436975,
124
- "acc_stderr": 0.03218358107742613
125
- },
126
- "lighteval|mmlu:high_school_physics|5": {
127
- "acc": 0.24503311258278146,
128
- "acc_stderr": 0.03511807571804724
129
- },
130
- "lighteval|mmlu:high_school_psychology|5": {
131
- "acc": 0.47155963302752296,
132
- "acc_stderr": 0.02140261569734804
133
- },
134
- "lighteval|mmlu:high_school_statistics|5": {
135
- "acc": 0.2962962962962963,
136
- "acc_stderr": 0.031141447823536023
137
- },
138
- "lighteval|mmlu:high_school_us_history|5": {
139
- "acc": 0.4068627450980392,
140
- "acc_stderr": 0.03447891136353382
141
- },
142
- "lighteval|mmlu:high_school_world_history|5": {
143
- "acc": 0.48945147679324896,
144
- "acc_stderr": 0.032539983791662855
145
- },
146
- "lighteval|mmlu:human_aging|5": {
147
- "acc": 0.43946188340807174,
148
- "acc_stderr": 0.03331092511038179
149
- },
150
- "lighteval|mmlu:human_sexuality|5": {
151
- "acc": 0.4122137404580153,
152
- "acc_stderr": 0.04317171194870255
153
- },
154
- "lighteval|mmlu:international_law|5": {
155
- "acc": 0.5206611570247934,
156
- "acc_stderr": 0.04560456086387235
157
- },
158
- "lighteval|mmlu:jurisprudence|5": {
159
- "acc": 0.4444444444444444,
160
- "acc_stderr": 0.04803752235190193
161
- },
162
- "lighteval|mmlu:logical_fallacies|5": {
163
- "acc": 0.3128834355828221,
164
- "acc_stderr": 0.036429145782924055
165
- },
166
- "lighteval|mmlu:machine_learning|5": {
167
- "acc": 0.33035714285714285,
168
- "acc_stderr": 0.04464285714285714
169
- },
170
- "lighteval|mmlu:management|5": {
171
- "acc": 0.47572815533980584,
172
- "acc_stderr": 0.04944901092973781
173
- },
174
- "lighteval|mmlu:marketing|5": {
175
- "acc": 0.6410256410256411,
176
- "acc_stderr": 0.03142616993791923
177
- },
178
- "lighteval|mmlu:medical_genetics|5": {
179
- "acc": 0.44,
180
- "acc_stderr": 0.049888765156985884
181
- },
182
- "lighteval|mmlu:miscellaneous|5": {
183
- "acc": 0.48531289910600256,
184
- "acc_stderr": 0.017872248024429122
185
- },
186
- "lighteval|mmlu:moral_disputes|5": {
187
- "acc": 0.4421965317919075,
188
- "acc_stderr": 0.026738603643807396
189
- },
190
- "lighteval|mmlu:moral_scenarios|5": {
191
- "acc": 0.24134078212290502,
192
- "acc_stderr": 0.014310999547961445
193
- },
194
- "lighteval|mmlu:nutrition|5": {
195
- "acc": 0.4738562091503268,
196
- "acc_stderr": 0.028590752958852387
197
- },
198
- "lighteval|mmlu:philosophy|5": {
199
- "acc": 0.4180064308681672,
200
- "acc_stderr": 0.02801365189199507
201
- },
202
- "lighteval|mmlu:prehistory|5": {
203
- "acc": 0.4012345679012346,
204
- "acc_stderr": 0.027272582849839792
205
- },
206
- "lighteval|mmlu:professional_accounting|5": {
207
- "acc": 0.33687943262411346,
208
- "acc_stderr": 0.028195534873966727
209
- },
210
- "lighteval|mmlu:professional_law|5": {
211
- "acc": 0.32790091264667537,
212
- "acc_stderr": 0.011989936640666525
213
- },
214
- "lighteval|mmlu:professional_medicine|5": {
215
- "acc": 0.2977941176470588,
216
- "acc_stderr": 0.02777829870154544
217
- },
218
- "lighteval|mmlu:professional_psychology|5": {
219
- "acc": 0.36764705882352944,
220
- "acc_stderr": 0.019506291693954854
221
- },
222
- "lighteval|mmlu:public_relations|5": {
223
- "acc": 0.44545454545454544,
224
- "acc_stderr": 0.04760548821460325
225
- },
226
- "lighteval|mmlu:security_studies|5": {
227
- "acc": 0.46530612244897956,
228
- "acc_stderr": 0.03193207024425314
229
- },
230
- "lighteval|mmlu:sociology|5": {
231
- "acc": 0.5522388059701493,
232
- "acc_stderr": 0.03516184772952167
233
- },
234
- "lighteval|mmlu:us_foreign_policy|5": {
235
- "acc": 0.69,
236
- "acc_stderr": 0.04648231987117316
237
- },
238
- "lighteval|mmlu:virology|5": {
239
- "acc": 0.3795180722891566,
240
- "acc_stderr": 0.037777988227480165
241
- },
242
- "lighteval|mmlu:world_religions|5": {
243
- "acc": 0.34502923976608185,
244
- "acc_stderr": 0.036459813773888065
245
- },
246
- "lighteval|mmlu:_average|5": {
247
- "acc": 0.4022082306665673,
248
- "acc_stderr": 0.03614193032276545
249
- }
250
- },
251
- "versions": {
252
- "lighteval|mmlu:abstract_algebra|5": 0,
253
- "lighteval|mmlu:anatomy|5": 0,
254
- "lighteval|mmlu:astronomy|5": 0,
255
- "lighteval|mmlu:business_ethics|5": 0,
256
- "lighteval|mmlu:clinical_knowledge|5": 0,
257
- "lighteval|mmlu:college_biology|5": 0,
258
- "lighteval|mmlu:college_chemistry|5": 0,
259
- "lighteval|mmlu:college_computer_science|5": 0,
260
- "lighteval|mmlu:college_mathematics|5": 0,
261
- "lighteval|mmlu:college_medicine|5": 0,
262
- "lighteval|mmlu:college_physics|5": 0,
263
- "lighteval|mmlu:computer_security|5": 0,
264
- "lighteval|mmlu:conceptual_physics|5": 0,
265
- "lighteval|mmlu:econometrics|5": 0,
266
- "lighteval|mmlu:electrical_engineering|5": 0,
267
- "lighteval|mmlu:elementary_mathematics|5": 0,
268
- "lighteval|mmlu:formal_logic|5": 0,
269
- "lighteval|mmlu:global_facts|5": 0,
270
- "lighteval|mmlu:high_school_biology|5": 0,
271
- "lighteval|mmlu:high_school_chemistry|5": 0,
272
- "lighteval|mmlu:high_school_computer_science|5": 0,
273
- "lighteval|mmlu:high_school_european_history|5": 0,
274
- "lighteval|mmlu:high_school_geography|5": 0,
275
- "lighteval|mmlu:high_school_government_and_politics|5": 0,
276
- "lighteval|mmlu:high_school_macroeconomics|5": 0,
277
- "lighteval|mmlu:high_school_mathematics|5": 0,
278
- "lighteval|mmlu:high_school_microeconomics|5": 0,
279
- "lighteval|mmlu:high_school_physics|5": 0,
280
- "lighteval|mmlu:high_school_psychology|5": 0,
281
- "lighteval|mmlu:high_school_statistics|5": 0,
282
- "lighteval|mmlu:high_school_us_history|5": 0,
283
- "lighteval|mmlu:high_school_world_history|5": 0,
284
- "lighteval|mmlu:human_aging|5": 0,
285
- "lighteval|mmlu:human_sexuality|5": 0,
286
- "lighteval|mmlu:international_law|5": 0,
287
- "lighteval|mmlu:jurisprudence|5": 0,
288
- "lighteval|mmlu:logical_fallacies|5": 0,
289
- "lighteval|mmlu:machine_learning|5": 0,
290
- "lighteval|mmlu:management|5": 0,
291
- "lighteval|mmlu:marketing|5": 0,
292
- "lighteval|mmlu:medical_genetics|5": 0,
293
- "lighteval|mmlu:miscellaneous|5": 0,
294
- "lighteval|mmlu:moral_disputes|5": 0,
295
- "lighteval|mmlu:moral_scenarios|5": 0,
296
- "lighteval|mmlu:nutrition|5": 0,
297
- "lighteval|mmlu:philosophy|5": 0,
298
- "lighteval|mmlu:prehistory|5": 0,
299
- "lighteval|mmlu:professional_accounting|5": 0,
300
- "lighteval|mmlu:professional_law|5": 0,
301
- "lighteval|mmlu:professional_medicine|5": 0,
302
- "lighteval|mmlu:professional_psychology|5": 0,
303
- "lighteval|mmlu:public_relations|5": 0,
304
- "lighteval|mmlu:security_studies|5": 0,
305
- "lighteval|mmlu:sociology|5": 0,
306
- "lighteval|mmlu:us_foreign_policy|5": 0,
307
- "lighteval|mmlu:virology|5": 0,
308
- "lighteval|mmlu:world_religions|5": 0
309
- },
310
- "config_tasks": {
311
- "lighteval|mmlu:abstract_algebra": {
312
- "name": "mmlu:abstract_algebra",
313
- "prompt_function": "mmlu_harness",
314
- "hf_repo": "lighteval/mmlu",
315
- "hf_subset": "abstract_algebra",
316
- "metric": [
317
- "loglikelihood_acc"
318
- ],
319
- "hf_avail_splits": [
320
- "auxiliary_train",
321
- "test",
322
- "validation",
323
- "dev"
324
- ],
325
- "evaluation_splits": [
326
- "test"
327
- ],
328
- "few_shots_split": "dev",
329
- "few_shots_select": "sequential",
330
- "generation_size": 1,
331
- "stop_sequence": [
332
- "\n"
333
- ],
334
- "output_regex": null,
335
- "frozen": false,
336
- "suite": [
337
- "lighteval",
338
- "mmlu"
339
- ],
340
- "original_num_docs": 100,
341
- "effective_num_docs": 100
342
- },
343
- "lighteval|mmlu:anatomy": {
344
- "name": "mmlu:anatomy",
345
- "prompt_function": "mmlu_harness",
346
- "hf_repo": "lighteval/mmlu",
347
- "hf_subset": "anatomy",
348
- "metric": [
349
- "loglikelihood_acc"
350
- ],
351
- "hf_avail_splits": [
352
- "auxiliary_train",
353
- "test",
354
- "validation",
355
- "dev"
356
- ],
357
- "evaluation_splits": [
358
- "test"
359
- ],
360
- "few_shots_split": "dev",
361
- "few_shots_select": "sequential",
362
- "generation_size": 1,
363
- "stop_sequence": [
364
- "\n"
365
- ],
366
- "output_regex": null,
367
- "frozen": false,
368
- "suite": [
369
- "lighteval",
370
- "mmlu"
371
- ],
372
- "original_num_docs": 135,
373
- "effective_num_docs": 135
374
- },
375
- "lighteval|mmlu:astronomy": {
376
- "name": "mmlu:astronomy",
377
- "prompt_function": "mmlu_harness",
378
- "hf_repo": "lighteval/mmlu",
379
- "hf_subset": "astronomy",
380
- "metric": [
381
- "loglikelihood_acc"
382
- ],
383
- "hf_avail_splits": [
384
- "auxiliary_train",
385
- "test",
386
- "validation",
387
- "dev"
388
- ],
389
- "evaluation_splits": [
390
- "test"
391
- ],
392
- "few_shots_split": "dev",
393
- "few_shots_select": "sequential",
394
- "generation_size": 1,
395
- "stop_sequence": [
396
- "\n"
397
- ],
398
- "output_regex": null,
399
- "frozen": false,
400
- "suite": [
401
- "lighteval",
402
- "mmlu"
403
- ],
404
- "original_num_docs": 152,
405
- "effective_num_docs": 152
406
- },
407
- "lighteval|mmlu:business_ethics": {
408
- "name": "mmlu:business_ethics",
409
- "prompt_function": "mmlu_harness",
410
- "hf_repo": "lighteval/mmlu",
411
- "hf_subset": "business_ethics",
412
- "metric": [
413
- "loglikelihood_acc"
414
- ],
415
- "hf_avail_splits": [
416
- "auxiliary_train",
417
- "test",
418
- "validation",
419
- "dev"
420
- ],
421
- "evaluation_splits": [
422
- "test"
423
- ],
424
- "few_shots_split": "dev",
425
- "few_shots_select": "sequential",
426
- "generation_size": 1,
427
- "stop_sequence": [
428
- "\n"
429
- ],
430
- "output_regex": null,
431
- "frozen": false,
432
- "suite": [
433
- "lighteval",
434
- "mmlu"
435
- ],
436
- "original_num_docs": 100,
437
- "effective_num_docs": 100
438
- },
439
- "lighteval|mmlu:clinical_knowledge": {
440
- "name": "mmlu:clinical_knowledge",
441
- "prompt_function": "mmlu_harness",
442
- "hf_repo": "lighteval/mmlu",
443
- "hf_subset": "clinical_knowledge",
444
- "metric": [
445
- "loglikelihood_acc"
446
- ],
447
- "hf_avail_splits": [
448
- "auxiliary_train",
449
- "test",
450
- "validation",
451
- "dev"
452
- ],
453
- "evaluation_splits": [
454
- "test"
455
- ],
456
- "few_shots_split": "dev",
457
- "few_shots_select": "sequential",
458
- "generation_size": 1,
459
- "stop_sequence": [
460
- "\n"
461
- ],
462
- "output_regex": null,
463
- "frozen": false,
464
- "suite": [
465
- "lighteval",
466
- "mmlu"
467
- ],
468
- "original_num_docs": 265,
469
- "effective_num_docs": 265
470
- },
471
- "lighteval|mmlu:college_biology": {
472
- "name": "mmlu:college_biology",
473
- "prompt_function": "mmlu_harness",
474
- "hf_repo": "lighteval/mmlu",
475
- "hf_subset": "college_biology",
476
- "metric": [
477
- "loglikelihood_acc"
478
- ],
479
- "hf_avail_splits": [
480
- "auxiliary_train",
481
- "test",
482
- "validation",
483
- "dev"
484
- ],
485
- "evaluation_splits": [
486
- "test"
487
- ],
488
- "few_shots_split": "dev",
489
- "few_shots_select": "sequential",
490
- "generation_size": 1,
491
- "stop_sequence": [
492
- "\n"
493
- ],
494
- "output_regex": null,
495
- "frozen": false,
496
- "suite": [
497
- "lighteval",
498
- "mmlu"
499
- ],
500
- "original_num_docs": 144,
501
- "effective_num_docs": 144
502
- },
503
- "lighteval|mmlu:college_chemistry": {
504
- "name": "mmlu:college_chemistry",
505
- "prompt_function": "mmlu_harness",
506
- "hf_repo": "lighteval/mmlu",
507
- "hf_subset": "college_chemistry",
508
- "metric": [
509
- "loglikelihood_acc"
510
- ],
511
- "hf_avail_splits": [
512
- "auxiliary_train",
513
- "test",
514
- "validation",
515
- "dev"
516
- ],
517
- "evaluation_splits": [
518
- "test"
519
- ],
520
- "few_shots_split": "dev",
521
- "few_shots_select": "sequential",
522
- "generation_size": 1,
523
- "stop_sequence": [
524
- "\n"
525
- ],
526
- "output_regex": null,
527
- "frozen": false,
528
- "suite": [
529
- "lighteval",
530
- "mmlu"
531
- ],
532
- "original_num_docs": 100,
533
- "effective_num_docs": 100
534
- },
535
- "lighteval|mmlu:college_computer_science": {
536
- "name": "mmlu:college_computer_science",
537
- "prompt_function": "mmlu_harness",
538
- "hf_repo": "lighteval/mmlu",
539
- "hf_subset": "college_computer_science",
540
- "metric": [
541
- "loglikelihood_acc"
542
- ],
543
- "hf_avail_splits": [
544
- "auxiliary_train",
545
- "test",
546
- "validation",
547
- "dev"
548
- ],
549
- "evaluation_splits": [
550
- "test"
551
- ],
552
- "few_shots_split": "dev",
553
- "few_shots_select": "sequential",
554
- "generation_size": 1,
555
- "stop_sequence": [
556
- "\n"
557
- ],
558
- "output_regex": null,
559
- "frozen": false,
560
- "suite": [
561
- "lighteval",
562
- "mmlu"
563
- ],
564
- "original_num_docs": 100,
565
- "effective_num_docs": 100
566
- },
567
- "lighteval|mmlu:college_mathematics": {
568
- "name": "mmlu:college_mathematics",
569
- "prompt_function": "mmlu_harness",
570
- "hf_repo": "lighteval/mmlu",
571
- "hf_subset": "college_mathematics",
572
- "metric": [
573
- "loglikelihood_acc"
574
- ],
575
- "hf_avail_splits": [
576
- "auxiliary_train",
577
- "test",
578
- "validation",
579
- "dev"
580
- ],
581
- "evaluation_splits": [
582
- "test"
583
- ],
584
- "few_shots_split": "dev",
585
- "few_shots_select": "sequential",
586
- "generation_size": 1,
587
- "stop_sequence": [
588
- "\n"
589
- ],
590
- "output_regex": null,
591
- "frozen": false,
592
- "suite": [
593
- "lighteval",
594
- "mmlu"
595
- ],
596
- "original_num_docs": 100,
597
- "effective_num_docs": 100
598
- },
599
- "lighteval|mmlu:college_medicine": {
600
- "name": "mmlu:college_medicine",
601
- "prompt_function": "mmlu_harness",
602
- "hf_repo": "lighteval/mmlu",
603
- "hf_subset": "college_medicine",
604
- "metric": [
605
- "loglikelihood_acc"
606
- ],
607
- "hf_avail_splits": [
608
- "auxiliary_train",
609
- "test",
610
- "validation",
611
- "dev"
612
- ],
613
- "evaluation_splits": [
614
- "test"
615
- ],
616
- "few_shots_split": "dev",
617
- "few_shots_select": "sequential",
618
- "generation_size": 1,
619
- "stop_sequence": [
620
- "\n"
621
- ],
622
- "output_regex": null,
623
- "frozen": false,
624
- "suite": [
625
- "lighteval",
626
- "mmlu"
627
- ],
628
- "original_num_docs": 173,
629
- "effective_num_docs": 173
630
- },
631
- "lighteval|mmlu:college_physics": {
632
- "name": "mmlu:college_physics",
633
- "prompt_function": "mmlu_harness",
634
- "hf_repo": "lighteval/mmlu",
635
- "hf_subset": "college_physics",
636
- "metric": [
637
- "loglikelihood_acc"
638
- ],
639
- "hf_avail_splits": [
640
- "auxiliary_train",
641
- "test",
642
- "validation",
643
- "dev"
644
- ],
645
- "evaluation_splits": [
646
- "test"
647
- ],
648
- "few_shots_split": "dev",
649
- "few_shots_select": "sequential",
650
- "generation_size": 1,
651
- "stop_sequence": [
652
- "\n"
653
- ],
654
- "output_regex": null,
655
- "frozen": false,
656
- "suite": [
657
- "lighteval",
658
- "mmlu"
659
- ],
660
- "original_num_docs": 102,
661
- "effective_num_docs": 102
662
- },
663
- "lighteval|mmlu:computer_security": {
664
- "name": "mmlu:computer_security",
665
- "prompt_function": "mmlu_harness",
666
- "hf_repo": "lighteval/mmlu",
667
- "hf_subset": "computer_security",
668
- "metric": [
669
- "loglikelihood_acc"
670
- ],
671
- "hf_avail_splits": [
672
- "auxiliary_train",
673
- "test",
674
- "validation",
675
- "dev"
676
- ],
677
- "evaluation_splits": [
678
- "test"
679
- ],
680
- "few_shots_split": "dev",
681
- "few_shots_select": "sequential",
682
- "generation_size": 1,
683
- "stop_sequence": [
684
- "\n"
685
- ],
686
- "output_regex": null,
687
- "frozen": false,
688
- "suite": [
689
- "lighteval",
690
- "mmlu"
691
- ],
692
- "original_num_docs": 100,
693
- "effective_num_docs": 100
694
- },
695
- "lighteval|mmlu:conceptual_physics": {
696
- "name": "mmlu:conceptual_physics",
697
- "prompt_function": "mmlu_harness",
698
- "hf_repo": "lighteval/mmlu",
699
- "hf_subset": "conceptual_physics",
700
- "metric": [
701
- "loglikelihood_acc"
702
- ],
703
- "hf_avail_splits": [
704
- "auxiliary_train",
705
- "test",
706
- "validation",
707
- "dev"
708
- ],
709
- "evaluation_splits": [
710
- "test"
711
- ],
712
- "few_shots_split": "dev",
713
- "few_shots_select": "sequential",
714
- "generation_size": 1,
715
- "stop_sequence": [
716
- "\n"
717
- ],
718
- "output_regex": null,
719
- "frozen": false,
720
- "suite": [
721
- "lighteval",
722
- "mmlu"
723
- ],
724
- "original_num_docs": 235,
725
- "effective_num_docs": 235
726
- },
727
- "lighteval|mmlu:econometrics": {
728
- "name": "mmlu:econometrics",
729
- "prompt_function": "mmlu_harness",
730
- "hf_repo": "lighteval/mmlu",
731
- "hf_subset": "econometrics",
732
- "metric": [
733
- "loglikelihood_acc"
734
- ],
735
- "hf_avail_splits": [
736
- "auxiliary_train",
737
- "test",
738
- "validation",
739
- "dev"
740
- ],
741
- "evaluation_splits": [
742
- "test"
743
- ],
744
- "few_shots_split": "dev",
745
- "few_shots_select": "sequential",
746
- "generation_size": 1,
747
- "stop_sequence": [
748
- "\n"
749
- ],
750
- "output_regex": null,
751
- "frozen": false,
752
- "suite": [
753
- "lighteval",
754
- "mmlu"
755
- ],
756
- "original_num_docs": 114,
757
- "effective_num_docs": 114
758
- },
759
- "lighteval|mmlu:electrical_engineering": {
760
- "name": "mmlu:electrical_engineering",
761
- "prompt_function": "mmlu_harness",
762
- "hf_repo": "lighteval/mmlu",
763
- "hf_subset": "electrical_engineering",
764
- "metric": [
765
- "loglikelihood_acc"
766
- ],
767
- "hf_avail_splits": [
768
- "auxiliary_train",
769
- "test",
770
- "validation",
771
- "dev"
772
- ],
773
- "evaluation_splits": [
774
- "test"
775
- ],
776
- "few_shots_split": "dev",
777
- "few_shots_select": "sequential",
778
- "generation_size": 1,
779
- "stop_sequence": [
780
- "\n"
781
- ],
782
- "output_regex": null,
783
- "frozen": false,
784
- "suite": [
785
- "lighteval",
786
- "mmlu"
787
- ],
788
- "original_num_docs": 145,
789
- "effective_num_docs": 145
790
- },
791
- "lighteval|mmlu:elementary_mathematics": {
792
- "name": "mmlu:elementary_mathematics",
793
- "prompt_function": "mmlu_harness",
794
- "hf_repo": "lighteval/mmlu",
795
- "hf_subset": "elementary_mathematics",
796
- "metric": [
797
- "loglikelihood_acc"
798
- ],
799
- "hf_avail_splits": [
800
- "auxiliary_train",
801
- "test",
802
- "validation",
803
- "dev"
804
- ],
805
- "evaluation_splits": [
806
- "test"
807
- ],
808
- "few_shots_split": "dev",
809
- "few_shots_select": "sequential",
810
- "generation_size": 1,
811
- "stop_sequence": [
812
- "\n"
813
- ],
814
- "output_regex": null,
815
- "frozen": false,
816
- "suite": [
817
- "lighteval",
818
- "mmlu"
819
- ],
820
- "original_num_docs": 378,
821
- "effective_num_docs": 378
822
- },
823
- "lighteval|mmlu:formal_logic": {
824
- "name": "mmlu:formal_logic",
825
- "prompt_function": "mmlu_harness",
826
- "hf_repo": "lighteval/mmlu",
827
- "hf_subset": "formal_logic",
828
- "metric": [
829
- "loglikelihood_acc"
830
- ],
831
- "hf_avail_splits": [
832
- "auxiliary_train",
833
- "test",
834
- "validation",
835
- "dev"
836
- ],
837
- "evaluation_splits": [
838
- "test"
839
- ],
840
- "few_shots_split": "dev",
841
- "few_shots_select": "sequential",
842
- "generation_size": 1,
843
- "stop_sequence": [
844
- "\n"
845
- ],
846
- "output_regex": null,
847
- "frozen": false,
848
- "suite": [
849
- "lighteval",
850
- "mmlu"
851
- ],
852
- "original_num_docs": 126,
853
- "effective_num_docs": 126
854
- },
855
- "lighteval|mmlu:global_facts": {
856
- "name": "mmlu:global_facts",
857
- "prompt_function": "mmlu_harness",
858
- "hf_repo": "lighteval/mmlu",
859
- "hf_subset": "global_facts",
860
- "metric": [
861
- "loglikelihood_acc"
862
- ],
863
- "hf_avail_splits": [
864
- "auxiliary_train",
865
- "test",
866
- "validation",
867
- "dev"
868
- ],
869
- "evaluation_splits": [
870
- "test"
871
- ],
872
- "few_shots_split": "dev",
873
- "few_shots_select": "sequential",
874
- "generation_size": 1,
875
- "stop_sequence": [
876
- "\n"
877
- ],
878
- "output_regex": null,
879
- "frozen": false,
880
- "suite": [
881
- "lighteval",
882
- "mmlu"
883
- ],
884
- "original_num_docs": 100,
885
- "effective_num_docs": 100
886
- },
887
- "lighteval|mmlu:high_school_biology": {
888
- "name": "mmlu:high_school_biology",
889
- "prompt_function": "mmlu_harness",
890
- "hf_repo": "lighteval/mmlu",
891
- "hf_subset": "high_school_biology",
892
- "metric": [
893
- "loglikelihood_acc"
894
- ],
895
- "hf_avail_splits": [
896
- "auxiliary_train",
897
- "test",
898
- "validation",
899
- "dev"
900
- ],
901
- "evaluation_splits": [
902
- "test"
903
- ],
904
- "few_shots_split": "dev",
905
- "few_shots_select": "sequential",
906
- "generation_size": 1,
907
- "stop_sequence": [
908
- "\n"
909
- ],
910
- "output_regex": null,
911
- "frozen": false,
912
- "suite": [
913
- "lighteval",
914
- "mmlu"
915
- ],
916
- "original_num_docs": 310,
917
- "effective_num_docs": 310
918
- },
919
- "lighteval|mmlu:high_school_chemistry": {
920
- "name": "mmlu:high_school_chemistry",
921
- "prompt_function": "mmlu_harness",
922
- "hf_repo": "lighteval/mmlu",
923
- "hf_subset": "high_school_chemistry",
924
- "metric": [
925
- "loglikelihood_acc"
926
- ],
927
- "hf_avail_splits": [
928
- "auxiliary_train",
929
- "test",
930
- "validation",
931
- "dev"
932
- ],
933
- "evaluation_splits": [
934
- "test"
935
- ],
936
- "few_shots_split": "dev",
937
- "few_shots_select": "sequential",
938
- "generation_size": 1,
939
- "stop_sequence": [
940
- "\n"
941
- ],
942
- "output_regex": null,
943
- "frozen": false,
944
- "suite": [
945
- "lighteval",
946
- "mmlu"
947
- ],
948
- "original_num_docs": 203,
949
- "effective_num_docs": 203
950
- },
951
- "lighteval|mmlu:high_school_computer_science": {
952
- "name": "mmlu:high_school_computer_science",
953
- "prompt_function": "mmlu_harness",
954
- "hf_repo": "lighteval/mmlu",
955
- "hf_subset": "high_school_computer_science",
956
- "metric": [
957
- "loglikelihood_acc"
958
- ],
959
- "hf_avail_splits": [
960
- "auxiliary_train",
961
- "test",
962
- "validation",
963
- "dev"
964
- ],
965
- "evaluation_splits": [
966
- "test"
967
- ],
968
- "few_shots_split": "dev",
969
- "few_shots_select": "sequential",
970
- "generation_size": 1,
971
- "stop_sequence": [
972
- "\n"
973
- ],
974
- "output_regex": null,
975
- "frozen": false,
976
- "suite": [
977
- "lighteval",
978
- "mmlu"
979
- ],
980
- "original_num_docs": 100,
981
- "effective_num_docs": 100
982
- },
983
- "lighteval|mmlu:high_school_european_history": {
984
- "name": "mmlu:high_school_european_history",
985
- "prompt_function": "mmlu_harness",
986
- "hf_repo": "lighteval/mmlu",
987
- "hf_subset": "high_school_european_history",
988
- "metric": [
989
- "loglikelihood_acc"
990
- ],
991
- "hf_avail_splits": [
992
- "auxiliary_train",
993
- "test",
994
- "validation",
995
- "dev"
996
- ],
997
- "evaluation_splits": [
998
- "test"
999
- ],
1000
- "few_shots_split": "dev",
1001
- "few_shots_select": "sequential",
1002
- "generation_size": 1,
1003
- "stop_sequence": [
1004
- "\n"
1005
- ],
1006
- "output_regex": null,
1007
- "frozen": false,
1008
- "suite": [
1009
- "lighteval",
1010
- "mmlu"
1011
- ],
1012
- "original_num_docs": 165,
1013
- "effective_num_docs": 165
1014
- },
1015
- "lighteval|mmlu:high_school_geography": {
1016
- "name": "mmlu:high_school_geography",
1017
- "prompt_function": "mmlu_harness",
1018
- "hf_repo": "lighteval/mmlu",
1019
- "hf_subset": "high_school_geography",
1020
- "metric": [
1021
- "loglikelihood_acc"
1022
- ],
1023
- "hf_avail_splits": [
1024
- "auxiliary_train",
1025
- "test",
1026
- "validation",
1027
- "dev"
1028
- ],
1029
- "evaluation_splits": [
1030
- "test"
1031
- ],
1032
- "few_shots_split": "dev",
1033
- "few_shots_select": "sequential",
1034
- "generation_size": 1,
1035
- "stop_sequence": [
1036
- "\n"
1037
- ],
1038
- "output_regex": null,
1039
- "frozen": false,
1040
- "suite": [
1041
- "lighteval",
1042
- "mmlu"
1043
- ],
1044
- "original_num_docs": 198,
1045
- "effective_num_docs": 198
1046
- },
1047
- "lighteval|mmlu:high_school_government_and_politics": {
1048
- "name": "mmlu:high_school_government_and_politics",
1049
- "prompt_function": "mmlu_harness",
1050
- "hf_repo": "lighteval/mmlu",
1051
- "hf_subset": "high_school_government_and_politics",
1052
- "metric": [
1053
- "loglikelihood_acc"
1054
- ],
1055
- "hf_avail_splits": [
1056
- "auxiliary_train",
1057
- "test",
1058
- "validation",
1059
- "dev"
1060
- ],
1061
- "evaluation_splits": [
1062
- "test"
1063
- ],
1064
- "few_shots_split": "dev",
1065
- "few_shots_select": "sequential",
1066
- "generation_size": 1,
1067
- "stop_sequence": [
1068
- "\n"
1069
- ],
1070
- "output_regex": null,
1071
- "frozen": false,
1072
- "suite": [
1073
- "lighteval",
1074
- "mmlu"
1075
- ],
1076
- "original_num_docs": 193,
1077
- "effective_num_docs": 193
1078
- },
1079
- "lighteval|mmlu:high_school_macroeconomics": {
1080
- "name": "mmlu:high_school_macroeconomics",
1081
- "prompt_function": "mmlu_harness",
1082
- "hf_repo": "lighteval/mmlu",
1083
- "hf_subset": "high_school_macroeconomics",
1084
- "metric": [
1085
- "loglikelihood_acc"
1086
- ],
1087
- "hf_avail_splits": [
1088
- "auxiliary_train",
1089
- "test",
1090
- "validation",
1091
- "dev"
1092
- ],
1093
- "evaluation_splits": [
1094
- "test"
1095
- ],
1096
- "few_shots_split": "dev",
1097
- "few_shots_select": "sequential",
1098
- "generation_size": 1,
1099
- "stop_sequence": [
1100
- "\n"
1101
- ],
1102
- "output_regex": null,
1103
- "frozen": false,
1104
- "suite": [
1105
- "lighteval",
1106
- "mmlu"
1107
- ],
1108
- "original_num_docs": 390,
1109
- "effective_num_docs": 390
1110
- },
1111
- "lighteval|mmlu:high_school_mathematics": {
1112
- "name": "mmlu:high_school_mathematics",
1113
- "prompt_function": "mmlu_harness",
1114
- "hf_repo": "lighteval/mmlu",
1115
- "hf_subset": "high_school_mathematics",
1116
- "metric": [
1117
- "loglikelihood_acc"
1118
- ],
1119
- "hf_avail_splits": [
1120
- "auxiliary_train",
1121
- "test",
1122
- "validation",
1123
- "dev"
1124
- ],
1125
- "evaluation_splits": [
1126
- "test"
1127
- ],
1128
- "few_shots_split": "dev",
1129
- "few_shots_select": "sequential",
1130
- "generation_size": 1,
1131
- "stop_sequence": [
1132
- "\n"
1133
- ],
1134
- "output_regex": null,
1135
- "frozen": false,
1136
- "suite": [
1137
- "lighteval",
1138
- "mmlu"
1139
- ],
1140
- "original_num_docs": 270,
1141
- "effective_num_docs": 270
1142
- },
1143
- "lighteval|mmlu:high_school_microeconomics": {
1144
- "name": "mmlu:high_school_microeconomics",
1145
- "prompt_function": "mmlu_harness",
1146
- "hf_repo": "lighteval/mmlu",
1147
- "hf_subset": "high_school_microeconomics",
1148
- "metric": [
1149
- "loglikelihood_acc"
1150
- ],
1151
- "hf_avail_splits": [
1152
- "auxiliary_train",
1153
- "test",
1154
- "validation",
1155
- "dev"
1156
- ],
1157
- "evaluation_splits": [
1158
- "test"
1159
- ],
1160
- "few_shots_split": "dev",
1161
- "few_shots_select": "sequential",
1162
- "generation_size": 1,
1163
- "stop_sequence": [
1164
- "\n"
1165
- ],
1166
- "output_regex": null,
1167
- "frozen": false,
1168
- "suite": [
1169
- "lighteval",
1170
- "mmlu"
1171
- ],
1172
- "original_num_docs": 238,
1173
- "effective_num_docs": 238
1174
- },
1175
- "lighteval|mmlu:high_school_physics": {
1176
- "name": "mmlu:high_school_physics",
1177
- "prompt_function": "mmlu_harness",
1178
- "hf_repo": "lighteval/mmlu",
1179
- "hf_subset": "high_school_physics",
1180
- "metric": [
1181
- "loglikelihood_acc"
1182
- ],
1183
- "hf_avail_splits": [
1184
- "auxiliary_train",
1185
- "test",
1186
- "validation",
1187
- "dev"
1188
- ],
1189
- "evaluation_splits": [
1190
- "test"
1191
- ],
1192
- "few_shots_split": "dev",
1193
- "few_shots_select": "sequential",
1194
- "generation_size": 1,
1195
- "stop_sequence": [
1196
- "\n"
1197
- ],
1198
- "output_regex": null,
1199
- "frozen": false,
1200
- "suite": [
1201
- "lighteval",
1202
- "mmlu"
1203
- ],
1204
- "original_num_docs": 151,
1205
- "effective_num_docs": 151
1206
- },
1207
- "lighteval|mmlu:high_school_psychology": {
1208
- "name": "mmlu:high_school_psychology",
1209
- "prompt_function": "mmlu_harness",
1210
- "hf_repo": "lighteval/mmlu",
1211
- "hf_subset": "high_school_psychology",
1212
- "metric": [
1213
- "loglikelihood_acc"
1214
- ],
1215
- "hf_avail_splits": [
1216
- "auxiliary_train",
1217
- "test",
1218
- "validation",
1219
- "dev"
1220
- ],
1221
- "evaluation_splits": [
1222
- "test"
1223
- ],
1224
- "few_shots_split": "dev",
1225
- "few_shots_select": "sequential",
1226
- "generation_size": 1,
1227
- "stop_sequence": [
1228
- "\n"
1229
- ],
1230
- "output_regex": null,
1231
- "frozen": false,
1232
- "suite": [
1233
- "lighteval",
1234
- "mmlu"
1235
- ],
1236
- "original_num_docs": 545,
1237
- "effective_num_docs": 545
1238
- },
1239
- "lighteval|mmlu:high_school_statistics": {
1240
- "name": "mmlu:high_school_statistics",
1241
- "prompt_function": "mmlu_harness",
1242
- "hf_repo": "lighteval/mmlu",
1243
- "hf_subset": "high_school_statistics",
1244
- "metric": [
1245
- "loglikelihood_acc"
1246
- ],
1247
- "hf_avail_splits": [
1248
- "auxiliary_train",
1249
- "test",
1250
- "validation",
1251
- "dev"
1252
- ],
1253
- "evaluation_splits": [
1254
- "test"
1255
- ],
1256
- "few_shots_split": "dev",
1257
- "few_shots_select": "sequential",
1258
- "generation_size": 1,
1259
- "stop_sequence": [
1260
- "\n"
1261
- ],
1262
- "output_regex": null,
1263
- "frozen": false,
1264
- "suite": [
1265
- "lighteval",
1266
- "mmlu"
1267
- ],
1268
- "original_num_docs": 216,
1269
- "effective_num_docs": 216
1270
- },
1271
- "lighteval|mmlu:high_school_us_history": {
1272
- "name": "mmlu:high_school_us_history",
1273
- "prompt_function": "mmlu_harness",
1274
- "hf_repo": "lighteval/mmlu",
1275
- "hf_subset": "high_school_us_history",
1276
- "metric": [
1277
- "loglikelihood_acc"
1278
- ],
1279
- "hf_avail_splits": [
1280
- "auxiliary_train",
1281
- "test",
1282
- "validation",
1283
- "dev"
1284
- ],
1285
- "evaluation_splits": [
1286
- "test"
1287
- ],
1288
- "few_shots_split": "dev",
1289
- "few_shots_select": "sequential",
1290
- "generation_size": 1,
1291
- "stop_sequence": [
1292
- "\n"
1293
- ],
1294
- "output_regex": null,
1295
- "frozen": false,
1296
- "suite": [
1297
- "lighteval",
1298
- "mmlu"
1299
- ],
1300
- "original_num_docs": 204,
1301
- "effective_num_docs": 204
1302
- },
1303
- "lighteval|mmlu:high_school_world_history": {
1304
- "name": "mmlu:high_school_world_history",
1305
- "prompt_function": "mmlu_harness",
1306
- "hf_repo": "lighteval/mmlu",
1307
- "hf_subset": "high_school_world_history",
1308
- "metric": [
1309
- "loglikelihood_acc"
1310
- ],
1311
- "hf_avail_splits": [
1312
- "auxiliary_train",
1313
- "test",
1314
- "validation",
1315
- "dev"
1316
- ],
1317
- "evaluation_splits": [
1318
- "test"
1319
- ],
1320
- "few_shots_split": "dev",
1321
- "few_shots_select": "sequential",
1322
- "generation_size": 1,
1323
- "stop_sequence": [
1324
- "\n"
1325
- ],
1326
- "output_regex": null,
1327
- "frozen": false,
1328
- "suite": [
1329
- "lighteval",
1330
- "mmlu"
1331
- ],
1332
- "original_num_docs": 237,
1333
- "effective_num_docs": 237
1334
- },
1335
- "lighteval|mmlu:human_aging": {
1336
- "name": "mmlu:human_aging",
1337
- "prompt_function": "mmlu_harness",
1338
- "hf_repo": "lighteval/mmlu",
1339
- "hf_subset": "human_aging",
1340
- "metric": [
1341
- "loglikelihood_acc"
1342
- ],
1343
- "hf_avail_splits": [
1344
- "auxiliary_train",
1345
- "test",
1346
- "validation",
1347
- "dev"
1348
- ],
1349
- "evaluation_splits": [
1350
- "test"
1351
- ],
1352
- "few_shots_split": "dev",
1353
- "few_shots_select": "sequential",
1354
- "generation_size": 1,
1355
- "stop_sequence": [
1356
- "\n"
1357
- ],
1358
- "output_regex": null,
1359
- "frozen": false,
1360
- "suite": [
1361
- "lighteval",
1362
- "mmlu"
1363
- ],
1364
- "original_num_docs": 223,
1365
- "effective_num_docs": 223
1366
- },
1367
- "lighteval|mmlu:human_sexuality": {
1368
- "name": "mmlu:human_sexuality",
1369
- "prompt_function": "mmlu_harness",
1370
- "hf_repo": "lighteval/mmlu",
1371
- "hf_subset": "human_sexuality",
1372
- "metric": [
1373
- "loglikelihood_acc"
1374
- ],
1375
- "hf_avail_splits": [
1376
- "auxiliary_train",
1377
- "test",
1378
- "validation",
1379
- "dev"
1380
- ],
1381
- "evaluation_splits": [
1382
- "test"
1383
- ],
1384
- "few_shots_split": "dev",
1385
- "few_shots_select": "sequential",
1386
- "generation_size": 1,
1387
- "stop_sequence": [
1388
- "\n"
1389
- ],
1390
- "output_regex": null,
1391
- "frozen": false,
1392
- "suite": [
1393
- "lighteval",
1394
- "mmlu"
1395
- ],
1396
- "original_num_docs": 131,
1397
- "effective_num_docs": 131
1398
- },
1399
- "lighteval|mmlu:international_law": {
1400
- "name": "mmlu:international_law",
1401
- "prompt_function": "mmlu_harness",
1402
- "hf_repo": "lighteval/mmlu",
1403
- "hf_subset": "international_law",
1404
- "metric": [
1405
- "loglikelihood_acc"
1406
- ],
1407
- "hf_avail_splits": [
1408
- "auxiliary_train",
1409
- "test",
1410
- "validation",
1411
- "dev"
1412
- ],
1413
- "evaluation_splits": [
1414
- "test"
1415
- ],
1416
- "few_shots_split": "dev",
1417
- "few_shots_select": "sequential",
1418
- "generation_size": 1,
1419
- "stop_sequence": [
1420
- "\n"
1421
- ],
1422
- "output_regex": null,
1423
- "frozen": false,
1424
- "suite": [
1425
- "lighteval",
1426
- "mmlu"
1427
- ],
1428
- "original_num_docs": 121,
1429
- "effective_num_docs": 121
1430
- },
1431
- "lighteval|mmlu:jurisprudence": {
1432
- "name": "mmlu:jurisprudence",
1433
- "prompt_function": "mmlu_harness",
1434
- "hf_repo": "lighteval/mmlu",
1435
- "hf_subset": "jurisprudence",
1436
- "metric": [
1437
- "loglikelihood_acc"
1438
- ],
1439
- "hf_avail_splits": [
1440
- "auxiliary_train",
1441
- "test",
1442
- "validation",
1443
- "dev"
1444
- ],
1445
- "evaluation_splits": [
1446
- "test"
1447
- ],
1448
- "few_shots_split": "dev",
1449
- "few_shots_select": "sequential",
1450
- "generation_size": 1,
1451
- "stop_sequence": [
1452
- "\n"
1453
- ],
1454
- "output_regex": null,
1455
- "frozen": false,
1456
- "suite": [
1457
- "lighteval",
1458
- "mmlu"
1459
- ],
1460
- "original_num_docs": 108,
1461
- "effective_num_docs": 108
1462
- },
1463
- "lighteval|mmlu:logical_fallacies": {
1464
- "name": "mmlu:logical_fallacies",
1465
- "prompt_function": "mmlu_harness",
1466
- "hf_repo": "lighteval/mmlu",
1467
- "hf_subset": "logical_fallacies",
1468
- "metric": [
1469
- "loglikelihood_acc"
1470
- ],
1471
- "hf_avail_splits": [
1472
- "auxiliary_train",
1473
- "test",
1474
- "validation",
1475
- "dev"
1476
- ],
1477
- "evaluation_splits": [
1478
- "test"
1479
- ],
1480
- "few_shots_split": "dev",
1481
- "few_shots_select": "sequential",
1482
- "generation_size": 1,
1483
- "stop_sequence": [
1484
- "\n"
1485
- ],
1486
- "output_regex": null,
1487
- "frozen": false,
1488
- "suite": [
1489
- "lighteval",
1490
- "mmlu"
1491
- ],
1492
- "original_num_docs": 163,
1493
- "effective_num_docs": 163
1494
- },
1495
- "lighteval|mmlu:machine_learning": {
1496
- "name": "mmlu:machine_learning",
1497
- "prompt_function": "mmlu_harness",
1498
- "hf_repo": "lighteval/mmlu",
1499
- "hf_subset": "machine_learning",
1500
- "metric": [
1501
- "loglikelihood_acc"
1502
- ],
1503
- "hf_avail_splits": [
1504
- "auxiliary_train",
1505
- "test",
1506
- "validation",
1507
- "dev"
1508
- ],
1509
- "evaluation_splits": [
1510
- "test"
1511
- ],
1512
- "few_shots_split": "dev",
1513
- "few_shots_select": "sequential",
1514
- "generation_size": 1,
1515
- "stop_sequence": [
1516
- "\n"
1517
- ],
1518
- "output_regex": null,
1519
- "frozen": false,
1520
- "suite": [
1521
- "lighteval",
1522
- "mmlu"
1523
- ],
1524
- "original_num_docs": 112,
1525
- "effective_num_docs": 112
1526
- },
1527
- "lighteval|mmlu:management": {
1528
- "name": "mmlu:management",
1529
- "prompt_function": "mmlu_harness",
1530
- "hf_repo": "lighteval/mmlu",
1531
- "hf_subset": "management",
1532
- "metric": [
1533
- "loglikelihood_acc"
1534
- ],
1535
- "hf_avail_splits": [
1536
- "auxiliary_train",
1537
- "test",
1538
- "validation",
1539
- "dev"
1540
- ],
1541
- "evaluation_splits": [
1542
- "test"
1543
- ],
1544
- "few_shots_split": "dev",
1545
- "few_shots_select": "sequential",
1546
- "generation_size": 1,
1547
- "stop_sequence": [
1548
- "\n"
1549
- ],
1550
- "output_regex": null,
1551
- "frozen": false,
1552
- "suite": [
1553
- "lighteval",
1554
- "mmlu"
1555
- ],
1556
- "original_num_docs": 103,
1557
- "effective_num_docs": 103
1558
- },
1559
- "lighteval|mmlu:marketing": {
1560
- "name": "mmlu:marketing",
1561
- "prompt_function": "mmlu_harness",
1562
- "hf_repo": "lighteval/mmlu",
1563
- "hf_subset": "marketing",
1564
- "metric": [
1565
- "loglikelihood_acc"
1566
- ],
1567
- "hf_avail_splits": [
1568
- "auxiliary_train",
1569
- "test",
1570
- "validation",
1571
- "dev"
1572
- ],
1573
- "evaluation_splits": [
1574
- "test"
1575
- ],
1576
- "few_shots_split": "dev",
1577
- "few_shots_select": "sequential",
1578
- "generation_size": 1,
1579
- "stop_sequence": [
1580
- "\n"
1581
- ],
1582
- "output_regex": null,
1583
- "frozen": false,
1584
- "suite": [
1585
- "lighteval",
1586
- "mmlu"
1587
- ],
1588
- "original_num_docs": 234,
1589
- "effective_num_docs": 234
1590
- },
1591
- "lighteval|mmlu:medical_genetics": {
1592
- "name": "mmlu:medical_genetics",
1593
- "prompt_function": "mmlu_harness",
1594
- "hf_repo": "lighteval/mmlu",
1595
- "hf_subset": "medical_genetics",
1596
- "metric": [
1597
- "loglikelihood_acc"
1598
- ],
1599
- "hf_avail_splits": [
1600
- "auxiliary_train",
1601
- "test",
1602
- "validation",
1603
- "dev"
1604
- ],
1605
- "evaluation_splits": [
1606
- "test"
1607
- ],
1608
- "few_shots_split": "dev",
1609
- "few_shots_select": "sequential",
1610
- "generation_size": 1,
1611
- "stop_sequence": [
1612
- "\n"
1613
- ],
1614
- "output_regex": null,
1615
- "frozen": false,
1616
- "suite": [
1617
- "lighteval",
1618
- "mmlu"
1619
- ],
1620
- "original_num_docs": 100,
1621
- "effective_num_docs": 100
1622
- },
1623
- "lighteval|mmlu:miscellaneous": {
1624
- "name": "mmlu:miscellaneous",
1625
- "prompt_function": "mmlu_harness",
1626
- "hf_repo": "lighteval/mmlu",
1627
- "hf_subset": "miscellaneous",
1628
- "metric": [
1629
- "loglikelihood_acc"
1630
- ],
1631
- "hf_avail_splits": [
1632
- "auxiliary_train",
1633
- "test",
1634
- "validation",
1635
- "dev"
1636
- ],
1637
- "evaluation_splits": [
1638
- "test"
1639
- ],
1640
- "few_shots_split": "dev",
1641
- "few_shots_select": "sequential",
1642
- "generation_size": 1,
1643
- "stop_sequence": [
1644
- "\n"
1645
- ],
1646
- "output_regex": null,
1647
- "frozen": false,
1648
- "suite": [
1649
- "lighteval",
1650
- "mmlu"
1651
- ],
1652
- "original_num_docs": 783,
1653
- "effective_num_docs": 783
1654
- },
1655
- "lighteval|mmlu:moral_disputes": {
1656
- "name": "mmlu:moral_disputes",
1657
- "prompt_function": "mmlu_harness",
1658
- "hf_repo": "lighteval/mmlu",
1659
- "hf_subset": "moral_disputes",
1660
- "metric": [
1661
- "loglikelihood_acc"
1662
- ],
1663
- "hf_avail_splits": [
1664
- "auxiliary_train",
1665
- "test",
1666
- "validation",
1667
- "dev"
1668
- ],
1669
- "evaluation_splits": [
1670
- "test"
1671
- ],
1672
- "few_shots_split": "dev",
1673
- "few_shots_select": "sequential",
1674
- "generation_size": 1,
1675
- "stop_sequence": [
1676
- "\n"
1677
- ],
1678
- "output_regex": null,
1679
- "frozen": false,
1680
- "suite": [
1681
- "lighteval",
1682
- "mmlu"
1683
- ],
1684
- "original_num_docs": 346,
1685
- "effective_num_docs": 346
1686
- },
1687
- "lighteval|mmlu:moral_scenarios": {
1688
- "name": "mmlu:moral_scenarios",
1689
- "prompt_function": "mmlu_harness",
1690
- "hf_repo": "lighteval/mmlu",
1691
- "hf_subset": "moral_scenarios",
1692
- "metric": [
1693
- "loglikelihood_acc"
1694
- ],
1695
- "hf_avail_splits": [
1696
- "auxiliary_train",
1697
- "test",
1698
- "validation",
1699
- "dev"
1700
- ],
1701
- "evaluation_splits": [
1702
- "test"
1703
- ],
1704
- "few_shots_split": "dev",
1705
- "few_shots_select": "sequential",
1706
- "generation_size": 1,
1707
- "stop_sequence": [
1708
- "\n"
1709
- ],
1710
- "output_regex": null,
1711
- "frozen": false,
1712
- "suite": [
1713
- "lighteval",
1714
- "mmlu"
1715
- ],
1716
- "original_num_docs": 895,
1717
- "effective_num_docs": 895
1718
- },
1719
- "lighteval|mmlu:nutrition": {
1720
- "name": "mmlu:nutrition",
1721
- "prompt_function": "mmlu_harness",
1722
- "hf_repo": "lighteval/mmlu",
1723
- "hf_subset": "nutrition",
1724
- "metric": [
1725
- "loglikelihood_acc"
1726
- ],
1727
- "hf_avail_splits": [
1728
- "auxiliary_train",
1729
- "test",
1730
- "validation",
1731
- "dev"
1732
- ],
1733
- "evaluation_splits": [
1734
- "test"
1735
- ],
1736
- "few_shots_split": "dev",
1737
- "few_shots_select": "sequential",
1738
- "generation_size": 1,
1739
- "stop_sequence": [
1740
- "\n"
1741
- ],
1742
- "output_regex": null,
1743
- "frozen": false,
1744
- "suite": [
1745
- "lighteval",
1746
- "mmlu"
1747
- ],
1748
- "original_num_docs": 306,
1749
- "effective_num_docs": 306
1750
- },
1751
- "lighteval|mmlu:philosophy": {
1752
- "name": "mmlu:philosophy",
1753
- "prompt_function": "mmlu_harness",
1754
- "hf_repo": "lighteval/mmlu",
1755
- "hf_subset": "philosophy",
1756
- "metric": [
1757
- "loglikelihood_acc"
1758
- ],
1759
- "hf_avail_splits": [
1760
- "auxiliary_train",
1761
- "test",
1762
- "validation",
1763
- "dev"
1764
- ],
1765
- "evaluation_splits": [
1766
- "test"
1767
- ],
1768
- "few_shots_split": "dev",
1769
- "few_shots_select": "sequential",
1770
- "generation_size": 1,
1771
- "stop_sequence": [
1772
- "\n"
1773
- ],
1774
- "output_regex": null,
1775
- "frozen": false,
1776
- "suite": [
1777
- "lighteval",
1778
- "mmlu"
1779
- ],
1780
- "original_num_docs": 311,
1781
- "effective_num_docs": 311
1782
- },
1783
- "lighteval|mmlu:prehistory": {
1784
- "name": "mmlu:prehistory",
1785
- "prompt_function": "mmlu_harness",
1786
- "hf_repo": "lighteval/mmlu",
1787
- "hf_subset": "prehistory",
1788
- "metric": [
1789
- "loglikelihood_acc"
1790
- ],
1791
- "hf_avail_splits": [
1792
- "auxiliary_train",
1793
- "test",
1794
- "validation",
1795
- "dev"
1796
- ],
1797
- "evaluation_splits": [
1798
- "test"
1799
- ],
1800
- "few_shots_split": "dev",
1801
- "few_shots_select": "sequential",
1802
- "generation_size": 1,
1803
- "stop_sequence": [
1804
- "\n"
1805
- ],
1806
- "output_regex": null,
1807
- "frozen": false,
1808
- "suite": [
1809
- "lighteval",
1810
- "mmlu"
1811
- ],
1812
- "original_num_docs": 324,
1813
- "effective_num_docs": 324
1814
- },
1815
- "lighteval|mmlu:professional_accounting": {
1816
- "name": "mmlu:professional_accounting",
1817
- "prompt_function": "mmlu_harness",
1818
- "hf_repo": "lighteval/mmlu",
1819
- "hf_subset": "professional_accounting",
1820
- "metric": [
1821
- "loglikelihood_acc"
1822
- ],
1823
- "hf_avail_splits": [
1824
- "auxiliary_train",
1825
- "test",
1826
- "validation",
1827
- "dev"
1828
- ],
1829
- "evaluation_splits": [
1830
- "test"
1831
- ],
1832
- "few_shots_split": "dev",
1833
- "few_shots_select": "sequential",
1834
- "generation_size": 1,
1835
- "stop_sequence": [
1836
- "\n"
1837
- ],
1838
- "output_regex": null,
1839
- "frozen": false,
1840
- "suite": [
1841
- "lighteval",
1842
- "mmlu"
1843
- ],
1844
- "original_num_docs": 282,
1845
- "effective_num_docs": 282
1846
- },
1847
- "lighteval|mmlu:professional_law": {
1848
- "name": "mmlu:professional_law",
1849
- "prompt_function": "mmlu_harness",
1850
- "hf_repo": "lighteval/mmlu",
1851
- "hf_subset": "professional_law",
1852
- "metric": [
1853
- "loglikelihood_acc"
1854
- ],
1855
- "hf_avail_splits": [
1856
- "auxiliary_train",
1857
- "test",
1858
- "validation",
1859
- "dev"
1860
- ],
1861
- "evaluation_splits": [
1862
- "test"
1863
- ],
1864
- "few_shots_split": "dev",
1865
- "few_shots_select": "sequential",
1866
- "generation_size": 1,
1867
- "stop_sequence": [
1868
- "\n"
1869
- ],
1870
- "output_regex": null,
1871
- "frozen": false,
1872
- "suite": [
1873
- "lighteval",
1874
- "mmlu"
1875
- ],
1876
- "original_num_docs": 1534,
1877
- "effective_num_docs": 1534
1878
- },
1879
- "lighteval|mmlu:professional_medicine": {
1880
- "name": "mmlu:professional_medicine",
1881
- "prompt_function": "mmlu_harness",
1882
- "hf_repo": "lighteval/mmlu",
1883
- "hf_subset": "professional_medicine",
1884
- "metric": [
1885
- "loglikelihood_acc"
1886
- ],
1887
- "hf_avail_splits": [
1888
- "auxiliary_train",
1889
- "test",
1890
- "validation",
1891
- "dev"
1892
- ],
1893
- "evaluation_splits": [
1894
- "test"
1895
- ],
1896
- "few_shots_split": "dev",
1897
- "few_shots_select": "sequential",
1898
- "generation_size": 1,
1899
- "stop_sequence": [
1900
- "\n"
1901
- ],
1902
- "output_regex": null,
1903
- "frozen": false,
1904
- "suite": [
1905
- "lighteval",
1906
- "mmlu"
1907
- ],
1908
- "original_num_docs": 272,
1909
- "effective_num_docs": 272
1910
- },
1911
- "lighteval|mmlu:professional_psychology": {
1912
- "name": "mmlu:professional_psychology",
1913
- "prompt_function": "mmlu_harness",
1914
- "hf_repo": "lighteval/mmlu",
1915
- "hf_subset": "professional_psychology",
1916
- "metric": [
1917
- "loglikelihood_acc"
1918
- ],
1919
- "hf_avail_splits": [
1920
- "auxiliary_train",
1921
- "test",
1922
- "validation",
1923
- "dev"
1924
- ],
1925
- "evaluation_splits": [
1926
- "test"
1927
- ],
1928
- "few_shots_split": "dev",
1929
- "few_shots_select": "sequential",
1930
- "generation_size": 1,
1931
- "stop_sequence": [
1932
- "\n"
1933
- ],
1934
- "output_regex": null,
1935
- "frozen": false,
1936
- "suite": [
1937
- "lighteval",
1938
- "mmlu"
1939
- ],
1940
- "original_num_docs": 612,
1941
- "effective_num_docs": 612
1942
- },
1943
- "lighteval|mmlu:public_relations": {
1944
- "name": "mmlu:public_relations",
1945
- "prompt_function": "mmlu_harness",
1946
- "hf_repo": "lighteval/mmlu",
1947
- "hf_subset": "public_relations",
1948
- "metric": [
1949
- "loglikelihood_acc"
1950
- ],
1951
- "hf_avail_splits": [
1952
- "auxiliary_train",
1953
- "test",
1954
- "validation",
1955
- "dev"
1956
- ],
1957
- "evaluation_splits": [
1958
- "test"
1959
- ],
1960
- "few_shots_split": "dev",
1961
- "few_shots_select": "sequential",
1962
- "generation_size": 1,
1963
- "stop_sequence": [
1964
- "\n"
1965
- ],
1966
- "output_regex": null,
1967
- "frozen": false,
1968
- "suite": [
1969
- "lighteval",
1970
- "mmlu"
1971
- ],
1972
- "original_num_docs": 110,
1973
- "effective_num_docs": 110
1974
- },
1975
- "lighteval|mmlu:security_studies": {
1976
- "name": "mmlu:security_studies",
1977
- "prompt_function": "mmlu_harness",
1978
- "hf_repo": "lighteval/mmlu",
1979
- "hf_subset": "security_studies",
1980
- "metric": [
1981
- "loglikelihood_acc"
1982
- ],
1983
- "hf_avail_splits": [
1984
- "auxiliary_train",
1985
- "test",
1986
- "validation",
1987
- "dev"
1988
- ],
1989
- "evaluation_splits": [
1990
- "test"
1991
- ],
1992
- "few_shots_split": "dev",
1993
- "few_shots_select": "sequential",
1994
- "generation_size": 1,
1995
- "stop_sequence": [
1996
- "\n"
1997
- ],
1998
- "output_regex": null,
1999
- "frozen": false,
2000
- "suite": [
2001
- "lighteval",
2002
- "mmlu"
2003
- ],
2004
- "original_num_docs": 245,
2005
- "effective_num_docs": 245
2006
- },
2007
- "lighteval|mmlu:sociology": {
2008
- "name": "mmlu:sociology",
2009
- "prompt_function": "mmlu_harness",
2010
- "hf_repo": "lighteval/mmlu",
2011
- "hf_subset": "sociology",
2012
- "metric": [
2013
- "loglikelihood_acc"
2014
- ],
2015
- "hf_avail_splits": [
2016
- "auxiliary_train",
2017
- "test",
2018
- "validation",
2019
- "dev"
2020
- ],
2021
- "evaluation_splits": [
2022
- "test"
2023
- ],
2024
- "few_shots_split": "dev",
2025
- "few_shots_select": "sequential",
2026
- "generation_size": 1,
2027
- "stop_sequence": [
2028
- "\n"
2029
- ],
2030
- "output_regex": null,
2031
- "frozen": false,
2032
- "suite": [
2033
- "lighteval",
2034
- "mmlu"
2035
- ],
2036
- "original_num_docs": 201,
2037
- "effective_num_docs": 201
2038
- },
2039
- "lighteval|mmlu:us_foreign_policy": {
2040
- "name": "mmlu:us_foreign_policy",
2041
- "prompt_function": "mmlu_harness",
2042
- "hf_repo": "lighteval/mmlu",
2043
- "hf_subset": "us_foreign_policy",
2044
- "metric": [
2045
- "loglikelihood_acc"
2046
- ],
2047
- "hf_avail_splits": [
2048
- "auxiliary_train",
2049
- "test",
2050
- "validation",
2051
- "dev"
2052
- ],
2053
- "evaluation_splits": [
2054
- "test"
2055
- ],
2056
- "few_shots_split": "dev",
2057
- "few_shots_select": "sequential",
2058
- "generation_size": 1,
2059
- "stop_sequence": [
2060
- "\n"
2061
- ],
2062
- "output_regex": null,
2063
- "frozen": false,
2064
- "suite": [
2065
- "lighteval",
2066
- "mmlu"
2067
- ],
2068
- "original_num_docs": 100,
2069
- "effective_num_docs": 100
2070
- },
2071
- "lighteval|mmlu:virology": {
2072
- "name": "mmlu:virology",
2073
- "prompt_function": "mmlu_harness",
2074
- "hf_repo": "lighteval/mmlu",
2075
- "hf_subset": "virology",
2076
- "metric": [
2077
- "loglikelihood_acc"
2078
- ],
2079
- "hf_avail_splits": [
2080
- "auxiliary_train",
2081
- "test",
2082
- "validation",
2083
- "dev"
2084
- ],
2085
- "evaluation_splits": [
2086
- "test"
2087
- ],
2088
- "few_shots_split": "dev",
2089
- "few_shots_select": "sequential",
2090
- "generation_size": 1,
2091
- "stop_sequence": [
2092
- "\n"
2093
- ],
2094
- "output_regex": null,
2095
- "frozen": false,
2096
- "suite": [
2097
- "lighteval",
2098
- "mmlu"
2099
- ],
2100
- "original_num_docs": 166,
2101
- "effective_num_docs": 166
2102
- },
2103
- "lighteval|mmlu:world_religions": {
2104
- "name": "mmlu:world_religions",
2105
- "prompt_function": "mmlu_harness",
2106
- "hf_repo": "lighteval/mmlu",
2107
- "hf_subset": "world_religions",
2108
- "metric": [
2109
- "loglikelihood_acc"
2110
- ],
2111
- "hf_avail_splits": [
2112
- "auxiliary_train",
2113
- "test",
2114
- "validation",
2115
- "dev"
2116
- ],
2117
- "evaluation_splits": [
2118
- "test"
2119
- ],
2120
- "few_shots_split": "dev",
2121
- "few_shots_select": "sequential",
2122
- "generation_size": 1,
2123
- "stop_sequence": [
2124
- "\n"
2125
- ],
2126
- "output_regex": null,
2127
- "frozen": false,
2128
- "suite": [
2129
- "lighteval",
2130
- "mmlu"
2131
- ],
2132
- "original_num_docs": 171,
2133
- "effective_num_docs": 171
2134
- }
2135
- },
2136
- "summary_tasks": {
2137
- "lighteval|mmlu:abstract_algebra|5": {
2138
- "hashes": {
2139
- "hash_examples": "4c76229e00c9c0e9",
2140
- "hash_full_prompts": "a45d01c3409c889c",
2141
- "hash_input_tokens": "5a65ee02e93a639b",
2142
- "hash_cont_tokens": "00520b0ec06da34f"
2143
- },
2144
- "truncated": 0,
2145
- "non_truncated": 100,
2146
- "padded": 400,
2147
- "non_padded": 0,
2148
- "effective_few_shots": 5.0,
2149
- "num_truncated_few_shots": 0
2150
- },
2151
- "lighteval|mmlu:anatomy|5": {
2152
- "hashes": {
2153
- "hash_examples": "6a1f8104dccbd33b",
2154
- "hash_full_prompts": "e245c6600e03cc32",
2155
- "hash_input_tokens": "6f703823c87cd6b5",
2156
- "hash_cont_tokens": "263324e6ce7f9b36"
2157
- },
2158
- "truncated": 0,
2159
- "non_truncated": 135,
2160
- "padded": 540,
2161
- "non_padded": 0,
2162
- "effective_few_shots": 5.0,
2163
- "num_truncated_few_shots": 0
2164
- },
2165
- "lighteval|mmlu:astronomy|5": {
2166
- "hashes": {
2167
- "hash_examples": "1302effa3a76ce4c",
2168
- "hash_full_prompts": "390f9bddf857ad04",
2169
- "hash_input_tokens": "f8696ce4ddd8778e",
2170
- "hash_cont_tokens": "18ba399c6801138e"
2171
- },
2172
- "truncated": 0,
2173
- "non_truncated": 152,
2174
- "padded": 608,
2175
- "non_padded": 0,
2176
- "effective_few_shots": 5.0,
2177
- "num_truncated_few_shots": 0
2178
- },
2179
- "lighteval|mmlu:business_ethics|5": {
2180
- "hashes": {
2181
- "hash_examples": "03cb8bce5336419a",
2182
- "hash_full_prompts": "5504f893bc4f2fa1",
2183
- "hash_input_tokens": "1d6700b9ef7f5021",
2184
- "hash_cont_tokens": "00520b0ec06da34f"
2185
- },
2186
- "truncated": 0,
2187
- "non_truncated": 100,
2188
- "padded": 400,
2189
- "non_padded": 0,
2190
- "effective_few_shots": 5.0,
2191
- "num_truncated_few_shots": 0
2192
- },
2193
- "lighteval|mmlu:clinical_knowledge|5": {
2194
- "hashes": {
2195
- "hash_examples": "ffbb9c7b2be257f9",
2196
- "hash_full_prompts": "106ad0bab4b90b78",
2197
- "hash_input_tokens": "0b5a3aeba550d545",
2198
- "hash_cont_tokens": "9d7500060e0dd995"
2199
- },
2200
- "truncated": 0,
2201
- "non_truncated": 265,
2202
- "padded": 1060,
2203
- "non_padded": 0,
2204
- "effective_few_shots": 5.0,
2205
- "num_truncated_few_shots": 0
2206
- },
2207
- "lighteval|mmlu:college_biology|5": {
2208
- "hashes": {
2209
- "hash_examples": "3ee77f176f38eb8e",
2210
- "hash_full_prompts": "59f9bdf2695cb226",
2211
- "hash_input_tokens": "3133231d55c88036",
2212
- "hash_cont_tokens": "78a731af5d2f6472"
2213
- },
2214
- "truncated": 0,
2215
- "non_truncated": 144,
2216
- "padded": 576,
2217
- "non_padded": 0,
2218
- "effective_few_shots": 5.0,
2219
- "num_truncated_few_shots": 0
2220
- },
2221
- "lighteval|mmlu:college_chemistry|5": {
2222
- "hashes": {
2223
- "hash_examples": "ce61a69c46d47aeb",
2224
- "hash_full_prompts": "3cac9b759fcff7a0",
2225
- "hash_input_tokens": "a064aef3fffaefdf",
2226
- "hash_cont_tokens": "00520b0ec06da34f"
2227
- },
2228
- "truncated": 0,
2229
- "non_truncated": 100,
2230
- "padded": 400,
2231
- "non_padded": 0,
2232
- "effective_few_shots": 5.0,
2233
- "num_truncated_few_shots": 0
2234
- },
2235
- "lighteval|mmlu:college_computer_science|5": {
2236
- "hashes": {
2237
- "hash_examples": "32805b52d7d5daab",
2238
- "hash_full_prompts": "010b0cca35070130",
2239
- "hash_input_tokens": "1acf2a466f90bb5c",
2240
- "hash_cont_tokens": "00520b0ec06da34f"
2241
- },
2242
- "truncated": 0,
2243
- "non_truncated": 100,
2244
- "padded": 400,
2245
- "non_padded": 0,
2246
- "effective_few_shots": 5.0,
2247
- "num_truncated_few_shots": 0
2248
- },
2249
- "lighteval|mmlu:college_mathematics|5": {
2250
- "hashes": {
2251
- "hash_examples": "55da1a0a0bd33722",
2252
- "hash_full_prompts": "511422eb9eefc773",
2253
- "hash_input_tokens": "6aa16ca086552f7f",
2254
- "hash_cont_tokens": "00520b0ec06da34f"
2255
- },
2256
- "truncated": 0,
2257
- "non_truncated": 100,
2258
- "padded": 400,
2259
- "non_padded": 0,
2260
- "effective_few_shots": 5.0,
2261
- "num_truncated_few_shots": 0
2262
- },
2263
- "lighteval|mmlu:college_medicine|5": {
2264
- "hashes": {
2265
- "hash_examples": "c33e143163049176",
2266
- "hash_full_prompts": "c8cc1a82a51a046e",
2267
- "hash_input_tokens": "c559ee6c07866a53",
2268
- "hash_cont_tokens": "699c8eb24e3e446b"
2269
- },
2270
- "truncated": 0,
2271
- "non_truncated": 173,
2272
- "padded": 692,
2273
- "non_padded": 0,
2274
- "effective_few_shots": 5.0,
2275
- "num_truncated_few_shots": 0
2276
- },
2277
- "lighteval|mmlu:college_physics|5": {
2278
- "hashes": {
2279
- "hash_examples": "ebdab1cdb7e555df",
2280
- "hash_full_prompts": "e40721b5059c5818",
2281
- "hash_input_tokens": "cf56d9f071de12d7",
2282
- "hash_cont_tokens": "075997110cbe055e"
2283
- },
2284
- "truncated": 0,
2285
- "non_truncated": 102,
2286
- "padded": 408,
2287
- "non_padded": 0,
2288
- "effective_few_shots": 5.0,
2289
- "num_truncated_few_shots": 0
2290
- },
2291
- "lighteval|mmlu:computer_security|5": {
2292
- "hashes": {
2293
- "hash_examples": "a24fd7d08a560921",
2294
- "hash_full_prompts": "946c9be5964ac44a",
2295
- "hash_input_tokens": "3198f551bf7172d3",
2296
- "hash_cont_tokens": "00520b0ec06da34f"
2297
- },
2298
- "truncated": 0,
2299
- "non_truncated": 100,
2300
- "padded": 400,
2301
- "non_padded": 0,
2302
- "effective_few_shots": 5.0,
2303
- "num_truncated_few_shots": 0
2304
- },
2305
- "lighteval|mmlu:conceptual_physics|5": {
2306
- "hashes": {
2307
- "hash_examples": "8300977a79386993",
2308
- "hash_full_prompts": "506a4f6094cc40c9",
2309
- "hash_input_tokens": "70ef2b6c0df3990f",
2310
- "hash_cont_tokens": "f22daa6d4818086f"
2311
- },
2312
- "truncated": 0,
2313
- "non_truncated": 235,
2314
- "padded": 940,
2315
- "non_padded": 0,
2316
- "effective_few_shots": 5.0,
2317
- "num_truncated_few_shots": 0
2318
- },
2319
- "lighteval|mmlu:econometrics|5": {
2320
- "hashes": {
2321
- "hash_examples": "ddde36788a04a46f",
2322
- "hash_full_prompts": "4ed2703f27f1ed05",
2323
- "hash_input_tokens": "b772b0dd4c35ad9d",
2324
- "hash_cont_tokens": "26791a0b1941b4c4"
2325
- },
2326
- "truncated": 0,
2327
- "non_truncated": 114,
2328
- "padded": 456,
2329
- "non_padded": 0,
2330
- "effective_few_shots": 5.0,
2331
- "num_truncated_few_shots": 0
2332
- },
2333
- "lighteval|mmlu:electrical_engineering|5": {
2334
- "hashes": {
2335
- "hash_examples": "acbc5def98c19b3f",
2336
- "hash_full_prompts": "d8f4b3e11c23653c",
2337
- "hash_input_tokens": "9121bfa6320afa76",
2338
- "hash_cont_tokens": "3e336577994f6c0d"
2339
- },
2340
- "truncated": 0,
2341
- "non_truncated": 145,
2342
- "padded": 580,
2343
- "non_padded": 0,
2344
- "effective_few_shots": 5.0,
2345
- "num_truncated_few_shots": 0
2346
- },
2347
- "lighteval|mmlu:elementary_mathematics|5": {
2348
- "hashes": {
2349
- "hash_examples": "146e61d07497a9bd",
2350
- "hash_full_prompts": "256d111bd15647ff",
2351
- "hash_input_tokens": "5fc0510ae9e792ca",
2352
- "hash_cont_tokens": "1d6bbfa8a67327c8"
2353
- },
2354
- "truncated": 0,
2355
- "non_truncated": 378,
2356
- "padded": 1512,
2357
- "non_padded": 0,
2358
- "effective_few_shots": 5.0,
2359
- "num_truncated_few_shots": 0
2360
- },
2361
- "lighteval|mmlu:formal_logic|5": {
2362
- "hashes": {
2363
- "hash_examples": "8635216e1909a03f",
2364
- "hash_full_prompts": "1171d04f3b1a11f5",
2365
- "hash_input_tokens": "c4f7dba2a82dfec9",
2366
- "hash_cont_tokens": "60508d85eb7693a4"
2367
- },
2368
- "truncated": 0,
2369
- "non_truncated": 126,
2370
- "padded": 504,
2371
- "non_padded": 0,
2372
- "effective_few_shots": 5.0,
2373
- "num_truncated_few_shots": 0
2374
- },
2375
- "lighteval|mmlu:global_facts|5": {
2376
- "hashes": {
2377
- "hash_examples": "30b315aa6353ee47",
2378
- "hash_full_prompts": "a7e56dbc074c7529",
2379
- "hash_input_tokens": "7f39794a4f94537a",
2380
- "hash_cont_tokens": "00520b0ec06da34f"
2381
- },
2382
- "truncated": 0,
2383
- "non_truncated": 100,
2384
- "padded": 400,
2385
- "non_padded": 0,
2386
- "effective_few_shots": 5.0,
2387
- "num_truncated_few_shots": 0
2388
- },
2389
- "lighteval|mmlu:high_school_biology|5": {
2390
- "hashes": {
2391
- "hash_examples": "c9136373af2180de",
2392
- "hash_full_prompts": "ad6e859ed978e04a",
2393
- "hash_input_tokens": "1fc52b2bad543adc",
2394
- "hash_cont_tokens": "d236ce982144e65f"
2395
- },
2396
- "truncated": 0,
2397
- "non_truncated": 310,
2398
- "padded": 1240,
2399
- "non_padded": 0,
2400
- "effective_few_shots": 5.0,
2401
- "num_truncated_few_shots": 0
2402
- },
2403
- "lighteval|mmlu:high_school_chemistry|5": {
2404
- "hashes": {
2405
- "hash_examples": "b0661bfa1add6404",
2406
- "hash_full_prompts": "6eb9c04bcc8a8f2a",
2407
- "hash_input_tokens": "293887f8ff8f3395",
2408
- "hash_cont_tokens": "59f93238ec5aead6"
2409
- },
2410
- "truncated": 0,
2411
- "non_truncated": 203,
2412
- "padded": 812,
2413
- "non_padded": 0,
2414
- "effective_few_shots": 5.0,
2415
- "num_truncated_few_shots": 0
2416
- },
2417
- "lighteval|mmlu:high_school_computer_science|5": {
2418
- "hashes": {
2419
- "hash_examples": "80fc1d623a3d665f",
2420
- "hash_full_prompts": "8e51bc91c81cf8dd",
2421
- "hash_input_tokens": "b127387bf467bd7f",
2422
- "hash_cont_tokens": "00520b0ec06da34f"
2423
- },
2424
- "truncated": 0,
2425
- "non_truncated": 100,
2426
- "padded": 400,
2427
- "non_padded": 0,
2428
- "effective_few_shots": 5.0,
2429
- "num_truncated_few_shots": 0
2430
- },
2431
- "lighteval|mmlu:high_school_european_history|5": {
2432
- "hashes": {
2433
- "hash_examples": "854da6e5af0fe1a1",
2434
- "hash_full_prompts": "664a1f16c9f3195c",
2435
- "hash_input_tokens": "adf6ba3e9bfbabda",
2436
- "hash_cont_tokens": "7b7414d6a5da3d91"
2437
- },
2438
- "truncated": 0,
2439
- "non_truncated": 165,
2440
- "padded": 656,
2441
- "non_padded": 4,
2442
- "effective_few_shots": 5.0,
2443
- "num_truncated_few_shots": 0
2444
- },
2445
- "lighteval|mmlu:high_school_geography|5": {
2446
- "hashes": {
2447
- "hash_examples": "7dc963c7acd19ad8",
2448
- "hash_full_prompts": "f3acf911f4023c8a",
2449
- "hash_input_tokens": "48821dc945bf8561",
2450
- "hash_cont_tokens": "1b66289e10988f84"
2451
- },
2452
- "truncated": 0,
2453
- "non_truncated": 198,
2454
- "padded": 792,
2455
- "non_padded": 0,
2456
- "effective_few_shots": 5.0,
2457
- "num_truncated_few_shots": 0
2458
- },
2459
- "lighteval|mmlu:high_school_government_and_politics|5": {
2460
- "hashes": {
2461
- "hash_examples": "1f675dcdebc9758f",
2462
- "hash_full_prompts": "066254feaa3158ae",
2463
- "hash_input_tokens": "3338bcae07775883",
2464
- "hash_cont_tokens": "5ab3c3415b1d3a55"
2465
- },
2466
- "truncated": 0,
2467
- "non_truncated": 193,
2468
- "padded": 772,
2469
- "non_padded": 0,
2470
- "effective_few_shots": 5.0,
2471
- "num_truncated_few_shots": 0
2472
- },
2473
- "lighteval|mmlu:high_school_macroeconomics|5": {
2474
- "hashes": {
2475
- "hash_examples": "2fb32cf2d80f0b35",
2476
- "hash_full_prompts": "19a7fa502aa85c95",
2477
- "hash_input_tokens": "05c26035b5b0b73f",
2478
- "hash_cont_tokens": "2f5457058d187374"
2479
- },
2480
- "truncated": 0,
2481
- "non_truncated": 390,
2482
- "padded": 1557,
2483
- "non_padded": 3,
2484
- "effective_few_shots": 5.0,
2485
- "num_truncated_few_shots": 0
2486
- },
2487
- "lighteval|mmlu:high_school_mathematics|5": {
2488
- "hashes": {
2489
- "hash_examples": "fd6646fdb5d58a1f",
2490
- "hash_full_prompts": "4f704e369778b5b0",
2491
- "hash_input_tokens": "d24843138e783d81",
2492
- "hash_cont_tokens": "e35137cb972e1918"
2493
- },
2494
- "truncated": 0,
2495
- "non_truncated": 270,
2496
- "padded": 1080,
2497
- "non_padded": 0,
2498
- "effective_few_shots": 5.0,
2499
- "num_truncated_few_shots": 0
2500
- },
2501
- "lighteval|mmlu:high_school_microeconomics|5": {
2502
- "hashes": {
2503
- "hash_examples": "2118f21f71d87d84",
2504
- "hash_full_prompts": "4350f9e2240f8010",
2505
- "hash_input_tokens": "69f6fe03e2dbe780",
2506
- "hash_cont_tokens": "f756093278ebb83e"
2507
- },
2508
- "truncated": 0,
2509
- "non_truncated": 238,
2510
- "padded": 908,
2511
- "non_padded": 44,
2512
- "effective_few_shots": 5.0,
2513
- "num_truncated_few_shots": 0
2514
- },
2515
- "lighteval|mmlu:high_school_physics|5": {
2516
- "hashes": {
2517
- "hash_examples": "dc3ce06378548565",
2518
- "hash_full_prompts": "5dc0d6831b66188f",
2519
- "hash_input_tokens": "9591d584d7e155fb",
2520
- "hash_cont_tokens": "9cf883ebf1c82176"
2521
- },
2522
- "truncated": 0,
2523
- "non_truncated": 151,
2524
- "padded": 604,
2525
- "non_padded": 0,
2526
- "effective_few_shots": 5.0,
2527
- "num_truncated_few_shots": 0
2528
- },
2529
- "lighteval|mmlu:high_school_psychology|5": {
2530
- "hashes": {
2531
- "hash_examples": "c8d1d98a40e11f2f",
2532
- "hash_full_prompts": "af2b097da6d50365",
2533
- "hash_input_tokens": "609750965597f03b",
2534
- "hash_cont_tokens": "bda0f77331ebb21a"
2535
- },
2536
- "truncated": 0,
2537
- "non_truncated": 545,
2538
- "padded": 2178,
2539
- "non_padded": 2,
2540
- "effective_few_shots": 5.0,
2541
- "num_truncated_few_shots": 0
2542
- },
2543
- "lighteval|mmlu:high_school_statistics|5": {
2544
- "hashes": {
2545
- "hash_examples": "666c8759b98ee4ff",
2546
- "hash_full_prompts": "c757694421d6d68d",
2547
- "hash_input_tokens": "5012f0774f799fa9",
2548
- "hash_cont_tokens": "4d04f014105a0bad"
2549
- },
2550
- "truncated": 0,
2551
- "non_truncated": 216,
2552
- "padded": 864,
2553
- "non_padded": 0,
2554
- "effective_few_shots": 5.0,
2555
- "num_truncated_few_shots": 0
2556
- },
2557
- "lighteval|mmlu:high_school_us_history|5": {
2558
- "hashes": {
2559
- "hash_examples": "95fef1c4b7d3f81e",
2560
- "hash_full_prompts": "e34a028d0ddeec5e",
2561
- "hash_input_tokens": "80099337c9403ac2",
2562
- "hash_cont_tokens": "f4590c58f12f2766"
2563
- },
2564
- "truncated": 0,
2565
- "non_truncated": 204,
2566
- "padded": 816,
2567
- "non_padded": 0,
2568
- "effective_few_shots": 5.0,
2569
- "num_truncated_few_shots": 0
2570
- },
2571
- "lighteval|mmlu:high_school_world_history|5": {
2572
- "hashes": {
2573
- "hash_examples": "7e5085b6184b0322",
2574
- "hash_full_prompts": "1fa3d51392765601",
2575
- "hash_input_tokens": "97a77dd8ac18b816",
2576
- "hash_cont_tokens": "db6bcddd891df5d9"
2577
- },
2578
- "truncated": 0,
2579
- "non_truncated": 237,
2580
- "padded": 948,
2581
- "non_padded": 0,
2582
- "effective_few_shots": 5.0,
2583
- "num_truncated_few_shots": 0
2584
- },
2585
- "lighteval|mmlu:human_aging|5": {
2586
- "hashes": {
2587
- "hash_examples": "c17333e7c7c10797",
2588
- "hash_full_prompts": "cac900721f9a1a94",
2589
- "hash_input_tokens": "94b290fc3f569341",
2590
- "hash_cont_tokens": "25cec8d640319105"
2591
- },
2592
- "truncated": 0,
2593
- "non_truncated": 223,
2594
- "padded": 892,
2595
- "non_padded": 0,
2596
- "effective_few_shots": 5.0,
2597
- "num_truncated_few_shots": 0
2598
- },
2599
- "lighteval|mmlu:human_sexuality|5": {
2600
- "hashes": {
2601
- "hash_examples": "4edd1e9045df5e3d",
2602
- "hash_full_prompts": "0d6567bafee0a13c",
2603
- "hash_input_tokens": "0bf7ca2c394fab9a",
2604
- "hash_cont_tokens": "6778302b4a10b645"
2605
- },
2606
- "truncated": 0,
2607
- "non_truncated": 131,
2608
- "padded": 524,
2609
- "non_padded": 0,
2610
- "effective_few_shots": 5.0,
2611
- "num_truncated_few_shots": 0
2612
- },
2613
- "lighteval|mmlu:international_law|5": {
2614
- "hashes": {
2615
- "hash_examples": "db2fa00d771a062a",
2616
- "hash_full_prompts": "d018f9116479795e",
2617
- "hash_input_tokens": "dd0706557f7b7328",
2618
- "hash_cont_tokens": "9eb54e1a46032749"
2619
- },
2620
- "truncated": 0,
2621
- "non_truncated": 121,
2622
- "padded": 484,
2623
- "non_padded": 0,
2624
- "effective_few_shots": 5.0,
2625
- "num_truncated_few_shots": 0
2626
- },
2627
- "lighteval|mmlu:jurisprudence|5": {
2628
- "hashes": {
2629
- "hash_examples": "e956f86b124076fe",
2630
- "hash_full_prompts": "1487e89a10ec58b7",
2631
- "hash_input_tokens": "9a26ea2bf963054b",
2632
- "hash_cont_tokens": "f17d9a372cfd66b1"
2633
- },
2634
- "truncated": 0,
2635
- "non_truncated": 108,
2636
- "padded": 420,
2637
- "non_padded": 12,
2638
- "effective_few_shots": 5.0,
2639
- "num_truncated_few_shots": 0
2640
- },
2641
- "lighteval|mmlu:logical_fallacies|5": {
2642
- "hashes": {
2643
- "hash_examples": "956e0e6365ab79f1",
2644
- "hash_full_prompts": "677785b2181f9243",
2645
- "hash_input_tokens": "2a03c39dbb148eb3",
2646
- "hash_cont_tokens": "cf44a68f5bca9a96"
2647
- },
2648
- "truncated": 0,
2649
- "non_truncated": 163,
2650
- "padded": 648,
2651
- "non_padded": 4,
2652
- "effective_few_shots": 5.0,
2653
- "num_truncated_few_shots": 0
2654
- },
2655
- "lighteval|mmlu:machine_learning|5": {
2656
- "hashes": {
2657
- "hash_examples": "397997cc6f4d581e",
2658
- "hash_full_prompts": "769ee14a2aea49bb",
2659
- "hash_input_tokens": "8dba95ed099c0e82",
2660
- "hash_cont_tokens": "eace00d420f4f32c"
2661
- },
2662
- "truncated": 0,
2663
- "non_truncated": 112,
2664
- "padded": 448,
2665
- "non_padded": 0,
2666
- "effective_few_shots": 5.0,
2667
- "num_truncated_few_shots": 0
2668
- },
2669
- "lighteval|mmlu:management|5": {
2670
- "hashes": {
2671
- "hash_examples": "2bcbe6f6ca63d740",
2672
- "hash_full_prompts": "cb1ff9dac9582144",
2673
- "hash_input_tokens": "1b66e8a5d4aac82a",
2674
- "hash_cont_tokens": "b7c51d0250c252d8"
2675
- },
2676
- "truncated": 0,
2677
- "non_truncated": 103,
2678
- "padded": 412,
2679
- "non_padded": 0,
2680
- "effective_few_shots": 5.0,
2681
- "num_truncated_few_shots": 0
2682
- },
2683
- "lighteval|mmlu:marketing|5": {
2684
- "hashes": {
2685
- "hash_examples": "8ddb20d964a1b065",
2686
- "hash_full_prompts": "9fc2114a187ad9a2",
2687
- "hash_input_tokens": "77f9e775718e2163",
2688
- "hash_cont_tokens": "086fb63f8b1d1339"
2689
- },
2690
- "truncated": 0,
2691
- "non_truncated": 234,
2692
- "padded": 924,
2693
- "non_padded": 12,
2694
- "effective_few_shots": 5.0,
2695
- "num_truncated_few_shots": 0
2696
- },
2697
- "lighteval|mmlu:medical_genetics|5": {
2698
- "hashes": {
2699
- "hash_examples": "182a71f4763d2cea",
2700
- "hash_full_prompts": "46a616fa51878959",
2701
- "hash_input_tokens": "b7653a2b4683a431",
2702
- "hash_cont_tokens": "00520b0ec06da34f"
2703
- },
2704
- "truncated": 0,
2705
- "non_truncated": 100,
2706
- "padded": 400,
2707
- "non_padded": 0,
2708
- "effective_few_shots": 5.0,
2709
- "num_truncated_few_shots": 0
2710
- },
2711
- "lighteval|mmlu:miscellaneous|5": {
2712
- "hashes": {
2713
- "hash_examples": "4c404fdbb4ca57fc",
2714
- "hash_full_prompts": "0813e1be36dbaae1",
2715
- "hash_input_tokens": "5dfe66fc53b9c1c1",
2716
- "hash_cont_tokens": "1827274fa6537077"
2717
- },
2718
- "truncated": 0,
2719
- "non_truncated": 783,
2720
- "padded": 3132,
2721
- "non_padded": 0,
2722
- "effective_few_shots": 5.0,
2723
- "num_truncated_few_shots": 0
2724
- },
2725
- "lighteval|mmlu:moral_disputes|5": {
2726
- "hashes": {
2727
- "hash_examples": "60cbd2baa3fea5c9",
2728
- "hash_full_prompts": "1d14adebb9b62519",
2729
- "hash_input_tokens": "ff47b8256ae0b365",
2730
- "hash_cont_tokens": "472c223f6f28cfc7"
2731
- },
2732
- "truncated": 0,
2733
- "non_truncated": 346,
2734
- "padded": 1384,
2735
- "non_padded": 0,
2736
- "effective_few_shots": 5.0,
2737
- "num_truncated_few_shots": 0
2738
- },
2739
- "lighteval|mmlu:moral_scenarios|5": {
2740
- "hashes": {
2741
- "hash_examples": "fd8b0431fbdd75ef",
2742
- "hash_full_prompts": "b80d3d236165e3de",
2743
- "hash_input_tokens": "3ed3c3a063f48cbc",
2744
- "hash_cont_tokens": "e90dade00a092f9e"
2745
- },
2746
- "truncated": 0,
2747
- "non_truncated": 895,
2748
- "padded": 3567,
2749
- "non_padded": 13,
2750
- "effective_few_shots": 5.0,
2751
- "num_truncated_few_shots": 0
2752
- },
2753
- "lighteval|mmlu:nutrition|5": {
2754
- "hashes": {
2755
- "hash_examples": "71e55e2b829b6528",
2756
- "hash_full_prompts": "2bfb18e5fab8dea7",
2757
- "hash_input_tokens": "1bd9ed64af00091e",
2758
- "hash_cont_tokens": "128e0ec97d96b165"
2759
- },
2760
- "truncated": 0,
2761
- "non_truncated": 306,
2762
- "padded": 1224,
2763
- "non_padded": 0,
2764
- "effective_few_shots": 5.0,
2765
- "num_truncated_few_shots": 0
2766
- },
2767
- "lighteval|mmlu:philosophy|5": {
2768
- "hashes": {
2769
- "hash_examples": "a6d489a8d208fa4b",
2770
- "hash_full_prompts": "e8c0d5b6dae3ccc8",
2771
- "hash_input_tokens": "f85266336968069e",
2772
- "hash_cont_tokens": "cbfd7829a3e0f082"
2773
- },
2774
- "truncated": 0,
2775
- "non_truncated": 311,
2776
- "padded": 1244,
2777
- "non_padded": 0,
2778
- "effective_few_shots": 5.0,
2779
- "num_truncated_few_shots": 0
2780
- },
2781
- "lighteval|mmlu:prehistory|5": {
2782
- "hashes": {
2783
- "hash_examples": "6cc50f032a19acaa",
2784
- "hash_full_prompts": "4a6a1d3ab1bf28e4",
2785
- "hash_input_tokens": "476c6e813a7ea9a5",
2786
- "hash_cont_tokens": "9c0cf5a2f71afa7e"
2787
- },
2788
- "truncated": 0,
2789
- "non_truncated": 324,
2790
- "padded": 1284,
2791
- "non_padded": 12,
2792
- "effective_few_shots": 5.0,
2793
- "num_truncated_few_shots": 0
2794
- },
2795
- "lighteval|mmlu:professional_accounting|5": {
2796
- "hashes": {
2797
- "hash_examples": "50f57ab32f5f6cea",
2798
- "hash_full_prompts": "e60129bd2d82ffc6",
2799
- "hash_input_tokens": "0cfd558014469fd7",
2800
- "hash_cont_tokens": "50f011c2453517ee"
2801
- },
2802
- "truncated": 0,
2803
- "non_truncated": 282,
2804
- "padded": 1128,
2805
- "non_padded": 0,
2806
- "effective_few_shots": 5.0,
2807
- "num_truncated_few_shots": 0
2808
- },
2809
- "lighteval|mmlu:professional_law|5": {
2810
- "hashes": {
2811
- "hash_examples": "a8fdc85c64f4b215",
2812
- "hash_full_prompts": "0dbb1d9b72dcea03",
2813
- "hash_input_tokens": "5efbd68b327c2f18",
2814
- "hash_cont_tokens": "73527e852c24186c"
2815
- },
2816
- "truncated": 0,
2817
- "non_truncated": 1534,
2818
- "padded": 6136,
2819
- "non_padded": 0,
2820
- "effective_few_shots": 5.0,
2821
- "num_truncated_few_shots": 0
2822
- },
2823
- "lighteval|mmlu:professional_medicine|5": {
2824
- "hashes": {
2825
- "hash_examples": "c373a28a3050a73a",
2826
- "hash_full_prompts": "5e040f9ca68b089e",
2827
- "hash_input_tokens": "c65aae468847d35e",
2828
- "hash_cont_tokens": "ceb7af5e2e789abc"
2829
- },
2830
- "truncated": 0,
2831
- "non_truncated": 272,
2832
- "padded": 1088,
2833
- "non_padded": 0,
2834
- "effective_few_shots": 5.0,
2835
- "num_truncated_few_shots": 0
2836
- },
2837
- "lighteval|mmlu:professional_psychology|5": {
2838
- "hashes": {
2839
- "hash_examples": "bf5254fe818356af",
2840
- "hash_full_prompts": "b386ecda8b87150e",
2841
- "hash_input_tokens": "fcb757765d136891",
2842
- "hash_cont_tokens": "8cfdced8a9667380"
2843
- },
2844
- "truncated": 0,
2845
- "non_truncated": 612,
2846
- "padded": 2428,
2847
- "non_padded": 20,
2848
- "effective_few_shots": 5.0,
2849
- "num_truncated_few_shots": 0
2850
- },
2851
- "lighteval|mmlu:public_relations|5": {
2852
- "hashes": {
2853
- "hash_examples": "b66d52e28e7d14e0",
2854
- "hash_full_prompts": "fe43562263e25677",
2855
- "hash_input_tokens": "a2fc135f0e660902",
2856
- "hash_cont_tokens": "f8327461a9cc5123"
2857
- },
2858
- "truncated": 0,
2859
- "non_truncated": 110,
2860
- "padded": 436,
2861
- "non_padded": 4,
2862
- "effective_few_shots": 5.0,
2863
- "num_truncated_few_shots": 0
2864
- },
2865
- "lighteval|mmlu:security_studies|5": {
2866
- "hashes": {
2867
- "hash_examples": "514c14feaf000ad9",
2868
- "hash_full_prompts": "27d4a2ac541ef4b9",
2869
- "hash_input_tokens": "ee8dff5638898dc7",
2870
- "hash_cont_tokens": "c30b0c4d52c2875d"
2871
- },
2872
- "truncated": 0,
2873
- "non_truncated": 245,
2874
- "padded": 980,
2875
- "non_padded": 0,
2876
- "effective_few_shots": 5.0,
2877
- "num_truncated_few_shots": 0
2878
- },
2879
- "lighteval|mmlu:sociology|5": {
2880
- "hashes": {
2881
- "hash_examples": "f6c9bc9d18c80870",
2882
- "hash_full_prompts": "c072ea7d1a1524f2",
2883
- "hash_input_tokens": "d888e1a7c1faab73",
2884
- "hash_cont_tokens": "eef4bd16d536fbd6"
2885
- },
2886
- "truncated": 0,
2887
- "non_truncated": 201,
2888
- "padded": 804,
2889
- "non_padded": 0,
2890
- "effective_few_shots": 5.0,
2891
- "num_truncated_few_shots": 0
2892
- },
2893
- "lighteval|mmlu:us_foreign_policy|5": {
2894
- "hashes": {
2895
- "hash_examples": "ed7b78629db6678f",
2896
- "hash_full_prompts": "341a97ca3e4d699d",
2897
- "hash_input_tokens": "cc3aa60a1406b430",
2898
- "hash_cont_tokens": "00520b0ec06da34f"
2899
- },
2900
- "truncated": 0,
2901
- "non_truncated": 100,
2902
- "padded": 400,
2903
- "non_padded": 0,
2904
- "effective_few_shots": 5.0,
2905
- "num_truncated_few_shots": 0
2906
- },
2907
- "lighteval|mmlu:virology|5": {
2908
- "hashes": {
2909
- "hash_examples": "bc52ffdc3f9b994a",
2910
- "hash_full_prompts": "651d471e2eb8b5e9",
2911
- "hash_input_tokens": "1764542c625db567",
2912
- "hash_cont_tokens": "f5fc195e049353c0"
2913
- },
2914
- "truncated": 0,
2915
- "non_truncated": 166,
2916
- "padded": 664,
2917
- "non_padded": 0,
2918
- "effective_few_shots": 5.0,
2919
- "num_truncated_few_shots": 0
2920
- },
2921
- "lighteval|mmlu:world_religions|5": {
2922
- "hashes": {
2923
- "hash_examples": "ecdb4a4f94f62930",
2924
- "hash_full_prompts": "3773f03542ce44a3",
2925
- "hash_input_tokens": "07206016118d726f",
2926
- "hash_cont_tokens": "ada548665e87b1e0"
2927
- },
2928
- "truncated": 0,
2929
- "non_truncated": 171,
2930
- "padded": 684,
2931
- "non_padded": 0,
2932
- "effective_few_shots": 5.0,
2933
- "num_truncated_few_shots": 0
2934
- }
2935
- },
2936
- "summary_general": {
2937
- "hashes": {
2938
- "hash_examples": "341a076d0beb7048",
2939
- "hash_full_prompts": "a5c8f2b7ff4f5ae2",
2940
- "hash_input_tokens": "2262a23f0cae768b",
2941
- "hash_cont_tokens": "3672212ca582e2d0"
2942
- },
2943
- "truncated": 0,
2944
- "non_truncated": 14042,
2945
- "padded": 56038,
2946
- "non_padded": 130,
2947
- "num_truncated_few_shots": 0
2948
- }
2949
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B/main/truthfulqa/results_2024-03-02T12-48-12.539015.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 5799474.343098943,
9
- "end_time": 5799525.476038375,
10
- "total_evaluation_time_secondes": "51.13293943181634",
11
- "model_name": "Qwen/Qwen1.5-0.5B",
12
- "model_sha": "fedce23ef6393499effdf4958f9b3256f299cc7d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|truthfulqa:mc|0": {
19
- "truthfulqa_mc1": 0.2558139534883721,
20
- "truthfulqa_mc1_stderr": 0.015274176219283371,
21
- "truthfulqa_mc2": 0.40446998559516045,
22
- "truthfulqa_mc2_stderr": 0.014421653357320591
23
- }
24
- },
25
- "versions": {
26
- "lighteval|truthfulqa:mc|0": 0
27
- },
28
- "config_tasks": {
29
- "lighteval|truthfulqa:mc": {
30
- "name": "truthfulqa:mc",
31
- "prompt_function": "truthful_qa_multiple_choice",
32
- "hf_repo": "truthful_qa",
33
- "hf_subset": "multiple_choice",
34
- "metric": [
35
- "truthfulqa_mc_metrics"
36
- ],
37
- "hf_avail_splits": [
38
- "validation"
39
- ],
40
- "evaluation_splits": [
41
- "validation"
42
- ],
43
- "few_shots_split": null,
44
- "few_shots_select": null,
45
- "generation_size": -1,
46
- "stop_sequence": [
47
- "\n"
48
- ],
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "lighteval"
53
- ],
54
- "original_num_docs": 817,
55
- "effective_num_docs": 817
56
- }
57
- },
58
- "summary_tasks": {
59
- "lighteval|truthfulqa:mc|0": {
60
- "hashes": {
61
- "hash_examples": "36a6d90e75d92d4a",
62
- "hash_full_prompts": "17e9d0dc9f923ba3",
63
- "hash_input_tokens": "6809afed8534e190",
64
- "hash_cont_tokens": "45d9451af4efe9b4"
65
- },
66
- "truncated": 0,
67
- "non_truncated": 817,
68
- "padded": 9192,
69
- "non_padded": 804,
70
- "effective_few_shots": 0.0,
71
- "num_truncated_few_shots": 0
72
- }
73
- },
74
- "summary_general": {
75
- "hashes": {
76
- "hash_examples": "aed1dfc67e53d0f2",
77
- "hash_full_prompts": "81a2e5a97bc8b7e3",
78
- "hash_input_tokens": "4f019dca4beb2f88",
79
- "hash_cont_tokens": "91ac8b83ca359e94"
80
- },
81
- "truncated": 0,
82
- "non_truncated": 817,
83
- "padded": 9192,
84
- "non_padded": 804,
85
- "num_truncated_few_shots": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/Qwen/Qwen1.5-0.5B/main/winogrande/results_2024-03-02T12-47-59.918589.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1329621.091158945,
9
- "end_time": 1329659.759752786,
10
- "total_evaluation_time_secondes": "38.6685938409064",
11
- "model_name": "Qwen/Qwen1.5-0.5B",
12
- "model_sha": "fedce23ef6393499effdf4958f9b3256f299cc7d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "1.05 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|winogrande|5": {
19
- "acc": 0.5153906866614049,
20
- "acc_stderr": 0.014045826789783666
21
- }
22
- },
23
- "versions": {
24
- "lighteval|winogrande|5": 0
25
- },
26
- "config_tasks": {
27
- "lighteval|winogrande": {
28
- "name": "winogrande",
29
- "prompt_function": "winogrande",
30
- "hf_repo": "winogrande",
31
- "hf_subset": "winogrande_xl",
32
- "metric": [
33
- "loglikelihood_acc"
34
- ],
35
- "hf_avail_splits": [
36
- "train",
37
- "test",
38
- "validation"
39
- ],
40
- "evaluation_splits": [
41
- "validation"
42
- ],
43
- "few_shots_split": null,
44
- "few_shots_select": "random_sampling",
45
- "generation_size": -1,
46
- "stop_sequence": [
47
- "\n"
48
- ],
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "lighteval"
53
- ],
54
- "original_num_docs": 1267,
55
- "effective_num_docs": 1267
56
- }
57
- },
58
- "summary_tasks": {
59
- "lighteval|winogrande|5": {
60
- "hashes": {
61
- "hash_examples": "087d5d1a1afd4c7b",
62
- "hash_full_prompts": "29e044bcf40d6a6d",
63
- "hash_input_tokens": "aa37fb95ba6e30c5",
64
- "hash_cont_tokens": "b0338f16e5945c7b"
65
- },
66
- "truncated": 0,
67
- "non_truncated": 1267,
68
- "padded": 2356,
69
- "non_padded": 178,
70
- "effective_few_shots": 5.0,
71
- "num_truncated_few_shots": 0
72
- }
73
- },
74
- "summary_general": {
75
- "hashes": {
76
- "hash_examples": "b9a49975cc41fab7",
77
- "hash_full_prompts": "2f908b2b9b5ec583",
78
- "hash_input_tokens": "8c145130af5ef1dc",
79
- "hash_cont_tokens": "fe04436d98a11172"
80
- },
81
- "truncated": 0,
82
- "non_truncated": 1267,
83
- "padded": 2356,
84
- "non_padded": 178,
85
- "num_truncated_few_shots": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/abacaj/phi-2-super/main/ifeval/results_2024-03-02T12-34-38.484385.json DELETED
@@ -1,89 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 385966.951846016,
9
- "end_time": 386615.376815783,
10
- "total_evaluation_time_secondes": "648.4249697669875",
11
- "model_name": "abacaj/phi-2-super",
12
- "model_sha": "f1e578c868e6cc20fb1ea8eeee427ddf6e0e2ee4",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "5.19 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|ifeval|0": {
19
- "prompt_level_strict_acc": 0.24584103512014788,
20
- "prompt_level_strict_acc_stderr": 0.01852941708079559,
21
- "inst_level_strict_acc": 0.3645083932853717,
22
- "inst_level_strict_acc_stderr": 0.0005225768432277203,
23
- "prompt_level_loose_acc": 0.2809611829944547,
24
- "prompt_level_loose_acc_stderr": 0.019342047683777005,
25
- "inst_level_loose_acc": 0.40047961630695444,
26
- "inst_level_loose_acc_stderr": 0.0005229809220065132
27
- }
28
- },
29
- "versions": {
30
- "custom|ifeval|0": 0
31
- },
32
- "config_tasks": {
33
- "custom|ifeval": {
34
- "name": "ifeval",
35
- "prompt_function": "ifeval_prompt",
36
- "hf_repo": "wis-k/instruction-following-eval",
37
- "hf_subset": "default",
38
- "metric": [
39
- "ifeval_metric"
40
- ],
41
- "hf_avail_splits": [
42
- "train"
43
- ],
44
- "evaluation_splits": [
45
- "train"
46
- ],
47
- "few_shots_split": "train",
48
- "few_shots_select": "random_sampling",
49
- "generation_size": 1280,
50
- "stop_sequence": [],
51
- "output_regex": null,
52
- "frozen": false,
53
- "suite": [
54
- "custom"
55
- ],
56
- "original_num_docs": 541,
57
- "effective_num_docs": 541
58
- }
59
- },
60
- "summary_tasks": {
61
- "custom|ifeval|0": {
62
- "hashes": {
63
- "hash_examples": "e99cbf567588d7c6",
64
- "hash_full_prompts": "b3cf2afae2cdd517",
65
- "hash_input_tokens": "dd39068592068ba5",
66
- "hash_cont_tokens": "9b65fb0ae540b597"
67
- },
68
- "truncated": 0,
69
- "non_truncated": 541,
70
- "padded": 0,
71
- "non_padded": 541,
72
- "effective_few_shots": 0.0,
73
- "num_truncated_few_shots": 0
74
- }
75
- },
76
- "summary_general": {
77
- "hashes": {
78
- "hash_examples": "ea046ab2c6fc5928",
79
- "hash_full_prompts": "ba67cfefd62e0550",
80
- "hash_input_tokens": "e0a5833eb7ed0885",
81
- "hash_cont_tokens": "0789d5afa3915c88"
82
- },
83
- "truncated": 0,
84
- "non_truncated": 541,
85
- "padded": 0,
86
- "non_padded": 541,
87
- "num_truncated_few_shots": 0
88
- }
89
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-dpo-full-mix1-beta-0.05-epoch-2/main/gsm8k/results_2024-03-01T11-15-08.605142.json DELETED
@@ -1,88 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 253849.558654959,
9
- "end_time": 254891.462718938,
10
- "total_evaluation_time_secondes": "1041.9040639790182",
11
- "model_name": "lewtun/gemma-7b-dpo-full-mix1-beta-0.05-epoch-2",
12
- "model_sha": "ebb030524ac79040fea69e8f6e2d53935efa8d7f",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "15.91 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|gsm8k|5": {
19
- "qem": 0.5481425322213799,
20
- "qem_stderr": 0.013708494995677646
21
- }
22
- },
23
- "versions": {
24
- "lighteval|gsm8k|5": 0
25
- },
26
- "config_tasks": {
27
- "lighteval|gsm8k": {
28
- "name": "gsm8k",
29
- "prompt_function": "gsm8k",
30
- "hf_repo": "gsm8k",
31
- "hf_subset": "main",
32
- "metric": [
33
- "quasi_exact_match_gsm8k"
34
- ],
35
- "hf_avail_splits": [
36
- "train",
37
- "test"
38
- ],
39
- "evaluation_splits": [
40
- "test"
41
- ],
42
- "few_shots_split": null,
43
- "few_shots_select": "random_sampling_from_train",
44
- "generation_size": 256,
45
- "stop_sequence": [
46
- ":",
47
- "Question:",
48
- "Question"
49
- ],
50
- "output_regex": null,
51
- "frozen": false,
52
- "suite": [
53
- "lighteval"
54
- ],
55
- "original_num_docs": 1319,
56
- "effective_num_docs": 1319
57
- }
58
- },
59
- "summary_tasks": {
60
- "lighteval|gsm8k|5": {
61
- "hashes": {
62
- "hash_examples": "0ed016e24e7512fd",
63
- "hash_full_prompts": "e9779f568bffa939",
64
- "hash_input_tokens": "cc35fc65cd0fd4b2",
65
- "hash_cont_tokens": "e4b0aac3431eede5"
66
- },
67
- "truncated": 0,
68
- "non_truncated": 1319,
69
- "padded": 0,
70
- "non_padded": 1319,
71
- "effective_few_shots": 5.0,
72
- "num_truncated_few_shots": 0
73
- }
74
- },
75
- "summary_general": {
76
- "hashes": {
77
- "hash_examples": "bc71463e88551d0e",
78
- "hash_full_prompts": "857dceaac0f1e2ec",
79
- "hash_input_tokens": "36c583e41332e10e",
80
- "hash_cont_tokens": "2b7ed29b1944f812"
81
- },
82
- "truncated": 0,
83
- "non_truncated": 1319,
84
- "padded": 0,
85
- "non_padded": 1319,
86
- "num_truncated_few_shots": 0
87
- }
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/lewtun/gemma-7b-dpo-full-mix1-beta-0.05-epoch-2/main/mmlu/results_2024-03-01T11-13-36.220599.json DELETED
@@ -1,2949 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 1,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1700987.468912052,
9
- "end_time": 1701933.868986028,
10
- "total_evaluation_time_secondes": "946.4000739760231",
11
- "model_name": "lewtun/gemma-7b-dpo-full-mix1-beta-0.05-epoch-2",
12
- "model_sha": "ebb030524ac79040fea69e8f6e2d53935efa8d7f",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "15.91 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "lighteval|mmlu:abstract_algebra|5": {
19
- "acc": 0.28,
20
- "acc_stderr": 0.04512608598542128
21
- },
22
- "lighteval|mmlu:anatomy|5": {
23
- "acc": 0.6,
24
- "acc_stderr": 0.04232073695151589
25
- },
26
- "lighteval|mmlu:astronomy|5": {
27
- "acc": 0.6973684210526315,
28
- "acc_stderr": 0.03738520676119669
29
- },
30
- "lighteval|mmlu:business_ethics|5": {
31
- "acc": 0.58,
32
- "acc_stderr": 0.049604496374885836
33
- },
34
- "lighteval|mmlu:clinical_knowledge|5": {
35
- "acc": 0.6566037735849056,
36
- "acc_stderr": 0.02922452646912479
37
- },
38
- "lighteval|mmlu:college_biology|5": {
39
- "acc": 0.7708333333333334,
40
- "acc_stderr": 0.035146974678623884
41
- },
42
- "lighteval|mmlu:college_chemistry|5": {
43
- "acc": 0.44,
44
- "acc_stderr": 0.049888765156985884
45
- },
46
- "lighteval|mmlu:college_computer_science|5": {
47
- "acc": 0.45,
48
- "acc_stderr": 0.05
49
- },
50
- "lighteval|mmlu:college_mathematics|5": {
51
- "acc": 0.29,
52
- "acc_stderr": 0.04560480215720683
53
- },
54
- "lighteval|mmlu:college_medicine|5": {
55
- "acc": 0.6069364161849711,
56
- "acc_stderr": 0.037242495958177295
57
- },
58
- "lighteval|mmlu:college_physics|5": {
59
- "acc": 0.4215686274509804,
60
- "acc_stderr": 0.04913595201274498
61
- },
62
- "lighteval|mmlu:computer_security|5": {
63
- "acc": 0.74,
64
- "acc_stderr": 0.04408440022768079
65
- },
66
- "lighteval|mmlu:conceptual_physics|5": {
67
- "acc": 0.6468085106382979,
68
- "acc_stderr": 0.031245325202761926
69
- },
70
- "lighteval|mmlu:econometrics|5": {
71
- "acc": 0.45614035087719296,
72
- "acc_stderr": 0.046854730419077895
73
- },
74
- "lighteval|mmlu:electrical_engineering|5": {
75
- "acc": 0.6344827586206897,
76
- "acc_stderr": 0.040131241954243856
77
- },
78
- "lighteval|mmlu:elementary_mathematics|5": {
79
- "acc": 0.4126984126984127,
80
- "acc_stderr": 0.025355741263055277
81
- },
82
- "lighteval|mmlu:formal_logic|5": {
83
- "acc": 0.42857142857142855,
84
- "acc_stderr": 0.04426266681379909
85
- },
86
- "lighteval|mmlu:global_facts|5": {
87
- "acc": 0.43,
88
- "acc_stderr": 0.049756985195624284
89
- },
90
- "lighteval|mmlu:high_school_biology|5": {
91
- "acc": 0.7709677419354839,
92
- "acc_stderr": 0.02390491431178265
93
- },
94
- "lighteval|mmlu:high_school_chemistry|5": {
95
- "acc": 0.5024630541871922,
96
- "acc_stderr": 0.03517945038691063
97
- },
98
- "lighteval|mmlu:high_school_computer_science|5": {
99
- "acc": 0.63,
100
- "acc_stderr": 0.04852365870939099
101
- },
102
- "lighteval|mmlu:high_school_european_history|5": {
103
- "acc": 0.7575757575757576,
104
- "acc_stderr": 0.03346409881055953
105
- },
106
- "lighteval|mmlu:high_school_geography|5": {
107
- "acc": 0.7929292929292929,
108
- "acc_stderr": 0.02886977846026704
109
- },
110
- "lighteval|mmlu:high_school_government_and_politics|5": {
111
- "acc": 0.8652849740932642,
112
- "acc_stderr": 0.024639789097709443
113
- },
114
- "lighteval|mmlu:high_school_macroeconomics|5": {
115
- "acc": 0.617948717948718,
116
- "acc_stderr": 0.024635549163908234
117
- },
118
- "lighteval|mmlu:high_school_mathematics|5": {
119
- "acc": 0.3888888888888889,
120
- "acc_stderr": 0.029723278961476664
121
- },
122
- "lighteval|mmlu:high_school_microeconomics|5": {
123
- "acc": 0.6428571428571429,
124
- "acc_stderr": 0.031124619309328177
125
- },
126
- "lighteval|mmlu:high_school_physics|5": {
127
- "acc": 0.37748344370860926,
128
- "acc_stderr": 0.0395802723112157
129
- },
130
- "lighteval|mmlu:high_school_psychology|5": {
131
- "acc": 0.8256880733944955,
132
- "acc_stderr": 0.016265675632010333
133
- },
134
- "lighteval|mmlu:high_school_statistics|5": {
135
- "acc": 0.5,
136
- "acc_stderr": 0.034099716973523674
137
- },
138
- "lighteval|mmlu:high_school_us_history|5": {
139
- "acc": 0.8235294117647058,
140
- "acc_stderr": 0.026756401538078962
141
- },
142
- "lighteval|mmlu:high_school_world_history|5": {
143
- "acc": 0.810126582278481,
144
- "acc_stderr": 0.025530100460233483
145
- },
146
- "lighteval|mmlu:human_aging|5": {
147
- "acc": 0.7130044843049327,
148
- "acc_stderr": 0.03036037971029195
149
- },
150
- "lighteval|mmlu:human_sexuality|5": {
151
- "acc": 0.7099236641221374,
152
- "acc_stderr": 0.03980066246467765
153
- },
154
- "lighteval|mmlu:international_law|5": {
155
- "acc": 0.768595041322314,
156
- "acc_stderr": 0.038498560987940904
157
- },
158
- "lighteval|mmlu:jurisprudence|5": {
159
- "acc": 0.7407407407407407,
160
- "acc_stderr": 0.042365112580946336
161
- },
162
- "lighteval|mmlu:logical_fallacies|5": {
163
- "acc": 0.7668711656441718,
164
- "acc_stderr": 0.0332201579577674
165
- },
166
- "lighteval|mmlu:machine_learning|5": {
167
- "acc": 0.38392857142857145,
168
- "acc_stderr": 0.04616143075028547
169
- },
170
- "lighteval|mmlu:management|5": {
171
- "acc": 0.7864077669902912,
172
- "acc_stderr": 0.04058042015646034
173
- },
174
- "lighteval|mmlu:marketing|5": {
175
- "acc": 0.8846153846153846,
176
- "acc_stderr": 0.020930193185179333
177
- },
178
- "lighteval|mmlu:medical_genetics|5": {
179
- "acc": 0.71,
180
- "acc_stderr": 0.045604802157206845
181
- },
182
- "lighteval|mmlu:miscellaneous|5": {
183
- "acc": 0.8326947637292464,
184
- "acc_stderr": 0.013347327202920332
185
- },
186
- "lighteval|mmlu:moral_disputes|5": {
187
- "acc": 0.653179190751445,
188
- "acc_stderr": 0.025624723994030454
189
- },
190
- "lighteval|mmlu:moral_scenarios|5": {
191
- "acc": 0.36201117318435755,
192
- "acc_stderr": 0.016073067350153084
193
- },
194
- "lighteval|mmlu:nutrition|5": {
195
- "acc": 0.6928104575163399,
196
- "acc_stderr": 0.026415601914388992
197
- },
198
- "lighteval|mmlu:philosophy|5": {
199
- "acc": 0.6752411575562701,
200
- "acc_stderr": 0.026596782287697043
201
- },
202
- "lighteval|mmlu:prehistory|5": {
203
- "acc": 0.7376543209876543,
204
- "acc_stderr": 0.02447722285613513
205
- },
206
- "lighteval|mmlu:professional_accounting|5": {
207
- "acc": 0.5035460992907801,
208
- "acc_stderr": 0.02982674915328092
209
- },
210
- "lighteval|mmlu:professional_law|5": {
211
- "acc": 0.47196870925684486,
212
- "acc_stderr": 0.012750151802922442
213
- },
214
- "lighteval|mmlu:professional_medicine|5": {
215
- "acc": 0.6029411764705882,
216
- "acc_stderr": 0.029722152099280058
217
- },
218
- "lighteval|mmlu:professional_psychology|5": {
219
- "acc": 0.6666666666666666,
220
- "acc_stderr": 0.019070985589687495
221
- },
222
- "lighteval|mmlu:public_relations|5": {
223
- "acc": 0.6636363636363637,
224
- "acc_stderr": 0.04525393596302505
225
- },
226
- "lighteval|mmlu:security_studies|5": {
227
- "acc": 0.710204081632653,
228
- "acc_stderr": 0.029043088683304324
229
- },
230
- "lighteval|mmlu:sociology|5": {
231
- "acc": 0.8159203980099502,
232
- "acc_stderr": 0.027403859410786845
233
- },
234
- "lighteval|mmlu:us_foreign_policy|5": {
235
- "acc": 0.85,
236
- "acc_stderr": 0.03588702812826371
237
- },
238
- "lighteval|mmlu:virology|5": {
239
- "acc": 0.5481927710843374,
240
- "acc_stderr": 0.03874371556587953
241
- },
242
- "lighteval|mmlu:world_religions|5": {
243
- "acc": 0.8245614035087719,
244
- "acc_stderr": 0.02917088550072767
245
- },
246
- "lighteval|mmlu:_average|5": {
247
- "acc": 0.6302293099478192,
248
- "acc_stderr": 0.03406311289757478
249
- }
250
- },
251
- "versions": {
252
- "lighteval|mmlu:abstract_algebra|5": 0,
253
- "lighteval|mmlu:anatomy|5": 0,
254
- "lighteval|mmlu:astronomy|5": 0,
255
- "lighteval|mmlu:business_ethics|5": 0,
256
- "lighteval|mmlu:clinical_knowledge|5": 0,
257
- "lighteval|mmlu:college_biology|5": 0,
258
- "lighteval|mmlu:college_chemistry|5": 0,
259
- "lighteval|mmlu:college_computer_science|5": 0,
260
- "lighteval|mmlu:college_mathematics|5": 0,
261
- "lighteval|mmlu:college_medicine|5": 0,
262
- "lighteval|mmlu:college_physics|5": 0,
263
- "lighteval|mmlu:computer_security|5": 0,
264
- "lighteval|mmlu:conceptual_physics|5": 0,
265
- "lighteval|mmlu:econometrics|5": 0,
266
- "lighteval|mmlu:electrical_engineering|5": 0,
267
- "lighteval|mmlu:elementary_mathematics|5": 0,
268
- "lighteval|mmlu:formal_logic|5": 0,
269
- "lighteval|mmlu:global_facts|5": 0,
270
- "lighteval|mmlu:high_school_biology|5": 0,
271
- "lighteval|mmlu:high_school_chemistry|5": 0,
272
- "lighteval|mmlu:high_school_computer_science|5": 0,
273
- "lighteval|mmlu:high_school_european_history|5": 0,
274
- "lighteval|mmlu:high_school_geography|5": 0,
275
- "lighteval|mmlu:high_school_government_and_politics|5": 0,
276
- "lighteval|mmlu:high_school_macroeconomics|5": 0,
277
- "lighteval|mmlu:high_school_mathematics|5": 0,
278
- "lighteval|mmlu:high_school_microeconomics|5": 0,
279
- "lighteval|mmlu:high_school_physics|5": 0,
280
- "lighteval|mmlu:high_school_psychology|5": 0,
281
- "lighteval|mmlu:high_school_statistics|5": 0,
282
- "lighteval|mmlu:high_school_us_history|5": 0,
283
- "lighteval|mmlu:high_school_world_history|5": 0,
284
- "lighteval|mmlu:human_aging|5": 0,
285
- "lighteval|mmlu:human_sexuality|5": 0,
286
- "lighteval|mmlu:international_law|5": 0,
287
- "lighteval|mmlu:jurisprudence|5": 0,
288
- "lighteval|mmlu:logical_fallacies|5": 0,
289
- "lighteval|mmlu:machine_learning|5": 0,
290
- "lighteval|mmlu:management|5": 0,
291
- "lighteval|mmlu:marketing|5": 0,
292
- "lighteval|mmlu:medical_genetics|5": 0,
293
- "lighteval|mmlu:miscellaneous|5": 0,
294
- "lighteval|mmlu:moral_disputes|5": 0,
295
- "lighteval|mmlu:moral_scenarios|5": 0,
296
- "lighteval|mmlu:nutrition|5": 0,
297
- "lighteval|mmlu:philosophy|5": 0,
298
- "lighteval|mmlu:prehistory|5": 0,
299
- "lighteval|mmlu:professional_accounting|5": 0,
300
- "lighteval|mmlu:professional_law|5": 0,
301
- "lighteval|mmlu:professional_medicine|5": 0,
302
- "lighteval|mmlu:professional_psychology|5": 0,
303
- "lighteval|mmlu:public_relations|5": 0,
304
- "lighteval|mmlu:security_studies|5": 0,
305
- "lighteval|mmlu:sociology|5": 0,
306
- "lighteval|mmlu:us_foreign_policy|5": 0,
307
- "lighteval|mmlu:virology|5": 0,
308
- "lighteval|mmlu:world_religions|5": 0
309
- },
310
- "config_tasks": {
311
- "lighteval|mmlu:abstract_algebra": {
312
- "name": "mmlu:abstract_algebra",
313
- "prompt_function": "mmlu_harness",
314
- "hf_repo": "lighteval/mmlu",
315
- "hf_subset": "abstract_algebra",
316
- "metric": [
317
- "loglikelihood_acc"
318
- ],
319
- "hf_avail_splits": [
320
- "auxiliary_train",
321
- "test",
322
- "validation",
323
- "dev"
324
- ],
325
- "evaluation_splits": [
326
- "test"
327
- ],
328
- "few_shots_split": "dev",
329
- "few_shots_select": "sequential",
330
- "generation_size": 1,
331
- "stop_sequence": [
332
- "\n"
333
- ],
334
- "output_regex": null,
335
- "frozen": false,
336
- "suite": [
337
- "lighteval",
338
- "mmlu"
339
- ],
340
- "original_num_docs": 100,
341
- "effective_num_docs": 100
342
- },
343
- "lighteval|mmlu:anatomy": {
344
- "name": "mmlu:anatomy",
345
- "prompt_function": "mmlu_harness",
346
- "hf_repo": "lighteval/mmlu",
347
- "hf_subset": "anatomy",
348
- "metric": [
349
- "loglikelihood_acc"
350
- ],
351
- "hf_avail_splits": [
352
- "auxiliary_train",
353
- "test",
354
- "validation",
355
- "dev"
356
- ],
357
- "evaluation_splits": [
358
- "test"
359
- ],
360
- "few_shots_split": "dev",
361
- "few_shots_select": "sequential",
362
- "generation_size": 1,
363
- "stop_sequence": [
364
- "\n"
365
- ],
366
- "output_regex": null,
367
- "frozen": false,
368
- "suite": [
369
- "lighteval",
370
- "mmlu"
371
- ],
372
- "original_num_docs": 135,
373
- "effective_num_docs": 135
374
- },
375
- "lighteval|mmlu:astronomy": {
376
- "name": "mmlu:astronomy",
377
- "prompt_function": "mmlu_harness",
378
- "hf_repo": "lighteval/mmlu",
379
- "hf_subset": "astronomy",
380
- "metric": [
381
- "loglikelihood_acc"
382
- ],
383
- "hf_avail_splits": [
384
- "auxiliary_train",
385
- "test",
386
- "validation",
387
- "dev"
388
- ],
389
- "evaluation_splits": [
390
- "test"
391
- ],
392
- "few_shots_split": "dev",
393
- "few_shots_select": "sequential",
394
- "generation_size": 1,
395
- "stop_sequence": [
396
- "\n"
397
- ],
398
- "output_regex": null,
399
- "frozen": false,
400
- "suite": [
401
- "lighteval",
402
- "mmlu"
403
- ],
404
- "original_num_docs": 152,
405
- "effective_num_docs": 152
406
- },
407
- "lighteval|mmlu:business_ethics": {
408
- "name": "mmlu:business_ethics",
409
- "prompt_function": "mmlu_harness",
410
- "hf_repo": "lighteval/mmlu",
411
- "hf_subset": "business_ethics",
412
- "metric": [
413
- "loglikelihood_acc"
414
- ],
415
- "hf_avail_splits": [
416
- "auxiliary_train",
417
- "test",
418
- "validation",
419
- "dev"
420
- ],
421
- "evaluation_splits": [
422
- "test"
423
- ],
424
- "few_shots_split": "dev",
425
- "few_shots_select": "sequential",
426
- "generation_size": 1,
427
- "stop_sequence": [
428
- "\n"
429
- ],
430
- "output_regex": null,
431
- "frozen": false,
432
- "suite": [
433
- "lighteval",
434
- "mmlu"
435
- ],
436
- "original_num_docs": 100,
437
- "effective_num_docs": 100
438
- },
439
- "lighteval|mmlu:clinical_knowledge": {
440
- "name": "mmlu:clinical_knowledge",
441
- "prompt_function": "mmlu_harness",
442
- "hf_repo": "lighteval/mmlu",
443
- "hf_subset": "clinical_knowledge",
444
- "metric": [
445
- "loglikelihood_acc"
446
- ],
447
- "hf_avail_splits": [
448
- "auxiliary_train",
449
- "test",
450
- "validation",
451
- "dev"
452
- ],
453
- "evaluation_splits": [
454
- "test"
455
- ],
456
- "few_shots_split": "dev",
457
- "few_shots_select": "sequential",
458
- "generation_size": 1,
459
- "stop_sequence": [
460
- "\n"
461
- ],
462
- "output_regex": null,
463
- "frozen": false,
464
- "suite": [
465
- "lighteval",
466
- "mmlu"
467
- ],
468
- "original_num_docs": 265,
469
- "effective_num_docs": 265
470
- },
471
- "lighteval|mmlu:college_biology": {
472
- "name": "mmlu:college_biology",
473
- "prompt_function": "mmlu_harness",
474
- "hf_repo": "lighteval/mmlu",
475
- "hf_subset": "college_biology",
476
- "metric": [
477
- "loglikelihood_acc"
478
- ],
479
- "hf_avail_splits": [
480
- "auxiliary_train",
481
- "test",
482
- "validation",
483
- "dev"
484
- ],
485
- "evaluation_splits": [
486
- "test"
487
- ],
488
- "few_shots_split": "dev",
489
- "few_shots_select": "sequential",
490
- "generation_size": 1,
491
- "stop_sequence": [
492
- "\n"
493
- ],
494
- "output_regex": null,
495
- "frozen": false,
496
- "suite": [
497
- "lighteval",
498
- "mmlu"
499
- ],
500
- "original_num_docs": 144,
501
- "effective_num_docs": 144
502
- },
503
- "lighteval|mmlu:college_chemistry": {
504
- "name": "mmlu:college_chemistry",
505
- "prompt_function": "mmlu_harness",
506
- "hf_repo": "lighteval/mmlu",
507
- "hf_subset": "college_chemistry",
508
- "metric": [
509
- "loglikelihood_acc"
510
- ],
511
- "hf_avail_splits": [
512
- "auxiliary_train",
513
- "test",
514
- "validation",
515
- "dev"
516
- ],
517
- "evaluation_splits": [
518
- "test"
519
- ],
520
- "few_shots_split": "dev",
521
- "few_shots_select": "sequential",
522
- "generation_size": 1,
523
- "stop_sequence": [
524
- "\n"
525
- ],
526
- "output_regex": null,
527
- "frozen": false,
528
- "suite": [
529
- "lighteval",
530
- "mmlu"
531
- ],
532
- "original_num_docs": 100,
533
- "effective_num_docs": 100
534
- },
535
- "lighteval|mmlu:college_computer_science": {
536
- "name": "mmlu:college_computer_science",
537
- "prompt_function": "mmlu_harness",
538
- "hf_repo": "lighteval/mmlu",
539
- "hf_subset": "college_computer_science",
540
- "metric": [
541
- "loglikelihood_acc"
542
- ],
543
- "hf_avail_splits": [
544
- "auxiliary_train",
545
- "test",
546
- "validation",
547
- "dev"
548
- ],
549
- "evaluation_splits": [
550
- "test"
551
- ],
552
- "few_shots_split": "dev",
553
- "few_shots_select": "sequential",
554
- "generation_size": 1,
555
- "stop_sequence": [
556
- "\n"
557
- ],
558
- "output_regex": null,
559
- "frozen": false,
560
- "suite": [
561
- "lighteval",
562
- "mmlu"
563
- ],
564
- "original_num_docs": 100,
565
- "effective_num_docs": 100
566
- },
567
- "lighteval|mmlu:college_mathematics": {
568
- "name": "mmlu:college_mathematics",
569
- "prompt_function": "mmlu_harness",
570
- "hf_repo": "lighteval/mmlu",
571
- "hf_subset": "college_mathematics",
572
- "metric": [
573
- "loglikelihood_acc"
574
- ],
575
- "hf_avail_splits": [
576
- "auxiliary_train",
577
- "test",
578
- "validation",
579
- "dev"
580
- ],
581
- "evaluation_splits": [
582
- "test"
583
- ],
584
- "few_shots_split": "dev",
585
- "few_shots_select": "sequential",
586
- "generation_size": 1,
587
- "stop_sequence": [
588
- "\n"
589
- ],
590
- "output_regex": null,
591
- "frozen": false,
592
- "suite": [
593
- "lighteval",
594
- "mmlu"
595
- ],
596
- "original_num_docs": 100,
597
- "effective_num_docs": 100
598
- },
599
- "lighteval|mmlu:college_medicine": {
600
- "name": "mmlu:college_medicine",
601
- "prompt_function": "mmlu_harness",
602
- "hf_repo": "lighteval/mmlu",
603
- "hf_subset": "college_medicine",
604
- "metric": [
605
- "loglikelihood_acc"
606
- ],
607
- "hf_avail_splits": [
608
- "auxiliary_train",
609
- "test",
610
- "validation",
611
- "dev"
612
- ],
613
- "evaluation_splits": [
614
- "test"
615
- ],
616
- "few_shots_split": "dev",
617
- "few_shots_select": "sequential",
618
- "generation_size": 1,
619
- "stop_sequence": [
620
- "\n"
621
- ],
622
- "output_regex": null,
623
- "frozen": false,
624
- "suite": [
625
- "lighteval",
626
- "mmlu"
627
- ],
628
- "original_num_docs": 173,
629
- "effective_num_docs": 173
630
- },
631
- "lighteval|mmlu:college_physics": {
632
- "name": "mmlu:college_physics",
633
- "prompt_function": "mmlu_harness",
634
- "hf_repo": "lighteval/mmlu",
635
- "hf_subset": "college_physics",
636
- "metric": [
637
- "loglikelihood_acc"
638
- ],
639
- "hf_avail_splits": [
640
- "auxiliary_train",
641
- "test",
642
- "validation",
643
- "dev"
644
- ],
645
- "evaluation_splits": [
646
- "test"
647
- ],
648
- "few_shots_split": "dev",
649
- "few_shots_select": "sequential",
650
- "generation_size": 1,
651
- "stop_sequence": [
652
- "\n"
653
- ],
654
- "output_regex": null,
655
- "frozen": false,
656
- "suite": [
657
- "lighteval",
658
- "mmlu"
659
- ],
660
- "original_num_docs": 102,
661
- "effective_num_docs": 102
662
- },
663
- "lighteval|mmlu:computer_security": {
664
- "name": "mmlu:computer_security",
665
- "prompt_function": "mmlu_harness",
666
- "hf_repo": "lighteval/mmlu",
667
- "hf_subset": "computer_security",
668
- "metric": [
669
- "loglikelihood_acc"
670
- ],
671
- "hf_avail_splits": [
672
- "auxiliary_train",
673
- "test",
674
- "validation",
675
- "dev"
676
- ],
677
- "evaluation_splits": [
678
- "test"
679
- ],
680
- "few_shots_split": "dev",
681
- "few_shots_select": "sequential",
682
- "generation_size": 1,
683
- "stop_sequence": [
684
- "\n"
685
- ],
686
- "output_regex": null,
687
- "frozen": false,
688
- "suite": [
689
- "lighteval",
690
- "mmlu"
691
- ],
692
- "original_num_docs": 100,
693
- "effective_num_docs": 100
694
- },
695
- "lighteval|mmlu:conceptual_physics": {
696
- "name": "mmlu:conceptual_physics",
697
- "prompt_function": "mmlu_harness",
698
- "hf_repo": "lighteval/mmlu",
699
- "hf_subset": "conceptual_physics",
700
- "metric": [
701
- "loglikelihood_acc"
702
- ],
703
- "hf_avail_splits": [
704
- "auxiliary_train",
705
- "test",
706
- "validation",
707
- "dev"
708
- ],
709
- "evaluation_splits": [
710
- "test"
711
- ],
712
- "few_shots_split": "dev",
713
- "few_shots_select": "sequential",
714
- "generation_size": 1,
715
- "stop_sequence": [
716
- "\n"
717
- ],
718
- "output_regex": null,
719
- "frozen": false,
720
- "suite": [
721
- "lighteval",
722
- "mmlu"
723
- ],
724
- "original_num_docs": 235,
725
- "effective_num_docs": 235
726
- },
727
- "lighteval|mmlu:econometrics": {
728
- "name": "mmlu:econometrics",
729
- "prompt_function": "mmlu_harness",
730
- "hf_repo": "lighteval/mmlu",
731
- "hf_subset": "econometrics",
732
- "metric": [
733
- "loglikelihood_acc"
734
- ],
735
- "hf_avail_splits": [
736
- "auxiliary_train",
737
- "test",
738
- "validation",
739
- "dev"
740
- ],
741
- "evaluation_splits": [
742
- "test"
743
- ],
744
- "few_shots_split": "dev",
745
- "few_shots_select": "sequential",
746
- "generation_size": 1,
747
- "stop_sequence": [
748
- "\n"
749
- ],
750
- "output_regex": null,
751
- "frozen": false,
752
- "suite": [
753
- "lighteval",
754
- "mmlu"
755
- ],
756
- "original_num_docs": 114,
757
- "effective_num_docs": 114
758
- },
759
- "lighteval|mmlu:electrical_engineering": {
760
- "name": "mmlu:electrical_engineering",
761
- "prompt_function": "mmlu_harness",
762
- "hf_repo": "lighteval/mmlu",
763
- "hf_subset": "electrical_engineering",
764
- "metric": [
765
- "loglikelihood_acc"
766
- ],
767
- "hf_avail_splits": [
768
- "auxiliary_train",
769
- "test",
770
- "validation",
771
- "dev"
772
- ],
773
- "evaluation_splits": [
774
- "test"
775
- ],
776
- "few_shots_split": "dev",
777
- "few_shots_select": "sequential",
778
- "generation_size": 1,
779
- "stop_sequence": [
780
- "\n"
781
- ],
782
- "output_regex": null,
783
- "frozen": false,
784
- "suite": [
785
- "lighteval",
786
- "mmlu"
787
- ],
788
- "original_num_docs": 145,
789
- "effective_num_docs": 145
790
- },
791
- "lighteval|mmlu:elementary_mathematics": {
792
- "name": "mmlu:elementary_mathematics",
793
- "prompt_function": "mmlu_harness",
794
- "hf_repo": "lighteval/mmlu",
795
- "hf_subset": "elementary_mathematics",
796
- "metric": [
797
- "loglikelihood_acc"
798
- ],
799
- "hf_avail_splits": [
800
- "auxiliary_train",
801
- "test",
802
- "validation",
803
- "dev"
804
- ],
805
- "evaluation_splits": [
806
- "test"
807
- ],
808
- "few_shots_split": "dev",
809
- "few_shots_select": "sequential",
810
- "generation_size": 1,
811
- "stop_sequence": [
812
- "\n"
813
- ],
814
- "output_regex": null,
815
- "frozen": false,
816
- "suite": [
817
- "lighteval",
818
- "mmlu"
819
- ],
820
- "original_num_docs": 378,
821
- "effective_num_docs": 378
822
- },
823
- "lighteval|mmlu:formal_logic": {
824
- "name": "mmlu:formal_logic",
825
- "prompt_function": "mmlu_harness",
826
- "hf_repo": "lighteval/mmlu",
827
- "hf_subset": "formal_logic",
828
- "metric": [
829
- "loglikelihood_acc"
830
- ],
831
- "hf_avail_splits": [
832
- "auxiliary_train",
833
- "test",
834
- "validation",
835
- "dev"
836
- ],
837
- "evaluation_splits": [
838
- "test"
839
- ],
840
- "few_shots_split": "dev",
841
- "few_shots_select": "sequential",
842
- "generation_size": 1,
843
- "stop_sequence": [
844
- "\n"
845
- ],
846
- "output_regex": null,
847
- "frozen": false,
848
- "suite": [
849
- "lighteval",
850
- "mmlu"
851
- ],
852
- "original_num_docs": 126,
853
- "effective_num_docs": 126
854
- },
855
- "lighteval|mmlu:global_facts": {
856
- "name": "mmlu:global_facts",
857
- "prompt_function": "mmlu_harness",
858
- "hf_repo": "lighteval/mmlu",
859
- "hf_subset": "global_facts",
860
- "metric": [
861
- "loglikelihood_acc"
862
- ],
863
- "hf_avail_splits": [
864
- "auxiliary_train",
865
- "test",
866
- "validation",
867
- "dev"
868
- ],
869
- "evaluation_splits": [
870
- "test"
871
- ],
872
- "few_shots_split": "dev",
873
- "few_shots_select": "sequential",
874
- "generation_size": 1,
875
- "stop_sequence": [
876
- "\n"
877
- ],
878
- "output_regex": null,
879
- "frozen": false,
880
- "suite": [
881
- "lighteval",
882
- "mmlu"
883
- ],
884
- "original_num_docs": 100,
885
- "effective_num_docs": 100
886
- },
887
- "lighteval|mmlu:high_school_biology": {
888
- "name": "mmlu:high_school_biology",
889
- "prompt_function": "mmlu_harness",
890
- "hf_repo": "lighteval/mmlu",
891
- "hf_subset": "high_school_biology",
892
- "metric": [
893
- "loglikelihood_acc"
894
- ],
895
- "hf_avail_splits": [
896
- "auxiliary_train",
897
- "test",
898
- "validation",
899
- "dev"
900
- ],
901
- "evaluation_splits": [
902
- "test"
903
- ],
904
- "few_shots_split": "dev",
905
- "few_shots_select": "sequential",
906
- "generation_size": 1,
907
- "stop_sequence": [
908
- "\n"
909
- ],
910
- "output_regex": null,
911
- "frozen": false,
912
- "suite": [
913
- "lighteval",
914
- "mmlu"
915
- ],
916
- "original_num_docs": 310,
917
- "effective_num_docs": 310
918
- },
919
- "lighteval|mmlu:high_school_chemistry": {
920
- "name": "mmlu:high_school_chemistry",
921
- "prompt_function": "mmlu_harness",
922
- "hf_repo": "lighteval/mmlu",
923
- "hf_subset": "high_school_chemistry",
924
- "metric": [
925
- "loglikelihood_acc"
926
- ],
927
- "hf_avail_splits": [
928
- "auxiliary_train",
929
- "test",
930
- "validation",
931
- "dev"
932
- ],
933
- "evaluation_splits": [
934
- "test"
935
- ],
936
- "few_shots_split": "dev",
937
- "few_shots_select": "sequential",
938
- "generation_size": 1,
939
- "stop_sequence": [
940
- "\n"
941
- ],
942
- "output_regex": null,
943
- "frozen": false,
944
- "suite": [
945
- "lighteval",
946
- "mmlu"
947
- ],
948
- "original_num_docs": 203,
949
- "effective_num_docs": 203
950
- },
951
- "lighteval|mmlu:high_school_computer_science": {
952
- "name": "mmlu:high_school_computer_science",
953
- "prompt_function": "mmlu_harness",
954
- "hf_repo": "lighteval/mmlu",
955
- "hf_subset": "high_school_computer_science",
956
- "metric": [
957
- "loglikelihood_acc"
958
- ],
959
- "hf_avail_splits": [
960
- "auxiliary_train",
961
- "test",
962
- "validation",
963
- "dev"
964
- ],
965
- "evaluation_splits": [
966
- "test"
967
- ],
968
- "few_shots_split": "dev",
969
- "few_shots_select": "sequential",
970
- "generation_size": 1,
971
- "stop_sequence": [
972
- "\n"
973
- ],
974
- "output_regex": null,
975
- "frozen": false,
976
- "suite": [
977
- "lighteval",
978
- "mmlu"
979
- ],
980
- "original_num_docs": 100,
981
- "effective_num_docs": 100
982
- },
983
- "lighteval|mmlu:high_school_european_history": {
984
- "name": "mmlu:high_school_european_history",
985
- "prompt_function": "mmlu_harness",
986
- "hf_repo": "lighteval/mmlu",
987
- "hf_subset": "high_school_european_history",
988
- "metric": [
989
- "loglikelihood_acc"
990
- ],
991
- "hf_avail_splits": [
992
- "auxiliary_train",
993
- "test",
994
- "validation",
995
- "dev"
996
- ],
997
- "evaluation_splits": [
998
- "test"
999
- ],
1000
- "few_shots_split": "dev",
1001
- "few_shots_select": "sequential",
1002
- "generation_size": 1,
1003
- "stop_sequence": [
1004
- "\n"
1005
- ],
1006
- "output_regex": null,
1007
- "frozen": false,
1008
- "suite": [
1009
- "lighteval",
1010
- "mmlu"
1011
- ],
1012
- "original_num_docs": 165,
1013
- "effective_num_docs": 165
1014
- },
1015
- "lighteval|mmlu:high_school_geography": {
1016
- "name": "mmlu:high_school_geography",
1017
- "prompt_function": "mmlu_harness",
1018
- "hf_repo": "lighteval/mmlu",
1019
- "hf_subset": "high_school_geography",
1020
- "metric": [
1021
- "loglikelihood_acc"
1022
- ],
1023
- "hf_avail_splits": [
1024
- "auxiliary_train",
1025
- "test",
1026
- "validation",
1027
- "dev"
1028
- ],
1029
- "evaluation_splits": [
1030
- "test"
1031
- ],
1032
- "few_shots_split": "dev",
1033
- "few_shots_select": "sequential",
1034
- "generation_size": 1,
1035
- "stop_sequence": [
1036
- "\n"
1037
- ],
1038
- "output_regex": null,
1039
- "frozen": false,
1040
- "suite": [
1041
- "lighteval",
1042
- "mmlu"
1043
- ],
1044
- "original_num_docs": 198,
1045
- "effective_num_docs": 198
1046
- },
1047
- "lighteval|mmlu:high_school_government_and_politics": {
1048
- "name": "mmlu:high_school_government_and_politics",
1049
- "prompt_function": "mmlu_harness",
1050
- "hf_repo": "lighteval/mmlu",
1051
- "hf_subset": "high_school_government_and_politics",
1052
- "metric": [
1053
- "loglikelihood_acc"
1054
- ],
1055
- "hf_avail_splits": [
1056
- "auxiliary_train",
1057
- "test",
1058
- "validation",
1059
- "dev"
1060
- ],
1061
- "evaluation_splits": [
1062
- "test"
1063
- ],
1064
- "few_shots_split": "dev",
1065
- "few_shots_select": "sequential",
1066
- "generation_size": 1,
1067
- "stop_sequence": [
1068
- "\n"
1069
- ],
1070
- "output_regex": null,
1071
- "frozen": false,
1072
- "suite": [
1073
- "lighteval",
1074
- "mmlu"
1075
- ],
1076
- "original_num_docs": 193,
1077
- "effective_num_docs": 193
1078
- },
1079
- "lighteval|mmlu:high_school_macroeconomics": {
1080
- "name": "mmlu:high_school_macroeconomics",
1081
- "prompt_function": "mmlu_harness",
1082
- "hf_repo": "lighteval/mmlu",
1083
- "hf_subset": "high_school_macroeconomics",
1084
- "metric": [
1085
- "loglikelihood_acc"
1086
- ],
1087
- "hf_avail_splits": [
1088
- "auxiliary_train",
1089
- "test",
1090
- "validation",
1091
- "dev"
1092
- ],
1093
- "evaluation_splits": [
1094
- "test"
1095
- ],
1096
- "few_shots_split": "dev",
1097
- "few_shots_select": "sequential",
1098
- "generation_size": 1,
1099
- "stop_sequence": [
1100
- "\n"
1101
- ],
1102
- "output_regex": null,
1103
- "frozen": false,
1104
- "suite": [
1105
- "lighteval",
1106
- "mmlu"
1107
- ],
1108
- "original_num_docs": 390,
1109
- "effective_num_docs": 390
1110
- },
1111
- "lighteval|mmlu:high_school_mathematics": {
1112
- "name": "mmlu:high_school_mathematics",
1113
- "prompt_function": "mmlu_harness",
1114
- "hf_repo": "lighteval/mmlu",
1115
- "hf_subset": "high_school_mathematics",
1116
- "metric": [
1117
- "loglikelihood_acc"
1118
- ],
1119
- "hf_avail_splits": [
1120
- "auxiliary_train",
1121
- "test",
1122
- "validation",
1123
- "dev"
1124
- ],
1125
- "evaluation_splits": [
1126
- "test"
1127
- ],
1128
- "few_shots_split": "dev",
1129
- "few_shots_select": "sequential",
1130
- "generation_size": 1,
1131
- "stop_sequence": [
1132
- "\n"
1133
- ],
1134
- "output_regex": null,
1135
- "frozen": false,
1136
- "suite": [
1137
- "lighteval",
1138
- "mmlu"
1139
- ],
1140
- "original_num_docs": 270,
1141
- "effective_num_docs": 270
1142
- },
1143
- "lighteval|mmlu:high_school_microeconomics": {
1144
- "name": "mmlu:high_school_microeconomics",
1145
- "prompt_function": "mmlu_harness",
1146
- "hf_repo": "lighteval/mmlu",
1147
- "hf_subset": "high_school_microeconomics",
1148
- "metric": [
1149
- "loglikelihood_acc"
1150
- ],
1151
- "hf_avail_splits": [
1152
- "auxiliary_train",
1153
- "test",
1154
- "validation",
1155
- "dev"
1156
- ],
1157
- "evaluation_splits": [
1158
- "test"
1159
- ],
1160
- "few_shots_split": "dev",
1161
- "few_shots_select": "sequential",
1162
- "generation_size": 1,
1163
- "stop_sequence": [
1164
- "\n"
1165
- ],
1166
- "output_regex": null,
1167
- "frozen": false,
1168
- "suite": [
1169
- "lighteval",
1170
- "mmlu"
1171
- ],
1172
- "original_num_docs": 238,
1173
- "effective_num_docs": 238
1174
- },
1175
- "lighteval|mmlu:high_school_physics": {
1176
- "name": "mmlu:high_school_physics",
1177
- "prompt_function": "mmlu_harness",
1178
- "hf_repo": "lighteval/mmlu",
1179
- "hf_subset": "high_school_physics",
1180
- "metric": [
1181
- "loglikelihood_acc"
1182
- ],
1183
- "hf_avail_splits": [
1184
- "auxiliary_train",
1185
- "test",
1186
- "validation",
1187
- "dev"
1188
- ],
1189
- "evaluation_splits": [
1190
- "test"
1191
- ],
1192
- "few_shots_split": "dev",
1193
- "few_shots_select": "sequential",
1194
- "generation_size": 1,
1195
- "stop_sequence": [
1196
- "\n"
1197
- ],
1198
- "output_regex": null,
1199
- "frozen": false,
1200
- "suite": [
1201
- "lighteval",
1202
- "mmlu"
1203
- ],
1204
- "original_num_docs": 151,
1205
- "effective_num_docs": 151
1206
- },
1207
- "lighteval|mmlu:high_school_psychology": {
1208
- "name": "mmlu:high_school_psychology",
1209
- "prompt_function": "mmlu_harness",
1210
- "hf_repo": "lighteval/mmlu",
1211
- "hf_subset": "high_school_psychology",
1212
- "metric": [
1213
- "loglikelihood_acc"
1214
- ],
1215
- "hf_avail_splits": [
1216
- "auxiliary_train",
1217
- "test",
1218
- "validation",
1219
- "dev"
1220
- ],
1221
- "evaluation_splits": [
1222
- "test"
1223
- ],
1224
- "few_shots_split": "dev",
1225
- "few_shots_select": "sequential",
1226
- "generation_size": 1,
1227
- "stop_sequence": [
1228
- "\n"
1229
- ],
1230
- "output_regex": null,
1231
- "frozen": false,
1232
- "suite": [
1233
- "lighteval",
1234
- "mmlu"
1235
- ],
1236
- "original_num_docs": 545,
1237
- "effective_num_docs": 545
1238
- },
1239
- "lighteval|mmlu:high_school_statistics": {
1240
- "name": "mmlu:high_school_statistics",
1241
- "prompt_function": "mmlu_harness",
1242
- "hf_repo": "lighteval/mmlu",
1243
- "hf_subset": "high_school_statistics",
1244
- "metric": [
1245
- "loglikelihood_acc"
1246
- ],
1247
- "hf_avail_splits": [
1248
- "auxiliary_train",
1249
- "test",
1250
- "validation",
1251
- "dev"
1252
- ],
1253
- "evaluation_splits": [
1254
- "test"
1255
- ],
1256
- "few_shots_split": "dev",
1257
- "few_shots_select": "sequential",
1258
- "generation_size": 1,
1259
- "stop_sequence": [
1260
- "\n"
1261
- ],
1262
- "output_regex": null,
1263
- "frozen": false,
1264
- "suite": [
1265
- "lighteval",
1266
- "mmlu"
1267
- ],
1268
- "original_num_docs": 216,
1269
- "effective_num_docs": 216
1270
- },
1271
- "lighteval|mmlu:high_school_us_history": {
1272
- "name": "mmlu:high_school_us_history",
1273
- "prompt_function": "mmlu_harness",
1274
- "hf_repo": "lighteval/mmlu",
1275
- "hf_subset": "high_school_us_history",
1276
- "metric": [
1277
- "loglikelihood_acc"
1278
- ],
1279
- "hf_avail_splits": [
1280
- "auxiliary_train",
1281
- "test",
1282
- "validation",
1283
- "dev"
1284
- ],
1285
- "evaluation_splits": [
1286
- "test"
1287
- ],
1288
- "few_shots_split": "dev",
1289
- "few_shots_select": "sequential",
1290
- "generation_size": 1,
1291
- "stop_sequence": [
1292
- "\n"
1293
- ],
1294
- "output_regex": null,
1295
- "frozen": false,
1296
- "suite": [
1297
- "lighteval",
1298
- "mmlu"
1299
- ],
1300
- "original_num_docs": 204,
1301
- "effective_num_docs": 204
1302
- },
1303
- "lighteval|mmlu:high_school_world_history": {
1304
- "name": "mmlu:high_school_world_history",
1305
- "prompt_function": "mmlu_harness",
1306
- "hf_repo": "lighteval/mmlu",
1307
- "hf_subset": "high_school_world_history",
1308
- "metric": [
1309
- "loglikelihood_acc"
1310
- ],
1311
- "hf_avail_splits": [
1312
- "auxiliary_train",
1313
- "test",
1314
- "validation",
1315
- "dev"
1316
- ],
1317
- "evaluation_splits": [
1318
- "test"
1319
- ],
1320
- "few_shots_split": "dev",
1321
- "few_shots_select": "sequential",
1322
- "generation_size": 1,
1323
- "stop_sequence": [
1324
- "\n"
1325
- ],
1326
- "output_regex": null,
1327
- "frozen": false,
1328
- "suite": [
1329
- "lighteval",
1330
- "mmlu"
1331
- ],
1332
- "original_num_docs": 237,
1333
- "effective_num_docs": 237
1334
- },
1335
- "lighteval|mmlu:human_aging": {
1336
- "name": "mmlu:human_aging",
1337
- "prompt_function": "mmlu_harness",
1338
- "hf_repo": "lighteval/mmlu",
1339
- "hf_subset": "human_aging",
1340
- "metric": [
1341
- "loglikelihood_acc"
1342
- ],
1343
- "hf_avail_splits": [
1344
- "auxiliary_train",
1345
- "test",
1346
- "validation",
1347
- "dev"
1348
- ],
1349
- "evaluation_splits": [
1350
- "test"
1351
- ],
1352
- "few_shots_split": "dev",
1353
- "few_shots_select": "sequential",
1354
- "generation_size": 1,
1355
- "stop_sequence": [
1356
- "\n"
1357
- ],
1358
- "output_regex": null,
1359
- "frozen": false,
1360
- "suite": [
1361
- "lighteval",
1362
- "mmlu"
1363
- ],
1364
- "original_num_docs": 223,
1365
- "effective_num_docs": 223
1366
- },
1367
- "lighteval|mmlu:human_sexuality": {
1368
- "name": "mmlu:human_sexuality",
1369
- "prompt_function": "mmlu_harness",
1370
- "hf_repo": "lighteval/mmlu",
1371
- "hf_subset": "human_sexuality",
1372
- "metric": [
1373
- "loglikelihood_acc"
1374
- ],
1375
- "hf_avail_splits": [
1376
- "auxiliary_train",
1377
- "test",
1378
- "validation",
1379
- "dev"
1380
- ],
1381
- "evaluation_splits": [
1382
- "test"
1383
- ],
1384
- "few_shots_split": "dev",
1385
- "few_shots_select": "sequential",
1386
- "generation_size": 1,
1387
- "stop_sequence": [
1388
- "\n"
1389
- ],
1390
- "output_regex": null,
1391
- "frozen": false,
1392
- "suite": [
1393
- "lighteval",
1394
- "mmlu"
1395
- ],
1396
- "original_num_docs": 131,
1397
- "effective_num_docs": 131
1398
- },
1399
- "lighteval|mmlu:international_law": {
1400
- "name": "mmlu:international_law",
1401
- "prompt_function": "mmlu_harness",
1402
- "hf_repo": "lighteval/mmlu",
1403
- "hf_subset": "international_law",
1404
- "metric": [
1405
- "loglikelihood_acc"
1406
- ],
1407
- "hf_avail_splits": [
1408
- "auxiliary_train",
1409
- "test",
1410
- "validation",
1411
- "dev"
1412
- ],
1413
- "evaluation_splits": [
1414
- "test"
1415
- ],
1416
- "few_shots_split": "dev",
1417
- "few_shots_select": "sequential",
1418
- "generation_size": 1,
1419
- "stop_sequence": [
1420
- "\n"
1421
- ],
1422
- "output_regex": null,
1423
- "frozen": false,
1424
- "suite": [
1425
- "lighteval",
1426
- "mmlu"
1427
- ],
1428
- "original_num_docs": 121,
1429
- "effective_num_docs": 121
1430
- },
1431
- "lighteval|mmlu:jurisprudence": {
1432
- "name": "mmlu:jurisprudence",
1433
- "prompt_function": "mmlu_harness",
1434
- "hf_repo": "lighteval/mmlu",
1435
- "hf_subset": "jurisprudence",
1436
- "metric": [
1437
- "loglikelihood_acc"
1438
- ],
1439
- "hf_avail_splits": [
1440
- "auxiliary_train",
1441
- "test",
1442
- "validation",
1443
- "dev"
1444
- ],
1445
- "evaluation_splits": [
1446
- "test"
1447
- ],
1448
- "few_shots_split": "dev",
1449
- "few_shots_select": "sequential",
1450
- "generation_size": 1,
1451
- "stop_sequence": [
1452
- "\n"
1453
- ],
1454
- "output_regex": null,
1455
- "frozen": false,
1456
- "suite": [
1457
- "lighteval",
1458
- "mmlu"
1459
- ],
1460
- "original_num_docs": 108,
1461
- "effective_num_docs": 108
1462
- },
1463
- "lighteval|mmlu:logical_fallacies": {
1464
- "name": "mmlu:logical_fallacies",
1465
- "prompt_function": "mmlu_harness",
1466
- "hf_repo": "lighteval/mmlu",
1467
- "hf_subset": "logical_fallacies",
1468
- "metric": [
1469
- "loglikelihood_acc"
1470
- ],
1471
- "hf_avail_splits": [
1472
- "auxiliary_train",
1473
- "test",
1474
- "validation",
1475
- "dev"
1476
- ],
1477
- "evaluation_splits": [
1478
- "test"
1479
- ],
1480
- "few_shots_split": "dev",
1481
- "few_shots_select": "sequential",
1482
- "generation_size": 1,
1483
- "stop_sequence": [
1484
- "\n"
1485
- ],
1486
- "output_regex": null,
1487
- "frozen": false,
1488
- "suite": [
1489
- "lighteval",
1490
- "mmlu"
1491
- ],
1492
- "original_num_docs": 163,
1493
- "effective_num_docs": 163
1494
- },
1495
- "lighteval|mmlu:machine_learning": {
1496
- "name": "mmlu:machine_learning",
1497
- "prompt_function": "mmlu_harness",
1498
- "hf_repo": "lighteval/mmlu",
1499
- "hf_subset": "machine_learning",
1500
- "metric": [
1501
- "loglikelihood_acc"
1502
- ],
1503
- "hf_avail_splits": [
1504
- "auxiliary_train",
1505
- "test",
1506
- "validation",
1507
- "dev"
1508
- ],
1509
- "evaluation_splits": [
1510
- "test"
1511
- ],
1512
- "few_shots_split": "dev",
1513
- "few_shots_select": "sequential",
1514
- "generation_size": 1,
1515
- "stop_sequence": [
1516
- "\n"
1517
- ],
1518
- "output_regex": null,
1519
- "frozen": false,
1520
- "suite": [
1521
- "lighteval",
1522
- "mmlu"
1523
- ],
1524
- "original_num_docs": 112,
1525
- "effective_num_docs": 112
1526
- },
1527
- "lighteval|mmlu:management": {
1528
- "name": "mmlu:management",
1529
- "prompt_function": "mmlu_harness",
1530
- "hf_repo": "lighteval/mmlu",
1531
- "hf_subset": "management",
1532
- "metric": [
1533
- "loglikelihood_acc"
1534
- ],
1535
- "hf_avail_splits": [
1536
- "auxiliary_train",
1537
- "test",
1538
- "validation",
1539
- "dev"
1540
- ],
1541
- "evaluation_splits": [
1542
- "test"
1543
- ],
1544
- "few_shots_split": "dev",
1545
- "few_shots_select": "sequential",
1546
- "generation_size": 1,
1547
- "stop_sequence": [
1548
- "\n"
1549
- ],
1550
- "output_regex": null,
1551
- "frozen": false,
1552
- "suite": [
1553
- "lighteval",
1554
- "mmlu"
1555
- ],
1556
- "original_num_docs": 103,
1557
- "effective_num_docs": 103
1558
- },
1559
- "lighteval|mmlu:marketing": {
1560
- "name": "mmlu:marketing",
1561
- "prompt_function": "mmlu_harness",
1562
- "hf_repo": "lighteval/mmlu",
1563
- "hf_subset": "marketing",
1564
- "metric": [
1565
- "loglikelihood_acc"
1566
- ],
1567
- "hf_avail_splits": [
1568
- "auxiliary_train",
1569
- "test",
1570
- "validation",
1571
- "dev"
1572
- ],
1573
- "evaluation_splits": [
1574
- "test"
1575
- ],
1576
- "few_shots_split": "dev",
1577
- "few_shots_select": "sequential",
1578
- "generation_size": 1,
1579
- "stop_sequence": [
1580
- "\n"
1581
- ],
1582
- "output_regex": null,
1583
- "frozen": false,
1584
- "suite": [
1585
- "lighteval",
1586
- "mmlu"
1587
- ],
1588
- "original_num_docs": 234,
1589
- "effective_num_docs": 234
1590
- },
1591
- "lighteval|mmlu:medical_genetics": {
1592
- "name": "mmlu:medical_genetics",
1593
- "prompt_function": "mmlu_harness",
1594
- "hf_repo": "lighteval/mmlu",
1595
- "hf_subset": "medical_genetics",
1596
- "metric": [
1597
- "loglikelihood_acc"
1598
- ],
1599
- "hf_avail_splits": [
1600
- "auxiliary_train",
1601
- "test",
1602
- "validation",
1603
- "dev"
1604
- ],
1605
- "evaluation_splits": [
1606
- "test"
1607
- ],
1608
- "few_shots_split": "dev",
1609
- "few_shots_select": "sequential",
1610
- "generation_size": 1,
1611
- "stop_sequence": [
1612
- "\n"
1613
- ],
1614
- "output_regex": null,
1615
- "frozen": false,
1616
- "suite": [
1617
- "lighteval",
1618
- "mmlu"
1619
- ],
1620
- "original_num_docs": 100,
1621
- "effective_num_docs": 100
1622
- },
1623
- "lighteval|mmlu:miscellaneous": {
1624
- "name": "mmlu:miscellaneous",
1625
- "prompt_function": "mmlu_harness",
1626
- "hf_repo": "lighteval/mmlu",
1627
- "hf_subset": "miscellaneous",
1628
- "metric": [
1629
- "loglikelihood_acc"
1630
- ],
1631
- "hf_avail_splits": [
1632
- "auxiliary_train",
1633
- "test",
1634
- "validation",
1635
- "dev"
1636
- ],
1637
- "evaluation_splits": [
1638
- "test"
1639
- ],
1640
- "few_shots_split": "dev",
1641
- "few_shots_select": "sequential",
1642
- "generation_size": 1,
1643
- "stop_sequence": [
1644
- "\n"
1645
- ],
1646
- "output_regex": null,
1647
- "frozen": false,
1648
- "suite": [
1649
- "lighteval",
1650
- "mmlu"
1651
- ],
1652
- "original_num_docs": 783,
1653
- "effective_num_docs": 783
1654
- },
1655
- "lighteval|mmlu:moral_disputes": {
1656
- "name": "mmlu:moral_disputes",
1657
- "prompt_function": "mmlu_harness",
1658
- "hf_repo": "lighteval/mmlu",
1659
- "hf_subset": "moral_disputes",
1660
- "metric": [
1661
- "loglikelihood_acc"
1662
- ],
1663
- "hf_avail_splits": [
1664
- "auxiliary_train",
1665
- "test",
1666
- "validation",
1667
- "dev"
1668
- ],
1669
- "evaluation_splits": [
1670
- "test"
1671
- ],
1672
- "few_shots_split": "dev",
1673
- "few_shots_select": "sequential",
1674
- "generation_size": 1,
1675
- "stop_sequence": [
1676
- "\n"
1677
- ],
1678
- "output_regex": null,
1679
- "frozen": false,
1680
- "suite": [
1681
- "lighteval",
1682
- "mmlu"
1683
- ],
1684
- "original_num_docs": 346,
1685
- "effective_num_docs": 346
1686
- },
1687
- "lighteval|mmlu:moral_scenarios": {
1688
- "name": "mmlu:moral_scenarios",
1689
- "prompt_function": "mmlu_harness",
1690
- "hf_repo": "lighteval/mmlu",
1691
- "hf_subset": "moral_scenarios",
1692
- "metric": [
1693
- "loglikelihood_acc"
1694
- ],
1695
- "hf_avail_splits": [
1696
- "auxiliary_train",
1697
- "test",
1698
- "validation",
1699
- "dev"
1700
- ],
1701
- "evaluation_splits": [
1702
- "test"
1703
- ],
1704
- "few_shots_split": "dev",
1705
- "few_shots_select": "sequential",
1706
- "generation_size": 1,
1707
- "stop_sequence": [
1708
- "\n"
1709
- ],
1710
- "output_regex": null,
1711
- "frozen": false,
1712
- "suite": [
1713
- "lighteval",
1714
- "mmlu"
1715
- ],
1716
- "original_num_docs": 895,
1717
- "effective_num_docs": 895
1718
- },
1719
- "lighteval|mmlu:nutrition": {
1720
- "name": "mmlu:nutrition",
1721
- "prompt_function": "mmlu_harness",
1722
- "hf_repo": "lighteval/mmlu",
1723
- "hf_subset": "nutrition",
1724
- "metric": [
1725
- "loglikelihood_acc"
1726
- ],
1727
- "hf_avail_splits": [
1728
- "auxiliary_train",
1729
- "test",
1730
- "validation",
1731
- "dev"
1732
- ],
1733
- "evaluation_splits": [
1734
- "test"
1735
- ],
1736
- "few_shots_split": "dev",
1737
- "few_shots_select": "sequential",
1738
- "generation_size": 1,
1739
- "stop_sequence": [
1740
- "\n"
1741
- ],
1742
- "output_regex": null,
1743
- "frozen": false,
1744
- "suite": [
1745
- "lighteval",
1746
- "mmlu"
1747
- ],
1748
- "original_num_docs": 306,
1749
- "effective_num_docs": 306
1750
- },
1751
- "lighteval|mmlu:philosophy": {
1752
- "name": "mmlu:philosophy",
1753
- "prompt_function": "mmlu_harness",
1754
- "hf_repo": "lighteval/mmlu",
1755
- "hf_subset": "philosophy",
1756
- "metric": [
1757
- "loglikelihood_acc"
1758
- ],
1759
- "hf_avail_splits": [
1760
- "auxiliary_train",
1761
- "test",
1762
- "validation",
1763
- "dev"
1764
- ],
1765
- "evaluation_splits": [
1766
- "test"
1767
- ],
1768
- "few_shots_split": "dev",
1769
- "few_shots_select": "sequential",
1770
- "generation_size": 1,
1771
- "stop_sequence": [
1772
- "\n"
1773
- ],
1774
- "output_regex": null,
1775
- "frozen": false,
1776
- "suite": [
1777
- "lighteval",
1778
- "mmlu"
1779
- ],
1780
- "original_num_docs": 311,
1781
- "effective_num_docs": 311
1782
- },
1783
- "lighteval|mmlu:prehistory": {
1784
- "name": "mmlu:prehistory",
1785
- "prompt_function": "mmlu_harness",
1786
- "hf_repo": "lighteval/mmlu",
1787
- "hf_subset": "prehistory",
1788
- "metric": [
1789
- "loglikelihood_acc"
1790
- ],
1791
- "hf_avail_splits": [
1792
- "auxiliary_train",
1793
- "test",
1794
- "validation",
1795
- "dev"
1796
- ],
1797
- "evaluation_splits": [
1798
- "test"
1799
- ],
1800
- "few_shots_split": "dev",
1801
- "few_shots_select": "sequential",
1802
- "generation_size": 1,
1803
- "stop_sequence": [
1804
- "\n"
1805
- ],
1806
- "output_regex": null,
1807
- "frozen": false,
1808
- "suite": [
1809
- "lighteval",
1810
- "mmlu"
1811
- ],
1812
- "original_num_docs": 324,
1813
- "effective_num_docs": 324
1814
- },
1815
- "lighteval|mmlu:professional_accounting": {
1816
- "name": "mmlu:professional_accounting",
1817
- "prompt_function": "mmlu_harness",
1818
- "hf_repo": "lighteval/mmlu",
1819
- "hf_subset": "professional_accounting",
1820
- "metric": [
1821
- "loglikelihood_acc"
1822
- ],
1823
- "hf_avail_splits": [
1824
- "auxiliary_train",
1825
- "test",
1826
- "validation",
1827
- "dev"
1828
- ],
1829
- "evaluation_splits": [
1830
- "test"
1831
- ],
1832
- "few_shots_split": "dev",
1833
- "few_shots_select": "sequential",
1834
- "generation_size": 1,
1835
- "stop_sequence": [
1836
- "\n"
1837
- ],
1838
- "output_regex": null,
1839
- "frozen": false,
1840
- "suite": [
1841
- "lighteval",
1842
- "mmlu"
1843
- ],
1844
- "original_num_docs": 282,
1845
- "effective_num_docs": 282
1846
- },
1847
- "lighteval|mmlu:professional_law": {
1848
- "name": "mmlu:professional_law",
1849
- "prompt_function": "mmlu_harness",
1850
- "hf_repo": "lighteval/mmlu",
1851
- "hf_subset": "professional_law",
1852
- "metric": [
1853
- "loglikelihood_acc"
1854
- ],
1855
- "hf_avail_splits": [
1856
- "auxiliary_train",
1857
- "test",
1858
- "validation",
1859
- "dev"
1860
- ],
1861
- "evaluation_splits": [
1862
- "test"
1863
- ],
1864
- "few_shots_split": "dev",
1865
- "few_shots_select": "sequential",
1866
- "generation_size": 1,
1867
- "stop_sequence": [
1868
- "\n"
1869
- ],
1870
- "output_regex": null,
1871
- "frozen": false,
1872
- "suite": [
1873
- "lighteval",
1874
- "mmlu"
1875
- ],
1876
- "original_num_docs": 1534,
1877
- "effective_num_docs": 1534
1878
- },
1879
- "lighteval|mmlu:professional_medicine": {
1880
- "name": "mmlu:professional_medicine",
1881
- "prompt_function": "mmlu_harness",
1882
- "hf_repo": "lighteval/mmlu",
1883
- "hf_subset": "professional_medicine",
1884
- "metric": [
1885
- "loglikelihood_acc"
1886
- ],
1887
- "hf_avail_splits": [
1888
- "auxiliary_train",
1889
- "test",
1890
- "validation",
1891
- "dev"
1892
- ],
1893
- "evaluation_splits": [
1894
- "test"
1895
- ],
1896
- "few_shots_split": "dev",
1897
- "few_shots_select": "sequential",
1898
- "generation_size": 1,
1899
- "stop_sequence": [
1900
- "\n"
1901
- ],
1902
- "output_regex": null,
1903
- "frozen": false,
1904
- "suite": [
1905
- "lighteval",
1906
- "mmlu"
1907
- ],
1908
- "original_num_docs": 272,
1909
- "effective_num_docs": 272
1910
- },
1911
- "lighteval|mmlu:professional_psychology": {
1912
- "name": "mmlu:professional_psychology",
1913
- "prompt_function": "mmlu_harness",
1914
- "hf_repo": "lighteval/mmlu",
1915
- "hf_subset": "professional_psychology",
1916
- "metric": [
1917
- "loglikelihood_acc"
1918
- ],
1919
- "hf_avail_splits": [
1920
- "auxiliary_train",
1921
- "test",
1922
- "validation",
1923
- "dev"
1924
- ],
1925
- "evaluation_splits": [
1926
- "test"
1927
- ],
1928
- "few_shots_split": "dev",
1929
- "few_shots_select": "sequential",
1930
- "generation_size": 1,
1931
- "stop_sequence": [
1932
- "\n"
1933
- ],
1934
- "output_regex": null,
1935
- "frozen": false,
1936
- "suite": [
1937
- "lighteval",
1938
- "mmlu"
1939
- ],
1940
- "original_num_docs": 612,
1941
- "effective_num_docs": 612
1942
- },
1943
- "lighteval|mmlu:public_relations": {
1944
- "name": "mmlu:public_relations",
1945
- "prompt_function": "mmlu_harness",
1946
- "hf_repo": "lighteval/mmlu",
1947
- "hf_subset": "public_relations",
1948
- "metric": [
1949
- "loglikelihood_acc"
1950
- ],
1951
- "hf_avail_splits": [
1952
- "auxiliary_train",
1953
- "test",
1954
- "validation",
1955
- "dev"
1956
- ],
1957
- "evaluation_splits": [
1958
- "test"
1959
- ],
1960
- "few_shots_split": "dev",
1961
- "few_shots_select": "sequential",
1962
- "generation_size": 1,
1963
- "stop_sequence": [
1964
- "\n"
1965
- ],
1966
- "output_regex": null,
1967
- "frozen": false,
1968
- "suite": [
1969
- "lighteval",
1970
- "mmlu"
1971
- ],
1972
- "original_num_docs": 110,
1973
- "effective_num_docs": 110
1974
- },
1975
- "lighteval|mmlu:security_studies": {
1976
- "name": "mmlu:security_studies",
1977
- "prompt_function": "mmlu_harness",
1978
- "hf_repo": "lighteval/mmlu",
1979
- "hf_subset": "security_studies",
1980
- "metric": [
1981
- "loglikelihood_acc"
1982
- ],
1983
- "hf_avail_splits": [
1984
- "auxiliary_train",
1985
- "test",
1986
- "validation",
1987
- "dev"
1988
- ],
1989
- "evaluation_splits": [
1990
- "test"
1991
- ],
1992
- "few_shots_split": "dev",
1993
- "few_shots_select": "sequential",
1994
- "generation_size": 1,
1995
- "stop_sequence": [
1996
- "\n"
1997
- ],
1998
- "output_regex": null,
1999
- "frozen": false,
2000
- "suite": [
2001
- "lighteval",
2002
- "mmlu"
2003
- ],
2004
- "original_num_docs": 245,
2005
- "effective_num_docs": 245
2006
- },
2007
- "lighteval|mmlu:sociology": {
2008
- "name": "mmlu:sociology",
2009
- "prompt_function": "mmlu_harness",
2010
- "hf_repo": "lighteval/mmlu",
2011
- "hf_subset": "sociology",
2012
- "metric": [
2013
- "loglikelihood_acc"
2014
- ],
2015
- "hf_avail_splits": [
2016
- "auxiliary_train",
2017
- "test",
2018
- "validation",
2019
- "dev"
2020
- ],
2021
- "evaluation_splits": [
2022
- "test"
2023
- ],
2024
- "few_shots_split": "dev",
2025
- "few_shots_select": "sequential",
2026
- "generation_size": 1,
2027
- "stop_sequence": [
2028
- "\n"
2029
- ],
2030
- "output_regex": null,
2031
- "frozen": false,
2032
- "suite": [
2033
- "lighteval",
2034
- "mmlu"
2035
- ],
2036
- "original_num_docs": 201,
2037
- "effective_num_docs": 201
2038
- },
2039
- "lighteval|mmlu:us_foreign_policy": {
2040
- "name": "mmlu:us_foreign_policy",
2041
- "prompt_function": "mmlu_harness",
2042
- "hf_repo": "lighteval/mmlu",
2043
- "hf_subset": "us_foreign_policy",
2044
- "metric": [
2045
- "loglikelihood_acc"
2046
- ],
2047
- "hf_avail_splits": [
2048
- "auxiliary_train",
2049
- "test",
2050
- "validation",
2051
- "dev"
2052
- ],
2053
- "evaluation_splits": [
2054
- "test"
2055
- ],
2056
- "few_shots_split": "dev",
2057
- "few_shots_select": "sequential",
2058
- "generation_size": 1,
2059
- "stop_sequence": [
2060
- "\n"
2061
- ],
2062
- "output_regex": null,
2063
- "frozen": false,
2064
- "suite": [
2065
- "lighteval",
2066
- "mmlu"
2067
- ],
2068
- "original_num_docs": 100,
2069
- "effective_num_docs": 100
2070
- },
2071
- "lighteval|mmlu:virology": {
2072
- "name": "mmlu:virology",
2073
- "prompt_function": "mmlu_harness",
2074
- "hf_repo": "lighteval/mmlu",
2075
- "hf_subset": "virology",
2076
- "metric": [
2077
- "loglikelihood_acc"
2078
- ],
2079
- "hf_avail_splits": [
2080
- "auxiliary_train",
2081
- "test",
2082
- "validation",
2083
- "dev"
2084
- ],
2085
- "evaluation_splits": [
2086
- "test"
2087
- ],
2088
- "few_shots_split": "dev",
2089
- "few_shots_select": "sequential",
2090
- "generation_size": 1,
2091
- "stop_sequence": [
2092
- "\n"
2093
- ],
2094
- "output_regex": null,
2095
- "frozen": false,
2096
- "suite": [
2097
- "lighteval",
2098
- "mmlu"
2099
- ],
2100
- "original_num_docs": 166,
2101
- "effective_num_docs": 166
2102
- },
2103
- "lighteval|mmlu:world_religions": {
2104
- "name": "mmlu:world_religions",
2105
- "prompt_function": "mmlu_harness",
2106
- "hf_repo": "lighteval/mmlu",
2107
- "hf_subset": "world_religions",
2108
- "metric": [
2109
- "loglikelihood_acc"
2110
- ],
2111
- "hf_avail_splits": [
2112
- "auxiliary_train",
2113
- "test",
2114
- "validation",
2115
- "dev"
2116
- ],
2117
- "evaluation_splits": [
2118
- "test"
2119
- ],
2120
- "few_shots_split": "dev",
2121
- "few_shots_select": "sequential",
2122
- "generation_size": 1,
2123
- "stop_sequence": [
2124
- "\n"
2125
- ],
2126
- "output_regex": null,
2127
- "frozen": false,
2128
- "suite": [
2129
- "lighteval",
2130
- "mmlu"
2131
- ],
2132
- "original_num_docs": 171,
2133
- "effective_num_docs": 171
2134
- }
2135
- },
2136
- "summary_tasks": {
2137
- "lighteval|mmlu:abstract_algebra|5": {
2138
- "hashes": {
2139
- "hash_examples": "4c76229e00c9c0e9",
2140
- "hash_full_prompts": "a316d5f10f1c4fc3",
2141
- "hash_input_tokens": "b87e1cd51e4cdb89",
2142
- "hash_cont_tokens": "dadea1de19dee95c"
2143
- },
2144
- "truncated": 0,
2145
- "non_truncated": 100,
2146
- "padded": 400,
2147
- "non_padded": 0,
2148
- "effective_few_shots": 5.0,
2149
- "num_truncated_few_shots": 0
2150
- },
2151
- "lighteval|mmlu:anatomy|5": {
2152
- "hashes": {
2153
- "hash_examples": "6a1f8104dccbd33b",
2154
- "hash_full_prompts": "fa80e4331377b478",
2155
- "hash_input_tokens": "9393a535f481cfe5",
2156
- "hash_cont_tokens": "96c2bab19c75f48d"
2157
- },
2158
- "truncated": 0,
2159
- "non_truncated": 135,
2160
- "padded": 540,
2161
- "non_padded": 0,
2162
- "effective_few_shots": 5.0,
2163
- "num_truncated_few_shots": 0
2164
- },
2165
- "lighteval|mmlu:astronomy|5": {
2166
- "hashes": {
2167
- "hash_examples": "1302effa3a76ce4c",
2168
- "hash_full_prompts": "824f7fba40f07d6a",
2169
- "hash_input_tokens": "d23f1f749725c6ec",
2170
- "hash_cont_tokens": "6cc2d6fb43989c46"
2171
- },
2172
- "truncated": 0,
2173
- "non_truncated": 152,
2174
- "padded": 608,
2175
- "non_padded": 0,
2176
- "effective_few_shots": 5.0,
2177
- "num_truncated_few_shots": 0
2178
- },
2179
- "lighteval|mmlu:business_ethics|5": {
2180
- "hashes": {
2181
- "hash_examples": "03cb8bce5336419a",
2182
- "hash_full_prompts": "09edd202cc596692",
2183
- "hash_input_tokens": "71a09c30a05a6e7d",
2184
- "hash_cont_tokens": "dadea1de19dee95c"
2185
- },
2186
- "truncated": 0,
2187
- "non_truncated": 100,
2188
- "padded": 400,
2189
- "non_padded": 0,
2190
- "effective_few_shots": 5.0,
2191
- "num_truncated_few_shots": 0
2192
- },
2193
- "lighteval|mmlu:clinical_knowledge|5": {
2194
- "hashes": {
2195
- "hash_examples": "ffbb9c7b2be257f9",
2196
- "hash_full_prompts": "70cc39d220c7b400",
2197
- "hash_input_tokens": "8af194657c9b6943",
2198
- "hash_cont_tokens": "4566966a1e601b6c"
2199
- },
2200
- "truncated": 0,
2201
- "non_truncated": 265,
2202
- "padded": 1060,
2203
- "non_padded": 0,
2204
- "effective_few_shots": 5.0,
2205
- "num_truncated_few_shots": 0
2206
- },
2207
- "lighteval|mmlu:college_biology|5": {
2208
- "hashes": {
2209
- "hash_examples": "3ee77f176f38eb8e",
2210
- "hash_full_prompts": "a6d4737b00af78b6",
2211
- "hash_input_tokens": "73384e1f26f9af72",
2212
- "hash_cont_tokens": "4ea00cd7b2f74799"
2213
- },
2214
- "truncated": 0,
2215
- "non_truncated": 144,
2216
- "padded": 576,
2217
- "non_padded": 0,
2218
- "effective_few_shots": 5.0,
2219
- "num_truncated_few_shots": 0
2220
- },
2221
- "lighteval|mmlu:college_chemistry|5": {
2222
- "hashes": {
2223
- "hash_examples": "ce61a69c46d47aeb",
2224
- "hash_full_prompts": "ff6fb75d880e777a",
2225
- "hash_input_tokens": "438a46bf77df6cb9",
2226
- "hash_cont_tokens": "dadea1de19dee95c"
2227
- },
2228
- "truncated": 0,
2229
- "non_truncated": 100,
2230
- "padded": 400,
2231
- "non_padded": 0,
2232
- "effective_few_shots": 5.0,
2233
- "num_truncated_few_shots": 0
2234
- },
2235
- "lighteval|mmlu:college_computer_science|5": {
2236
- "hashes": {
2237
- "hash_examples": "32805b52d7d5daab",
2238
- "hash_full_prompts": "1a2b1bfdbbfc168c",
2239
- "hash_input_tokens": "6b479720c4df6881",
2240
- "hash_cont_tokens": "dadea1de19dee95c"
2241
- },
2242
- "truncated": 0,
2243
- "non_truncated": 100,
2244
- "padded": 400,
2245
- "non_padded": 0,
2246
- "effective_few_shots": 5.0,
2247
- "num_truncated_few_shots": 0
2248
- },
2249
- "lighteval|mmlu:college_mathematics|5": {
2250
- "hashes": {
2251
- "hash_examples": "55da1a0a0bd33722",
2252
- "hash_full_prompts": "6b940166b67e6a0c",
2253
- "hash_input_tokens": "061bf0b1df48d4ec",
2254
- "hash_cont_tokens": "dadea1de19dee95c"
2255
- },
2256
- "truncated": 0,
2257
- "non_truncated": 100,
2258
- "padded": 400,
2259
- "non_padded": 0,
2260
- "effective_few_shots": 5.0,
2261
- "num_truncated_few_shots": 0
2262
- },
2263
- "lighteval|mmlu:college_medicine|5": {
2264
- "hashes": {
2265
- "hash_examples": "c33e143163049176",
2266
- "hash_full_prompts": "950fedf0d751265d",
2267
- "hash_input_tokens": "bc4473ba239e2883",
2268
- "hash_cont_tokens": "aed3e7fd8adea27e"
2269
- },
2270
- "truncated": 0,
2271
- "non_truncated": 173,
2272
- "padded": 692,
2273
- "non_padded": 0,
2274
- "effective_few_shots": 5.0,
2275
- "num_truncated_few_shots": 0
2276
- },
2277
- "lighteval|mmlu:college_physics|5": {
2278
- "hashes": {
2279
- "hash_examples": "ebdab1cdb7e555df",
2280
- "hash_full_prompts": "2144c10eab705657",
2281
- "hash_input_tokens": "41d82f8a31d66df4",
2282
- "hash_cont_tokens": "1ca37bb9b8be1c5d"
2283
- },
2284
- "truncated": 0,
2285
- "non_truncated": 102,
2286
- "padded": 408,
2287
- "non_padded": 0,
2288
- "effective_few_shots": 5.0,
2289
- "num_truncated_few_shots": 0
2290
- },
2291
- "lighteval|mmlu:computer_security|5": {
2292
- "hashes": {
2293
- "hash_examples": "a24fd7d08a560921",
2294
- "hash_full_prompts": "f9164444c27c2eb3",
2295
- "hash_input_tokens": "c1fd0d621231fd85",
2296
- "hash_cont_tokens": "dadea1de19dee95c"
2297
- },
2298
- "truncated": 0,
2299
- "non_truncated": 100,
2300
- "padded": 400,
2301
- "non_padded": 0,
2302
- "effective_few_shots": 5.0,
2303
- "num_truncated_few_shots": 0
2304
- },
2305
- "lighteval|mmlu:conceptual_physics|5": {
2306
- "hashes": {
2307
- "hash_examples": "8300977a79386993",
2308
- "hash_full_prompts": "a76f787b57e7a885",
2309
- "hash_input_tokens": "af47247587837007",
2310
- "hash_cont_tokens": "26db9e6e7dfdac00"
2311
- },
2312
- "truncated": 0,
2313
- "non_truncated": 235,
2314
- "padded": 940,
2315
- "non_padded": 0,
2316
- "effective_few_shots": 5.0,
2317
- "num_truncated_few_shots": 0
2318
- },
2319
- "lighteval|mmlu:econometrics|5": {
2320
- "hashes": {
2321
- "hash_examples": "ddde36788a04a46f",
2322
- "hash_full_prompts": "9c574243eaf1af25",
2323
- "hash_input_tokens": "028aefb54bcd394e",
2324
- "hash_cont_tokens": "2ef49b394cfb87e1"
2325
- },
2326
- "truncated": 0,
2327
- "non_truncated": 114,
2328
- "padded": 456,
2329
- "non_padded": 0,
2330
- "effective_few_shots": 5.0,
2331
- "num_truncated_few_shots": 0
2332
- },
2333
- "lighteval|mmlu:electrical_engineering|5": {
2334
- "hashes": {
2335
- "hash_examples": "acbc5def98c19b3f",
2336
- "hash_full_prompts": "27b104b8ac4dd53e",
2337
- "hash_input_tokens": "51fad1e92571e71f",
2338
- "hash_cont_tokens": "adb5a1c5d57fbb41"
2339
- },
2340
- "truncated": 0,
2341
- "non_truncated": 145,
2342
- "padded": 580,
2343
- "non_padded": 0,
2344
- "effective_few_shots": 5.0,
2345
- "num_truncated_few_shots": 0
2346
- },
2347
- "lighteval|mmlu:elementary_mathematics|5": {
2348
- "hashes": {
2349
- "hash_examples": "146e61d07497a9bd",
2350
- "hash_full_prompts": "61fe73f29609efa0",
2351
- "hash_input_tokens": "c3a3e5ab857bb805",
2352
- "hash_cont_tokens": "d0782f141bcc895b"
2353
- },
2354
- "truncated": 0,
2355
- "non_truncated": 378,
2356
- "padded": 1512,
2357
- "non_padded": 0,
2358
- "effective_few_shots": 5.0,
2359
- "num_truncated_few_shots": 0
2360
- },
2361
- "lighteval|mmlu:formal_logic|5": {
2362
- "hashes": {
2363
- "hash_examples": "8635216e1909a03f",
2364
- "hash_full_prompts": "eed1d295e6976a2b",
2365
- "hash_input_tokens": "10125c8c57d40c06",
2366
- "hash_cont_tokens": "315a91fa1f805c93"
2367
- },
2368
- "truncated": 0,
2369
- "non_truncated": 126,
2370
- "padded": 504,
2371
- "non_padded": 0,
2372
- "effective_few_shots": 5.0,
2373
- "num_truncated_few_shots": 0
2374
- },
2375
- "lighteval|mmlu:global_facts|5": {
2376
- "hashes": {
2377
- "hash_examples": "30b315aa6353ee47",
2378
- "hash_full_prompts": "aa3b8cd1b4caef67",
2379
- "hash_input_tokens": "3745c719168ab057",
2380
- "hash_cont_tokens": "dadea1de19dee95c"
2381
- },
2382
- "truncated": 0,
2383
- "non_truncated": 100,
2384
- "padded": 400,
2385
- "non_padded": 0,
2386
- "effective_few_shots": 5.0,
2387
- "num_truncated_few_shots": 0
2388
- },
2389
- "lighteval|mmlu:high_school_biology|5": {
2390
- "hashes": {
2391
- "hash_examples": "c9136373af2180de",
2392
- "hash_full_prompts": "563919f4a7e8cfa0",
2393
- "hash_input_tokens": "ef148a7fbb855b4a",
2394
- "hash_cont_tokens": "715bc46d18155135"
2395
- },
2396
- "truncated": 0,
2397
- "non_truncated": 310,
2398
- "padded": 1240,
2399
- "non_padded": 0,
2400
- "effective_few_shots": 5.0,
2401
- "num_truncated_few_shots": 0
2402
- },
2403
- "lighteval|mmlu:high_school_chemistry|5": {
2404
- "hashes": {
2405
- "hash_examples": "b0661bfa1add6404",
2406
- "hash_full_prompts": "673af6ff7f175e54",
2407
- "hash_input_tokens": "ad5dda4a94b94236",
2408
- "hash_cont_tokens": "3d12f9b93cc609a2"
2409
- },
2410
- "truncated": 0,
2411
- "non_truncated": 203,
2412
- "padded": 812,
2413
- "non_padded": 0,
2414
- "effective_few_shots": 5.0,
2415
- "num_truncated_few_shots": 0
2416
- },
2417
- "lighteval|mmlu:high_school_computer_science|5": {
2418
- "hashes": {
2419
- "hash_examples": "80fc1d623a3d665f",
2420
- "hash_full_prompts": "54edf289edf6a21b",
2421
- "hash_input_tokens": "b4905bb04989c325",
2422
- "hash_cont_tokens": "dadea1de19dee95c"
2423
- },
2424
- "truncated": 0,
2425
- "non_truncated": 100,
2426
- "padded": 400,
2427
- "non_padded": 0,
2428
- "effective_few_shots": 5.0,
2429
- "num_truncated_few_shots": 0
2430
- },
2431
- "lighteval|mmlu:high_school_european_history|5": {
2432
- "hashes": {
2433
- "hash_examples": "854da6e5af0fe1a1",
2434
- "hash_full_prompts": "fe3a16c6c460b023",
2435
- "hash_input_tokens": "32d7171e04b8a11f",
2436
- "hash_cont_tokens": "6d9c47e593859ccd"
2437
- },
2438
- "truncated": 0,
2439
- "non_truncated": 165,
2440
- "padded": 656,
2441
- "non_padded": 4,
2442
- "effective_few_shots": 5.0,
2443
- "num_truncated_few_shots": 0
2444
- },
2445
- "lighteval|mmlu:high_school_geography|5": {
2446
- "hashes": {
2447
- "hash_examples": "7dc963c7acd19ad8",
2448
- "hash_full_prompts": "ab9a10a8f824e912",
2449
- "hash_input_tokens": "02ffa411d3e65393",
2450
- "hash_cont_tokens": "84097c7fa87dfe61"
2451
- },
2452
- "truncated": 0,
2453
- "non_truncated": 198,
2454
- "padded": 792,
2455
- "non_padded": 0,
2456
- "effective_few_shots": 5.0,
2457
- "num_truncated_few_shots": 0
2458
- },
2459
- "lighteval|mmlu:high_school_government_and_politics|5": {
2460
- "hashes": {
2461
- "hash_examples": "1f675dcdebc9758f",
2462
- "hash_full_prompts": "5edbb212b89b37c2",
2463
- "hash_input_tokens": "7cbf9f330f9848cd",
2464
- "hash_cont_tokens": "86d43dfe026b5e6e"
2465
- },
2466
- "truncated": 0,
2467
- "non_truncated": 193,
2468
- "padded": 772,
2469
- "non_padded": 0,
2470
- "effective_few_shots": 5.0,
2471
- "num_truncated_few_shots": 0
2472
- },
2473
- "lighteval|mmlu:high_school_macroeconomics|5": {
2474
- "hashes": {
2475
- "hash_examples": "2fb32cf2d80f0b35",
2476
- "hash_full_prompts": "a03f1c58bff9be7c",
2477
- "hash_input_tokens": "c49c112a9d5fe955",
2478
- "hash_cont_tokens": "99f5469b1de9a21b"
2479
- },
2480
- "truncated": 0,
2481
- "non_truncated": 390,
2482
- "padded": 1560,
2483
- "non_padded": 0,
2484
- "effective_few_shots": 5.0,
2485
- "num_truncated_few_shots": 0
2486
- },
2487
- "lighteval|mmlu:high_school_mathematics|5": {
2488
- "hashes": {
2489
- "hash_examples": "fd6646fdb5d58a1f",
2490
- "hash_full_prompts": "804e7468979f87b0",
2491
- "hash_input_tokens": "ba0dfc1e3421d9cd",
2492
- "hash_cont_tokens": "e215c84aa19ccb33"
2493
- },
2494
- "truncated": 0,
2495
- "non_truncated": 270,
2496
- "padded": 1078,
2497
- "non_padded": 2,
2498
- "effective_few_shots": 5.0,
2499
- "num_truncated_few_shots": 0
2500
- },
2501
- "lighteval|mmlu:high_school_microeconomics|5": {
2502
- "hashes": {
2503
- "hash_examples": "2118f21f71d87d84",
2504
- "hash_full_prompts": "6aad089b085f6261",
2505
- "hash_input_tokens": "ae3d419d3b200341",
2506
- "hash_cont_tokens": "dc8017437d84c710"
2507
- },
2508
- "truncated": 0,
2509
- "non_truncated": 238,
2510
- "padded": 952,
2511
- "non_padded": 0,
2512
- "effective_few_shots": 5.0,
2513
- "num_truncated_few_shots": 0
2514
- },
2515
- "lighteval|mmlu:high_school_physics|5": {
2516
- "hashes": {
2517
- "hash_examples": "dc3ce06378548565",
2518
- "hash_full_prompts": "748f2785c56490fe",
2519
- "hash_input_tokens": "168c4746c2c782f3",
2520
- "hash_cont_tokens": "b8152fcdcf86c673"
2521
- },
2522
- "truncated": 0,
2523
- "non_truncated": 151,
2524
- "padded": 596,
2525
- "non_padded": 8,
2526
- "effective_few_shots": 5.0,
2527
- "num_truncated_few_shots": 0
2528
- },
2529
- "lighteval|mmlu:high_school_psychology|5": {
2530
- "hashes": {
2531
- "hash_examples": "c8d1d98a40e11f2f",
2532
- "hash_full_prompts": "91c2c4e9f4ef640d",
2533
- "hash_input_tokens": "ad59ecce726b81f7",
2534
- "hash_cont_tokens": "ac45cbb9009f81d9"
2535
- },
2536
- "truncated": 0,
2537
- "non_truncated": 545,
2538
- "padded": 2168,
2539
- "non_padded": 12,
2540
- "effective_few_shots": 5.0,
2541
- "num_truncated_few_shots": 0
2542
- },
2543
- "lighteval|mmlu:high_school_statistics|5": {
2544
- "hashes": {
2545
- "hash_examples": "666c8759b98ee4ff",
2546
- "hash_full_prompts": "35db4871e53974e3",
2547
- "hash_input_tokens": "4a95f30069946bb1",
2548
- "hash_cont_tokens": "9c9b68ee68272b16"
2549
- },
2550
- "truncated": 0,
2551
- "non_truncated": 216,
2552
- "padded": 864,
2553
- "non_padded": 0,
2554
- "effective_few_shots": 5.0,
2555
- "num_truncated_few_shots": 0
2556
- },
2557
- "lighteval|mmlu:high_school_us_history|5": {
2558
- "hashes": {
2559
- "hash_examples": "95fef1c4b7d3f81e",
2560
- "hash_full_prompts": "64a4220710854b0c",
2561
- "hash_input_tokens": "9cbadc7afd5b364b",
2562
- "hash_cont_tokens": "cec285b624c15c10"
2563
- },
2564
- "truncated": 0,
2565
- "non_truncated": 204,
2566
- "padded": 816,
2567
- "non_padded": 0,
2568
- "effective_few_shots": 5.0,
2569
- "num_truncated_few_shots": 0
2570
- },
2571
- "lighteval|mmlu:high_school_world_history|5": {
2572
- "hashes": {
2573
- "hash_examples": "7e5085b6184b0322",
2574
- "hash_full_prompts": "04061c742ebe8768",
2575
- "hash_input_tokens": "cf5740bee56319ca",
2576
- "hash_cont_tokens": "2c02128f8f2f7539"
2577
- },
2578
- "truncated": 0,
2579
- "non_truncated": 237,
2580
- "padded": 948,
2581
- "non_padded": 0,
2582
- "effective_few_shots": 5.0,
2583
- "num_truncated_few_shots": 0
2584
- },
2585
- "lighteval|mmlu:human_aging|5": {
2586
- "hashes": {
2587
- "hash_examples": "c17333e7c7c10797",
2588
- "hash_full_prompts": "82f49e7280e80cf6",
2589
- "hash_input_tokens": "354c4fa2fc35d79f",
2590
- "hash_cont_tokens": "faa94c4ec8e7be4e"
2591
- },
2592
- "truncated": 0,
2593
- "non_truncated": 223,
2594
- "padded": 892,
2595
- "non_padded": 0,
2596
- "effective_few_shots": 5.0,
2597
- "num_truncated_few_shots": 0
2598
- },
2599
- "lighteval|mmlu:human_sexuality|5": {
2600
- "hashes": {
2601
- "hash_examples": "4edd1e9045df5e3d",
2602
- "hash_full_prompts": "a661f4d429c05587",
2603
- "hash_input_tokens": "c03b92d669e52759",
2604
- "hash_cont_tokens": "d642d34719fa5ff6"
2605
- },
2606
- "truncated": 0,
2607
- "non_truncated": 131,
2608
- "padded": 524,
2609
- "non_padded": 0,
2610
- "effective_few_shots": 5.0,
2611
- "num_truncated_few_shots": 0
2612
- },
2613
- "lighteval|mmlu:international_law|5": {
2614
- "hashes": {
2615
- "hash_examples": "db2fa00d771a062a",
2616
- "hash_full_prompts": "b903b8a0ef33aaa0",
2617
- "hash_input_tokens": "2dbceb85bb9962fc",
2618
- "hash_cont_tokens": "f0d54717d3cdc783"
2619
- },
2620
- "truncated": 0,
2621
- "non_truncated": 121,
2622
- "padded": 484,
2623
- "non_padded": 0,
2624
- "effective_few_shots": 5.0,
2625
- "num_truncated_few_shots": 0
2626
- },
2627
- "lighteval|mmlu:jurisprudence|5": {
2628
- "hashes": {
2629
- "hash_examples": "e956f86b124076fe",
2630
- "hash_full_prompts": "09566c878dd2bee2",
2631
- "hash_input_tokens": "0cdec18f35799629",
2632
- "hash_cont_tokens": "d766ae8c3d361559"
2633
- },
2634
- "truncated": 0,
2635
- "non_truncated": 108,
2636
- "padded": 432,
2637
- "non_padded": 0,
2638
- "effective_few_shots": 5.0,
2639
- "num_truncated_few_shots": 0
2640
- },
2641
- "lighteval|mmlu:logical_fallacies|5": {
2642
- "hashes": {
2643
- "hash_examples": "956e0e6365ab79f1",
2644
- "hash_full_prompts": "23d259ac774b5ed5",
2645
- "hash_input_tokens": "f5c2a1ebc3001890",
2646
- "hash_cont_tokens": "0fcca855210b4243"
2647
- },
2648
- "truncated": 0,
2649
- "non_truncated": 163,
2650
- "padded": 652,
2651
- "non_padded": 0,
2652
- "effective_few_shots": 5.0,
2653
- "num_truncated_few_shots": 0
2654
- },
2655
- "lighteval|mmlu:machine_learning|5": {
2656
- "hashes": {
2657
- "hash_examples": "397997cc6f4d581e",
2658
- "hash_full_prompts": "f2fba3fcd07e8270",
2659
- "hash_input_tokens": "b6ee91938ab015df",
2660
- "hash_cont_tokens": "8b369a2ff9235b9d"
2661
- },
2662
- "truncated": 0,
2663
- "non_truncated": 112,
2664
- "padded": 448,
2665
- "non_padded": 0,
2666
- "effective_few_shots": 5.0,
2667
- "num_truncated_few_shots": 0
2668
- },
2669
- "lighteval|mmlu:management|5": {
2670
- "hashes": {
2671
- "hash_examples": "2bcbe6f6ca63d740",
2672
- "hash_full_prompts": "d3e694f59e5e9c14",
2673
- "hash_input_tokens": "a0316eb0174082ac",
2674
- "hash_cont_tokens": "c77ad5f59321afa5"
2675
- },
2676
- "truncated": 0,
2677
- "non_truncated": 103,
2678
- "padded": 412,
2679
- "non_padded": 0,
2680
- "effective_few_shots": 5.0,
2681
- "num_truncated_few_shots": 0
2682
- },
2683
- "lighteval|mmlu:marketing|5": {
2684
- "hashes": {
2685
- "hash_examples": "8ddb20d964a1b065",
2686
- "hash_full_prompts": "859304e17136ce75",
2687
- "hash_input_tokens": "2d347ff563a291d4",
2688
- "hash_cont_tokens": "c94db408fe712d9b"
2689
- },
2690
- "truncated": 0,
2691
- "non_truncated": 234,
2692
- "padded": 936,
2693
- "non_padded": 0,
2694
- "effective_few_shots": 5.0,
2695
- "num_truncated_few_shots": 0
2696
- },
2697
- "lighteval|mmlu:medical_genetics|5": {
2698
- "hashes": {
2699
- "hash_examples": "182a71f4763d2cea",
2700
- "hash_full_prompts": "42e1ef61698d0cd8",
2701
- "hash_input_tokens": "54e9d1ebe9299f17",
2702
- "hash_cont_tokens": "dadea1de19dee95c"
2703
- },
2704
- "truncated": 0,
2705
- "non_truncated": 100,
2706
- "padded": 400,
2707
- "non_padded": 0,
2708
- "effective_few_shots": 5.0,
2709
- "num_truncated_few_shots": 0
2710
- },
2711
- "lighteval|mmlu:miscellaneous|5": {
2712
- "hashes": {
2713
- "hash_examples": "4c404fdbb4ca57fc",
2714
- "hash_full_prompts": "4b836cb32f1d9f1c",
2715
- "hash_input_tokens": "bfd569a9633251d6",
2716
- "hash_cont_tokens": "60215a6f77eaf4d9"
2717
- },
2718
- "truncated": 0,
2719
- "non_truncated": 783,
2720
- "padded": 3132,
2721
- "non_padded": 0,
2722
- "effective_few_shots": 5.0,
2723
- "num_truncated_few_shots": 0
2724
- },
2725
- "lighteval|mmlu:moral_disputes|5": {
2726
- "hashes": {
2727
- "hash_examples": "60cbd2baa3fea5c9",
2728
- "hash_full_prompts": "285523191aa3d754",
2729
- "hash_input_tokens": "e945109ff5b7773f",
2730
- "hash_cont_tokens": "3ca55f92255c9f21"
2731
- },
2732
- "truncated": 0,
2733
- "non_truncated": 346,
2734
- "padded": 1384,
2735
- "non_padded": 0,
2736
- "effective_few_shots": 5.0,
2737
- "num_truncated_few_shots": 0
2738
- },
2739
- "lighteval|mmlu:moral_scenarios|5": {
2740
- "hashes": {
2741
- "hash_examples": "fd8b0431fbdd75ef",
2742
- "hash_full_prompts": "cc2dc703a4ea5626",
2743
- "hash_input_tokens": "7540830d244858b3",
2744
- "hash_cont_tokens": "a82e76a0738dc6ac"
2745
- },
2746
- "truncated": 0,
2747
- "non_truncated": 895,
2748
- "padded": 3551,
2749
- "non_padded": 29,
2750
- "effective_few_shots": 5.0,
2751
- "num_truncated_few_shots": 0
2752
- },
2753
- "lighteval|mmlu:nutrition|5": {
2754
- "hashes": {
2755
- "hash_examples": "71e55e2b829b6528",
2756
- "hash_full_prompts": "95da0ecaaca81f0c",
2757
- "hash_input_tokens": "ad2d3f90bac5daaf",
2758
- "hash_cont_tokens": "b683842a2cf7cdd6"
2759
- },
2760
- "truncated": 0,
2761
- "non_truncated": 306,
2762
- "padded": 1224,
2763
- "non_padded": 0,
2764
- "effective_few_shots": 5.0,
2765
- "num_truncated_few_shots": 0
2766
- },
2767
- "lighteval|mmlu:philosophy|5": {
2768
- "hashes": {
2769
- "hash_examples": "a6d489a8d208fa4b",
2770
- "hash_full_prompts": "a12c86a1cc15caf1",
2771
- "hash_input_tokens": "21629734db2baf80",
2772
- "hash_cont_tokens": "a545f25ae279a135"
2773
- },
2774
- "truncated": 0,
2775
- "non_truncated": 311,
2776
- "padded": 1244,
2777
- "non_padded": 0,
2778
- "effective_few_shots": 5.0,
2779
- "num_truncated_few_shots": 0
2780
- },
2781
- "lighteval|mmlu:prehistory|5": {
2782
- "hashes": {
2783
- "hash_examples": "6cc50f032a19acaa",
2784
- "hash_full_prompts": "60a9f42322f7d076",
2785
- "hash_input_tokens": "fec26d07ab114747",
2786
- "hash_cont_tokens": "5a5ebca069b16663"
2787
- },
2788
- "truncated": 0,
2789
- "non_truncated": 324,
2790
- "padded": 1268,
2791
- "non_padded": 28,
2792
- "effective_few_shots": 5.0,
2793
- "num_truncated_few_shots": 0
2794
- },
2795
- "lighteval|mmlu:professional_accounting|5": {
2796
- "hashes": {
2797
- "hash_examples": "50f57ab32f5f6cea",
2798
- "hash_full_prompts": "c0ebe46710635608",
2799
- "hash_input_tokens": "443dfeb8cface0e0",
2800
- "hash_cont_tokens": "e45018e60164d208"
2801
- },
2802
- "truncated": 0,
2803
- "non_truncated": 282,
2804
- "padded": 1120,
2805
- "non_padded": 8,
2806
- "effective_few_shots": 5.0,
2807
- "num_truncated_few_shots": 0
2808
- },
2809
- "lighteval|mmlu:professional_law|5": {
2810
- "hashes": {
2811
- "hash_examples": "a8fdc85c64f4b215",
2812
- "hash_full_prompts": "bf4e78194cbc908e",
2813
- "hash_input_tokens": "1cd3a9a61d6e593f",
2814
- "hash_cont_tokens": "b11002d08c03f837"
2815
- },
2816
- "truncated": 0,
2817
- "non_truncated": 1534,
2818
- "padded": 6136,
2819
- "non_padded": 0,
2820
- "effective_few_shots": 5.0,
2821
- "num_truncated_few_shots": 0
2822
- },
2823
- "lighteval|mmlu:professional_medicine|5": {
2824
- "hashes": {
2825
- "hash_examples": "c373a28a3050a73a",
2826
- "hash_full_prompts": "dcbd2e68cdbadbb2",
2827
- "hash_input_tokens": "269f00f50b882a91",
2828
- "hash_cont_tokens": "11ce4c2ab1132810"
2829
- },
2830
- "truncated": 0,
2831
- "non_truncated": 272,
2832
- "padded": 1088,
2833
- "non_padded": 0,
2834
- "effective_few_shots": 5.0,
2835
- "num_truncated_few_shots": 0
2836
- },
2837
- "lighteval|mmlu:professional_psychology|5": {
2838
- "hashes": {
2839
- "hash_examples": "bf5254fe818356af",
2840
- "hash_full_prompts": "6171d464a5d04506",
2841
- "hash_input_tokens": "4f2be9a15b195243",
2842
- "hash_cont_tokens": "3835bfc898aacaa0"
2843
- },
2844
- "truncated": 0,
2845
- "non_truncated": 612,
2846
- "padded": 2448,
2847
- "non_padded": 0,
2848
- "effective_few_shots": 5.0,
2849
- "num_truncated_few_shots": 0
2850
- },
2851
- "lighteval|mmlu:public_relations|5": {
2852
- "hashes": {
2853
- "hash_examples": "b66d52e28e7d14e0",
2854
- "hash_full_prompts": "4ead0df88af1730e",
2855
- "hash_input_tokens": "5470d37cfcaebef7",
2856
- "hash_cont_tokens": "1692112db1aec618"
2857
- },
2858
- "truncated": 0,
2859
- "non_truncated": 110,
2860
- "padded": 440,
2861
- "non_padded": 0,
2862
- "effective_few_shots": 5.0,
2863
- "num_truncated_few_shots": 0
2864
- },
2865
- "lighteval|mmlu:security_studies|5": {
2866
- "hashes": {
2867
- "hash_examples": "514c14feaf000ad9",
2868
- "hash_full_prompts": "00768a1d238a1756",
2869
- "hash_input_tokens": "3b06989a2c329947",
2870
- "hash_cont_tokens": "9801a1ce7f762a8b"
2871
- },
2872
- "truncated": 0,
2873
- "non_truncated": 245,
2874
- "padded": 980,
2875
- "non_padded": 0,
2876
- "effective_few_shots": 5.0,
2877
- "num_truncated_few_shots": 0
2878
- },
2879
- "lighteval|mmlu:sociology|5": {
2880
- "hashes": {
2881
- "hash_examples": "f6c9bc9d18c80870",
2882
- "hash_full_prompts": "4aa27cf15b0bad9d",
2883
- "hash_input_tokens": "a95cbf3f4777e430",
2884
- "hash_cont_tokens": "277e7d5b38c0960d"
2885
- },
2886
- "truncated": 0,
2887
- "non_truncated": 201,
2888
- "padded": 804,
2889
- "non_padded": 0,
2890
- "effective_few_shots": 5.0,
2891
- "num_truncated_few_shots": 0
2892
- },
2893
- "lighteval|mmlu:us_foreign_policy|5": {
2894
- "hashes": {
2895
- "hash_examples": "ed7b78629db6678f",
2896
- "hash_full_prompts": "6ea0aa7c4f19aaea",
2897
- "hash_input_tokens": "3527ab8d27acf9ba",
2898
- "hash_cont_tokens": "dadea1de19dee95c"
2899
- },
2900
- "truncated": 0,
2901
- "non_truncated": 100,
2902
- "padded": 397,
2903
- "non_padded": 3,
2904
- "effective_few_shots": 5.0,
2905
- "num_truncated_few_shots": 0
2906
- },
2907
- "lighteval|mmlu:virology|5": {
2908
- "hashes": {
2909
- "hash_examples": "bc52ffdc3f9b994a",
2910
- "hash_full_prompts": "4049f1088a6433bb",
2911
- "hash_input_tokens": "c5c9e9261b38b647",
2912
- "hash_cont_tokens": "a4a0852e6fb42244"
2913
- },
2914
- "truncated": 0,
2915
- "non_truncated": 166,
2916
- "padded": 664,
2917
- "non_padded": 0,
2918
- "effective_few_shots": 5.0,
2919
- "num_truncated_few_shots": 0
2920
- },
2921
- "lighteval|mmlu:world_religions|5": {
2922
- "hashes": {
2923
- "hash_examples": "ecdb4a4f94f62930",
2924
- "hash_full_prompts": "569221fa18187415",
2925
- "hash_input_tokens": "5b6e672d8863808f",
2926
- "hash_cont_tokens": "c96f2973fdf12010"
2927
- },
2928
- "truncated": 0,
2929
- "non_truncated": 171,
2930
- "padded": 684,
2931
- "non_padded": 0,
2932
- "effective_few_shots": 5.0,
2933
- "num_truncated_few_shots": 0
2934
- }
2935
- },
2936
- "summary_general": {
2937
- "hashes": {
2938
- "hash_examples": "341a076d0beb7048",
2939
- "hash_full_prompts": "bd301d9e40b69cb5",
2940
- "hash_input_tokens": "06789d11909d9084",
2941
- "hash_cont_tokens": "28aa09e44eee2d3e"
2942
- },
2943
- "truncated": 0,
2944
- "non_truncated": 14042,
2945
- "padded": 56074,
2946
- "non_padded": 94,
2947
- "num_truncated_few_shots": 0
2948
- }
2949
- }