edbeeching HF staff commited on
Commit
49a831c
·
verified ·
1 Parent(s): 5d5addb

Upload eval_results/AI-MO/internlm-math-20b-sft/aimo_v02.39/mini_math_v2_pot/results_2024-05-06T17-23-46.593315.json with huggingface_hub

Browse files
eval_results/AI-MO/internlm-math-20b-sft/aimo_v02.39/mini_math_v2_pot/results_2024-05-06T17-23-46.593315.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 7978108.348671261,
9
+ "end_time": 7979008.853603525,
10
+ "total_evaluation_time_secondes": "900.5049322638661",
11
+ "model_name": "AI-MO/internlm-math-20b-sft",
12
+ "model_sha": "7bd9c1a41300bf05e86da3d973e1c284266c8239",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "37.18 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|mini_math_v2_pot:level_1|0": {
19
+ "qem": 0.5142857142857142,
20
+ "qem_stderr": 0.0857142857142857
21
+ },
22
+ "custom|mini_math_v2_pot:level_2|0": {
23
+ "qem": 0.3472222222222222,
24
+ "qem_stderr": 0.05650114676852965
25
+ },
26
+ "custom|mini_math_v2_pot:level_3|0": {
27
+ "qem": 0.3,
28
+ "qem_stderr": 0.04857520521621861
29
+ },
30
+ "custom|mini_math_v2_pot:level_4|0": {
31
+ "qem": 0.1958762886597938,
32
+ "qem_stderr": 0.04050575681830336
33
+ },
34
+ "custom|mini_math_v2_pot:level_5|0": {
35
+ "qem": 0.09433962264150944,
36
+ "qem_stderr": 0.028525620604469944
37
+ },
38
+ "custom|mini_math_v2_pot:_average|0": {
39
+ "qem": 0.2903447695618479,
40
+ "qem_stderr": 0.05196440302436145
41
+ },
42
+ "all": {
43
+ "qem": 0.2903447695618479,
44
+ "qem_stderr": 0.05196440302436145
45
+ }
46
+ },
47
+ "versions": {
48
+ "custom|mini_math_v2_pot:level_1|0": 0,
49
+ "custom|mini_math_v2_pot:level_2|0": 0,
50
+ "custom|mini_math_v2_pot:level_3|0": 0,
51
+ "custom|mini_math_v2_pot:level_4|0": 0,
52
+ "custom|mini_math_v2_pot:level_5|0": 0
53
+ },
54
+ "config_tasks": {
55
+ "custom|mini_math_v2_pot:level_1": {
56
+ "name": "mini_math_v2_pot:level_1",
57
+ "prompt_function": "minimath_pot_prompt_fn",
58
+ "hf_repo": "AI-MO/lighteval-mini-math",
59
+ "hf_subset": "Level 1",
60
+ "metric": [
61
+ "quasi_exact_match_code_and_math"
62
+ ],
63
+ "hf_avail_splits": [
64
+ "train",
65
+ "test"
66
+ ],
67
+ "evaluation_splits": [
68
+ "test"
69
+ ],
70
+ "few_shots_split": null,
71
+ "few_shots_select": null,
72
+ "generation_size": 2048,
73
+ "stop_sequence": null,
74
+ "output_regex": null,
75
+ "frozen": false,
76
+ "suite": [
77
+ "custom"
78
+ ],
79
+ "original_num_docs": 35,
80
+ "effective_num_docs": 35,
81
+ "trust_dataset": null,
82
+ "must_remove_duplicate_docs": null
83
+ },
84
+ "custom|mini_math_v2_pot:level_2": {
85
+ "name": "mini_math_v2_pot:level_2",
86
+ "prompt_function": "minimath_pot_prompt_fn",
87
+ "hf_repo": "AI-MO/lighteval-mini-math",
88
+ "hf_subset": "Level 2",
89
+ "metric": [
90
+ "quasi_exact_match_code_and_math"
91
+ ],
92
+ "hf_avail_splits": [
93
+ "train",
94
+ "test"
95
+ ],
96
+ "evaluation_splits": [
97
+ "test"
98
+ ],
99
+ "few_shots_split": null,
100
+ "few_shots_select": null,
101
+ "generation_size": 2048,
102
+ "stop_sequence": null,
103
+ "output_regex": null,
104
+ "frozen": false,
105
+ "suite": [
106
+ "custom"
107
+ ],
108
+ "original_num_docs": 72,
109
+ "effective_num_docs": 72,
110
+ "trust_dataset": null,
111
+ "must_remove_duplicate_docs": null
112
+ },
113
+ "custom|mini_math_v2_pot:level_3": {
114
+ "name": "mini_math_v2_pot:level_3",
115
+ "prompt_function": "minimath_pot_prompt_fn",
116
+ "hf_repo": "AI-MO/lighteval-mini-math",
117
+ "hf_subset": "Level 3",
118
+ "metric": [
119
+ "quasi_exact_match_code_and_math"
120
+ ],
121
+ "hf_avail_splits": [
122
+ "train",
123
+ "test"
124
+ ],
125
+ "evaluation_splits": [
126
+ "test"
127
+ ],
128
+ "few_shots_split": null,
129
+ "few_shots_select": null,
130
+ "generation_size": 2048,
131
+ "stop_sequence": null,
132
+ "output_regex": null,
133
+ "frozen": false,
134
+ "suite": [
135
+ "custom"
136
+ ],
137
+ "original_num_docs": 90,
138
+ "effective_num_docs": 90,
139
+ "trust_dataset": null,
140
+ "must_remove_duplicate_docs": null
141
+ },
142
+ "custom|mini_math_v2_pot:level_4": {
143
+ "name": "mini_math_v2_pot:level_4",
144
+ "prompt_function": "minimath_pot_prompt_fn",
145
+ "hf_repo": "AI-MO/lighteval-mini-math",
146
+ "hf_subset": "Level 4",
147
+ "metric": [
148
+ "quasi_exact_match_code_and_math"
149
+ ],
150
+ "hf_avail_splits": [
151
+ "train",
152
+ "test"
153
+ ],
154
+ "evaluation_splits": [
155
+ "test"
156
+ ],
157
+ "few_shots_split": null,
158
+ "few_shots_select": null,
159
+ "generation_size": 2048,
160
+ "stop_sequence": null,
161
+ "output_regex": null,
162
+ "frozen": false,
163
+ "suite": [
164
+ "custom"
165
+ ],
166
+ "original_num_docs": 97,
167
+ "effective_num_docs": 97,
168
+ "trust_dataset": null,
169
+ "must_remove_duplicate_docs": null
170
+ },
171
+ "custom|mini_math_v2_pot:level_5": {
172
+ "name": "mini_math_v2_pot:level_5",
173
+ "prompt_function": "minimath_pot_prompt_fn",
174
+ "hf_repo": "AI-MO/lighteval-mini-math",
175
+ "hf_subset": "Level 5",
176
+ "metric": [
177
+ "quasi_exact_match_code_and_math"
178
+ ],
179
+ "hf_avail_splits": [
180
+ "train",
181
+ "test"
182
+ ],
183
+ "evaluation_splits": [
184
+ "test"
185
+ ],
186
+ "few_shots_split": null,
187
+ "few_shots_select": null,
188
+ "generation_size": 2048,
189
+ "stop_sequence": null,
190
+ "output_regex": null,
191
+ "frozen": false,
192
+ "suite": [
193
+ "custom"
194
+ ],
195
+ "original_num_docs": 106,
196
+ "effective_num_docs": 106,
197
+ "trust_dataset": null,
198
+ "must_remove_duplicate_docs": null
199
+ }
200
+ },
201
+ "summary_tasks": {
202
+ "custom|mini_math_v2_pot:level_1|0": {
203
+ "hashes": {
204
+ "hash_examples": "df659a925b34c135",
205
+ "hash_full_prompts": "bf728d8b4480d249",
206
+ "hash_input_tokens": "7f9ac889d353e50a",
207
+ "hash_cont_tokens": "8470166ee180ccbe"
208
+ },
209
+ "truncated": 35,
210
+ "non_truncated": 0,
211
+ "padded": 28,
212
+ "non_padded": 7,
213
+ "effective_few_shots": 0.0,
214
+ "num_truncated_few_shots": 0
215
+ },
216
+ "custom|mini_math_v2_pot:level_2|0": {
217
+ "hashes": {
218
+ "hash_examples": "53b89aa4c81ad21c",
219
+ "hash_full_prompts": "00b450c53c8d8581",
220
+ "hash_input_tokens": "968f8225a6c5a6a0",
221
+ "hash_cont_tokens": "1c9c8db48d941a26"
222
+ },
223
+ "truncated": 72,
224
+ "non_truncated": 0,
225
+ "padded": 36,
226
+ "non_padded": 36,
227
+ "effective_few_shots": 0.0,
228
+ "num_truncated_few_shots": 0
229
+ },
230
+ "custom|mini_math_v2_pot:level_3|0": {
231
+ "hashes": {
232
+ "hash_examples": "558578854e3a423f",
233
+ "hash_full_prompts": "012e0d47278d486d",
234
+ "hash_input_tokens": "ea2c985e6d98f465",
235
+ "hash_cont_tokens": "d0aca898f7ace3be"
236
+ },
237
+ "truncated": 89,
238
+ "non_truncated": 1,
239
+ "padded": 35,
240
+ "non_padded": 55,
241
+ "effective_few_shots": 0.0,
242
+ "num_truncated_few_shots": 0
243
+ },
244
+ "custom|mini_math_v2_pot:level_4|0": {
245
+ "hashes": {
246
+ "hash_examples": "3741efb7f449aa92",
247
+ "hash_full_prompts": "ad4c7fab21185380",
248
+ "hash_input_tokens": "62a85983bc1f08ab",
249
+ "hash_cont_tokens": "71378d3f07da4eb2"
250
+ },
251
+ "truncated": 97,
252
+ "non_truncated": 0,
253
+ "padded": 34,
254
+ "non_padded": 63,
255
+ "effective_few_shots": 0.0,
256
+ "num_truncated_few_shots": 0
257
+ },
258
+ "custom|mini_math_v2_pot:level_5|0": {
259
+ "hashes": {
260
+ "hash_examples": "2985fcbca171329c",
261
+ "hash_full_prompts": "070175091d3906e6",
262
+ "hash_input_tokens": "654365a84a90ffd7",
263
+ "hash_cont_tokens": "fd1c541e1d756c08"
264
+ },
265
+ "truncated": 106,
266
+ "non_truncated": 0,
267
+ "padded": 27,
268
+ "non_padded": 79,
269
+ "effective_few_shots": 0.0,
270
+ "num_truncated_few_shots": 0
271
+ }
272
+ },
273
+ "summary_general": {
274
+ "hashes": {
275
+ "hash_examples": "47d56e44d5845903",
276
+ "hash_full_prompts": "e48e380bf6ac2627",
277
+ "hash_input_tokens": "448f43c43a461998",
278
+ "hash_cont_tokens": "4ab72d2067ee7d05"
279
+ },
280
+ "truncated": 399,
281
+ "non_truncated": 1,
282
+ "padded": 160,
283
+ "non_padded": 240,
284
+ "num_truncated_few_shots": 0
285
+ }
286
+ }