edbeeching HF staff commited on
Commit
1b6923b
·
verified ·
1 Parent(s): 1b4494d

Upload eval_results/AI-MO/deepseek-coder-33b-sft/aimo_v01.29/mini_math_v2_pot/results_2024-05-06T21-10-50.967482.json with huggingface_hub

Browse files
eval_results/AI-MO/deepseek-coder-33b-sft/aimo_v01.29/mini_math_v2_pot/results_2024-05-06T21-10-50.967482.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 321920.777063273,
9
+ "end_time": 324833.573920226,
10
+ "total_evaluation_time_secondes": "2912.7968569530058",
11
+ "model_name": "AI-MO/deepseek-coder-33b-sft",
12
+ "model_sha": "56d1a6a6f713abf774d31e11ea51d99c4fdff163",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "62.59 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|mini_math_v2_pot:level_1|0": {
19
+ "qem": 0.02857142857142857,
20
+ "qem_stderr": 0.028571428571428574
21
+ },
22
+ "custom|mini_math_v2_pot:level_2|0": {
23
+ "qem": 0.013888888888888888,
24
+ "qem_stderr": 0.013888888888888907
25
+ },
26
+ "custom|mini_math_v2_pot:level_3|0": {
27
+ "qem": 0.011111111111111112,
28
+ "qem_stderr": 0.011111111111111125
29
+ },
30
+ "custom|mini_math_v2_pot:level_4|0": {
31
+ "qem": 0.0,
32
+ "qem_stderr": 0.0
33
+ },
34
+ "custom|mini_math_v2_pot:level_5|0": {
35
+ "qem": 0.0,
36
+ "qem_stderr": 0.0
37
+ },
38
+ "custom|mini_math_v2_pot:_average|0": {
39
+ "qem": 0.010714285714285714,
40
+ "qem_stderr": 0.010714285714285721
41
+ },
42
+ "all": {
43
+ "qem": 0.010714285714285714,
44
+ "qem_stderr": 0.010714285714285721
45
+ }
46
+ },
47
+ "versions": {
48
+ "custom|mini_math_v2_pot:level_1|0": 0,
49
+ "custom|mini_math_v2_pot:level_2|0": 0,
50
+ "custom|mini_math_v2_pot:level_3|0": 0,
51
+ "custom|mini_math_v2_pot:level_4|0": 0,
52
+ "custom|mini_math_v2_pot:level_5|0": 0
53
+ },
54
+ "config_tasks": {
55
+ "custom|mini_math_v2_pot:level_1": {
56
+ "name": "mini_math_v2_pot:level_1",
57
+ "prompt_function": "minimath_pot_prompt_fn",
58
+ "hf_repo": "AI-MO/lighteval-mini-math",
59
+ "hf_subset": "Level 1",
60
+ "metric": [
61
+ "quasi_exact_match_code_and_math"
62
+ ],
63
+ "hf_avail_splits": [
64
+ "train",
65
+ "test"
66
+ ],
67
+ "evaluation_splits": [
68
+ "test"
69
+ ],
70
+ "few_shots_split": null,
71
+ "few_shots_select": null,
72
+ "generation_size": 2048,
73
+ "stop_sequence": null,
74
+ "output_regex": null,
75
+ "frozen": false,
76
+ "suite": [
77
+ "custom"
78
+ ],
79
+ "original_num_docs": 35,
80
+ "effective_num_docs": 35,
81
+ "trust_dataset": null,
82
+ "must_remove_duplicate_docs": null
83
+ },
84
+ "custom|mini_math_v2_pot:level_2": {
85
+ "name": "mini_math_v2_pot:level_2",
86
+ "prompt_function": "minimath_pot_prompt_fn",
87
+ "hf_repo": "AI-MO/lighteval-mini-math",
88
+ "hf_subset": "Level 2",
89
+ "metric": [
90
+ "quasi_exact_match_code_and_math"
91
+ ],
92
+ "hf_avail_splits": [
93
+ "train",
94
+ "test"
95
+ ],
96
+ "evaluation_splits": [
97
+ "test"
98
+ ],
99
+ "few_shots_split": null,
100
+ "few_shots_select": null,
101
+ "generation_size": 2048,
102
+ "stop_sequence": null,
103
+ "output_regex": null,
104
+ "frozen": false,
105
+ "suite": [
106
+ "custom"
107
+ ],
108
+ "original_num_docs": 72,
109
+ "effective_num_docs": 72,
110
+ "trust_dataset": null,
111
+ "must_remove_duplicate_docs": null
112
+ },
113
+ "custom|mini_math_v2_pot:level_3": {
114
+ "name": "mini_math_v2_pot:level_3",
115
+ "prompt_function": "minimath_pot_prompt_fn",
116
+ "hf_repo": "AI-MO/lighteval-mini-math",
117
+ "hf_subset": "Level 3",
118
+ "metric": [
119
+ "quasi_exact_match_code_and_math"
120
+ ],
121
+ "hf_avail_splits": [
122
+ "train",
123
+ "test"
124
+ ],
125
+ "evaluation_splits": [
126
+ "test"
127
+ ],
128
+ "few_shots_split": null,
129
+ "few_shots_select": null,
130
+ "generation_size": 2048,
131
+ "stop_sequence": null,
132
+ "output_regex": null,
133
+ "frozen": false,
134
+ "suite": [
135
+ "custom"
136
+ ],
137
+ "original_num_docs": 90,
138
+ "effective_num_docs": 90,
139
+ "trust_dataset": null,
140
+ "must_remove_duplicate_docs": null
141
+ },
142
+ "custom|mini_math_v2_pot:level_4": {
143
+ "name": "mini_math_v2_pot:level_4",
144
+ "prompt_function": "minimath_pot_prompt_fn",
145
+ "hf_repo": "AI-MO/lighteval-mini-math",
146
+ "hf_subset": "Level 4",
147
+ "metric": [
148
+ "quasi_exact_match_code_and_math"
149
+ ],
150
+ "hf_avail_splits": [
151
+ "train",
152
+ "test"
153
+ ],
154
+ "evaluation_splits": [
155
+ "test"
156
+ ],
157
+ "few_shots_split": null,
158
+ "few_shots_select": null,
159
+ "generation_size": 2048,
160
+ "stop_sequence": null,
161
+ "output_regex": null,
162
+ "frozen": false,
163
+ "suite": [
164
+ "custom"
165
+ ],
166
+ "original_num_docs": 97,
167
+ "effective_num_docs": 97,
168
+ "trust_dataset": null,
169
+ "must_remove_duplicate_docs": null
170
+ },
171
+ "custom|mini_math_v2_pot:level_5": {
172
+ "name": "mini_math_v2_pot:level_5",
173
+ "prompt_function": "minimath_pot_prompt_fn",
174
+ "hf_repo": "AI-MO/lighteval-mini-math",
175
+ "hf_subset": "Level 5",
176
+ "metric": [
177
+ "quasi_exact_match_code_and_math"
178
+ ],
179
+ "hf_avail_splits": [
180
+ "train",
181
+ "test"
182
+ ],
183
+ "evaluation_splits": [
184
+ "test"
185
+ ],
186
+ "few_shots_split": null,
187
+ "few_shots_select": null,
188
+ "generation_size": 2048,
189
+ "stop_sequence": null,
190
+ "output_regex": null,
191
+ "frozen": false,
192
+ "suite": [
193
+ "custom"
194
+ ],
195
+ "original_num_docs": 106,
196
+ "effective_num_docs": 106,
197
+ "trust_dataset": null,
198
+ "must_remove_duplicate_docs": null
199
+ }
200
+ },
201
+ "summary_tasks": {
202
+ "custom|mini_math_v2_pot:level_1|0": {
203
+ "hashes": {
204
+ "hash_examples": "df659a925b34c135",
205
+ "hash_full_prompts": "bf728d8b4480d249",
206
+ "hash_input_tokens": "8b7e4ed0136525ee",
207
+ "hash_cont_tokens": "f6b1f602295bf586"
208
+ },
209
+ "truncated": 35,
210
+ "non_truncated": 0,
211
+ "padded": 19,
212
+ "non_padded": 16,
213
+ "effective_few_shots": 0.0,
214
+ "num_truncated_few_shots": 0
215
+ },
216
+ "custom|mini_math_v2_pot:level_2|0": {
217
+ "hashes": {
218
+ "hash_examples": "53b89aa4c81ad21c",
219
+ "hash_full_prompts": "00b450c53c8d8581",
220
+ "hash_input_tokens": "25e69df4388a303a",
221
+ "hash_cont_tokens": "073997446cd40a25"
222
+ },
223
+ "truncated": 72,
224
+ "non_truncated": 0,
225
+ "padded": 28,
226
+ "non_padded": 44,
227
+ "effective_few_shots": 0.0,
228
+ "num_truncated_few_shots": 0
229
+ },
230
+ "custom|mini_math_v2_pot:level_3|0": {
231
+ "hashes": {
232
+ "hash_examples": "558578854e3a423f",
233
+ "hash_full_prompts": "012e0d47278d486d",
234
+ "hash_input_tokens": "e1d4614faaca5ee5",
235
+ "hash_cont_tokens": "4e1c64307f42e9e7"
236
+ },
237
+ "truncated": 90,
238
+ "non_truncated": 0,
239
+ "padded": 34,
240
+ "non_padded": 56,
241
+ "effective_few_shots": 0.0,
242
+ "num_truncated_few_shots": 0
243
+ },
244
+ "custom|mini_math_v2_pot:level_4|0": {
245
+ "hashes": {
246
+ "hash_examples": "3741efb7f449aa92",
247
+ "hash_full_prompts": "ad4c7fab21185380",
248
+ "hash_input_tokens": "86417d6b4b460291",
249
+ "hash_cont_tokens": "d7e8e706cfbc6aaf"
250
+ },
251
+ "truncated": 96,
252
+ "non_truncated": 1,
253
+ "padded": 34,
254
+ "non_padded": 63,
255
+ "effective_few_shots": 0.0,
256
+ "num_truncated_few_shots": 0
257
+ },
258
+ "custom|mini_math_v2_pot:level_5|0": {
259
+ "hashes": {
260
+ "hash_examples": "2985fcbca171329c",
261
+ "hash_full_prompts": "070175091d3906e6",
262
+ "hash_input_tokens": "e350959ef9607074",
263
+ "hash_cont_tokens": "874c1f1f1bdf2db0"
264
+ },
265
+ "truncated": 106,
266
+ "non_truncated": 0,
267
+ "padded": 39,
268
+ "non_padded": 67,
269
+ "effective_few_shots": 0.0,
270
+ "num_truncated_few_shots": 0
271
+ }
272
+ },
273
+ "summary_general": {
274
+ "hashes": {
275
+ "hash_examples": "47d56e44d5845903",
276
+ "hash_full_prompts": "e48e380bf6ac2627",
277
+ "hash_input_tokens": "0652d41dd808bc91",
278
+ "hash_cont_tokens": "69d07104392d6c43"
279
+ },
280
+ "truncated": 399,
281
+ "non_truncated": 1,
282
+ "padded": 154,
283
+ "non_padded": 246,
284
+ "num_truncated_few_shots": 0
285
+ }
286
+ }