vwxyzjn commited on
Commit
f464876
·
verified ·
1 Parent(s): 4c68057

Upload eval_results/AI-MO/deepseek-math-7b-kto/aimo_v24.34.0/mini_math_v2/results_2024-06-03T04-10-21.980640.json with huggingface_hub

Browse files
eval_results/AI-MO/deepseek-math-7b-kto/aimo_v24.34.0/mini_math_v2/results_2024-06-03T04-10-21.980640.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 569566.940117332,
9
+ "end_time": 570505.727296684,
10
+ "total_evaluation_time_secondes": "938.7871793520171",
11
+ "model_name": "AI-MO/deepseek-math-7b-kto",
12
+ "model_sha": "476bc0992a1e70ee0faed1e29ad5885e0a49e85d",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "12.93 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|mini_math_v2:level_1|0": {
19
+ "qem": 0.45714285714285713,
20
+ "qem_stderr": 0.08543371446816024
21
+ },
22
+ "custom|mini_math_v2:level_2|0": {
23
+ "qem": 0.5277777777777778,
24
+ "qem_stderr": 0.05924743948371487
25
+ },
26
+ "custom|mini_math_v2:level_3|0": {
27
+ "qem": 0.5333333333333333,
28
+ "qem_stderr": 0.05288198530254015
29
+ },
30
+ "custom|mini_math_v2:level_4|0": {
31
+ "qem": 0.29896907216494845,
32
+ "qem_stderr": 0.046724655123323536
33
+ },
34
+ "custom|mini_math_v2:level_5|0": {
35
+ "qem": 0.12264150943396226,
36
+ "qem_stderr": 0.03201198727823859
37
+ },
38
+ "custom|mini_math_v2:_average|0": {
39
+ "qem": 0.3879729099705758,
40
+ "qem_stderr": 0.05525995633119547
41
+ },
42
+ "all": {
43
+ "qem": 0.3879729099705758,
44
+ "qem_stderr": 0.05525995633119547
45
+ }
46
+ },
47
+ "versions": {
48
+ "custom|mini_math_v2:level_1|0": 0,
49
+ "custom|mini_math_v2:level_2|0": 0,
50
+ "custom|mini_math_v2:level_3|0": 0,
51
+ "custom|mini_math_v2:level_4|0": 0,
52
+ "custom|mini_math_v2:level_5|0": 0
53
+ },
54
+ "config_tasks": {
55
+ "custom|mini_math_v2:level_1": {
56
+ "name": "mini_math_v2:level_1",
57
+ "prompt_function": "minimath_prompt_fn",
58
+ "hf_repo": "AI-MO/lighteval-mini-math",
59
+ "hf_subset": "Level 1",
60
+ "metric": [
61
+ "quasi_exact_match_math"
62
+ ],
63
+ "hf_avail_splits": [
64
+ "train",
65
+ "test"
66
+ ],
67
+ "evaluation_splits": [
68
+ "test"
69
+ ],
70
+ "few_shots_split": null,
71
+ "few_shots_select": null,
72
+ "generation_size": 2048,
73
+ "stop_sequence": null,
74
+ "output_regex": null,
75
+ "num_samples": null,
76
+ "frozen": false,
77
+ "suite": [
78
+ "custom"
79
+ ],
80
+ "original_num_docs": 35,
81
+ "effective_num_docs": 35,
82
+ "trust_dataset": null,
83
+ "must_remove_duplicate_docs": null,
84
+ "version": 0
85
+ },
86
+ "custom|mini_math_v2:level_2": {
87
+ "name": "mini_math_v2:level_2",
88
+ "prompt_function": "minimath_prompt_fn",
89
+ "hf_repo": "AI-MO/lighteval-mini-math",
90
+ "hf_subset": "Level 2",
91
+ "metric": [
92
+ "quasi_exact_match_math"
93
+ ],
94
+ "hf_avail_splits": [
95
+ "train",
96
+ "test"
97
+ ],
98
+ "evaluation_splits": [
99
+ "test"
100
+ ],
101
+ "few_shots_split": null,
102
+ "few_shots_select": null,
103
+ "generation_size": 2048,
104
+ "stop_sequence": null,
105
+ "output_regex": null,
106
+ "num_samples": null,
107
+ "frozen": false,
108
+ "suite": [
109
+ "custom"
110
+ ],
111
+ "original_num_docs": 72,
112
+ "effective_num_docs": 72,
113
+ "trust_dataset": null,
114
+ "must_remove_duplicate_docs": null,
115
+ "version": 0
116
+ },
117
+ "custom|mini_math_v2:level_3": {
118
+ "name": "mini_math_v2:level_3",
119
+ "prompt_function": "minimath_prompt_fn",
120
+ "hf_repo": "AI-MO/lighteval-mini-math",
121
+ "hf_subset": "Level 3",
122
+ "metric": [
123
+ "quasi_exact_match_math"
124
+ ],
125
+ "hf_avail_splits": [
126
+ "train",
127
+ "test"
128
+ ],
129
+ "evaluation_splits": [
130
+ "test"
131
+ ],
132
+ "few_shots_split": null,
133
+ "few_shots_select": null,
134
+ "generation_size": 2048,
135
+ "stop_sequence": null,
136
+ "output_regex": null,
137
+ "num_samples": null,
138
+ "frozen": false,
139
+ "suite": [
140
+ "custom"
141
+ ],
142
+ "original_num_docs": 90,
143
+ "effective_num_docs": 90,
144
+ "trust_dataset": null,
145
+ "must_remove_duplicate_docs": null,
146
+ "version": 0
147
+ },
148
+ "custom|mini_math_v2:level_4": {
149
+ "name": "mini_math_v2:level_4",
150
+ "prompt_function": "minimath_prompt_fn",
151
+ "hf_repo": "AI-MO/lighteval-mini-math",
152
+ "hf_subset": "Level 4",
153
+ "metric": [
154
+ "quasi_exact_match_math"
155
+ ],
156
+ "hf_avail_splits": [
157
+ "train",
158
+ "test"
159
+ ],
160
+ "evaluation_splits": [
161
+ "test"
162
+ ],
163
+ "few_shots_split": null,
164
+ "few_shots_select": null,
165
+ "generation_size": 2048,
166
+ "stop_sequence": null,
167
+ "output_regex": null,
168
+ "num_samples": null,
169
+ "frozen": false,
170
+ "suite": [
171
+ "custom"
172
+ ],
173
+ "original_num_docs": 97,
174
+ "effective_num_docs": 97,
175
+ "trust_dataset": null,
176
+ "must_remove_duplicate_docs": null,
177
+ "version": 0
178
+ },
179
+ "custom|mini_math_v2:level_5": {
180
+ "name": "mini_math_v2:level_5",
181
+ "prompt_function": "minimath_prompt_fn",
182
+ "hf_repo": "AI-MO/lighteval-mini-math",
183
+ "hf_subset": "Level 5",
184
+ "metric": [
185
+ "quasi_exact_match_math"
186
+ ],
187
+ "hf_avail_splits": [
188
+ "train",
189
+ "test"
190
+ ],
191
+ "evaluation_splits": [
192
+ "test"
193
+ ],
194
+ "few_shots_split": null,
195
+ "few_shots_select": null,
196
+ "generation_size": 2048,
197
+ "stop_sequence": null,
198
+ "output_regex": null,
199
+ "num_samples": null,
200
+ "frozen": false,
201
+ "suite": [
202
+ "custom"
203
+ ],
204
+ "original_num_docs": 106,
205
+ "effective_num_docs": 106,
206
+ "trust_dataset": null,
207
+ "must_remove_duplicate_docs": null,
208
+ "version": 0
209
+ }
210
+ },
211
+ "summary_tasks": {
212
+ "custom|mini_math_v2:level_1|0": {
213
+ "hashes": {
214
+ "hash_examples": "ad7e0d89fb7b0664",
215
+ "hash_full_prompts": "8317d6f66ac80683",
216
+ "hash_input_tokens": "ada1a3a4fe7121bb",
217
+ "hash_cont_tokens": "3dedb4124f9710f6"
218
+ },
219
+ "truncated": 35,
220
+ "non_truncated": 0,
221
+ "padded": 24,
222
+ "non_padded": 11,
223
+ "effective_few_shots": 0.0,
224
+ "num_truncated_few_shots": 0
225
+ },
226
+ "custom|mini_math_v2:level_2|0": {
227
+ "hashes": {
228
+ "hash_examples": "493b7e3130e3a50d",
229
+ "hash_full_prompts": "3d6ff6c491597924",
230
+ "hash_input_tokens": "f4355f12814f6de1",
231
+ "hash_cont_tokens": "7e927752e9b5f25d"
232
+ },
233
+ "truncated": 72,
234
+ "non_truncated": 0,
235
+ "padded": 37,
236
+ "non_padded": 35,
237
+ "effective_few_shots": 0.0,
238
+ "num_truncated_few_shots": 0
239
+ },
240
+ "custom|mini_math_v2:level_3|0": {
241
+ "hashes": {
242
+ "hash_examples": "11edfc7fc00756f4",
243
+ "hash_full_prompts": "21e5d973c3ee6b21",
244
+ "hash_input_tokens": "635ee9d0883e544e",
245
+ "hash_cont_tokens": "068f437b105efbcd"
246
+ },
247
+ "truncated": 90,
248
+ "non_truncated": 0,
249
+ "padded": 37,
250
+ "non_padded": 53,
251
+ "effective_few_shots": 0.0,
252
+ "num_truncated_few_shots": 0
253
+ },
254
+ "custom|mini_math_v2:level_4|0": {
255
+ "hashes": {
256
+ "hash_examples": "a901e1669616ae50",
257
+ "hash_full_prompts": "83a6822c2b45515c",
258
+ "hash_input_tokens": "649020009f32d11c",
259
+ "hash_cont_tokens": "bf9bdfbe7cea6932"
260
+ },
261
+ "truncated": 96,
262
+ "non_truncated": 1,
263
+ "padded": 34,
264
+ "non_padded": 63,
265
+ "effective_few_shots": 0.0,
266
+ "num_truncated_few_shots": 0
267
+ },
268
+ "custom|mini_math_v2:level_5|0": {
269
+ "hashes": {
270
+ "hash_examples": "1d024f2e2410736e",
271
+ "hash_full_prompts": "9054791972fe1d46",
272
+ "hash_input_tokens": "af6c17bc482b74c2",
273
+ "hash_cont_tokens": "dc9d4a985f06c71f"
274
+ },
275
+ "truncated": 105,
276
+ "non_truncated": 1,
277
+ "padded": 27,
278
+ "non_padded": 79,
279
+ "effective_few_shots": 0.0,
280
+ "num_truncated_few_shots": 0
281
+ }
282
+ },
283
+ "summary_general": {
284
+ "hashes": {
285
+ "hash_examples": "06e45ee0bae45a44",
286
+ "hash_full_prompts": "8d590d8ff727493b",
287
+ "hash_input_tokens": "8b72351bf9498684",
288
+ "hash_cont_tokens": "7afbc3b318c9ddc2"
289
+ },
290
+ "truncated": 398,
291
+ "non_truncated": 2,
292
+ "padded": 159,
293
+ "non_padded": 241,
294
+ "num_truncated_few_shots": 0
295
+ }
296
+ }