lewtun HF Staff commited on
Commit
4193994
·
verified ·
1 Parent(s): e33185a

Upload eval_results/deepseek-ai/deepseek-math-7b-rl/main/math_deepseek_rl_cot/results_2024-07-01T14-36-07.008953.json with huggingface_hub

Browse files
eval_results/deepseek-ai/deepseek-math-7b-rl/main/math_deepseek_rl_cot/results_2024-07-01T14-36-07.008953.json ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 257347.55263128,
9
+ "end_time": 281700.066696367,
10
+ "total_evaluation_time_secondes": "24352.514065086987",
11
+ "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
+ "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "12.87 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|math_deepseek_rl_cot:algebra|0": {
19
+ "qem": 0.6301600673967986,
20
+ "qem_stderr": 0.01401812717243806
21
+ },
22
+ "custom|math_deepseek_rl_cot:counting_and_probability|0": {
23
+ "qem": 0.34177215189873417,
24
+ "qem_stderr": 0.021808504852491584
25
+ },
26
+ "custom|math_deepseek_rl_cot:geometry|0": {
27
+ "qem": 0.3569937369519833,
28
+ "qem_stderr": 0.021914115773729835
29
+ },
30
+ "custom|math_deepseek_rl_cot:intermediate_algebra|0": {
31
+ "qem": 0.19601328903654486,
32
+ "qem_stderr": 0.013217944513396376
33
+ },
34
+ "custom|math_deepseek_rl_cot:number_theory|0": {
35
+ "qem": 0.31296296296296294,
36
+ "qem_stderr": 0.019972947695805383
37
+ },
38
+ "custom|math_deepseek_rl_cot:prealgebra|0": {
39
+ "qem": 0.6383467278989667,
40
+ "qem_stderr": 0.01628976770999433
41
+ },
42
+ "custom|math_deepseek_rl_cot:precalculus|0": {
43
+ "qem": 0.22527472527472528,
44
+ "qem_stderr": 0.017895005885612287
45
+ },
46
+ "custom|math_deepseek_rl_cot:_average|0": {
47
+ "qem": 0.3859319516315309,
48
+ "qem_stderr": 0.01787377337192398
49
+ },
50
+ "all": {
51
+ "qem": 0.3859319516315309,
52
+ "qem_stderr": 0.01787377337192398
53
+ }
54
+ },
55
+ "versions": {
56
+ "custom|math_deepseek_rl_cot:algebra|0": 0,
57
+ "custom|math_deepseek_rl_cot:counting_and_probability|0": 0,
58
+ "custom|math_deepseek_rl_cot:geometry|0": 0,
59
+ "custom|math_deepseek_rl_cot:intermediate_algebra|0": 0,
60
+ "custom|math_deepseek_rl_cot:number_theory|0": 0,
61
+ "custom|math_deepseek_rl_cot:prealgebra|0": 0,
62
+ "custom|math_deepseek_rl_cot:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "custom|math_deepseek_rl_cot:algebra": {
66
+ "name": "math_deepseek_rl_cot:algebra",
67
+ "prompt_function": "math_deepseek_prompt_fn",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": null,
85
+ "output_regex": null,
86
+ "num_samples": null,
87
+ "frozen": false,
88
+ "suite": [
89
+ "custom"
90
+ ],
91
+ "original_num_docs": 1187,
92
+ "effective_num_docs": 1187,
93
+ "trust_dataset": null,
94
+ "must_remove_duplicate_docs": null,
95
+ "version": 0
96
+ },
97
+ "custom|math_deepseek_rl_cot:counting_and_probability": {
98
+ "name": "math_deepseek_rl_cot:counting_and_probability",
99
+ "prompt_function": "math_deepseek_prompt_fn",
100
+ "hf_repo": "lighteval/MATH",
101
+ "hf_subset": "counting_and_probability",
102
+ "metric": [
103
+ "quasi_exact_match_math"
104
+ ],
105
+ "hf_avail_splits": [
106
+ "train",
107
+ "test",
108
+ "validation"
109
+ ],
110
+ "evaluation_splits": [
111
+ "test"
112
+ ],
113
+ "few_shots_split": null,
114
+ "few_shots_select": null,
115
+ "generation_size": 2048,
116
+ "stop_sequence": null,
117
+ "output_regex": null,
118
+ "num_samples": null,
119
+ "frozen": false,
120
+ "suite": [
121
+ "custom"
122
+ ],
123
+ "original_num_docs": 474,
124
+ "effective_num_docs": 474,
125
+ "trust_dataset": null,
126
+ "must_remove_duplicate_docs": null,
127
+ "version": 0
128
+ },
129
+ "custom|math_deepseek_rl_cot:geometry": {
130
+ "name": "math_deepseek_rl_cot:geometry",
131
+ "prompt_function": "math_deepseek_prompt_fn",
132
+ "hf_repo": "lighteval/MATH",
133
+ "hf_subset": "geometry",
134
+ "metric": [
135
+ "quasi_exact_match_math"
136
+ ],
137
+ "hf_avail_splits": [
138
+ "train",
139
+ "test",
140
+ "validation"
141
+ ],
142
+ "evaluation_splits": [
143
+ "test"
144
+ ],
145
+ "few_shots_split": null,
146
+ "few_shots_select": null,
147
+ "generation_size": 2048,
148
+ "stop_sequence": null,
149
+ "output_regex": null,
150
+ "num_samples": null,
151
+ "frozen": false,
152
+ "suite": [
153
+ "custom"
154
+ ],
155
+ "original_num_docs": 479,
156
+ "effective_num_docs": 479,
157
+ "trust_dataset": null,
158
+ "must_remove_duplicate_docs": null,
159
+ "version": 0
160
+ },
161
+ "custom|math_deepseek_rl_cot:intermediate_algebra": {
162
+ "name": "math_deepseek_rl_cot:intermediate_algebra",
163
+ "prompt_function": "math_deepseek_prompt_fn",
164
+ "hf_repo": "lighteval/MATH",
165
+ "hf_subset": "intermediate_algebra",
166
+ "metric": [
167
+ "quasi_exact_match_math"
168
+ ],
169
+ "hf_avail_splits": [
170
+ "train",
171
+ "test",
172
+ "validation"
173
+ ],
174
+ "evaluation_splits": [
175
+ "test"
176
+ ],
177
+ "few_shots_split": null,
178
+ "few_shots_select": null,
179
+ "generation_size": 2048,
180
+ "stop_sequence": null,
181
+ "output_regex": null,
182
+ "num_samples": null,
183
+ "frozen": false,
184
+ "suite": [
185
+ "custom"
186
+ ],
187
+ "original_num_docs": 903,
188
+ "effective_num_docs": 903,
189
+ "trust_dataset": null,
190
+ "must_remove_duplicate_docs": null,
191
+ "version": 0
192
+ },
193
+ "custom|math_deepseek_rl_cot:number_theory": {
194
+ "name": "math_deepseek_rl_cot:number_theory",
195
+ "prompt_function": "math_deepseek_prompt_fn",
196
+ "hf_repo": "lighteval/MATH",
197
+ "hf_subset": "number_theory",
198
+ "metric": [
199
+ "quasi_exact_match_math"
200
+ ],
201
+ "hf_avail_splits": [
202
+ "train",
203
+ "test",
204
+ "validation"
205
+ ],
206
+ "evaluation_splits": [
207
+ "test"
208
+ ],
209
+ "few_shots_split": null,
210
+ "few_shots_select": null,
211
+ "generation_size": 2048,
212
+ "stop_sequence": null,
213
+ "output_regex": null,
214
+ "num_samples": null,
215
+ "frozen": false,
216
+ "suite": [
217
+ "custom"
218
+ ],
219
+ "original_num_docs": 540,
220
+ "effective_num_docs": 540,
221
+ "trust_dataset": null,
222
+ "must_remove_duplicate_docs": null,
223
+ "version": 0
224
+ },
225
+ "custom|math_deepseek_rl_cot:prealgebra": {
226
+ "name": "math_deepseek_rl_cot:prealgebra",
227
+ "prompt_function": "math_deepseek_prompt_fn",
228
+ "hf_repo": "lighteval/MATH",
229
+ "hf_subset": "prealgebra",
230
+ "metric": [
231
+ "quasi_exact_match_math"
232
+ ],
233
+ "hf_avail_splits": [
234
+ "train",
235
+ "test",
236
+ "validation"
237
+ ],
238
+ "evaluation_splits": [
239
+ "test"
240
+ ],
241
+ "few_shots_split": null,
242
+ "few_shots_select": null,
243
+ "generation_size": 2048,
244
+ "stop_sequence": null,
245
+ "output_regex": null,
246
+ "num_samples": null,
247
+ "frozen": false,
248
+ "suite": [
249
+ "custom"
250
+ ],
251
+ "original_num_docs": 871,
252
+ "effective_num_docs": 871,
253
+ "trust_dataset": null,
254
+ "must_remove_duplicate_docs": null,
255
+ "version": 0
256
+ },
257
+ "custom|math_deepseek_rl_cot:precalculus": {
258
+ "name": "math_deepseek_rl_cot:precalculus",
259
+ "prompt_function": "math_deepseek_prompt_fn",
260
+ "hf_repo": "lighteval/MATH",
261
+ "hf_subset": "precalculus",
262
+ "metric": [
263
+ "quasi_exact_match_math"
264
+ ],
265
+ "hf_avail_splits": [
266
+ "train",
267
+ "test",
268
+ "validation"
269
+ ],
270
+ "evaluation_splits": [
271
+ "test"
272
+ ],
273
+ "few_shots_split": null,
274
+ "few_shots_select": null,
275
+ "generation_size": 2048,
276
+ "stop_sequence": null,
277
+ "output_regex": null,
278
+ "num_samples": null,
279
+ "frozen": false,
280
+ "suite": [
281
+ "custom"
282
+ ],
283
+ "original_num_docs": 546,
284
+ "effective_num_docs": 546,
285
+ "trust_dataset": null,
286
+ "must_remove_duplicate_docs": null,
287
+ "version": 0
288
+ }
289
+ },
290
+ "summary_tasks": {
291
+ "custom|math_deepseek_rl_cot:algebra|0": {
292
+ "hashes": {
293
+ "hash_examples": "dcb01820b9d86e0d",
294
+ "hash_full_prompts": "7f1260779d95bf58",
295
+ "hash_input_tokens": "57158eeae9b6bf38",
296
+ "hash_cont_tokens": "09cfcbc3b65ecf48"
297
+ },
298
+ "truncated": 1187,
299
+ "non_truncated": 0,
300
+ "padded": 153,
301
+ "non_padded": 1034,
302
+ "effective_few_shots": 0.0,
303
+ "num_truncated_few_shots": 0
304
+ },
305
+ "custom|math_deepseek_rl_cot:counting_and_probability|0": {
306
+ "hashes": {
307
+ "hash_examples": "d72845695bf62192",
308
+ "hash_full_prompts": "1f67dad9a500dbb8",
309
+ "hash_input_tokens": "588febc753bee08c",
310
+ "hash_cont_tokens": "895a162596fa2068"
311
+ },
312
+ "truncated": 474,
313
+ "non_truncated": 0,
314
+ "padded": 51,
315
+ "non_padded": 423,
316
+ "effective_few_shots": 0.0,
317
+ "num_truncated_few_shots": 0
318
+ },
319
+ "custom|math_deepseek_rl_cot:geometry|0": {
320
+ "hashes": {
321
+ "hash_examples": "ec3e32d61c9774ce",
322
+ "hash_full_prompts": "00c7e19804742ce4",
323
+ "hash_input_tokens": "588d58229de15812",
324
+ "hash_cont_tokens": "be559a6a20337605"
325
+ },
326
+ "truncated": 479,
327
+ "non_truncated": 0,
328
+ "padded": 114,
329
+ "non_padded": 365,
330
+ "effective_few_shots": 0.0,
331
+ "num_truncated_few_shots": 0
332
+ },
333
+ "custom|math_deepseek_rl_cot:intermediate_algebra|0": {
334
+ "hashes": {
335
+ "hash_examples": "e22dffce2050ffaa",
336
+ "hash_full_prompts": "459d909fa8d3429a",
337
+ "hash_input_tokens": "3a1a54fee72e0a10",
338
+ "hash_cont_tokens": "701415421948964c"
339
+ },
340
+ "truncated": 903,
341
+ "non_truncated": 0,
342
+ "padded": 43,
343
+ "non_padded": 860,
344
+ "effective_few_shots": 0.0,
345
+ "num_truncated_few_shots": 0
346
+ },
347
+ "custom|math_deepseek_rl_cot:number_theory|0": {
348
+ "hashes": {
349
+ "hash_examples": "c87bdc11a396a831",
350
+ "hash_full_prompts": "6ca30019751b160b",
351
+ "hash_input_tokens": "3b361cd87ffc912c",
352
+ "hash_cont_tokens": "4e0c8752abc4928f"
353
+ },
354
+ "truncated": 537,
355
+ "non_truncated": 3,
356
+ "padded": 6,
357
+ "non_padded": 534,
358
+ "effective_few_shots": 0.0,
359
+ "num_truncated_few_shots": 0
360
+ },
361
+ "custom|math_deepseek_rl_cot:prealgebra|0": {
362
+ "hashes": {
363
+ "hash_examples": "28267717005d8d7d",
364
+ "hash_full_prompts": "f5100ec1648003c2",
365
+ "hash_input_tokens": "0a18706449a101d1",
366
+ "hash_cont_tokens": "33c6035b479b6819"
367
+ },
368
+ "truncated": 871,
369
+ "non_truncated": 0,
370
+ "padded": 53,
371
+ "non_padded": 818,
372
+ "effective_few_shots": 0.0,
373
+ "num_truncated_few_shots": 0
374
+ },
375
+ "custom|math_deepseek_rl_cot:precalculus|0": {
376
+ "hashes": {
377
+ "hash_examples": "40e85e93f5bf21d7",
378
+ "hash_full_prompts": "7c21366b20b4b9ea",
379
+ "hash_input_tokens": "c6f547e0dfd9f529",
380
+ "hash_cont_tokens": "44b1230b7034bf1b"
381
+ },
382
+ "truncated": 546,
383
+ "non_truncated": 0,
384
+ "padded": 21,
385
+ "non_padded": 525,
386
+ "effective_few_shots": 0.0,
387
+ "num_truncated_few_shots": 0
388
+ }
389
+ },
390
+ "summary_general": {
391
+ "hashes": {
392
+ "hash_examples": "825b967c0800ccea",
393
+ "hash_full_prompts": "af07312a5dbdfcb9",
394
+ "hash_input_tokens": "152bc219b338fc22",
395
+ "hash_cont_tokens": "8ce59928066af846"
396
+ },
397
+ "truncated": 4997,
398
+ "non_truncated": 3,
399
+ "padded": 441,
400
+ "non_padded": 4559,
401
+ "num_truncated_few_shots": 0
402
+ }
403
+ }