lewtun HF Staff commited on
Commit
e33185a
·
verified ·
1 Parent(s): 4bb3987

Upload eval_results/deepseek-ai/deepseek-math-7b-rl/main/math_deepseek_cot/results_2024-07-01T02-38-10.632353.json with huggingface_hub

Browse files
eval_results/deepseek-ai/deepseek-math-7b-rl/main/math_deepseek_cot/results_2024-07-01T02-38-10.632353.json ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 253721.302661655,
9
+ "end_time": 278458.960923328,
10
+ "total_evaluation_time_secondes": "24737.658261673",
11
+ "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
+ "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "12.87 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|math_deepseek_cot:algebra|0": {
19
+ "qem": 0.03201347935973041,
20
+ "qem_stderr": 0.00511162221827606
21
+ },
22
+ "custom|math_deepseek_cot:counting_and_probability|0": {
23
+ "qem": 0.004219409282700422,
24
+ "qem_stderr": 0.002980417365102051
25
+ },
26
+ "custom|math_deepseek_cot:geometry|0": {
27
+ "qem": 0.010438413361169102,
28
+ "qem_stderr": 0.004648627117184639
29
+ },
30
+ "custom|math_deepseek_cot:intermediate_algebra|0": {
31
+ "qem": 0.013289036544850499,
32
+ "qem_stderr": 0.003812751108019936
33
+ },
34
+ "custom|math_deepseek_cot:number_theory|0": {
35
+ "qem": 0.0,
36
+ "qem_stderr": 0.0
37
+ },
38
+ "custom|math_deepseek_cot:prealgebra|0": {
39
+ "qem": 0.04477611940298507,
40
+ "qem_stderr": 0.007011584710623336
41
+ },
42
+ "custom|math_deepseek_cot:precalculus|0": {
43
+ "qem": 0.020146520146520148,
44
+ "qem_stderr": 0.006018417889653956
45
+ },
46
+ "custom|math_deepseek_cot:_average|0": {
47
+ "qem": 0.017840425442565092,
48
+ "qem_stderr": 0.0042262029155514256
49
+ },
50
+ "all": {
51
+ "qem": 0.017840425442565092,
52
+ "qem_stderr": 0.0042262029155514256
53
+ }
54
+ },
55
+ "versions": {
56
+ "custom|math_deepseek_cot:algebra|0": 0,
57
+ "custom|math_deepseek_cot:counting_and_probability|0": 0,
58
+ "custom|math_deepseek_cot:geometry|0": 0,
59
+ "custom|math_deepseek_cot:intermediate_algebra|0": 0,
60
+ "custom|math_deepseek_cot:number_theory|0": 0,
61
+ "custom|math_deepseek_cot:prealgebra|0": 0,
62
+ "custom|math_deepseek_cot:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "custom|math_deepseek_cot:algebra": {
66
+ "name": "math_deepseek_cot:algebra",
67
+ "prompt_function": "math_deepseek_prompt_fn",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": null,
85
+ "output_regex": null,
86
+ "num_samples": null,
87
+ "frozen": false,
88
+ "suite": [
89
+ "custom"
90
+ ],
91
+ "original_num_docs": 1187,
92
+ "effective_num_docs": 1187,
93
+ "trust_dataset": null,
94
+ "must_remove_duplicate_docs": null,
95
+ "version": 0
96
+ },
97
+ "custom|math_deepseek_cot:counting_and_probability": {
98
+ "name": "math_deepseek_cot:counting_and_probability",
99
+ "prompt_function": "math_deepseek_prompt_fn",
100
+ "hf_repo": "lighteval/MATH",
101
+ "hf_subset": "counting_and_probability",
102
+ "metric": [
103
+ "quasi_exact_match_math"
104
+ ],
105
+ "hf_avail_splits": [
106
+ "train",
107
+ "test",
108
+ "validation"
109
+ ],
110
+ "evaluation_splits": [
111
+ "test"
112
+ ],
113
+ "few_shots_split": null,
114
+ "few_shots_select": null,
115
+ "generation_size": 2048,
116
+ "stop_sequence": null,
117
+ "output_regex": null,
118
+ "num_samples": null,
119
+ "frozen": false,
120
+ "suite": [
121
+ "custom"
122
+ ],
123
+ "original_num_docs": 474,
124
+ "effective_num_docs": 474,
125
+ "trust_dataset": null,
126
+ "must_remove_duplicate_docs": null,
127
+ "version": 0
128
+ },
129
+ "custom|math_deepseek_cot:geometry": {
130
+ "name": "math_deepseek_cot:geometry",
131
+ "prompt_function": "math_deepseek_prompt_fn",
132
+ "hf_repo": "lighteval/MATH",
133
+ "hf_subset": "geometry",
134
+ "metric": [
135
+ "quasi_exact_match_math"
136
+ ],
137
+ "hf_avail_splits": [
138
+ "train",
139
+ "test",
140
+ "validation"
141
+ ],
142
+ "evaluation_splits": [
143
+ "test"
144
+ ],
145
+ "few_shots_split": null,
146
+ "few_shots_select": null,
147
+ "generation_size": 2048,
148
+ "stop_sequence": null,
149
+ "output_regex": null,
150
+ "num_samples": null,
151
+ "frozen": false,
152
+ "suite": [
153
+ "custom"
154
+ ],
155
+ "original_num_docs": 479,
156
+ "effective_num_docs": 479,
157
+ "trust_dataset": null,
158
+ "must_remove_duplicate_docs": null,
159
+ "version": 0
160
+ },
161
+ "custom|math_deepseek_cot:intermediate_algebra": {
162
+ "name": "math_deepseek_cot:intermediate_algebra",
163
+ "prompt_function": "math_deepseek_prompt_fn",
164
+ "hf_repo": "lighteval/MATH",
165
+ "hf_subset": "intermediate_algebra",
166
+ "metric": [
167
+ "quasi_exact_match_math"
168
+ ],
169
+ "hf_avail_splits": [
170
+ "train",
171
+ "test",
172
+ "validation"
173
+ ],
174
+ "evaluation_splits": [
175
+ "test"
176
+ ],
177
+ "few_shots_split": null,
178
+ "few_shots_select": null,
179
+ "generation_size": 2048,
180
+ "stop_sequence": null,
181
+ "output_regex": null,
182
+ "num_samples": null,
183
+ "frozen": false,
184
+ "suite": [
185
+ "custom"
186
+ ],
187
+ "original_num_docs": 903,
188
+ "effective_num_docs": 903,
189
+ "trust_dataset": null,
190
+ "must_remove_duplicate_docs": null,
191
+ "version": 0
192
+ },
193
+ "custom|math_deepseek_cot:number_theory": {
194
+ "name": "math_deepseek_cot:number_theory",
195
+ "prompt_function": "math_deepseek_prompt_fn",
196
+ "hf_repo": "lighteval/MATH",
197
+ "hf_subset": "number_theory",
198
+ "metric": [
199
+ "quasi_exact_match_math"
200
+ ],
201
+ "hf_avail_splits": [
202
+ "train",
203
+ "test",
204
+ "validation"
205
+ ],
206
+ "evaluation_splits": [
207
+ "test"
208
+ ],
209
+ "few_shots_split": null,
210
+ "few_shots_select": null,
211
+ "generation_size": 2048,
212
+ "stop_sequence": null,
213
+ "output_regex": null,
214
+ "num_samples": null,
215
+ "frozen": false,
216
+ "suite": [
217
+ "custom"
218
+ ],
219
+ "original_num_docs": 540,
220
+ "effective_num_docs": 540,
221
+ "trust_dataset": null,
222
+ "must_remove_duplicate_docs": null,
223
+ "version": 0
224
+ },
225
+ "custom|math_deepseek_cot:prealgebra": {
226
+ "name": "math_deepseek_cot:prealgebra",
227
+ "prompt_function": "math_deepseek_prompt_fn",
228
+ "hf_repo": "lighteval/MATH",
229
+ "hf_subset": "prealgebra",
230
+ "metric": [
231
+ "quasi_exact_match_math"
232
+ ],
233
+ "hf_avail_splits": [
234
+ "train",
235
+ "test",
236
+ "validation"
237
+ ],
238
+ "evaluation_splits": [
239
+ "test"
240
+ ],
241
+ "few_shots_split": null,
242
+ "few_shots_select": null,
243
+ "generation_size": 2048,
244
+ "stop_sequence": null,
245
+ "output_regex": null,
246
+ "num_samples": null,
247
+ "frozen": false,
248
+ "suite": [
249
+ "custom"
250
+ ],
251
+ "original_num_docs": 871,
252
+ "effective_num_docs": 871,
253
+ "trust_dataset": null,
254
+ "must_remove_duplicate_docs": null,
255
+ "version": 0
256
+ },
257
+ "custom|math_deepseek_cot:precalculus": {
258
+ "name": "math_deepseek_cot:precalculus",
259
+ "prompt_function": "math_deepseek_prompt_fn",
260
+ "hf_repo": "lighteval/MATH",
261
+ "hf_subset": "precalculus",
262
+ "metric": [
263
+ "quasi_exact_match_math"
264
+ ],
265
+ "hf_avail_splits": [
266
+ "train",
267
+ "test",
268
+ "validation"
269
+ ],
270
+ "evaluation_splits": [
271
+ "test"
272
+ ],
273
+ "few_shots_split": null,
274
+ "few_shots_select": null,
275
+ "generation_size": 2048,
276
+ "stop_sequence": null,
277
+ "output_regex": null,
278
+ "num_samples": null,
279
+ "frozen": false,
280
+ "suite": [
281
+ "custom"
282
+ ],
283
+ "original_num_docs": 546,
284
+ "effective_num_docs": 546,
285
+ "trust_dataset": null,
286
+ "must_remove_duplicate_docs": null,
287
+ "version": 0
288
+ }
289
+ },
290
+ "summary_tasks": {
291
+ "custom|math_deepseek_cot:algebra|0": {
292
+ "hashes": {
293
+ "hash_examples": "dd0c20b1f0cb4647",
294
+ "hash_full_prompts": "9dc2c8ae59782b3b",
295
+ "hash_input_tokens": "6c82f6e0f700d0dd",
296
+ "hash_cont_tokens": "8f7129ae00c1e7f4"
297
+ },
298
+ "truncated": 1187,
299
+ "non_truncated": 0,
300
+ "padded": 153,
301
+ "non_padded": 1034,
302
+ "effective_few_shots": 0.0,
303
+ "num_truncated_few_shots": 0
304
+ },
305
+ "custom|math_deepseek_cot:counting_and_probability|0": {
306
+ "hashes": {
307
+ "hash_examples": "155d950d14c44af4",
308
+ "hash_full_prompts": "0229e46c276fb642",
309
+ "hash_input_tokens": "8d6a6e01cba0c2df",
310
+ "hash_cont_tokens": "9e9011bd181065a6"
311
+ },
312
+ "truncated": 474,
313
+ "non_truncated": 0,
314
+ "padded": 51,
315
+ "non_padded": 423,
316
+ "effective_few_shots": 0.0,
317
+ "num_truncated_few_shots": 0
318
+ },
319
+ "custom|math_deepseek_cot:geometry|0": {
320
+ "hashes": {
321
+ "hash_examples": "d0903037eaaf129f",
322
+ "hash_full_prompts": "7ec2382c7aa27657",
323
+ "hash_input_tokens": "6be40ba81a92ef74",
324
+ "hash_cont_tokens": "557b92d91c772021"
325
+ },
326
+ "truncated": 479,
327
+ "non_truncated": 0,
328
+ "padded": 114,
329
+ "non_padded": 365,
330
+ "effective_few_shots": 0.0,
331
+ "num_truncated_few_shots": 0
332
+ },
333
+ "custom|math_deepseek_cot:intermediate_algebra|0": {
334
+ "hashes": {
335
+ "hash_examples": "11e643d9ae66ed0e",
336
+ "hash_full_prompts": "35b30af62390119f",
337
+ "hash_input_tokens": "bc5244afa75043dc",
338
+ "hash_cont_tokens": "f95c183c254a69cb"
339
+ },
340
+ "truncated": 903,
341
+ "non_truncated": 0,
342
+ "padded": 43,
343
+ "non_padded": 860,
344
+ "effective_few_shots": 0.0,
345
+ "num_truncated_few_shots": 0
346
+ },
347
+ "custom|math_deepseek_cot:number_theory|0": {
348
+ "hashes": {
349
+ "hash_examples": "d5cb46b7f8cc037c",
350
+ "hash_full_prompts": "d35d76ee7b76069c",
351
+ "hash_input_tokens": "2898f6f86f93fd51",
352
+ "hash_cont_tokens": "391f851a0533e193"
353
+ },
354
+ "truncated": 537,
355
+ "non_truncated": 3,
356
+ "padded": 6,
357
+ "non_padded": 534,
358
+ "effective_few_shots": 0.0,
359
+ "num_truncated_few_shots": 0
360
+ },
361
+ "custom|math_deepseek_cot:prealgebra|0": {
362
+ "hashes": {
363
+ "hash_examples": "927b75048eaa22ce",
364
+ "hash_full_prompts": "b44563e8a610a5e0",
365
+ "hash_input_tokens": "d61848981f04e253",
366
+ "hash_cont_tokens": "22cca59bb15fdf81"
367
+ },
368
+ "truncated": 871,
369
+ "non_truncated": 0,
370
+ "padded": 53,
371
+ "non_padded": 818,
372
+ "effective_few_shots": 0.0,
373
+ "num_truncated_few_shots": 0
374
+ },
375
+ "custom|math_deepseek_cot:precalculus|0": {
376
+ "hashes": {
377
+ "hash_examples": "a2ad19d3f21c8c63",
378
+ "hash_full_prompts": "16ec0e775cc4e656",
379
+ "hash_input_tokens": "6545a1bd9ce71e45",
380
+ "hash_cont_tokens": "ee65a5af8a844194"
381
+ },
382
+ "truncated": 546,
383
+ "non_truncated": 0,
384
+ "padded": 21,
385
+ "non_padded": 525,
386
+ "effective_few_shots": 0.0,
387
+ "num_truncated_few_shots": 0
388
+ }
389
+ },
390
+ "summary_general": {
391
+ "hashes": {
392
+ "hash_examples": "0aa3308517bd659c",
393
+ "hash_full_prompts": "cb8f5223e4e35bcc",
394
+ "hash_input_tokens": "24e1699323938802",
395
+ "hash_cont_tokens": "e71cd19e1d2d55b1"
396
+ },
397
+ "truncated": 4997,
398
+ "non_truncated": 3,
399
+ "padded": 441,
400
+ "non_padded": 4559,
401
+ "num_truncated_few_shots": 0
402
+ }
403
+ }