edbeeching HF staff commited on
Commit
991b78a
·
verified ·
1 Parent(s): d492d13

Upload eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v00.33/mini_math_v2_mmos/results_2024-05-17T08-21-48.012606.json with huggingface_hub

Browse files
eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v00.33/mini_math_v2_mmos/results_2024-05-17T08-21-48.012606.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 51129.923433572,
9
+ "end_time": 51761.743144073,
10
+ "total_evaluation_time_secondes": "631.8197105009967",
11
+ "model_name": "AI-MO/deepseek-math-7b-rl-sft",
12
+ "model_sha": "45e02217043850698c5138c17fe4ee7b0bebb723",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "12.93 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|mini_math_v2_mmos:level_1|0": {
19
+ "qem": 0.6571428571428571,
20
+ "qem_stderr": 0.08140424227436861
21
+ },
22
+ "custom|mini_math_v2_mmos:level_2|0": {
23
+ "qem": 0.4166666666666667,
24
+ "qem_stderr": 0.05850912479161746
25
+ },
26
+ "custom|mini_math_v2_mmos:level_3|0": {
27
+ "qem": 0.4666666666666667,
28
+ "qem_stderr": 0.05288198530254015
29
+ },
30
+ "custom|mini_math_v2_mmos:level_4|0": {
31
+ "qem": 0.26804123711340205,
32
+ "qem_stderr": 0.045207292498016735
33
+ },
34
+ "custom|mini_math_v2_mmos:level_5|0": {
35
+ "qem": 0.1320754716981132,
36
+ "qem_stderr": 0.03304132210456279
37
+ },
38
+ "custom|mini_math_v2_mmos:_average|0": {
39
+ "qem": 0.3881185798575411,
40
+ "qem_stderr": 0.05420879339422115
41
+ },
42
+ "all": {
43
+ "qem": 0.3881185798575411,
44
+ "qem_stderr": 0.05420879339422115
45
+ }
46
+ },
47
+ "versions": {
48
+ "custom|mini_math_v2_mmos:level_1|0": 0,
49
+ "custom|mini_math_v2_mmos:level_2|0": 0,
50
+ "custom|mini_math_v2_mmos:level_3|0": 0,
51
+ "custom|mini_math_v2_mmos:level_4|0": 0,
52
+ "custom|mini_math_v2_mmos:level_5|0": 0
53
+ },
54
+ "config_tasks": {
55
+ "custom|mini_math_v2_mmos:level_1": {
56
+ "name": "mini_math_v2_mmos:level_1",
57
+ "prompt_function": "minimath_mmos_prompt_fn",
58
+ "hf_repo": "AI-MO/lighteval-mini-math",
59
+ "hf_subset": "Level 1",
60
+ "metric": [
61
+ "quasi_exact_match_code_and_math"
62
+ ],
63
+ "hf_avail_splits": [
64
+ "train",
65
+ "test"
66
+ ],
67
+ "evaluation_splits": [
68
+ "test"
69
+ ],
70
+ "few_shots_split": null,
71
+ "few_shots_select": null,
72
+ "generation_size": 2048,
73
+ "stop_sequence": null,
74
+ "output_regex": null,
75
+ "frozen": false,
76
+ "suite": [
77
+ "custom"
78
+ ],
79
+ "original_num_docs": 35,
80
+ "effective_num_docs": 35,
81
+ "trust_dataset": null,
82
+ "must_remove_duplicate_docs": null
83
+ },
84
+ "custom|mini_math_v2_mmos:level_2": {
85
+ "name": "mini_math_v2_mmos:level_2",
86
+ "prompt_function": "minimath_mmos_prompt_fn",
87
+ "hf_repo": "AI-MO/lighteval-mini-math",
88
+ "hf_subset": "Level 2",
89
+ "metric": [
90
+ "quasi_exact_match_code_and_math"
91
+ ],
92
+ "hf_avail_splits": [
93
+ "train",
94
+ "test"
95
+ ],
96
+ "evaluation_splits": [
97
+ "test"
98
+ ],
99
+ "few_shots_split": null,
100
+ "few_shots_select": null,
101
+ "generation_size": 2048,
102
+ "stop_sequence": null,
103
+ "output_regex": null,
104
+ "frozen": false,
105
+ "suite": [
106
+ "custom"
107
+ ],
108
+ "original_num_docs": 72,
109
+ "effective_num_docs": 72,
110
+ "trust_dataset": null,
111
+ "must_remove_duplicate_docs": null
112
+ },
113
+ "custom|mini_math_v2_mmos:level_3": {
114
+ "name": "mini_math_v2_mmos:level_3",
115
+ "prompt_function": "minimath_mmos_prompt_fn",
116
+ "hf_repo": "AI-MO/lighteval-mini-math",
117
+ "hf_subset": "Level 3",
118
+ "metric": [
119
+ "quasi_exact_match_code_and_math"
120
+ ],
121
+ "hf_avail_splits": [
122
+ "train",
123
+ "test"
124
+ ],
125
+ "evaluation_splits": [
126
+ "test"
127
+ ],
128
+ "few_shots_split": null,
129
+ "few_shots_select": null,
130
+ "generation_size": 2048,
131
+ "stop_sequence": null,
132
+ "output_regex": null,
133
+ "frozen": false,
134
+ "suite": [
135
+ "custom"
136
+ ],
137
+ "original_num_docs": 90,
138
+ "effective_num_docs": 90,
139
+ "trust_dataset": null,
140
+ "must_remove_duplicate_docs": null
141
+ },
142
+ "custom|mini_math_v2_mmos:level_4": {
143
+ "name": "mini_math_v2_mmos:level_4",
144
+ "prompt_function": "minimath_mmos_prompt_fn",
145
+ "hf_repo": "AI-MO/lighteval-mini-math",
146
+ "hf_subset": "Level 4",
147
+ "metric": [
148
+ "quasi_exact_match_code_and_math"
149
+ ],
150
+ "hf_avail_splits": [
151
+ "train",
152
+ "test"
153
+ ],
154
+ "evaluation_splits": [
155
+ "test"
156
+ ],
157
+ "few_shots_split": null,
158
+ "few_shots_select": null,
159
+ "generation_size": 2048,
160
+ "stop_sequence": null,
161
+ "output_regex": null,
162
+ "frozen": false,
163
+ "suite": [
164
+ "custom"
165
+ ],
166
+ "original_num_docs": 97,
167
+ "effective_num_docs": 97,
168
+ "trust_dataset": null,
169
+ "must_remove_duplicate_docs": null
170
+ },
171
+ "custom|mini_math_v2_mmos:level_5": {
172
+ "name": "mini_math_v2_mmos:level_5",
173
+ "prompt_function": "minimath_mmos_prompt_fn",
174
+ "hf_repo": "AI-MO/lighteval-mini-math",
175
+ "hf_subset": "Level 5",
176
+ "metric": [
177
+ "quasi_exact_match_code_and_math"
178
+ ],
179
+ "hf_avail_splits": [
180
+ "train",
181
+ "test"
182
+ ],
183
+ "evaluation_splits": [
184
+ "test"
185
+ ],
186
+ "few_shots_split": null,
187
+ "few_shots_select": null,
188
+ "generation_size": 2048,
189
+ "stop_sequence": null,
190
+ "output_regex": null,
191
+ "frozen": false,
192
+ "suite": [
193
+ "custom"
194
+ ],
195
+ "original_num_docs": 106,
196
+ "effective_num_docs": 106,
197
+ "trust_dataset": null,
198
+ "must_remove_duplicate_docs": null
199
+ }
200
+ },
201
+ "summary_tasks": {
202
+ "custom|mini_math_v2_mmos:level_1|0": {
203
+ "hashes": {
204
+ "hash_examples": "ad7e0d89fb7b0664",
205
+ "hash_full_prompts": "09389803d825fdf3",
206
+ "hash_input_tokens": "417bc9ee4fcab798",
207
+ "hash_cont_tokens": "924f11e36d2d52e6"
208
+ },
209
+ "truncated": 35,
210
+ "non_truncated": 0,
211
+ "padded": 24,
212
+ "non_padded": 11,
213
+ "effective_few_shots": 0.0,
214
+ "num_truncated_few_shots": 0
215
+ },
216
+ "custom|mini_math_v2_mmos:level_2|0": {
217
+ "hashes": {
218
+ "hash_examples": "493b7e3130e3a50d",
219
+ "hash_full_prompts": "f5f9ef766a19cf57",
220
+ "hash_input_tokens": "3fafd8c7323f3bb4",
221
+ "hash_cont_tokens": "dc917ee160b06099"
222
+ },
223
+ "truncated": 72,
224
+ "non_truncated": 0,
225
+ "padded": 37,
226
+ "non_padded": 35,
227
+ "effective_few_shots": 0.0,
228
+ "num_truncated_few_shots": 0
229
+ },
230
+ "custom|mini_math_v2_mmos:level_3|0": {
231
+ "hashes": {
232
+ "hash_examples": "11edfc7fc00756f4",
233
+ "hash_full_prompts": "716fbe1386202336",
234
+ "hash_input_tokens": "f5c3171f680a2e9a",
235
+ "hash_cont_tokens": "d950f271a9dac29e"
236
+ },
237
+ "truncated": 90,
238
+ "non_truncated": 0,
239
+ "padded": 37,
240
+ "non_padded": 53,
241
+ "effective_few_shots": 0.0,
242
+ "num_truncated_few_shots": 0
243
+ },
244
+ "custom|mini_math_v2_mmos:level_4|0": {
245
+ "hashes": {
246
+ "hash_examples": "a901e1669616ae50",
247
+ "hash_full_prompts": "32cb1558a580e38f",
248
+ "hash_input_tokens": "6d5f32ab97f76f8b",
249
+ "hash_cont_tokens": "9fb8c44e8dde9220"
250
+ },
251
+ "truncated": 96,
252
+ "non_truncated": 1,
253
+ "padded": 34,
254
+ "non_padded": 63,
255
+ "effective_few_shots": 0.0,
256
+ "num_truncated_few_shots": 0
257
+ },
258
+ "custom|mini_math_v2_mmos:level_5|0": {
259
+ "hashes": {
260
+ "hash_examples": "1d024f2e2410736e",
261
+ "hash_full_prompts": "4b1345f7301730ff",
262
+ "hash_input_tokens": "58a467e662a3f415",
263
+ "hash_cont_tokens": "442ec86fe8e15137"
264
+ },
265
+ "truncated": 105,
266
+ "non_truncated": 1,
267
+ "padded": 27,
268
+ "non_padded": 79,
269
+ "effective_few_shots": 0.0,
270
+ "num_truncated_few_shots": 0
271
+ }
272
+ },
273
+ "summary_general": {
274
+ "hashes": {
275
+ "hash_examples": "06e45ee0bae45a44",
276
+ "hash_full_prompts": "7c1d25a76def20ed",
277
+ "hash_input_tokens": "7e79c0c0764c2c82",
278
+ "hash_cont_tokens": "7df756419881b020"
279
+ },
280
+ "truncated": 398,
281
+ "non_truncated": 2,
282
+ "padded": 159,
283
+ "non_padded": 241,
284
+ "num_truncated_few_shots": 0
285
+ }
286
+ }