kashif HF staff commited on
Commit
8836bbc
·
verified ·
1 Parent(s): caae418

Upload eval_results/AI-MO/Eurus-7b-sft/aimo_v01.00/mini_math_v2_cot/results_2024-04-29T13-29-15.050137.json with huggingface_hub

Browse files
eval_results/AI-MO/Eurus-7b-sft/aimo_v01.00/mini_math_v2_cot/results_2024-04-29T13-29-15.050137.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 391563.467611792,
9
+ "end_time": 392527.973125356,
10
+ "total_evaluation_time_secondes": "964.5055135640432",
11
+ "model_name": "AI-MO/Eurus-7b-sft",
12
+ "model_sha": "c85aa9511e4c7a9f5f061d5d8e2016536482f006",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|mini_math_v2_cot:level_1|0": {
19
+ "qem": 0.34285714285714286,
20
+ "qem_stderr": 0.08140424227436863
21
+ },
22
+ "custom|mini_math_v2_cot:level_2|0": {
23
+ "qem": 0.3194444444444444,
24
+ "qem_stderr": 0.05533504751887217
25
+ },
26
+ "custom|mini_math_v2_cot:level_3|0": {
27
+ "qem": 0.16666666666666666,
28
+ "qem_stderr": 0.03950378859499813
29
+ },
30
+ "custom|mini_math_v2_cot:level_4|0": {
31
+ "qem": 0.10309278350515463,
32
+ "qem_stderr": 0.031035037574206968
33
+ },
34
+ "custom|mini_math_v2_cot:level_5|0": {
35
+ "qem": 0.03773584905660377,
36
+ "qem_stderr": 0.01859642944545588
37
+ },
38
+ "custom|mini_math_v2_cot:_average|0": {
39
+ "qem": 0.19395937730600246,
40
+ "qem_stderr": 0.04517490908158036
41
+ },
42
+ "all": {
43
+ "qem": 0.19395937730600246,
44
+ "qem_stderr": 0.04517490908158036
45
+ }
46
+ },
47
+ "versions": {
48
+ "custom|mini_math_v2_cot:level_1|0": 0,
49
+ "custom|mini_math_v2_cot:level_2|0": 0,
50
+ "custom|mini_math_v2_cot:level_3|0": 0,
51
+ "custom|mini_math_v2_cot:level_4|0": 0,
52
+ "custom|mini_math_v2_cot:level_5|0": 0
53
+ },
54
+ "config_tasks": {
55
+ "custom|mini_math_v2_cot:level_1": {
56
+ "name": "mini_math_v2_cot:level_1",
57
+ "prompt_function": "minimath_prompt_fn",
58
+ "hf_repo": "AI-MO/lighteval-mini-math",
59
+ "hf_subset": "Level 1",
60
+ "metric": [
61
+ "quasi_exact_match_math"
62
+ ],
63
+ "hf_avail_splits": [
64
+ "train",
65
+ "test"
66
+ ],
67
+ "evaluation_splits": [
68
+ "test"
69
+ ],
70
+ "few_shots_split": null,
71
+ "few_shots_select": null,
72
+ "generation_size": 2048,
73
+ "stop_sequence": null,
74
+ "output_regex": null,
75
+ "frozen": false,
76
+ "suite": [
77
+ "custom"
78
+ ],
79
+ "original_num_docs": 35,
80
+ "effective_num_docs": 35,
81
+ "trust_dataset": null,
82
+ "must_remove_duplicate_docs": null
83
+ },
84
+ "custom|mini_math_v2_cot:level_2": {
85
+ "name": "mini_math_v2_cot:level_2",
86
+ "prompt_function": "minimath_prompt_fn",
87
+ "hf_repo": "AI-MO/lighteval-mini-math",
88
+ "hf_subset": "Level 2",
89
+ "metric": [
90
+ "quasi_exact_match_math"
91
+ ],
92
+ "hf_avail_splits": [
93
+ "train",
94
+ "test"
95
+ ],
96
+ "evaluation_splits": [
97
+ "test"
98
+ ],
99
+ "few_shots_split": null,
100
+ "few_shots_select": null,
101
+ "generation_size": 2048,
102
+ "stop_sequence": null,
103
+ "output_regex": null,
104
+ "frozen": false,
105
+ "suite": [
106
+ "custom"
107
+ ],
108
+ "original_num_docs": 72,
109
+ "effective_num_docs": 72,
110
+ "trust_dataset": null,
111
+ "must_remove_duplicate_docs": null
112
+ },
113
+ "custom|mini_math_v2_cot:level_3": {
114
+ "name": "mini_math_v2_cot:level_3",
115
+ "prompt_function": "minimath_prompt_fn",
116
+ "hf_repo": "AI-MO/lighteval-mini-math",
117
+ "hf_subset": "Level 3",
118
+ "metric": [
119
+ "quasi_exact_match_math"
120
+ ],
121
+ "hf_avail_splits": [
122
+ "train",
123
+ "test"
124
+ ],
125
+ "evaluation_splits": [
126
+ "test"
127
+ ],
128
+ "few_shots_split": null,
129
+ "few_shots_select": null,
130
+ "generation_size": 2048,
131
+ "stop_sequence": null,
132
+ "output_regex": null,
133
+ "frozen": false,
134
+ "suite": [
135
+ "custom"
136
+ ],
137
+ "original_num_docs": 90,
138
+ "effective_num_docs": 90,
139
+ "trust_dataset": null,
140
+ "must_remove_duplicate_docs": null
141
+ },
142
+ "custom|mini_math_v2_cot:level_4": {
143
+ "name": "mini_math_v2_cot:level_4",
144
+ "prompt_function": "minimath_prompt_fn",
145
+ "hf_repo": "AI-MO/lighteval-mini-math",
146
+ "hf_subset": "Level 4",
147
+ "metric": [
148
+ "quasi_exact_match_math"
149
+ ],
150
+ "hf_avail_splits": [
151
+ "train",
152
+ "test"
153
+ ],
154
+ "evaluation_splits": [
155
+ "test"
156
+ ],
157
+ "few_shots_split": null,
158
+ "few_shots_select": null,
159
+ "generation_size": 2048,
160
+ "stop_sequence": null,
161
+ "output_regex": null,
162
+ "frozen": false,
163
+ "suite": [
164
+ "custom"
165
+ ],
166
+ "original_num_docs": 97,
167
+ "effective_num_docs": 97,
168
+ "trust_dataset": null,
169
+ "must_remove_duplicate_docs": null
170
+ },
171
+ "custom|mini_math_v2_cot:level_5": {
172
+ "name": "mini_math_v2_cot:level_5",
173
+ "prompt_function": "minimath_prompt_fn",
174
+ "hf_repo": "AI-MO/lighteval-mini-math",
175
+ "hf_subset": "Level 5",
176
+ "metric": [
177
+ "quasi_exact_match_math"
178
+ ],
179
+ "hf_avail_splits": [
180
+ "train",
181
+ "test"
182
+ ],
183
+ "evaluation_splits": [
184
+ "test"
185
+ ],
186
+ "few_shots_split": null,
187
+ "few_shots_select": null,
188
+ "generation_size": 2048,
189
+ "stop_sequence": null,
190
+ "output_regex": null,
191
+ "frozen": false,
192
+ "suite": [
193
+ "custom"
194
+ ],
195
+ "original_num_docs": 106,
196
+ "effective_num_docs": 106,
197
+ "trust_dataset": null,
198
+ "must_remove_duplicate_docs": null
199
+ }
200
+ },
201
+ "summary_tasks": {
202
+ "custom|mini_math_v2_cot:level_1|0": {
203
+ "hashes": {
204
+ "hash_examples": "6e46c83c3c04fd6a",
205
+ "hash_full_prompts": "f82f7f6570e0d1d7",
206
+ "hash_input_tokens": "c337b401eb02a335",
207
+ "hash_cont_tokens": "7b1537dff805f854"
208
+ },
209
+ "truncated": 35,
210
+ "non_truncated": 0,
211
+ "padded": 23,
212
+ "non_padded": 12,
213
+ "effective_few_shots": 0.0,
214
+ "num_truncated_few_shots": 0
215
+ },
216
+ "custom|mini_math_v2_cot:level_2|0": {
217
+ "hashes": {
218
+ "hash_examples": "889713509c9bbbc1",
219
+ "hash_full_prompts": "70d857ef2d839ba0",
220
+ "hash_input_tokens": "7a576f7464b4081a",
221
+ "hash_cont_tokens": "3d333a27a2a927c9"
222
+ },
223
+ "truncated": 71,
224
+ "non_truncated": 1,
225
+ "padded": 38,
226
+ "non_padded": 34,
227
+ "effective_few_shots": 0.0,
228
+ "num_truncated_few_shots": 0
229
+ },
230
+ "custom|mini_math_v2_cot:level_3|0": {
231
+ "hashes": {
232
+ "hash_examples": "325e5cc88f99c065",
233
+ "hash_full_prompts": "ac84f6c9c8b958ae",
234
+ "hash_input_tokens": "68ee20cb97b0e202",
235
+ "hash_cont_tokens": "8be45f212900c86c"
236
+ },
237
+ "truncated": 90,
238
+ "non_truncated": 0,
239
+ "padded": 33,
240
+ "non_padded": 57,
241
+ "effective_few_shots": 0.0,
242
+ "num_truncated_few_shots": 0
243
+ },
244
+ "custom|mini_math_v2_cot:level_4|0": {
245
+ "hashes": {
246
+ "hash_examples": "797712bab625f507",
247
+ "hash_full_prompts": "b76096d7d92e2296",
248
+ "hash_input_tokens": "7de858f872668166",
249
+ "hash_cont_tokens": "3fe29b7d95997781"
250
+ },
251
+ "truncated": 97,
252
+ "non_truncated": 0,
253
+ "padded": 38,
254
+ "non_padded": 59,
255
+ "effective_few_shots": 0.0,
256
+ "num_truncated_few_shots": 0
257
+ },
258
+ "custom|mini_math_v2_cot:level_5|0": {
259
+ "hashes": {
260
+ "hash_examples": "d263622047b94222",
261
+ "hash_full_prompts": "b38ec0550a8d3581",
262
+ "hash_input_tokens": "4eea4ab5411e66e7",
263
+ "hash_cont_tokens": "2cb3146d0080f788"
264
+ },
265
+ "truncated": 106,
266
+ "non_truncated": 0,
267
+ "padded": 32,
268
+ "non_padded": 74,
269
+ "effective_few_shots": 0.0,
270
+ "num_truncated_few_shots": 0
271
+ }
272
+ },
273
+ "summary_general": {
274
+ "hashes": {
275
+ "hash_examples": "3cf20f18ddb05dd8",
276
+ "hash_full_prompts": "b74e3580ef05e271",
277
+ "hash_input_tokens": "3f1e669b8185c08b",
278
+ "hash_cont_tokens": "ab1a8b9154fbc47c"
279
+ },
280
+ "truncated": 399,
281
+ "non_truncated": 1,
282
+ "padded": 164,
283
+ "non_padded": 236,
284
+ "num_truncated_few_shots": 0
285
+ }
286
+ }