edbeeching HF staff commited on
Commit
89cdb9f
·
verified ·
1 Parent(s): 610eab9

Upload eval_results/AI-MO/mistral-7b-sft/aimo_v08.00/mini_math_v2_mmos/results_2024-05-17T06-11-53.628058.json with huggingface_hub

Browse files
eval_results/AI-MO/mistral-7b-sft/aimo_v08.00/mini_math_v2_mmos/results_2024-05-17T06-11-53.628058.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 211334.077495201,
9
+ "end_time": 212332.683644103,
10
+ "total_evaluation_time_secondes": "998.6061489020067",
11
+ "model_name": "AI-MO/mistral-7b-sft",
12
+ "model_sha": "e16b9db898ea77c88a6cdb2278949786b4c3defb",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|mini_math_v2_mmos:level_1|0": {
19
+ "qem": 0.6857142857142857,
20
+ "qem_stderr": 0.07961491954505552
21
+ },
22
+ "custom|mini_math_v2_mmos:level_2|0": {
23
+ "qem": 0.3472222222222222,
24
+ "qem_stderr": 0.05650114676852965
25
+ },
26
+ "custom|mini_math_v2_mmos:level_3|0": {
27
+ "qem": 0.43333333333333335,
28
+ "qem_stderr": 0.052526671187288064
29
+ },
30
+ "custom|mini_math_v2_mmos:level_4|0": {
31
+ "qem": 0.24742268041237114,
32
+ "qem_stderr": 0.044041256419162546
33
+ },
34
+ "custom|mini_math_v2_mmos:level_5|0": {
35
+ "qem": 0.14150943396226415,
36
+ "qem_stderr": 0.03401463467418858
37
+ },
38
+ "custom|mini_math_v2_mmos:_average|0": {
39
+ "qem": 0.3710403911288953,
40
+ "qem_stderr": 0.05333972571884486
41
+ },
42
+ "all": {
43
+ "qem": 0.3710403911288953,
44
+ "qem_stderr": 0.05333972571884486
45
+ }
46
+ },
47
+ "versions": {
48
+ "custom|mini_math_v2_mmos:level_1|0": 0,
49
+ "custom|mini_math_v2_mmos:level_2|0": 0,
50
+ "custom|mini_math_v2_mmos:level_3|0": 0,
51
+ "custom|mini_math_v2_mmos:level_4|0": 0,
52
+ "custom|mini_math_v2_mmos:level_5|0": 0
53
+ },
54
+ "config_tasks": {
55
+ "custom|mini_math_v2_mmos:level_1": {
56
+ "name": "mini_math_v2_mmos:level_1",
57
+ "prompt_function": "minimath_mmos_prompt_fn",
58
+ "hf_repo": "AI-MO/lighteval-mini-math",
59
+ "hf_subset": "Level 1",
60
+ "metric": [
61
+ "quasi_exact_match_code_and_math"
62
+ ],
63
+ "hf_avail_splits": [
64
+ "train",
65
+ "test"
66
+ ],
67
+ "evaluation_splits": [
68
+ "test"
69
+ ],
70
+ "few_shots_split": null,
71
+ "few_shots_select": null,
72
+ "generation_size": 2048,
73
+ "stop_sequence": null,
74
+ "output_regex": null,
75
+ "frozen": false,
76
+ "suite": [
77
+ "custom"
78
+ ],
79
+ "original_num_docs": 35,
80
+ "effective_num_docs": 35,
81
+ "trust_dataset": null,
82
+ "must_remove_duplicate_docs": null
83
+ },
84
+ "custom|mini_math_v2_mmos:level_2": {
85
+ "name": "mini_math_v2_mmos:level_2",
86
+ "prompt_function": "minimath_mmos_prompt_fn",
87
+ "hf_repo": "AI-MO/lighteval-mini-math",
88
+ "hf_subset": "Level 2",
89
+ "metric": [
90
+ "quasi_exact_match_code_and_math"
91
+ ],
92
+ "hf_avail_splits": [
93
+ "train",
94
+ "test"
95
+ ],
96
+ "evaluation_splits": [
97
+ "test"
98
+ ],
99
+ "few_shots_split": null,
100
+ "few_shots_select": null,
101
+ "generation_size": 2048,
102
+ "stop_sequence": null,
103
+ "output_regex": null,
104
+ "frozen": false,
105
+ "suite": [
106
+ "custom"
107
+ ],
108
+ "original_num_docs": 72,
109
+ "effective_num_docs": 72,
110
+ "trust_dataset": null,
111
+ "must_remove_duplicate_docs": null
112
+ },
113
+ "custom|mini_math_v2_mmos:level_3": {
114
+ "name": "mini_math_v2_mmos:level_3",
115
+ "prompt_function": "minimath_mmos_prompt_fn",
116
+ "hf_repo": "AI-MO/lighteval-mini-math",
117
+ "hf_subset": "Level 3",
118
+ "metric": [
119
+ "quasi_exact_match_code_and_math"
120
+ ],
121
+ "hf_avail_splits": [
122
+ "train",
123
+ "test"
124
+ ],
125
+ "evaluation_splits": [
126
+ "test"
127
+ ],
128
+ "few_shots_split": null,
129
+ "few_shots_select": null,
130
+ "generation_size": 2048,
131
+ "stop_sequence": null,
132
+ "output_regex": null,
133
+ "frozen": false,
134
+ "suite": [
135
+ "custom"
136
+ ],
137
+ "original_num_docs": 90,
138
+ "effective_num_docs": 90,
139
+ "trust_dataset": null,
140
+ "must_remove_duplicate_docs": null
141
+ },
142
+ "custom|mini_math_v2_mmos:level_4": {
143
+ "name": "mini_math_v2_mmos:level_4",
144
+ "prompt_function": "minimath_mmos_prompt_fn",
145
+ "hf_repo": "AI-MO/lighteval-mini-math",
146
+ "hf_subset": "Level 4",
147
+ "metric": [
148
+ "quasi_exact_match_code_and_math"
149
+ ],
150
+ "hf_avail_splits": [
151
+ "train",
152
+ "test"
153
+ ],
154
+ "evaluation_splits": [
155
+ "test"
156
+ ],
157
+ "few_shots_split": null,
158
+ "few_shots_select": null,
159
+ "generation_size": 2048,
160
+ "stop_sequence": null,
161
+ "output_regex": null,
162
+ "frozen": false,
163
+ "suite": [
164
+ "custom"
165
+ ],
166
+ "original_num_docs": 97,
167
+ "effective_num_docs": 97,
168
+ "trust_dataset": null,
169
+ "must_remove_duplicate_docs": null
170
+ },
171
+ "custom|mini_math_v2_mmos:level_5": {
172
+ "name": "mini_math_v2_mmos:level_5",
173
+ "prompt_function": "minimath_mmos_prompt_fn",
174
+ "hf_repo": "AI-MO/lighteval-mini-math",
175
+ "hf_subset": "Level 5",
176
+ "metric": [
177
+ "quasi_exact_match_code_and_math"
178
+ ],
179
+ "hf_avail_splits": [
180
+ "train",
181
+ "test"
182
+ ],
183
+ "evaluation_splits": [
184
+ "test"
185
+ ],
186
+ "few_shots_split": null,
187
+ "few_shots_select": null,
188
+ "generation_size": 2048,
189
+ "stop_sequence": null,
190
+ "output_regex": null,
191
+ "frozen": false,
192
+ "suite": [
193
+ "custom"
194
+ ],
195
+ "original_num_docs": 106,
196
+ "effective_num_docs": 106,
197
+ "trust_dataset": null,
198
+ "must_remove_duplicate_docs": null
199
+ }
200
+ },
201
+ "summary_tasks": {
202
+ "custom|mini_math_v2_mmos:level_1|0": {
203
+ "hashes": {
204
+ "hash_examples": "ad7e0d89fb7b0664",
205
+ "hash_full_prompts": "8317d6f66ac80683",
206
+ "hash_input_tokens": "3a0704b99081deee",
207
+ "hash_cont_tokens": "c016b5c805cf3e1e"
208
+ },
209
+ "truncated": 35,
210
+ "non_truncated": 0,
211
+ "padded": 23,
212
+ "non_padded": 12,
213
+ "effective_few_shots": 0.0,
214
+ "num_truncated_few_shots": 0
215
+ },
216
+ "custom|mini_math_v2_mmos:level_2|0": {
217
+ "hashes": {
218
+ "hash_examples": "493b7e3130e3a50d",
219
+ "hash_full_prompts": "3d6ff6c491597924",
220
+ "hash_input_tokens": "9a5750d79caed641",
221
+ "hash_cont_tokens": "eca2243e572e3630"
222
+ },
223
+ "truncated": 71,
224
+ "non_truncated": 1,
225
+ "padded": 38,
226
+ "non_padded": 34,
227
+ "effective_few_shots": 0.0,
228
+ "num_truncated_few_shots": 0
229
+ },
230
+ "custom|mini_math_v2_mmos:level_3|0": {
231
+ "hashes": {
232
+ "hash_examples": "11edfc7fc00756f4",
233
+ "hash_full_prompts": "21e5d973c3ee6b21",
234
+ "hash_input_tokens": "96f4e2aeba444c39",
235
+ "hash_cont_tokens": "31ec7a5e1cfe7f78"
236
+ },
237
+ "truncated": 90,
238
+ "non_truncated": 0,
239
+ "padded": 33,
240
+ "non_padded": 57,
241
+ "effective_few_shots": 0.0,
242
+ "num_truncated_few_shots": 0
243
+ },
244
+ "custom|mini_math_v2_mmos:level_4|0": {
245
+ "hashes": {
246
+ "hash_examples": "a901e1669616ae50",
247
+ "hash_full_prompts": "83a6822c2b45515c",
248
+ "hash_input_tokens": "004886079238776b",
249
+ "hash_cont_tokens": "781047a2ef29f578"
250
+ },
251
+ "truncated": 97,
252
+ "non_truncated": 0,
253
+ "padded": 38,
254
+ "non_padded": 59,
255
+ "effective_few_shots": 0.0,
256
+ "num_truncated_few_shots": 0
257
+ },
258
+ "custom|mini_math_v2_mmos:level_5|0": {
259
+ "hashes": {
260
+ "hash_examples": "1d024f2e2410736e",
261
+ "hash_full_prompts": "9054791972fe1d46",
262
+ "hash_input_tokens": "5610b48b0ca7c713",
263
+ "hash_cont_tokens": "20c5b1e4200ff174"
264
+ },
265
+ "truncated": 105,
266
+ "non_truncated": 1,
267
+ "padded": 32,
268
+ "non_padded": 74,
269
+ "effective_few_shots": 0.0,
270
+ "num_truncated_few_shots": 0
271
+ }
272
+ },
273
+ "summary_general": {
274
+ "hashes": {
275
+ "hash_examples": "06e45ee0bae45a44",
276
+ "hash_full_prompts": "8d590d8ff727493b",
277
+ "hash_input_tokens": "a1a8458c66b2080d",
278
+ "hash_cont_tokens": "7d23abf04f9015a5"
279
+ },
280
+ "truncated": 398,
281
+ "non_truncated": 2,
282
+ "padded": 164,
283
+ "non_padded": 236,
284
+ "num_truncated_few_shots": 0
285
+ }
286
+ }