edbeeching HF Staff commited on
Commit
ea3a1b8
·
verified ·
1 Parent(s): e3ba967

Upload eval_results/Qwen/Qwen2.5-Math-7B-Instruct/main/mini_math/results_2025-01-23T15-02-00.811603.json with huggingface_hub

Browse files
eval_results/Qwen/Qwen2.5-Math-7B-Instruct/main/mini_math/results_2025-01-23T15-02-00.811603.json ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 389746.081182226,
9
+ "end_time": 389875.173432725,
10
+ "total_evaluation_time_secondes": "129.09225049900124",
11
+ "model_name": "Qwen/Qwen2.5-Math-7B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|mini_math:level_1|0": {
18
+ "extractive_match": 0.8571428571428571,
19
+ "extractive_match_stderr": 0.06001200360120039
20
+ },
21
+ "custom|mini_math:level_2|0": {
22
+ "extractive_match": 0.6527777777777778,
23
+ "extractive_match_stderr": 0.05650114676852965
24
+ },
25
+ "custom|mini_math:level_3|0": {
26
+ "extractive_match": 0.6888888888888889,
27
+ "extractive_match_stderr": 0.049072405533864084
28
+ },
29
+ "custom|mini_math:level_4|0": {
30
+ "extractive_match": 0.5463917525773195,
31
+ "extractive_match_stderr": 0.05081090406399051
32
+ },
33
+ "custom|mini_math:level_5|0": {
34
+ "extractive_match": 0.330188679245283,
35
+ "extractive_match_stderr": 0.045894715469579954
36
+ },
37
+ "custom|mini_math:_average|0": {
38
+ "extractive_match": 0.6150779911264251,
39
+ "extractive_match_stderr": 0.052458235087432914
40
+ },
41
+ "all": {
42
+ "extractive_match": 0.6150779911264251,
43
+ "extractive_match_stderr": 0.052458235087432914
44
+ }
45
+ },
46
+ "versions": {
47
+ "custom|mini_math:level_1|0": 0,
48
+ "custom|mini_math:level_2|0": 0,
49
+ "custom|mini_math:level_3|0": 0,
50
+ "custom|mini_math:level_4|0": 0,
51
+ "custom|mini_math:level_5|0": 0
52
+ },
53
+ "config_tasks": {
54
+ "custom|mini_math:level_1": {
55
+ "name": "mini_math:level_1",
56
+ "prompt_function": "math_prompt_fn",
57
+ "hf_repo": "AI-MO/lighteval-mini-math",
58
+ "hf_subset": "Level 1",
59
+ "metric": [
60
+ {
61
+ "metric_name": "extractive_match",
62
+ "higher_is_better": true,
63
+ "category": "3",
64
+ "use_case": "1",
65
+ "sample_level_fn": "sample_level_fn",
66
+ "corpus_level_fn": "mean"
67
+ }
68
+ ],
69
+ "hf_revision": null,
70
+ "hf_filter": null,
71
+ "hf_avail_splits": [
72
+ "train",
73
+ "test"
74
+ ],
75
+ "trust_dataset": false,
76
+ "evaluation_splits": [
77
+ "test"
78
+ ],
79
+ "few_shots_split": null,
80
+ "few_shots_select": null,
81
+ "generation_size": 2048,
82
+ "generation_grammar": null,
83
+ "stop_sequence": [],
84
+ "num_samples": null,
85
+ "suite": [
86
+ "custom"
87
+ ],
88
+ "original_num_docs": 35,
89
+ "effective_num_docs": 35,
90
+ "must_remove_duplicate_docs": false,
91
+ "version": 0
92
+ },
93
+ "custom|mini_math:level_2": {
94
+ "name": "mini_math:level_2",
95
+ "prompt_function": "math_prompt_fn",
96
+ "hf_repo": "AI-MO/lighteval-mini-math",
97
+ "hf_subset": "Level 2",
98
+ "metric": [
99
+ {
100
+ "metric_name": "extractive_match",
101
+ "higher_is_better": true,
102
+ "category": "3",
103
+ "use_case": "1",
104
+ "sample_level_fn": "sample_level_fn",
105
+ "corpus_level_fn": "mean"
106
+ }
107
+ ],
108
+ "hf_revision": null,
109
+ "hf_filter": null,
110
+ "hf_avail_splits": [
111
+ "train",
112
+ "test"
113
+ ],
114
+ "trust_dataset": false,
115
+ "evaluation_splits": [
116
+ "test"
117
+ ],
118
+ "few_shots_split": null,
119
+ "few_shots_select": null,
120
+ "generation_size": 2048,
121
+ "generation_grammar": null,
122
+ "stop_sequence": [],
123
+ "num_samples": null,
124
+ "suite": [
125
+ "custom"
126
+ ],
127
+ "original_num_docs": 72,
128
+ "effective_num_docs": 72,
129
+ "must_remove_duplicate_docs": false,
130
+ "version": 0
131
+ },
132
+ "custom|mini_math:level_3": {
133
+ "name": "mini_math:level_3",
134
+ "prompt_function": "math_prompt_fn",
135
+ "hf_repo": "AI-MO/lighteval-mini-math",
136
+ "hf_subset": "Level 3",
137
+ "metric": [
138
+ {
139
+ "metric_name": "extractive_match",
140
+ "higher_is_better": true,
141
+ "category": "3",
142
+ "use_case": "1",
143
+ "sample_level_fn": "sample_level_fn",
144
+ "corpus_level_fn": "mean"
145
+ }
146
+ ],
147
+ "hf_revision": null,
148
+ "hf_filter": null,
149
+ "hf_avail_splits": [
150
+ "train",
151
+ "test"
152
+ ],
153
+ "trust_dataset": false,
154
+ "evaluation_splits": [
155
+ "test"
156
+ ],
157
+ "few_shots_split": null,
158
+ "few_shots_select": null,
159
+ "generation_size": 2048,
160
+ "generation_grammar": null,
161
+ "stop_sequence": [],
162
+ "num_samples": null,
163
+ "suite": [
164
+ "custom"
165
+ ],
166
+ "original_num_docs": 90,
167
+ "effective_num_docs": 90,
168
+ "must_remove_duplicate_docs": false,
169
+ "version": 0
170
+ },
171
+ "custom|mini_math:level_4": {
172
+ "name": "mini_math:level_4",
173
+ "prompt_function": "math_prompt_fn",
174
+ "hf_repo": "AI-MO/lighteval-mini-math",
175
+ "hf_subset": "Level 4",
176
+ "metric": [
177
+ {
178
+ "metric_name": "extractive_match",
179
+ "higher_is_better": true,
180
+ "category": "3",
181
+ "use_case": "1",
182
+ "sample_level_fn": "sample_level_fn",
183
+ "corpus_level_fn": "mean"
184
+ }
185
+ ],
186
+ "hf_revision": null,
187
+ "hf_filter": null,
188
+ "hf_avail_splits": [
189
+ "train",
190
+ "test"
191
+ ],
192
+ "trust_dataset": false,
193
+ "evaluation_splits": [
194
+ "test"
195
+ ],
196
+ "few_shots_split": null,
197
+ "few_shots_select": null,
198
+ "generation_size": 2048,
199
+ "generation_grammar": null,
200
+ "stop_sequence": [],
201
+ "num_samples": null,
202
+ "suite": [
203
+ "custom"
204
+ ],
205
+ "original_num_docs": 97,
206
+ "effective_num_docs": 97,
207
+ "must_remove_duplicate_docs": false,
208
+ "version": 0
209
+ },
210
+ "custom|mini_math:level_5": {
211
+ "name": "mini_math:level_5",
212
+ "prompt_function": "math_prompt_fn",
213
+ "hf_repo": "AI-MO/lighteval-mini-math",
214
+ "hf_subset": "Level 5",
215
+ "metric": [
216
+ {
217
+ "metric_name": "extractive_match",
218
+ "higher_is_better": true,
219
+ "category": "3",
220
+ "use_case": "1",
221
+ "sample_level_fn": "sample_level_fn",
222
+ "corpus_level_fn": "mean"
223
+ }
224
+ ],
225
+ "hf_revision": null,
226
+ "hf_filter": null,
227
+ "hf_avail_splits": [
228
+ "train",
229
+ "test"
230
+ ],
231
+ "trust_dataset": false,
232
+ "evaluation_splits": [
233
+ "test"
234
+ ],
235
+ "few_shots_split": null,
236
+ "few_shots_select": null,
237
+ "generation_size": 2048,
238
+ "generation_grammar": null,
239
+ "stop_sequence": [],
240
+ "num_samples": null,
241
+ "suite": [
242
+ "custom"
243
+ ],
244
+ "original_num_docs": 106,
245
+ "effective_num_docs": 106,
246
+ "must_remove_duplicate_docs": false,
247
+ "version": 0
248
+ }
249
+ },
250
+ "summary_tasks": {
251
+ "custom|mini_math:level_1|0": {
252
+ "hashes": {
253
+ "hash_examples": "ad7e0d89fb7b0664",
254
+ "hash_full_prompts": "b80536c069eb4029",
255
+ "hash_input_tokens": "b7427696025a0367",
256
+ "hash_cont_tokens": "c299cf9e4b97d0c0"
257
+ },
258
+ "truncated": 0,
259
+ "non_truncated": 35,
260
+ "padded": 0,
261
+ "non_padded": 35,
262
+ "effective_few_shots": 0.0,
263
+ "num_truncated_few_shots": 0
264
+ },
265
+ "custom|mini_math:level_2|0": {
266
+ "hashes": {
267
+ "hash_examples": "493b7e3130e3a50d",
268
+ "hash_full_prompts": "46446bf68057d29e",
269
+ "hash_input_tokens": "0e7973761858265b",
270
+ "hash_cont_tokens": "aa155389ff5e9e3e"
271
+ },
272
+ "truncated": 0,
273
+ "non_truncated": 72,
274
+ "padded": 0,
275
+ "non_padded": 72,
276
+ "effective_few_shots": 0.0,
277
+ "num_truncated_few_shots": 0
278
+ },
279
+ "custom|mini_math:level_3|0": {
280
+ "hashes": {
281
+ "hash_examples": "11edfc7fc00756f4",
282
+ "hash_full_prompts": "dfffd64b4ba2e549",
283
+ "hash_input_tokens": "0f6dd65ec1cae39f",
284
+ "hash_cont_tokens": "60b282155e01fead"
285
+ },
286
+ "truncated": 0,
287
+ "non_truncated": 90,
288
+ "padded": 0,
289
+ "non_padded": 90,
290
+ "effective_few_shots": 0.0,
291
+ "num_truncated_few_shots": 0
292
+ },
293
+ "custom|mini_math:level_4|0": {
294
+ "hashes": {
295
+ "hash_examples": "a901e1669616ae50",
296
+ "hash_full_prompts": "2e1f3a5e7d370bb0",
297
+ "hash_input_tokens": "f3b8792222ecd28e",
298
+ "hash_cont_tokens": "8121eb5755ea0d39"
299
+ },
300
+ "truncated": 0,
301
+ "non_truncated": 97,
302
+ "padded": 0,
303
+ "non_padded": 97,
304
+ "effective_few_shots": 0.0,
305
+ "num_truncated_few_shots": 0
306
+ },
307
+ "custom|mini_math:level_5|0": {
308
+ "hashes": {
309
+ "hash_examples": "1d024f2e2410736e",
310
+ "hash_full_prompts": "386a3d414eaee221",
311
+ "hash_input_tokens": "09a513b7947e58d6",
312
+ "hash_cont_tokens": "0543220ba1936de3"
313
+ },
314
+ "truncated": 0,
315
+ "non_truncated": 106,
316
+ "padded": 0,
317
+ "non_padded": 106,
318
+ "effective_few_shots": 0.0,
319
+ "num_truncated_few_shots": 0
320
+ }
321
+ },
322
+ "summary_general": {
323
+ "hashes": {
324
+ "hash_examples": "06e45ee0bae45a44",
325
+ "hash_full_prompts": "f12515b05c0a349f",
326
+ "hash_input_tokens": "9b84bf8c84139e4f",
327
+ "hash_cont_tokens": "2aa56fa802e33fa4"
328
+ },
329
+ "truncated": 0,
330
+ "non_truncated": 400,
331
+ "padded": 0,
332
+ "non_padded": 400,
333
+ "num_truncated_few_shots": 0
334
+ }
335
+ }