edbeeching HF Staff commited on
Commit
dbcafd3
·
verified ·
1 Parent(s): 19016f5

Delete eval_results/deepseek-ai

Browse files
Files changed (31) hide show
  1. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/aime24/results_2025-02-06T17-20-54.254090.json +0 -98
  2. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/gpqa/results_2025-02-06T17-22-38.528696.json +0 -98
  3. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-01-29T16-38-54.088382.json +0 -98
  4. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-02-06T17-28-17.933149.json +0 -98
  5. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/aime24/results_2025-02-06T17-01-03.311411.json +0 -98
  6. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/gpqa/results_2025-02-06T17-00-37.294536.json +0 -98
  7. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-01-29T16-19-05.697532.json +0 -98
  8. eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-02-06T17-02-13.445609.json +0 -98
  9. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T15-18-58.986325.json +0 -98
  10. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T16-27-35.319682.json +0 -98
  11. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-33-18.290562.json +0 -98
  12. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-42-59.056415.json +0 -98
  13. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-52-00.573631.json +0 -98
  14. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T16-16-01.453401.json +0 -98
  15. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-05T08-47-02.738326.json +0 -98
  16. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-04-53.650542.json +0 -98
  17. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-19-17.273929.json +0 -98
  18. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-16-44.132377.json +0 -98
  19. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-35-19.804114.json +0 -98
  20. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/aime24/results_2025-02-06T16-51-54.015026.json +0 -98
  21. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/gpqa/results_2025-02-06T16-54-34.705796.json +0 -98
  22. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-01-29T16-21-19.161811.json +0 -98
  23. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-02-06T16-56-34.467531.json +0 -98
  24. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/aime24/results_2025-02-06T17-12-46.800739.json +0 -98
  25. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/gpqa/results_2025-02-06T17-41-45.634038.json +0 -98
  26. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-01-29T16-35-05.004956.json +0 -98
  27. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-02-06T17-44-13.823355.json +0 -98
  28. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/aime24/results_2025-02-06T16-04-06.233392.json +0 -98
  29. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/gpqa/results_2025-02-06T16-44-25.806464.json +0 -98
  30. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-01-29T16-17-35.586793.json +0 -98
  31. eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-02-06T16-16-56.008098.json +0 -98
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/aime24/results_2025-02-06T17-20-54.254090.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 154157.020215178,
9
- "end_time": 155704.725477899,
10
- "total_evaluation_time_secondes": "1547.7052627209923",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime24|0": {
18
- "extractive_match": 0.6666666666666666,
19
- "extractive_match_stderr": 0.08753762190648169
20
- },
21
- "all": {
22
- "extractive_match": 0.6666666666666666,
23
- "extractive_match_stderr": 0.08753762190648169
24
- }
25
- },
26
- "versions": {
27
- "custom|aime24|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime24": {
31
- "name": "aime24",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "HuggingFaceH4/aime_2024",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 30,
64
- "effective_num_docs": 30,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime24|0": {
71
- "hashes": {
72
- "hash_examples": "18ca0099f8d8f826",
73
- "hash_full_prompts": "d34905fb622c50aa",
74
- "hash_input_tokens": "7e717febea55e885",
75
- "hash_cont_tokens": "c126e156aa1075ea"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 30,
79
- "padded": 0,
80
- "non_padded": 30,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "c4769936f28d3d77",
88
- "hash_full_prompts": "da635cdfbf36e078",
89
- "hash_input_tokens": "b8b436300cb70c68",
90
- "hash_cont_tokens": "7434fad4a1282c88"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 30,
94
- "padded": 0,
95
- "non_padded": 30,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/gpqa/results_2025-02-06T17-22-38.528696.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 106057.431232029,
9
- "end_time": 107560.738359603,
10
- "total_evaluation_time_secondes": "1503.307127573993",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|gpqa:diamond|0": {
18
- "extractive_match": 0.6212121212121212,
19
- "extractive_match_stderr": 0.03456088731993747
20
- },
21
- "all": {
22
- "extractive_match": 0.6212121212121212,
23
- "extractive_match_stderr": 0.03456088731993747
24
- }
25
- },
26
- "versions": {
27
- "custom|gpqa:diamond|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|gpqa:diamond": {
31
- "name": "gpqa:diamond",
32
- "prompt_function": "gpqa_prompt_fn",
33
- "hf_repo": "Idavidrein/gpqa",
34
- "hf_subset": "gpqa_diamond",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": true,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 198,
64
- "effective_num_docs": 198,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|gpqa:diamond|0": {
71
- "hashes": {
72
- "hash_examples": "50ecb6f5d091bd95",
73
- "hash_full_prompts": "4d6bc2c8e64a03b8",
74
- "hash_input_tokens": "480ff14cf78ff54a",
75
- "hash_cont_tokens": "610f4c06f7b1213b"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 198,
79
- "padded": 0,
80
- "non_padded": 198,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "a9318dbdd867770b",
88
- "hash_full_prompts": "d8f2b1ad973f6d42",
89
- "hash_input_tokens": "c46bd83c1fb24788",
90
- "hash_cont_tokens": "cccc39eed903d13a"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 198,
94
- "padded": 0,
95
- "non_padded": 198,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-01-29T16-38-54.088382.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 440109.713616279,
9
- "end_time": 441777.149476983,
10
- "total_evaluation_time_secondes": "1667.4358607039903",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.908,
19
- "extractive_match_stderr": 0.012938578501027575
20
- },
21
- "all": {
22
- "extractive_match": 0.908,
23
- "extractive_match_stderr": 0.012938578501027575
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "664892a030c023a0",
74
- "hash_input_tokens": "f9582e585a627833",
75
- "hash_cont_tokens": "dafab44ee1c37be0"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "aafa6c8f5b7270a6",
89
- "hash_input_tokens": "4fae6476bfea7e35",
90
- "hash_cont_tokens": "d28b0d99ac9375a4"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-02-06T17-28-17.933149.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 166330.804576162,
9
- "end_time": 168189.007509852,
10
- "total_evaluation_time_secondes": "1858.202933690016",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.94,
19
- "extractive_match_stderr": 0.010631371130019282
20
- },
21
- "all": {
22
- "extractive_match": 0.94,
23
- "extractive_match_stderr": 0.010631371130019282
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "a6dd0b2c8017a31e",
75
- "hash_cont_tokens": "092de069fd11183c"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "11a6a8141b926588",
90
- "hash_cont_tokens": "f30470cc4782fe00"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/aime24/results_2025-02-06T17-01-03.311411.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 155444.418658458,
9
- "end_time": 155894.410067112,
10
- "total_evaluation_time_secondes": "449.99140865402296",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime24|0": {
18
- "extractive_match": 0.3333333333333333,
19
- "extractive_match_stderr": 0.08753762190648168
20
- },
21
- "all": {
22
- "extractive_match": 0.3333333333333333,
23
- "extractive_match_stderr": 0.08753762190648168
24
- }
25
- },
26
- "versions": {
27
- "custom|aime24|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime24": {
31
- "name": "aime24",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "HuggingFaceH4/aime_2024",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 30,
64
- "effective_num_docs": 30,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime24|0": {
71
- "hashes": {
72
- "hash_examples": "18ca0099f8d8f826",
73
- "hash_full_prompts": "d34905fb622c50aa",
74
- "hash_input_tokens": "7e717febea55e885",
75
- "hash_cont_tokens": "41eb7da6051abc52"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 30,
79
- "padded": 0,
80
- "non_padded": 30,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "c4769936f28d3d77",
88
- "hash_full_prompts": "da635cdfbf36e078",
89
- "hash_input_tokens": "b8b436300cb70c68",
90
- "hash_cont_tokens": "861658322c546034"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 30,
94
- "padded": 0,
95
- "non_padded": 30,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/gpqa/results_2025-02-06T17-00-37.294536.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 158199.305823159,
9
- "end_time": 158588.293025704,
10
- "total_evaluation_time_secondes": "388.98720254501677",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|gpqa:diamond|0": {
18
- "extractive_match": 0.5,
19
- "extractive_match_stderr": 0.035623524993954825
20
- },
21
- "all": {
22
- "extractive_match": 0.5,
23
- "extractive_match_stderr": 0.035623524993954825
24
- }
25
- },
26
- "versions": {
27
- "custom|gpqa:diamond|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|gpqa:diamond": {
31
- "name": "gpqa:diamond",
32
- "prompt_function": "gpqa_prompt_fn",
33
- "hf_repo": "Idavidrein/gpqa",
34
- "hf_subset": "gpqa_diamond",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": true,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 198,
64
- "effective_num_docs": 198,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|gpqa:diamond|0": {
71
- "hashes": {
72
- "hash_examples": "9f6f23223e6fa498",
73
- "hash_full_prompts": "d5b99fbdea4fb7bc",
74
- "hash_input_tokens": "82232a555cec2ca0",
75
- "hash_cont_tokens": "2f82c3fadbcee31e"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 198,
79
- "padded": 0,
80
- "non_padded": 198,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "1f5eb58b451df729",
88
- "hash_full_prompts": "d1bb01e81a8c1dea",
89
- "hash_input_tokens": "5e4de8f905acdfcd",
90
- "hash_cont_tokens": "7dc8f50295485a73"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 198,
94
- "padded": 0,
95
- "non_padded": 198,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-01-29T16-19-05.697532.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 542365.918012772,
9
- "end_time": 542868.227046596,
10
- "total_evaluation_time_secondes": "502.3090338240145",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.788,
19
- "extractive_match_stderr": 0.01829703700401389
20
- },
21
- "all": {
22
- "extractive_match": 0.788,
23
- "extractive_match_stderr": 0.01829703700401389
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "664892a030c023a0",
74
- "hash_input_tokens": "f9582e585a627833",
75
- "hash_cont_tokens": "552f19a0af51a46c"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "aafa6c8f5b7270a6",
89
- "hash_input_tokens": "4fae6476bfea7e35",
90
- "hash_cont_tokens": "ac49751184643ef5"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-02-06T17-02-13.445609.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 1218070.082840348,
9
- "end_time": 1218564.13089538,
10
- "total_evaluation_time_secondes": "494.04805503203534",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.854,
19
- "extractive_match_stderr": 0.01580720517583485
20
- },
21
- "all": {
22
- "extractive_match": 0.854,
23
- "extractive_match_stderr": 0.01580720517583485
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "a6dd0b2c8017a31e",
75
- "hash_cont_tokens": "73a683e89547befe"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "11a6a8141b926588",
90
- "hash_cont_tokens": "87ef792246644d83"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T15-18-58.986325.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 83434.21699032,
9
- "end_time": 83991.449675551,
10
- "total_evaluation_time_secondes": "557.232685231007",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime24|0": {
18
- "extractive_match": 0.3,
19
- "extractive_match_stderr": 0.0850962943396763
20
- },
21
- "all": {
22
- "extractive_match": 0.3,
23
- "extractive_match_stderr": 0.0850962943396763
24
- }
25
- },
26
- "versions": {
27
- "custom|aime24|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime24": {
31
- "name": "aime24",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "HuggingFaceH4/aime_2024",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 30,
64
- "effective_num_docs": 30,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime24|0": {
71
- "hashes": {
72
- "hash_examples": "18ca0099f8d8f826",
73
- "hash_full_prompts": "d34905fb622c50aa",
74
- "hash_input_tokens": "6d1b89ed573bfa89",
75
- "hash_cont_tokens": "892270bbdf1ba4ca"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 30,
79
- "padded": 0,
80
- "non_padded": 30,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "c4769936f28d3d77",
88
- "hash_full_prompts": "da635cdfbf36e078",
89
- "hash_input_tokens": "a41b3c52a63d1650",
90
- "hash_cont_tokens": "566402dfda2de898"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 30,
94
- "padded": 0,
95
- "non_padded": 30,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T16-27-35.319682.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 87759.005419085,
9
- "end_time": 88107.786310245,
10
- "total_evaluation_time_secondes": "348.78089116000046",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime24|0": {
18
- "extractive_match": 0.3,
19
- "extractive_match_stderr": 0.0850962943396763
20
- },
21
- "all": {
22
- "extractive_match": 0.3,
23
- "extractive_match_stderr": 0.0850962943396763
24
- }
25
- },
26
- "versions": {
27
- "custom|aime24|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime24": {
31
- "name": "aime24",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "HuggingFaceH4/aime_2024",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 30,
64
- "effective_num_docs": 30,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime24|0": {
71
- "hashes": {
72
- "hash_examples": "18ca0099f8d8f826",
73
- "hash_full_prompts": "d34905fb622c50aa",
74
- "hash_input_tokens": "6d1b89ed573bfa89",
75
- "hash_cont_tokens": "892270bbdf1ba4ca"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 30,
79
- "padded": 0,
80
- "non_padded": 30,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "c4769936f28d3d77",
88
- "hash_full_prompts": "da635cdfbf36e078",
89
- "hash_input_tokens": "a41b3c52a63d1650",
90
- "hash_cont_tokens": "566402dfda2de898"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 30,
94
- "padded": 0,
95
- "non_padded": 30,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-33-18.290562.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 93420.632074006,
9
- "end_time": 93855.525530633,
10
- "total_evaluation_time_secondes": "434.89345662698906",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.824,
19
- "extractive_match_stderr": 0.017047852020622277
20
- },
21
- "all": {
22
- "extractive_match": 0.824,
23
- "extractive_match_stderr": 0.017047852020622277
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "7f42cd9c5af6adb3"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "32a94e5b93071c8a"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-42-59.056415.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 94015.204090474,
9
- "end_time": 94436.296266973,
10
- "total_evaluation_time_secondes": "421.0921764990053",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.824,
19
- "extractive_match_stderr": 0.017047852020622277
20
- },
21
- "all": {
22
- "extractive_match": 0.824,
23
- "extractive_match_stderr": 0.017047852020622277
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "7f42cd9c5af6adb3"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "32a94e5b93071c8a"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-52-00.573631.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 94556.149152107,
9
- "end_time": 94977.81332013,
10
- "total_evaluation_time_secondes": "421.664168022995",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.824,
19
- "extractive_match_stderr": 0.017047852020622277
20
- },
21
- "all": {
22
- "extractive_match": 0.824,
23
- "extractive_match_stderr": 0.017047852020622277
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "7f42cd9c5af6adb3"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "32a94e5b93071c8a"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T16-16-01.453401.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 589153.215620642,
9
- "end_time": 589556.287669233,
10
- "total_evaluation_time_secondes": "403.0720485911006",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.79,
19
- "extractive_match_stderr": 0.01823362086530592
20
- },
21
- "all": {
22
- "extractive_match": 0.79,
23
- "extractive_match_stderr": 0.01823362086530592
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "664892a030c023a0",
74
- "hash_input_tokens": "fa8894639fd8d026",
75
- "hash_cont_tokens": "d89c381da3b42bbe"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "aafa6c8f5b7270a6",
89
- "hash_input_tokens": "ef34990598320c6d",
90
- "hash_cont_tokens": "dcbdc909abc24613"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-05T08-47-02.738326.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 35770.98460946,
9
- "end_time": 36190.406735991,
10
- "total_evaluation_time_secondes": "419.42212653099705",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.824,
19
- "extractive_match_stderr": 0.017047852020622277
20
- },
21
- "all": {
22
- "extractive_match": 0.824,
23
- "extractive_match_stderr": 0.017047852020622277
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "7f42cd9c5af6adb3"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "32a94e5b93071c8a"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-04-53.650542.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 125973.485551623,
9
- "end_time": 126412.032679635,
10
- "total_evaluation_time_secondes": "438.5471280119964",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.754,
19
- "extractive_match_stderr": 0.019279819056352555
20
- },
21
- "all": {
22
- "extractive_match": 0.754,
23
- "extractive_match_stderr": 0.019279819056352555
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "7f42cd9c5af6adb3"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "32a94e5b93071c8a"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-19-17.273929.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 126833.938882632,
9
- "end_time": 127275.659424463,
10
- "total_evaluation_time_secondes": "441.72054183098953",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.75,
19
- "extractive_match_stderr": 0.019384310743640384
20
- },
21
- "all": {
22
- "extractive_match": 0.75,
23
- "extractive_match_stderr": 0.019384310743640384
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "7f42cd9c5af6adb3"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "32a94e5b93071c8a"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-16-44.132377.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 149630.524893698,
9
- "end_time": 149980.890803378,
10
- "total_evaluation_time_secondes": "350.36590967999655",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.818,
19
- "extractive_match_stderr": 0.017272773297730446
20
- },
21
- "all": {
22
- "extractive_match": 0.818,
23
- "extractive_match_stderr": 0.017272773297730446
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "1574449fe1e92cc1"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "ae13515204ae68f8"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-35-19.804114.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 154079.212744644,
9
- "end_time": 154436.986131958,
10
- "total_evaluation_time_secondes": "357.77338731399504",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.818,
19
- "extractive_match_stderr": 0.017272773297730446
20
- },
21
- "all": {
22
- "extractive_match": 0.818,
23
- "extractive_match_stderr": 0.017272773297730446
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "1574449fe1e92cc1"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "ae13515204ae68f8"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/aime24/results_2025-02-06T16-51-54.015026.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 278520.656131555,
9
- "end_time": 279339.265554163,
10
- "total_evaluation_time_secondes": "818.6094226080459",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime24|0": {
18
- "extractive_match": 0.5666666666666667,
19
- "extractive_match_stderr": 0.0920186554465537
20
- },
21
- "all": {
22
- "extractive_match": 0.5666666666666667,
23
- "extractive_match_stderr": 0.0920186554465537
24
- }
25
- },
26
- "versions": {
27
- "custom|aime24|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime24": {
31
- "name": "aime24",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "HuggingFaceH4/aime_2024",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 30,
64
- "effective_num_docs": 30,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime24|0": {
71
- "hashes": {
72
- "hash_examples": "18ca0099f8d8f826",
73
- "hash_full_prompts": "d34905fb622c50aa",
74
- "hash_input_tokens": "6d1b89ed573bfa89",
75
- "hash_cont_tokens": "f599f918b6aad43a"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 30,
79
- "padded": 0,
80
- "non_padded": 30,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "c4769936f28d3d77",
88
- "hash_full_prompts": "da635cdfbf36e078",
89
- "hash_input_tokens": "a41b3c52a63d1650",
90
- "hash_cont_tokens": "d9df7a1759a2bafb"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 30,
94
- "padded": 0,
95
- "non_padded": 30,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/gpqa/results_2025-02-06T16-54-34.705796.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 153607.482863218,
9
- "end_time": 154125.175074251,
10
- "total_evaluation_time_secondes": "517.6922110329906",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|gpqa:diamond|0": {
18
- "extractive_match": 0.5808080808080808,
19
- "extractive_match_stderr": 0.035155207286704175
20
- },
21
- "all": {
22
- "extractive_match": 0.5808080808080808,
23
- "extractive_match_stderr": 0.035155207286704175
24
- }
25
- },
26
- "versions": {
27
- "custom|gpqa:diamond|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|gpqa:diamond": {
31
- "name": "gpqa:diamond",
32
- "prompt_function": "gpqa_prompt_fn",
33
- "hf_repo": "Idavidrein/gpqa",
34
- "hf_subset": "gpqa_diamond",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": true,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 198,
64
- "effective_num_docs": 198,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|gpqa:diamond|0": {
71
- "hashes": {
72
- "hash_examples": "f368a0154dc4c902",
73
- "hash_full_prompts": "9327ccdd77ef50bf",
74
- "hash_input_tokens": "9cda8e7ee83e820f",
75
- "hash_cont_tokens": "8b4c8fd5af6bd759"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 198,
79
- "padded": 0,
80
- "non_padded": 198,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "8fd6d3e7ffb1ce33",
88
- "hash_full_prompts": "d9165bcf5a7b5ccc",
89
- "hash_input_tokens": "f4863da0ce0df94f",
90
- "hash_cont_tokens": "df3c6744a5e75b31"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 198,
94
- "padded": 0,
95
- "non_padded": 198,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-01-29T16-21-19.161811.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 439917.061219354,
9
- "end_time": 440589.579187485,
10
- "total_evaluation_time_secondes": "672.5179681309965",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.9,
19
- "extractive_match_stderr": 0.013429844431075358
20
- },
21
- "all": {
22
- "extractive_match": 0.9,
23
- "extractive_match_stderr": 0.013429844431075358
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "664892a030c023a0",
74
- "hash_input_tokens": "fa8894639fd8d026",
75
- "hash_cont_tokens": "d11a879759a1fc75"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "aafa6c8f5b7270a6",
89
- "hash_input_tokens": "ef34990598320c6d",
90
- "hash_cont_tokens": "8fbc5ab4b89d7194"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-02-06T16-56-34.467531.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 165587.423013672,
9
- "end_time": 166285.558426051,
10
- "total_evaluation_time_secondes": "698.1354123789934",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.954,
19
- "extractive_match_stderr": 0.009377840251121327
20
- },
21
- "all": {
22
- "extractive_match": 0.954,
23
- "extractive_match_stderr": 0.009377840251121327
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "43ae83b3ac2bd54a"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "9f227f154fe291b3"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/aime24/results_2025-02-06T17-12-46.800739.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 279377.267561971,
9
- "end_time": 280592.055722844,
10
- "total_evaluation_time_secondes": "1214.788160872995",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime24|0": {
18
- "extractive_match": 0.5666666666666667,
19
- "extractive_match_stderr": 0.0920186554465537
20
- },
21
- "all": {
22
- "extractive_match": 0.5666666666666667,
23
- "extractive_match_stderr": 0.0920186554465537
24
- }
25
- },
26
- "versions": {
27
- "custom|aime24|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime24": {
31
- "name": "aime24",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "HuggingFaceH4/aime_2024",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 30,
64
- "effective_num_docs": 30,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime24|0": {
71
- "hashes": {
72
- "hash_examples": "18ca0099f8d8f826",
73
- "hash_full_prompts": "d34905fb622c50aa",
74
- "hash_input_tokens": "6d1b89ed573bfa89",
75
- "hash_cont_tokens": "f6f3f20780f098e5"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 30,
79
- "padded": 0,
80
- "non_padded": 30,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "c4769936f28d3d77",
88
- "hash_full_prompts": "da635cdfbf36e078",
89
- "hash_input_tokens": "a41b3c52a63d1650",
90
- "hash_cont_tokens": "f64066263741b27b"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 30,
94
- "padded": 0,
95
- "non_padded": 30,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/gpqa/results_2025-02-06T17-41-45.634038.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 1219766.333155578,
9
- "end_time": 1220936.310749698,
10
- "total_evaluation_time_secondes": "1169.9775941199623",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|gpqa:diamond|0": {
18
- "extractive_match": 0.6313131313131313,
19
- "extractive_match_stderr": 0.03437305501980619
20
- },
21
- "all": {
22
- "extractive_match": 0.6313131313131313,
23
- "extractive_match_stderr": 0.03437305501980619
24
- }
25
- },
26
- "versions": {
27
- "custom|gpqa:diamond|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|gpqa:diamond": {
31
- "name": "gpqa:diamond",
32
- "prompt_function": "gpqa_prompt_fn",
33
- "hf_repo": "Idavidrein/gpqa",
34
- "hf_subset": "gpqa_diamond",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": true,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 198,
64
- "effective_num_docs": 198,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|gpqa:diamond|0": {
71
- "hashes": {
72
- "hash_examples": "50ecb6f5d091bd95",
73
- "hash_full_prompts": "4d6bc2c8e64a03b8",
74
- "hash_input_tokens": "7f4457760a5d7f38",
75
- "hash_cont_tokens": "bb662f415cf87cf1"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 198,
79
- "padded": 0,
80
- "non_padded": 198,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "a9318dbdd867770b",
88
- "hash_full_prompts": "d8f2b1ad973f6d42",
89
- "hash_input_tokens": "1edd6765f01885b1",
90
- "hash_cont_tokens": "675f31a7f0f4d133"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 198,
94
- "padded": 0,
95
- "non_padded": 198,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-01-29T16-35-05.004956.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 1210708.081163628,
9
- "end_time": 1212186.095578638,
10
- "total_evaluation_time_secondes": "1478.0144150098786",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.91,
19
- "extractive_match_stderr": 0.012811255071733802
20
- },
21
- "all": {
22
- "extractive_match": 0.91,
23
- "extractive_match_stderr": 0.012811255071733802
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "664892a030c023a0",
74
- "hash_input_tokens": "fa8894639fd8d026",
75
- "hash_cont_tokens": "da05b695d3c9cac1"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "aafa6c8f5b7270a6",
89
- "hash_input_tokens": "ef34990598320c6d",
90
- "hash_cont_tokens": "c745900d0266ea36"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-02-06T17-44-13.823355.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 281198.233560346,
9
- "end_time": 282479.072700203,
10
- "total_evaluation_time_secondes": "1280.8391398569802",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.956,
19
- "extractive_match_stderr": 0.00918131761711647
20
- },
21
- "all": {
22
- "extractive_match": 0.956,
23
- "extractive_match_stderr": 0.00918131761711647
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "550b3c9eec8abcb8"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "d1bb3eb720ee911c"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/aime24/results_2025-02-06T16-04-06.233392.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 86256.833747225,
9
- "end_time": 86698.700447643,
10
- "total_evaluation_time_secondes": "441.86670041800244",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime24|0": {
18
- "extractive_match": 0.43333333333333335,
19
- "extractive_match_stderr": 0.0920186554465537
20
- },
21
- "all": {
22
- "extractive_match": 0.43333333333333335,
23
- "extractive_match_stderr": 0.0920186554465537
24
- }
25
- },
26
- "versions": {
27
- "custom|aime24|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime24": {
31
- "name": "aime24",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "HuggingFaceH4/aime_2024",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 30,
64
- "effective_num_docs": 30,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime24|0": {
71
- "hashes": {
72
- "hash_examples": "18ca0099f8d8f826",
73
- "hash_full_prompts": "d34905fb622c50aa",
74
- "hash_input_tokens": "6d1b89ed573bfa89",
75
- "hash_cont_tokens": "6587c677409d3d9f"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 30,
79
- "padded": 0,
80
- "non_padded": 30,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "c4769936f28d3d77",
88
- "hash_full_prompts": "da635cdfbf36e078",
89
- "hash_input_tokens": "a41b3c52a63d1650",
90
- "hash_cont_tokens": "c741de609419edb2"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 30,
94
- "padded": 0,
95
- "non_padded": 30,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/gpqa/results_2025-02-06T16-44-25.806464.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 88599.975613896,
9
- "end_time": 89118.272384086,
10
- "total_evaluation_time_secondes": "518.2967701900052",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|gpqa:diamond|0": {
18
- "extractive_match": 0.5151515151515151,
19
- "extractive_match_stderr": 0.035607165165310595
20
- },
21
- "all": {
22
- "extractive_match": 0.5151515151515151,
23
- "extractive_match_stderr": 0.035607165165310595
24
- }
25
- },
26
- "versions": {
27
- "custom|gpqa:diamond|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|gpqa:diamond": {
31
- "name": "gpqa:diamond",
32
- "prompt_function": "gpqa_prompt_fn",
33
- "hf_repo": "Idavidrein/gpqa",
34
- "hf_subset": "gpqa_diamond",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": true,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 198,
64
- "effective_num_docs": 198,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|gpqa:diamond|0": {
71
- "hashes": {
72
- "hash_examples": "af663a4591d96f5d",
73
- "hash_full_prompts": "b0fa5864c08e0781",
74
- "hash_input_tokens": "276e7f8541d9d416",
75
- "hash_cont_tokens": "1da7cc15c0ea6367"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 198,
79
- "padded": 0,
80
- "non_padded": 198,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "5b1b43c43d7fe08d",
88
- "hash_full_prompts": "30924f5d1c8c1b0b",
89
- "hash_input_tokens": "1f51bdd0b08dad14",
90
- "hash_cont_tokens": "bd9da4fcb416e9c4"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 198,
94
- "padded": 0,
95
- "non_padded": 198,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-01-29T16-17-35.586793.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 589160.248337717,
9
- "end_time": 589650.422460219,
10
- "total_evaluation_time_secondes": "490.1741225019796",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.896,
19
- "extractive_match_stderr": 0.013665338743182685
20
- },
21
- "all": {
22
- "extractive_match": 0.896,
23
- "extractive_match_stderr": 0.013665338743182685
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "664892a030c023a0",
74
- "hash_input_tokens": "fa8894639fd8d026",
75
- "hash_cont_tokens": "521319f820154128"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "aafa6c8f5b7270a6",
89
- "hash_input_tokens": "ef34990598320c6d",
90
- "hash_cont_tokens": "1dfa5a3bfd356b15"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-02-06T16-16-56.008098.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 86736.527102462,
9
- "end_time": 87468.473149333,
10
- "total_evaluation_time_secondes": "731.9460468710022",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|math_500|0": {
18
- "extractive_match": 0.916,
19
- "extractive_match_stderr": 0.012417584015643694
20
- },
21
- "all": {
22
- "extractive_match": 0.916,
23
- "extractive_match_stderr": 0.012417584015643694
24
- }
25
- },
26
- "versions": {
27
- "custom|math_500|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|math_500": {
31
- "name": "math_500",
32
- "prompt_function": "prompt_fn",
33
- "hf_repo": "HuggingFaceH4/MATH-500",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "test"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "test"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 500,
64
- "effective_num_docs": 500,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|math_500|0": {
71
- "hashes": {
72
- "hash_examples": "eac05bd67b8179c3",
73
- "hash_full_prompts": "9043592f69431f18",
74
- "hash_input_tokens": "c5aa3a61e16cb62b",
75
- "hash_cont_tokens": "52394364e3300d65"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 500,
79
- "padded": 0,
80
- "non_padded": 500,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "d6b3f24200421bb2",
88
- "hash_full_prompts": "1a4bc197befd9b91",
89
- "hash_input_tokens": "b703f1639cd56c2a",
90
- "hash_cont_tokens": "7801dd51980363b8"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 500,
94
- "padded": 0,
95
- "non_padded": 500,
96
- "num_truncated_few_shots": 0
97
- }
98
- }