Delete eval_results/deepseek-ai
Browse files- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/aime24/results_2025-02-06T17-20-54.254090.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/gpqa/results_2025-02-06T17-22-38.528696.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-01-29T16-38-54.088382.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-02-06T17-28-17.933149.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/aime24/results_2025-02-06T17-01-03.311411.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/gpqa/results_2025-02-06T17-00-37.294536.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-01-29T16-19-05.697532.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-02-06T17-02-13.445609.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T15-18-58.986325.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T16-27-35.319682.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-33-18.290562.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-42-59.056415.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-52-00.573631.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T16-16-01.453401.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-05T08-47-02.738326.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-04-53.650542.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-19-17.273929.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-16-44.132377.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-35-19.804114.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/aime24/results_2025-02-06T16-51-54.015026.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/gpqa/results_2025-02-06T16-54-34.705796.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-01-29T16-21-19.161811.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-02-06T16-56-34.467531.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/aime24/results_2025-02-06T17-12-46.800739.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/gpqa/results_2025-02-06T17-41-45.634038.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-01-29T16-35-05.004956.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-02-06T17-44-13.823355.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/aime24/results_2025-02-06T16-04-06.233392.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/gpqa/results_2025-02-06T16-44-25.806464.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-01-29T16-17-35.586793.json +0 -98
- eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-02-06T16-16-56.008098.json +0 -98
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/aime24/results_2025-02-06T17-20-54.254090.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 154157.020215178,
|
9 |
-
"end_time": 155704.725477899,
|
10 |
-
"total_evaluation_time_secondes": "1547.7052627209923",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|aime24|0": {
|
18 |
-
"extractive_match": 0.6666666666666666,
|
19 |
-
"extractive_match_stderr": 0.08753762190648169
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.6666666666666666,
|
23 |
-
"extractive_match_stderr": 0.08753762190648169
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|aime24|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|aime24": {
|
31 |
-
"name": "aime24",
|
32 |
-
"prompt_function": "aime_prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 30,
|
64 |
-
"effective_num_docs": 30,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|aime24|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "18ca0099f8d8f826",
|
73 |
-
"hash_full_prompts": "d34905fb622c50aa",
|
74 |
-
"hash_input_tokens": "7e717febea55e885",
|
75 |
-
"hash_cont_tokens": "c126e156aa1075ea"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 30,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 30,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "c4769936f28d3d77",
|
88 |
-
"hash_full_prompts": "da635cdfbf36e078",
|
89 |
-
"hash_input_tokens": "b8b436300cb70c68",
|
90 |
-
"hash_cont_tokens": "7434fad4a1282c88"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 30,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 30,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/gpqa/results_2025-02-06T17-22-38.528696.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 106057.431232029,
|
9 |
-
"end_time": 107560.738359603,
|
10 |
-
"total_evaluation_time_secondes": "1503.307127573993",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|gpqa:diamond|0": {
|
18 |
-
"extractive_match": 0.6212121212121212,
|
19 |
-
"extractive_match_stderr": 0.03456088731993747
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.6212121212121212,
|
23 |
-
"extractive_match_stderr": 0.03456088731993747
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|gpqa:diamond|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|gpqa:diamond": {
|
31 |
-
"name": "gpqa:diamond",
|
32 |
-
"prompt_function": "gpqa_prompt_fn",
|
33 |
-
"hf_repo": "Idavidrein/gpqa",
|
34 |
-
"hf_subset": "gpqa_diamond",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": true,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 198,
|
64 |
-
"effective_num_docs": 198,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|gpqa:diamond|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "50ecb6f5d091bd95",
|
73 |
-
"hash_full_prompts": "4d6bc2c8e64a03b8",
|
74 |
-
"hash_input_tokens": "480ff14cf78ff54a",
|
75 |
-
"hash_cont_tokens": "610f4c06f7b1213b"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 198,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 198,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "a9318dbdd867770b",
|
88 |
-
"hash_full_prompts": "d8f2b1ad973f6d42",
|
89 |
-
"hash_input_tokens": "c46bd83c1fb24788",
|
90 |
-
"hash_cont_tokens": "cccc39eed903d13a"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 198,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 198,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-01-29T16-38-54.088382.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 440109.713616279,
|
9 |
-
"end_time": 441777.149476983,
|
10 |
-
"total_evaluation_time_secondes": "1667.4358607039903",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.908,
|
19 |
-
"extractive_match_stderr": 0.012938578501027575
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.908,
|
23 |
-
"extractive_match_stderr": 0.012938578501027575
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "664892a030c023a0",
|
74 |
-
"hash_input_tokens": "f9582e585a627833",
|
75 |
-
"hash_cont_tokens": "dafab44ee1c37be0"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "aafa6c8f5b7270a6",
|
89 |
-
"hash_input_tokens": "4fae6476bfea7e35",
|
90 |
-
"hash_cont_tokens": "d28b0d99ac9375a4"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/math_500/results_2025-02-06T17-28-17.933149.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 166330.804576162,
|
9 |
-
"end_time": 168189.007509852,
|
10 |
-
"total_evaluation_time_secondes": "1858.202933690016",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.94,
|
19 |
-
"extractive_match_stderr": 0.010631371130019282
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.94,
|
23 |
-
"extractive_match_stderr": 0.010631371130019282
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "a6dd0b2c8017a31e",
|
75 |
-
"hash_cont_tokens": "092de069fd11183c"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "11a6a8141b926588",
|
90 |
-
"hash_cont_tokens": "f30470cc4782fe00"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/aime24/results_2025-02-06T17-01-03.311411.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 155444.418658458,
|
9 |
-
"end_time": 155894.410067112,
|
10 |
-
"total_evaluation_time_secondes": "449.99140865402296",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|aime24|0": {
|
18 |
-
"extractive_match": 0.3333333333333333,
|
19 |
-
"extractive_match_stderr": 0.08753762190648168
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.3333333333333333,
|
23 |
-
"extractive_match_stderr": 0.08753762190648168
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|aime24|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|aime24": {
|
31 |
-
"name": "aime24",
|
32 |
-
"prompt_function": "aime_prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 30,
|
64 |
-
"effective_num_docs": 30,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|aime24|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "18ca0099f8d8f826",
|
73 |
-
"hash_full_prompts": "d34905fb622c50aa",
|
74 |
-
"hash_input_tokens": "7e717febea55e885",
|
75 |
-
"hash_cont_tokens": "41eb7da6051abc52"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 30,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 30,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "c4769936f28d3d77",
|
88 |
-
"hash_full_prompts": "da635cdfbf36e078",
|
89 |
-
"hash_input_tokens": "b8b436300cb70c68",
|
90 |
-
"hash_cont_tokens": "861658322c546034"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 30,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 30,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/gpqa/results_2025-02-06T17-00-37.294536.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 158199.305823159,
|
9 |
-
"end_time": 158588.293025704,
|
10 |
-
"total_evaluation_time_secondes": "388.98720254501677",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|gpqa:diamond|0": {
|
18 |
-
"extractive_match": 0.5,
|
19 |
-
"extractive_match_stderr": 0.035623524993954825
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.5,
|
23 |
-
"extractive_match_stderr": 0.035623524993954825
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|gpqa:diamond|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|gpqa:diamond": {
|
31 |
-
"name": "gpqa:diamond",
|
32 |
-
"prompt_function": "gpqa_prompt_fn",
|
33 |
-
"hf_repo": "Idavidrein/gpqa",
|
34 |
-
"hf_subset": "gpqa_diamond",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": true,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 198,
|
64 |
-
"effective_num_docs": 198,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|gpqa:diamond|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "9f6f23223e6fa498",
|
73 |
-
"hash_full_prompts": "d5b99fbdea4fb7bc",
|
74 |
-
"hash_input_tokens": "82232a555cec2ca0",
|
75 |
-
"hash_cont_tokens": "2f82c3fadbcee31e"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 198,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 198,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "1f5eb58b451df729",
|
88 |
-
"hash_full_prompts": "d1bb01e81a8c1dea",
|
89 |
-
"hash_input_tokens": "5e4de8f905acdfcd",
|
90 |
-
"hash_cont_tokens": "7dc8f50295485a73"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 198,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 198,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-01-29T16-19-05.697532.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 542365.918012772,
|
9 |
-
"end_time": 542868.227046596,
|
10 |
-
"total_evaluation_time_secondes": "502.3090338240145",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.788,
|
19 |
-
"extractive_match_stderr": 0.01829703700401389
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.788,
|
23 |
-
"extractive_match_stderr": 0.01829703700401389
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "664892a030c023a0",
|
74 |
-
"hash_input_tokens": "f9582e585a627833",
|
75 |
-
"hash_cont_tokens": "552f19a0af51a46c"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "aafa6c8f5b7270a6",
|
89 |
-
"hash_input_tokens": "4fae6476bfea7e35",
|
90 |
-
"hash_cont_tokens": "ac49751184643ef5"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/math_500/results_2025-02-06T17-02-13.445609.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 1218070.082840348,
|
9 |
-
"end_time": 1218564.13089538,
|
10 |
-
"total_evaluation_time_secondes": "494.04805503203534",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.854,
|
19 |
-
"extractive_match_stderr": 0.01580720517583485
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.854,
|
23 |
-
"extractive_match_stderr": 0.01580720517583485
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "a6dd0b2c8017a31e",
|
75 |
-
"hash_cont_tokens": "73a683e89547befe"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "11a6a8141b926588",
|
90 |
-
"hash_cont_tokens": "87ef792246644d83"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T15-18-58.986325.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 83434.21699032,
|
9 |
-
"end_time": 83991.449675551,
|
10 |
-
"total_evaluation_time_secondes": "557.232685231007",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|aime24|0": {
|
18 |
-
"extractive_match": 0.3,
|
19 |
-
"extractive_match_stderr": 0.0850962943396763
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.3,
|
23 |
-
"extractive_match_stderr": 0.0850962943396763
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|aime24|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|aime24": {
|
31 |
-
"name": "aime24",
|
32 |
-
"prompt_function": "aime_prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 30,
|
64 |
-
"effective_num_docs": 30,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|aime24|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "18ca0099f8d8f826",
|
73 |
-
"hash_full_prompts": "d34905fb622c50aa",
|
74 |
-
"hash_input_tokens": "6d1b89ed573bfa89",
|
75 |
-
"hash_cont_tokens": "892270bbdf1ba4ca"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 30,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 30,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "c4769936f28d3d77",
|
88 |
-
"hash_full_prompts": "da635cdfbf36e078",
|
89 |
-
"hash_input_tokens": "a41b3c52a63d1650",
|
90 |
-
"hash_cont_tokens": "566402dfda2de898"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 30,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 30,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime24/results_2025-02-06T16-27-35.319682.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 87759.005419085,
|
9 |
-
"end_time": 88107.786310245,
|
10 |
-
"total_evaluation_time_secondes": "348.78089116000046",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|aime24|0": {
|
18 |
-
"extractive_match": 0.3,
|
19 |
-
"extractive_match_stderr": 0.0850962943396763
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.3,
|
23 |
-
"extractive_match_stderr": 0.0850962943396763
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|aime24|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|aime24": {
|
31 |
-
"name": "aime24",
|
32 |
-
"prompt_function": "aime_prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 30,
|
64 |
-
"effective_num_docs": 30,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|aime24|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "18ca0099f8d8f826",
|
73 |
-
"hash_full_prompts": "d34905fb622c50aa",
|
74 |
-
"hash_input_tokens": "6d1b89ed573bfa89",
|
75 |
-
"hash_cont_tokens": "892270bbdf1ba4ca"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 30,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 30,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "c4769936f28d3d77",
|
88 |
-
"hash_full_prompts": "da635cdfbf36e078",
|
89 |
-
"hash_input_tokens": "a41b3c52a63d1650",
|
90 |
-
"hash_cont_tokens": "566402dfda2de898"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 30,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 30,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-33-18.290562.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 93420.632074006,
|
9 |
-
"end_time": 93855.525530633,
|
10 |
-
"total_evaluation_time_secondes": "434.89345662698906",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.824,
|
19 |
-
"extractive_match_stderr": 0.017047852020622277
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.824,
|
23 |
-
"extractive_match_stderr": 0.017047852020622277
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "7f42cd9c5af6adb3"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "32a94e5b93071c8a"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-42-59.056415.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 94015.204090474,
|
9 |
-
"end_time": 94436.296266973,
|
10 |
-
"total_evaluation_time_secondes": "421.0921764990053",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.824,
|
19 |
-
"extractive_match_stderr": 0.017047852020622277
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.824,
|
23 |
-
"extractive_match_stderr": 0.017047852020622277
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "7f42cd9c5af6adb3"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "32a94e5b93071c8a"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T15-52-00.573631.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 94556.149152107,
|
9 |
-
"end_time": 94977.81332013,
|
10 |
-
"total_evaluation_time_secondes": "421.664168022995",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.824,
|
19 |
-
"extractive_match_stderr": 0.017047852020622277
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.824,
|
23 |
-
"extractive_match_stderr": 0.017047852020622277
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "7f42cd9c5af6adb3"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "32a94e5b93071c8a"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-01-29T16-16-01.453401.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 589153.215620642,
|
9 |
-
"end_time": 589556.287669233,
|
10 |
-
"total_evaluation_time_secondes": "403.0720485911006",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.79,
|
19 |
-
"extractive_match_stderr": 0.01823362086530592
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.79,
|
23 |
-
"extractive_match_stderr": 0.01823362086530592
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "664892a030c023a0",
|
74 |
-
"hash_input_tokens": "fa8894639fd8d026",
|
75 |
-
"hash_cont_tokens": "d89c381da3b42bbe"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "aafa6c8f5b7270a6",
|
89 |
-
"hash_input_tokens": "ef34990598320c6d",
|
90 |
-
"hash_cont_tokens": "dcbdc909abc24613"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-05T08-47-02.738326.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 35770.98460946,
|
9 |
-
"end_time": 36190.406735991,
|
10 |
-
"total_evaluation_time_secondes": "419.42212653099705",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.824,
|
19 |
-
"extractive_match_stderr": 0.017047852020622277
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.824,
|
23 |
-
"extractive_match_stderr": 0.017047852020622277
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "7f42cd9c5af6adb3"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "32a94e5b93071c8a"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-04-53.650542.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 125973.485551623,
|
9 |
-
"end_time": 126412.032679635,
|
10 |
-
"total_evaluation_time_secondes": "438.5471280119964",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.754,
|
19 |
-
"extractive_match_stderr": 0.019279819056352555
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.754,
|
23 |
-
"extractive_match_stderr": 0.019279819056352555
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "7f42cd9c5af6adb3"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "32a94e5b93071c8a"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T09-19-17.273929.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 126833.938882632,
|
9 |
-
"end_time": 127275.659424463,
|
10 |
-
"total_evaluation_time_secondes": "441.72054183098953",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.75,
|
19 |
-
"extractive_match_stderr": 0.019384310743640384
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.75,
|
23 |
-
"extractive_match_stderr": 0.019384310743640384
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "7f42cd9c5af6adb3"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "32a94e5b93071c8a"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-16-44.132377.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 149630.524893698,
|
9 |
-
"end_time": 149980.890803378,
|
10 |
-
"total_evaluation_time_secondes": "350.36590967999655",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.818,
|
19 |
-
"extractive_match_stderr": 0.017272773297730446
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.818,
|
23 |
-
"extractive_match_stderr": 0.017272773297730446
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "1574449fe1e92cc1"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "ae13515204ae68f8"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/math_500/results_2025-02-06T15-35-19.804114.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 154079.212744644,
|
9 |
-
"end_time": 154436.986131958,
|
10 |
-
"total_evaluation_time_secondes": "357.77338731399504",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.818,
|
19 |
-
"extractive_match_stderr": 0.017272773297730446
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.818,
|
23 |
-
"extractive_match_stderr": 0.017272773297730446
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "1574449fe1e92cc1"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "ae13515204ae68f8"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/aime24/results_2025-02-06T16-51-54.015026.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 278520.656131555,
|
9 |
-
"end_time": 279339.265554163,
|
10 |
-
"total_evaluation_time_secondes": "818.6094226080459",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|aime24|0": {
|
18 |
-
"extractive_match": 0.5666666666666667,
|
19 |
-
"extractive_match_stderr": 0.0920186554465537
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.5666666666666667,
|
23 |
-
"extractive_match_stderr": 0.0920186554465537
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|aime24|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|aime24": {
|
31 |
-
"name": "aime24",
|
32 |
-
"prompt_function": "aime_prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 30,
|
64 |
-
"effective_num_docs": 30,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|aime24|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "18ca0099f8d8f826",
|
73 |
-
"hash_full_prompts": "d34905fb622c50aa",
|
74 |
-
"hash_input_tokens": "6d1b89ed573bfa89",
|
75 |
-
"hash_cont_tokens": "f599f918b6aad43a"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 30,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 30,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "c4769936f28d3d77",
|
88 |
-
"hash_full_prompts": "da635cdfbf36e078",
|
89 |
-
"hash_input_tokens": "a41b3c52a63d1650",
|
90 |
-
"hash_cont_tokens": "d9df7a1759a2bafb"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 30,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 30,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/gpqa/results_2025-02-06T16-54-34.705796.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 153607.482863218,
|
9 |
-
"end_time": 154125.175074251,
|
10 |
-
"total_evaluation_time_secondes": "517.6922110329906",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|gpqa:diamond|0": {
|
18 |
-
"extractive_match": 0.5808080808080808,
|
19 |
-
"extractive_match_stderr": 0.035155207286704175
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.5808080808080808,
|
23 |
-
"extractive_match_stderr": 0.035155207286704175
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|gpqa:diamond|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|gpqa:diamond": {
|
31 |
-
"name": "gpqa:diamond",
|
32 |
-
"prompt_function": "gpqa_prompt_fn",
|
33 |
-
"hf_repo": "Idavidrein/gpqa",
|
34 |
-
"hf_subset": "gpqa_diamond",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": true,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 198,
|
64 |
-
"effective_num_docs": 198,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|gpqa:diamond|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "f368a0154dc4c902",
|
73 |
-
"hash_full_prompts": "9327ccdd77ef50bf",
|
74 |
-
"hash_input_tokens": "9cda8e7ee83e820f",
|
75 |
-
"hash_cont_tokens": "8b4c8fd5af6bd759"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 198,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 198,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "8fd6d3e7ffb1ce33",
|
88 |
-
"hash_full_prompts": "d9165bcf5a7b5ccc",
|
89 |
-
"hash_input_tokens": "f4863da0ce0df94f",
|
90 |
-
"hash_cont_tokens": "df3c6744a5e75b31"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 198,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 198,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-01-29T16-21-19.161811.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 439917.061219354,
|
9 |
-
"end_time": 440589.579187485,
|
10 |
-
"total_evaluation_time_secondes": "672.5179681309965",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.9,
|
19 |
-
"extractive_match_stderr": 0.013429844431075358
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.9,
|
23 |
-
"extractive_match_stderr": 0.013429844431075358
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "664892a030c023a0",
|
74 |
-
"hash_input_tokens": "fa8894639fd8d026",
|
75 |
-
"hash_cont_tokens": "d11a879759a1fc75"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "aafa6c8f5b7270a6",
|
89 |
-
"hash_input_tokens": "ef34990598320c6d",
|
90 |
-
"hash_cont_tokens": "8fbc5ab4b89d7194"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/main/math_500/results_2025-02-06T16-56-34.467531.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 165587.423013672,
|
9 |
-
"end_time": 166285.558426051,
|
10 |
-
"total_evaluation_time_secondes": "698.1354123789934",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.954,
|
19 |
-
"extractive_match_stderr": 0.009377840251121327
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.954,
|
23 |
-
"extractive_match_stderr": 0.009377840251121327
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "43ae83b3ac2bd54a"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "9f227f154fe291b3"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/aime24/results_2025-02-06T17-12-46.800739.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 279377.267561971,
|
9 |
-
"end_time": 280592.055722844,
|
10 |
-
"total_evaluation_time_secondes": "1214.788160872995",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|aime24|0": {
|
18 |
-
"extractive_match": 0.5666666666666667,
|
19 |
-
"extractive_match_stderr": 0.0920186554465537
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.5666666666666667,
|
23 |
-
"extractive_match_stderr": 0.0920186554465537
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|aime24|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|aime24": {
|
31 |
-
"name": "aime24",
|
32 |
-
"prompt_function": "aime_prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 30,
|
64 |
-
"effective_num_docs": 30,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|aime24|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "18ca0099f8d8f826",
|
73 |
-
"hash_full_prompts": "d34905fb622c50aa",
|
74 |
-
"hash_input_tokens": "6d1b89ed573bfa89",
|
75 |
-
"hash_cont_tokens": "f6f3f20780f098e5"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 30,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 30,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "c4769936f28d3d77",
|
88 |
-
"hash_full_prompts": "da635cdfbf36e078",
|
89 |
-
"hash_input_tokens": "a41b3c52a63d1650",
|
90 |
-
"hash_cont_tokens": "f64066263741b27b"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 30,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 30,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/gpqa/results_2025-02-06T17-41-45.634038.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 1219766.333155578,
|
9 |
-
"end_time": 1220936.310749698,
|
10 |
-
"total_evaluation_time_secondes": "1169.9775941199623",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|gpqa:diamond|0": {
|
18 |
-
"extractive_match": 0.6313131313131313,
|
19 |
-
"extractive_match_stderr": 0.03437305501980619
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.6313131313131313,
|
23 |
-
"extractive_match_stderr": 0.03437305501980619
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|gpqa:diamond|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|gpqa:diamond": {
|
31 |
-
"name": "gpqa:diamond",
|
32 |
-
"prompt_function": "gpqa_prompt_fn",
|
33 |
-
"hf_repo": "Idavidrein/gpqa",
|
34 |
-
"hf_subset": "gpqa_diamond",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": true,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 198,
|
64 |
-
"effective_num_docs": 198,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|gpqa:diamond|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "50ecb6f5d091bd95",
|
73 |
-
"hash_full_prompts": "4d6bc2c8e64a03b8",
|
74 |
-
"hash_input_tokens": "7f4457760a5d7f38",
|
75 |
-
"hash_cont_tokens": "bb662f415cf87cf1"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 198,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 198,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "a9318dbdd867770b",
|
88 |
-
"hash_full_prompts": "d8f2b1ad973f6d42",
|
89 |
-
"hash_input_tokens": "1edd6765f01885b1",
|
90 |
-
"hash_cont_tokens": "675f31a7f0f4d133"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 198,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 198,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-01-29T16-35-05.004956.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 1210708.081163628,
|
9 |
-
"end_time": 1212186.095578638,
|
10 |
-
"total_evaluation_time_secondes": "1478.0144150098786",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.91,
|
19 |
-
"extractive_match_stderr": 0.012811255071733802
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.91,
|
23 |
-
"extractive_match_stderr": 0.012811255071733802
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "664892a030c023a0",
|
74 |
-
"hash_input_tokens": "fa8894639fd8d026",
|
75 |
-
"hash_cont_tokens": "da05b695d3c9cac1"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "aafa6c8f5b7270a6",
|
89 |
-
"hash_input_tokens": "ef34990598320c6d",
|
90 |
-
"hash_cont_tokens": "c745900d0266ea36"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/math_500/results_2025-02-06T17-44-13.823355.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 281198.233560346,
|
9 |
-
"end_time": 282479.072700203,
|
10 |
-
"total_evaluation_time_secondes": "1280.8391398569802",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.956,
|
19 |
-
"extractive_match_stderr": 0.00918131761711647
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.956,
|
23 |
-
"extractive_match_stderr": 0.00918131761711647
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "550b3c9eec8abcb8"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "d1bb3eb720ee911c"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/aime24/results_2025-02-06T16-04-06.233392.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 86256.833747225,
|
9 |
-
"end_time": 86698.700447643,
|
10 |
-
"total_evaluation_time_secondes": "441.86670041800244",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|aime24|0": {
|
18 |
-
"extractive_match": 0.43333333333333335,
|
19 |
-
"extractive_match_stderr": 0.0920186554465537
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.43333333333333335,
|
23 |
-
"extractive_match_stderr": 0.0920186554465537
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|aime24|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|aime24": {
|
31 |
-
"name": "aime24",
|
32 |
-
"prompt_function": "aime_prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/aime_2024",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 30,
|
64 |
-
"effective_num_docs": 30,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|aime24|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "18ca0099f8d8f826",
|
73 |
-
"hash_full_prompts": "d34905fb622c50aa",
|
74 |
-
"hash_input_tokens": "6d1b89ed573bfa89",
|
75 |
-
"hash_cont_tokens": "6587c677409d3d9f"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 30,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 30,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "c4769936f28d3d77",
|
88 |
-
"hash_full_prompts": "da635cdfbf36e078",
|
89 |
-
"hash_input_tokens": "a41b3c52a63d1650",
|
90 |
-
"hash_cont_tokens": "c741de609419edb2"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 30,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 30,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/gpqa/results_2025-02-06T16-44-25.806464.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 88599.975613896,
|
9 |
-
"end_time": 89118.272384086,
|
10 |
-
"total_evaluation_time_secondes": "518.2967701900052",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|gpqa:diamond|0": {
|
18 |
-
"extractive_match": 0.5151515151515151,
|
19 |
-
"extractive_match_stderr": 0.035607165165310595
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.5151515151515151,
|
23 |
-
"extractive_match_stderr": 0.035607165165310595
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|gpqa:diamond|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|gpqa:diamond": {
|
31 |
-
"name": "gpqa:diamond",
|
32 |
-
"prompt_function": "gpqa_prompt_fn",
|
33 |
-
"hf_repo": "Idavidrein/gpqa",
|
34 |
-
"hf_subset": "gpqa_diamond",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"train"
|
49 |
-
],
|
50 |
-
"trust_dataset": true,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"train"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 198,
|
64 |
-
"effective_num_docs": 198,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|gpqa:diamond|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "af663a4591d96f5d",
|
73 |
-
"hash_full_prompts": "b0fa5864c08e0781",
|
74 |
-
"hash_input_tokens": "276e7f8541d9d416",
|
75 |
-
"hash_cont_tokens": "1da7cc15c0ea6367"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 198,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 198,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "5b1b43c43d7fe08d",
|
88 |
-
"hash_full_prompts": "30924f5d1c8c1b0b",
|
89 |
-
"hash_input_tokens": "1f51bdd0b08dad14",
|
90 |
-
"hash_cont_tokens": "bd9da4fcb416e9c4"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 198,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 198,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-01-29T16-17-35.586793.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 589160.248337717,
|
9 |
-
"end_time": 589650.422460219,
|
10 |
-
"total_evaluation_time_secondes": "490.1741225019796",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.896,
|
19 |
-
"extractive_match_stderr": 0.013665338743182685
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.896,
|
23 |
-
"extractive_match_stderr": 0.013665338743182685
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "664892a030c023a0",
|
74 |
-
"hash_input_tokens": "fa8894639fd8d026",
|
75 |
-
"hash_cont_tokens": "521319f820154128"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "aafa6c8f5b7270a6",
|
89 |
-
"hash_input_tokens": "ef34990598320c6d",
|
90 |
-
"hash_cont_tokens": "1dfa5a3bfd356b15"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/math_500/results_2025-02-06T16-16-56.008098.json
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": -1,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": 0,
|
8 |
-
"start_time": 86736.527102462,
|
9 |
-
"end_time": 87468.473149333,
|
10 |
-
"total_evaluation_time_secondes": "731.9460468710022",
|
11 |
-
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
12 |
-
"model_sha": "",
|
13 |
-
"model_dtype": null,
|
14 |
-
"model_size": null
|
15 |
-
},
|
16 |
-
"results": {
|
17 |
-
"custom|math_500|0": {
|
18 |
-
"extractive_match": 0.916,
|
19 |
-
"extractive_match_stderr": 0.012417584015643694
|
20 |
-
},
|
21 |
-
"all": {
|
22 |
-
"extractive_match": 0.916,
|
23 |
-
"extractive_match_stderr": 0.012417584015643694
|
24 |
-
}
|
25 |
-
},
|
26 |
-
"versions": {
|
27 |
-
"custom|math_500|0": 1
|
28 |
-
},
|
29 |
-
"config_tasks": {
|
30 |
-
"custom|math_500": {
|
31 |
-
"name": "math_500",
|
32 |
-
"prompt_function": "prompt_fn",
|
33 |
-
"hf_repo": "HuggingFaceH4/MATH-500",
|
34 |
-
"hf_subset": "default",
|
35 |
-
"metric": [
|
36 |
-
{
|
37 |
-
"metric_name": "extractive_match",
|
38 |
-
"higher_is_better": true,
|
39 |
-
"category": "3",
|
40 |
-
"use_case": "1",
|
41 |
-
"sample_level_fn": "sample_level_fn",
|
42 |
-
"corpus_level_fn": "mean"
|
43 |
-
}
|
44 |
-
],
|
45 |
-
"hf_revision": null,
|
46 |
-
"hf_filter": null,
|
47 |
-
"hf_avail_splits": [
|
48 |
-
"test"
|
49 |
-
],
|
50 |
-
"trust_dataset": false,
|
51 |
-
"evaluation_splits": [
|
52 |
-
"test"
|
53 |
-
],
|
54 |
-
"few_shots_split": null,
|
55 |
-
"few_shots_select": null,
|
56 |
-
"generation_size": 32768,
|
57 |
-
"generation_grammar": null,
|
58 |
-
"stop_sequence": [],
|
59 |
-
"num_samples": null,
|
60 |
-
"suite": [
|
61 |
-
"custom"
|
62 |
-
],
|
63 |
-
"original_num_docs": 500,
|
64 |
-
"effective_num_docs": 500,
|
65 |
-
"must_remove_duplicate_docs": false,
|
66 |
-
"version": 1
|
67 |
-
}
|
68 |
-
},
|
69 |
-
"summary_tasks": {
|
70 |
-
"custom|math_500|0": {
|
71 |
-
"hashes": {
|
72 |
-
"hash_examples": "eac05bd67b8179c3",
|
73 |
-
"hash_full_prompts": "9043592f69431f18",
|
74 |
-
"hash_input_tokens": "c5aa3a61e16cb62b",
|
75 |
-
"hash_cont_tokens": "52394364e3300d65"
|
76 |
-
},
|
77 |
-
"truncated": 0,
|
78 |
-
"non_truncated": 500,
|
79 |
-
"padded": 0,
|
80 |
-
"non_padded": 500,
|
81 |
-
"effective_few_shots": 0.0,
|
82 |
-
"num_truncated_few_shots": 0
|
83 |
-
}
|
84 |
-
},
|
85 |
-
"summary_general": {
|
86 |
-
"hashes": {
|
87 |
-
"hash_examples": "d6b3f24200421bb2",
|
88 |
-
"hash_full_prompts": "1a4bc197befd9b91",
|
89 |
-
"hash_input_tokens": "b703f1639cd56c2a",
|
90 |
-
"hash_cont_tokens": "7801dd51980363b8"
|
91 |
-
},
|
92 |
-
"truncated": 0,
|
93 |
-
"non_truncated": 500,
|
94 |
-
"padded": 0,
|
95 |
-
"non_padded": 500,
|
96 |
-
"num_truncated_few_shots": 0
|
97 |
-
}
|
98 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|