tfrere commited on
Commit
83d60af
·
1 Parent(s): debda0e

update eveluationTask to remove local storage

Browse files
Files changed (29) hide show
  1. backend/data/lighteval_results/lighteval_results.json +0 -30
  2. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json +0 -121
  3. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json +0 -121
  4. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json +0 -121
  5. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json +0 -121
  6. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json +0 -121
  7. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json +0 -121
  8. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json +0 -121
  9. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json +0 -121
  10. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json +0 -121
  11. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json +0 -121
  12. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json +0 -121
  13. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json +0 -121
  14. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-59-46.530720.json +0 -121
  15. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json +0 -121
  16. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json +0 -121
  17. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json +0 -121
  18. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json +0 -121
  19. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json +0 -121
  20. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json +0 -121
  21. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json +0 -121
  22. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json +0 -121
  23. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json +0 -121
  24. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json +0 -121
  25. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json +0 -121
  26. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json +0 -121
  27. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json +0 -121
  28. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-28-57.341922.json +0 -121
  29. backend/tasks/evaluationTask.py +46 -18
backend/data/lighteval_results/lighteval_results.json DELETED
@@ -1,30 +0,0 @@
1
- [
2
- {
3
- "model": "Qwen/QwQ-32B",
4
- "provider": "sambanova",
5
- "accuracy": 1.0,
6
- "execution_time": 21.59078598022461,
7
- "status": "success"
8
- },
9
- {
10
- "model": "Qwen/Qwen2.5-72B-Instruct",
11
- "provider": "sambanova",
12
- "accuracy": 1.0,
13
- "execution_time": 14.694424152374268,
14
- "status": "success"
15
- },
16
- {
17
- "model": "deepseek-ai/DeepSeek-V3-0324",
18
- "provider": "novita",
19
- "accuracy": 1.0,
20
- "execution_time": 24.018408060073853,
21
- "status": "success"
22
- },
23
- {
24
- "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
25
- "provider": "sambanova",
26
- "accuracy": 1.0,
27
- "execution_time": 16.271580934524536,
28
- "status": "success"
29
- }
30
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186274.866411583,
9
- "end_time": 186322.987643416,
10
- "total_evaluation_time_secondes": "48.12123183300719",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186407.701185,
9
- "end_time": 186447.883386625,
10
- "total_evaluation_time_secondes": "40.18220162499347",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186521.763833833,
9
- "end_time": 186557.476439666,
10
- "total_evaluation_time_secondes": "35.71260583298863",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186704.883209333,
9
- "end_time": 186743.215716791,
10
- "total_evaluation_time_secondes": "38.332507457991596",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 187518.49620975,
9
- "end_time": 187553.120908083,
10
- "total_evaluation_time_secondes": "34.62469833297655",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 187690.771319041,
9
- "end_time": 187724.908132583,
10
- "total_evaluation_time_secondes": "34.136813541990705",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 187785.492066916,
9
- "end_time": 187824.287589375,
10
- "total_evaluation_time_secondes": "38.79552245899686",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 188300.087538958,
9
- "end_time": 188337.230208583,
10
- "total_evaluation_time_secondes": "37.142669624998234",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "7e34d82512ce6dfc",
96
- "hash_full_prompts": "af7c42c6f40964e1",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "7cdb142c3142312a",
111
- "hash_full_prompts": "a2e47b0b68e57792",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 188452.784089458,
9
- "end_time": 188490.538178958,
10
- "total_evaluation_time_secondes": "37.75408949999837",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 15,
7
- "job_id": 0,
8
- "start_time": 188674.734532375,
9
- "end_time": 188715.337919458,
10
- "total_evaluation_time_secondes": "40.60338708298514",
11
- "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 15,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "35f5eef8199d4521",
96
- "hash_full_prompts": "5590bc220414fefb",
97
- "hash_input_tokens": "58ec870775e406f3",
98
- "hash_cont_tokens": "58ec870775e406f3"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 15,
102
- "padded": 0,
103
- "non_padded": 15,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "bc7dfdffc5e53476",
111
- "hash_full_prompts": "712fd00df902d786",
112
- "hash_input_tokens": "544d800a25dfd777",
113
- "hash_cont_tokens": "544d800a25dfd777"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 15,
117
- "padded": 0,
118
- "non_padded": 15,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 15,
7
- "job_id": 0,
8
- "start_time": 188674.734510208,
9
- "end_time": 188690.205653,
10
- "total_evaluation_time_secondes": "15.471142791997408",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 15,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "35f5eef8199d4521",
96
- "hash_full_prompts": "5590bc220414fefb",
97
- "hash_input_tokens": "58ec870775e406f3",
98
- "hash_cont_tokens": "58ec870775e406f3"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 15,
102
- "padded": 0,
103
- "non_padded": 15,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "bc7dfdffc5e53476",
111
- "hash_full_prompts": "712fd00df902d786",
112
- "hash_input_tokens": "544d800a25dfd777",
113
- "hash_cont_tokens": "544d800a25dfd777"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 15,
117
- "padded": 0,
118
- "non_padded": 15,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 15,
7
- "job_id": 0,
8
- "start_time": 190861.972782125,
9
- "end_time": 190876.962226916,
10
- "total_evaluation_time_secondes": "14.989444790990092",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
- "hf_subset": "multi_hop_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 34,
87
- "effective_num_docs": 15,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "97803694d4430d2d",
96
- "hash_full_prompts": "3125bcda69618d2b",
97
- "hash_input_tokens": "58ec870775e406f3",
98
- "hash_cont_tokens": "58ec870775e406f3"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 15,
102
- "padded": 0,
103
- "non_padded": 15,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "13a4051f728a0e87",
111
- "hash_full_prompts": "e18b288370ab6ae2",
112
- "hash_input_tokens": "544d800a25dfd777",
113
- "hash_cont_tokens": "544d800a25dfd777"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 15,
117
- "padded": 0,
118
- "non_padded": 15,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-59-46.530720.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 30,
7
- "job_id": 0,
8
- "start_time": 193754.29830825,
9
- "end_time": 193775.660671041,
10
- "total_evaluation_time_secondes": "21.362362790998304",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_d0766aeb-d261-4f0f-870c-537432fd8584",
51
- "hf_subset": "multi_hop_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 34,
87
- "effective_num_docs": 30,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "8deb6ee598efe642",
96
- "hash_full_prompts": "ee276216c7fba0dc",
97
- "hash_input_tokens": "79ab129e9a18c6d6",
98
- "hash_cont_tokens": "79ab129e9a18c6d6"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 30,
102
- "padded": 0,
103
- "non_padded": 30,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "134194cd9d247350",
111
- "hash_full_prompts": "59b03121730720e8",
112
- "hash_input_tokens": "05a66e44e190c178",
113
- "hash_cont_tokens": "05a66e44e190c178"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 30,
117
- "padded": 0,
118
- "non_padded": 30,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186274.866369916,
9
- "end_time": 186294.792813083,
10
- "total_evaluation_time_secondes": "19.926443167001707",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186407.701222875,
9
- "end_time": 186427.871588083,
10
- "total_evaluation_time_secondes": "20.170365208003204",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186521.763754958,
9
- "end_time": 186545.585271583,
10
- "total_evaluation_time_secondes": "23.821516625001095",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 186704.882684291,
9
- "end_time": 186723.820615833,
10
- "total_evaluation_time_secondes": "18.937931542022852",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 187518.496174916,
9
- "end_time": 187538.752125166,
10
- "total_evaluation_time_secondes": "20.255950249993475",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 187690.771119125,
9
- "end_time": 187715.172306583,
10
- "total_evaluation_time_secondes": "24.40118745798827",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 187785.49207775,
9
- "end_time": 187806.982701541,
10
- "total_evaluation_time_secondes": "21.4906237910036",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 188300.087685291,
9
- "end_time": 188324.829042291,
10
- "total_evaluation_time_secondes": "24.7413570000208",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "7e34d82512ce6dfc",
96
- "hash_full_prompts": "af7c42c6f40964e1",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "7cdb142c3142312a",
111
- "hash_full_prompts": "a2e47b0b68e57792",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 5,
7
- "job_id": 0,
8
- "start_time": 188452.784059833,
9
- "end_time": 188474.450274291,
10
- "total_evaluation_time_secondes": "21.666214458004106",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 5,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "abaa6ef1f9715482",
96
- "hash_full_prompts": "0b5eb6607b419659",
97
- "hash_input_tokens": "bf9d9e969418cff7",
98
- "hash_cont_tokens": "bf9d9e969418cff7"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 5,
102
- "padded": 0,
103
- "non_padded": 5,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b1bf475c2319e3b2",
111
- "hash_full_prompts": "d860f90cd7291b63",
112
- "hash_input_tokens": "5882dac673b9f859",
113
- "hash_cont_tokens": "5882dac673b9f859"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 5,
117
- "padded": 0,
118
- "non_padded": 5,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 15,
7
- "job_id": 0,
8
- "start_time": 188674.734458958,
9
- "end_time": 188711.276019958,
10
- "total_evaluation_time_secondes": "36.54156099999091",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
- "hf_subset": "single_shot_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 15,
87
- "effective_num_docs": 15,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "35f5eef8199d4521",
96
- "hash_full_prompts": "5590bc220414fefb",
97
- "hash_input_tokens": "58ec870775e406f3",
98
- "hash_cont_tokens": "58ec870775e406f3"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 15,
102
- "padded": 0,
103
- "non_padded": 15,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "bc7dfdffc5e53476",
111
- "hash_full_prompts": "712fd00df902d786",
112
- "hash_input_tokens": "544d800a25dfd777",
113
- "hash_cont_tokens": "544d800a25dfd777"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 15,
117
- "padded": 0,
118
- "non_padded": 15,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 15,
7
- "job_id": 0,
8
- "start_time": 190861.972804458,
9
- "end_time": 190894.739973125,
10
- "total_evaluation_time_secondes": "32.7671686669928",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
- "hf_subset": "multi_hop_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 34,
87
- "effective_num_docs": 15,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "97803694d4430d2d",
96
- "hash_full_prompts": "3125bcda69618d2b",
97
- "hash_input_tokens": "58ec870775e406f3",
98
- "hash_cont_tokens": "58ec870775e406f3"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 15,
102
- "padded": 0,
103
- "non_padded": 15,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "13a4051f728a0e87",
111
- "hash_full_prompts": "e18b288370ab6ae2",
112
- "hash_input_tokens": "544d800a25dfd777",
113
- "hash_cont_tokens": "544d800a25dfd777"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 15,
117
- "padded": 0,
118
- "non_padded": 15,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 30,
7
- "job_id": 0,
8
- "start_time": 190994.241279791,
9
- "end_time": 191043.871577458,
10
- "total_evaluation_time_secondes": "49.63029766699765",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
- "hf_subset": "multi_hop_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 34,
87
- "effective_num_docs": 30,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "1b5afc5f13827f79",
96
- "hash_full_prompts": "cd8c39c007643835",
97
- "hash_input_tokens": "79ab129e9a18c6d6",
98
- "hash_cont_tokens": "79ab129e9a18c6d6"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 30,
102
- "padded": 0,
103
- "non_padded": 30,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b18e19e266a5bc51",
111
- "hash_full_prompts": "1eaa15cbc4a17d04",
112
- "hash_input_tokens": "05a66e44e190c178",
113
- "hash_cont_tokens": "05a66e44e190c178"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 30,
117
- "padded": 0,
118
- "non_padded": 30,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 30,
7
- "job_id": 0,
8
- "start_time": 191195.945968041,
9
- "end_time": 191244.057571,
10
- "total_evaluation_time_secondes": "48.111602959019365",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
- "hf_subset": "multi_hop_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 34,
87
- "effective_num_docs": 30,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "1b5afc5f13827f79",
96
- "hash_full_prompts": "cd8c39c007643835",
97
- "hash_input_tokens": "79ab129e9a18c6d6",
98
- "hash_cont_tokens": "79ab129e9a18c6d6"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 30,
102
- "padded": 0,
103
- "non_padded": 30,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b18e19e266a5bc51",
111
- "hash_full_prompts": "1eaa15cbc4a17d04",
112
- "hash_input_tokens": "05a66e44e190c178",
113
- "hash_cont_tokens": "05a66e44e190c178"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 30,
117
- "padded": 0,
118
- "non_padded": 30,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-28-57.341922.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": null,
6
- "max_samples": 30,
7
- "job_id": 0,
8
- "start_time": 191865.098197958,
9
- "end_time": 191926.425937958,
10
- "total_evaluation_time_secondes": "61.32774000000791",
11
- "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": "",
15
- "generation_parameters": {
16
- "early_stopping": null,
17
- "repetition_penalty": null,
18
- "frequency_penalty": null,
19
- "length_penalty": null,
20
- "presence_penalty": null,
21
- "max_new_tokens": null,
22
- "min_new_tokens": null,
23
- "seed": null,
24
- "stop_tokens": null,
25
- "temperature": null,
26
- "top_k": null,
27
- "min_p": null,
28
- "top_p": null,
29
- "truncate_prompt": null,
30
- "response_format": null
31
- }
32
- },
33
- "results": {
34
- "custom|yourbench|0": {
35
- "accuracy": 1.0,
36
- "accuracy_stderr": 0.0
37
- },
38
- "all": {
39
- "accuracy": 1.0,
40
- "accuracy_stderr": 0.0
41
- }
42
- },
43
- "versions": {
44
- "custom|yourbench|0": 0
45
- },
46
- "config_tasks": {
47
- "custom|yourbench": {
48
- "name": "yourbench",
49
- "prompt_function": "yourbench_prompt",
50
- "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
- "hf_subset": "multi_hop_questions",
52
- "metric": [
53
- {
54
- "metric_name": [
55
- "accuracy"
56
- ],
57
- "higher_is_better": {
58
- "accuracy": true
59
- },
60
- "category": "7",
61
- "use_case": "1",
62
- "sample_level_fn": "compute",
63
- "corpus_level_fn": {
64
- "accuracy": "mean"
65
- }
66
- }
67
- ],
68
- "hf_revision": null,
69
- "hf_filter": null,
70
- "hf_avail_splits": [
71
- "train"
72
- ],
73
- "trust_dataset": true,
74
- "evaluation_splits": [
75
- "train"
76
- ],
77
- "few_shots_split": null,
78
- "few_shots_select": null,
79
- "generation_size": 8192,
80
- "generation_grammar": null,
81
- "stop_sequence": [],
82
- "num_samples": null,
83
- "suite": [
84
- "custom"
85
- ],
86
- "original_num_docs": 34,
87
- "effective_num_docs": 30,
88
- "must_remove_duplicate_docs": false,
89
- "version": 0
90
- }
91
- },
92
- "summary_tasks": {
93
- "custom|yourbench|0": {
94
- "hashes": {
95
- "hash_examples": "1b5afc5f13827f79",
96
- "hash_full_prompts": "cd8c39c007643835",
97
- "hash_input_tokens": "79ab129e9a18c6d6",
98
- "hash_cont_tokens": "79ab129e9a18c6d6"
99
- },
100
- "truncated": 0,
101
- "non_truncated": 30,
102
- "padded": 0,
103
- "non_padded": 30,
104
- "effective_few_shots": 0.0,
105
- "num_truncated_few_shots": 0
106
- }
107
- },
108
- "summary_general": {
109
- "hashes": {
110
- "hash_examples": "b18e19e266a5bc51",
111
- "hash_full_prompts": "1eaa15cbc4a17d04",
112
- "hash_input_tokens": "05a66e44e190c178",
113
- "hash_cont_tokens": "05a66e44e190c178"
114
- },
115
- "truncated": 0,
116
- "non_truncated": 30,
117
- "padded": 0,
118
- "non_padded": 30,
119
- "num_truncated_few_shots": 0
120
- }
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tasks/evaluationTask.py CHANGED
@@ -36,21 +36,17 @@ class EvaluationTask:
36
 
37
  def _save_results_to_hub(self) -> None:
38
  """
39
- Save evaluation results to the dataset on the Hub
40
  """
41
  try:
42
- # Create results directory if it doesn't exist
43
- results_dir = Path("data/lighteval_results")
44
- results_dir.mkdir(parents=True, exist_ok=True)
45
-
46
- # Save results to JSON file
47
- results_file = results_dir / "lighteval_results.json"
48
- with open(results_file, "w") as f:
49
- json.dump(self.results, f, indent=2)
50
 
51
  # Push to Hub
52
  self.hf_api.upload_file(
53
- path_or_fileobj=str(results_file),
54
  path_in_repo="lighteval_results.json",
55
  repo_id=self.dataset_name,
56
  repo_type="dataset",
@@ -58,6 +54,9 @@ class EvaluationTask:
58
  )
59
 
60
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json")
 
 
 
61
  except Exception as e:
62
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
63
 
@@ -78,6 +77,9 @@ yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions")
78
  TASKS_TABLE = [yourbench]
79
  """)
80
 
 
 
 
81
  # LightEval command
82
  cmd_args = [
83
  "lighteval",
@@ -88,7 +90,7 @@ TASKS_TABLE = [yourbench]
88
  "--custom-tasks",
89
  temp_file_path,
90
  "--max-samples", "30",
91
- "--output-dir", "data/lighteval_results",
92
  "--no-push-to-hub"
93
  ]
94
 
@@ -106,6 +108,12 @@ TASKS_TABLE = [yourbench]
106
  except asyncio.TimeoutError:
107
  process.kill()
108
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
 
 
 
 
 
 
109
  return {
110
  "model": model_name,
111
  "provider": provider,
@@ -115,6 +123,12 @@ TASKS_TABLE = [yourbench]
115
  }
116
  except Exception as e:
117
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}")
 
 
 
 
 
 
118
  return {
119
  "model": model_name,
120
  "provider": provider,
@@ -127,19 +141,16 @@ TASKS_TABLE = [yourbench]
127
  execution_time = time.time() - start_time
128
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
129
 
130
- # Clean up
131
- os.unlink(temp_file_path)
132
-
133
  try:
134
  # Get results from the output file
135
- results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
136
  results_file = next(results_dir.glob("results_*.json"))
137
 
138
  with open(results_file) as f:
139
  results = json.load(f)
140
  accuracy = results["results"]["all"]["accuracy"]
141
 
142
- return {
143
  "model": model_name,
144
  "provider": provider,
145
  "accuracy": accuracy,
@@ -148,13 +159,20 @@ TASKS_TABLE = [yourbench]
148
  }
149
  except Exception as e:
150
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
151
- return {
152
  "model": model_name,
153
  "provider": provider,
154
  "accuracy": 0.0,
155
  "execution_time": execution_time,
156
  "status": "parse_error"
157
  }
 
 
 
 
 
 
 
158
 
159
  async def run(self) -> None:
160
  """
@@ -191,7 +209,17 @@ TASKS_TABLE = [yourbench]
191
  total_time = time.time() - script_start_time
192
  print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
193
 
194
- # Save results to Hub
 
 
 
 
 
 
 
 
 
 
195
  self._save_results_to_hub()
196
 
197
  # Mark the task as completed
 
36
 
37
  def _save_results_to_hub(self) -> None:
38
  """
39
+ Save evaluation results directly to the dataset on the Hub without persisting locally
40
  """
41
  try:
42
+ # Créer un fichier temporaire pour les résultats
43
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
44
+ json.dump(self.results, temp_file, indent=2)
45
+ temp_file_path = temp_file.name
 
 
 
 
46
 
47
  # Push to Hub
48
  self.hf_api.upload_file(
49
+ path_or_fileobj=temp_file_path,
50
  path_in_repo="lighteval_results.json",
51
  repo_id=self.dataset_name,
52
  repo_type="dataset",
 
54
  )
55
 
56
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json")
57
+
58
+ # Supprimer le fichier temporaire
59
+ os.unlink(temp_file_path)
60
  except Exception as e:
61
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
62
 
 
77
  TASKS_TABLE = [yourbench]
78
  """)
79
 
80
+ # Create temporary output directory
81
+ temp_output_dir = tempfile.mkdtemp(prefix="lighteval_")
82
+
83
  # LightEval command
84
  cmd_args = [
85
  "lighteval",
 
90
  "--custom-tasks",
91
  temp_file_path,
92
  "--max-samples", "30",
93
+ "--output-dir", temp_output_dir,
94
  "--no-push-to-hub"
95
  ]
96
 
 
108
  except asyncio.TimeoutError:
109
  process.kill()
110
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
111
+
112
+ # Clean up temporary files and directories
113
+ os.unlink(temp_file_path)
114
+ import shutil
115
+ shutil.rmtree(temp_output_dir, ignore_errors=True)
116
+
117
  return {
118
  "model": model_name,
119
  "provider": provider,
 
123
  }
124
  except Exception as e:
125
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}")
126
+
127
+ # Clean up temporary files and directories
128
+ os.unlink(temp_file_path)
129
+ import shutil
130
+ shutil.rmtree(temp_output_dir, ignore_errors=True)
131
+
132
  return {
133
  "model": model_name,
134
  "provider": provider,
 
141
  execution_time = time.time() - start_time
142
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
143
 
 
 
 
144
  try:
145
  # Get results from the output file
146
+ results_dir = Path(temp_output_dir) / "results" / model_name.replace("/", "/")
147
  results_file = next(results_dir.glob("results_*.json"))
148
 
149
  with open(results_file) as f:
150
  results = json.load(f)
151
  accuracy = results["results"]["all"]["accuracy"]
152
 
153
+ result_data = {
154
  "model": model_name,
155
  "provider": provider,
156
  "accuracy": accuracy,
 
159
  }
160
  except Exception as e:
161
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
162
+ result_data = {
163
  "model": model_name,
164
  "provider": provider,
165
  "accuracy": 0.0,
166
  "execution_time": execution_time,
167
  "status": "parse_error"
168
  }
169
+
170
+ # Clean up temporary files and directories
171
+ os.unlink(temp_file_path)
172
+ import shutil
173
+ shutil.rmtree(temp_output_dir, ignore_errors=True)
174
+
175
+ return result_data
176
 
177
  async def run(self) -> None:
178
  """
 
209
  total_time = time.time() - script_start_time
210
  print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
211
 
212
+ # Cleanup intermediate results if they exist
213
+ if os.path.exists("data/lighteval_results"):
214
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Cleaning up intermediate results")
215
+ try:
216
+ # Recursively delete intermediate results
217
+ import shutil
218
+ shutil.rmtree("data/lighteval_results", ignore_errors=True)
219
+ except Exception as e:
220
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Warning: Failed to clean up intermediate results: {str(e)}")
221
+
222
+ # Save final results to Hub (only once)
223
  self._save_results_to_hub()
224
 
225
  # Mark the task as completed