tfrere commited on
Commit
2a8ebbd
·
1 Parent(s): ebdfd67

update on tasks

Browse files
Files changed (43) hide show
  1. backend/data/lighteval_results/lighteval_results.json +30 -0
  2. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json +121 -0
  3. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json +121 -0
  4. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json +121 -0
  5. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json +121 -0
  6. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json +121 -0
  7. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json +121 -0
  8. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json +121 -0
  9. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json +121 -0
  10. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json +121 -0
  11. backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json +121 -0
  12. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json +121 -0
  13. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json +121 -0
  14. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json +121 -0
  15. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json +121 -0
  16. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json +121 -0
  17. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json +121 -0
  18. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json +121 -0
  19. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json +121 -0
  20. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json +121 -0
  21. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json +121 -0
  22. backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json +121 -0
  23. backend/lighteval_task/__init__.py +3 -0
  24. backend/{tasks/yourbench_lighteval_task.py → lighteval_task/lighteval_task.py} +36 -10
  25. backend/pyproject.toml +6 -0
  26. backend/routes/evaluation.py +42 -31
  27. backend/tasks/createBench.py +1 -83
  28. backend/tasks/createBenchConfigFile.py +4 -4
  29. backend/tasks/evaluationTask.py +144 -405
  30. backend/tasks/get_model_providers.py +29 -0
  31. backend/test_import.py +5 -0
  32. backend/yourbench_simple_demo.egg-info/PKG-INFO +18 -0
  33. backend/yourbench_simple_demo.egg-info/SOURCES.txt +17 -0
  34. backend/yourbench_simple_demo.egg-info/dependency_links.txt +1 -0
  35. backend/yourbench_simple_demo.egg-info/requires.txt +13 -0
  36. backend/yourbench_simple_demo.egg-info/top_level.txt +1 -0
  37. frontend/src/components/BenchmarkDisplay.jsx +24 -21
  38. frontend/src/components/BenchmarkEvaluation.jsx +42 -211
  39. frontend/src/components/BenchmarkGenerator.jsx +2 -2
  40. frontend/src/components/EvaluationDisplay.jsx +50 -46
  41. frontend/src/components/ExternalLinks.jsx +33 -2
  42. frontend/src/config/theme.js +1 -1
  43. test_import.py +5 -0
backend/data/lighteval_results/lighteval_results.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
4
+ "provider": "sambanova",
5
+ "accuracy": 1.0,
6
+ "execution_time": 18.800472021102905,
7
+ "status": "success"
8
+ },
9
+ {
10
+ "model": "deepseek-ai/DeepSeek-V3-0324",
11
+ "provider": "novita",
12
+ "accuracy": 1.0,
13
+ "execution_time": 34.95434904098511,
14
+ "status": "success"
15
+ },
16
+ {
17
+ "model": "Qwen/Qwen2.5-72B-Instruct",
18
+ "provider": "sambanova",
19
+ "accuracy": 0.0,
20
+ "execution_time": 60.0,
21
+ "status": "timeout"
22
+ },
23
+ {
24
+ "model": "Qwen/QwQ-32B",
25
+ "provider": "sambanova",
26
+ "accuracy": 0.0,
27
+ "execution_time": 60.0,
28
+ "status": "timeout"
29
+ }
30
+ ]
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186274.866411583,
9
+ "end_time": 186322.987643416,
10
+ "total_evaluation_time_secondes": "48.12123183300719",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186407.701185,
9
+ "end_time": 186447.883386625,
10
+ "total_evaluation_time_secondes": "40.18220162499347",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186521.763833833,
9
+ "end_time": 186557.476439666,
10
+ "total_evaluation_time_secondes": "35.71260583298863",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186704.883209333,
9
+ "end_time": 186743.215716791,
10
+ "total_evaluation_time_secondes": "38.332507457991596",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 187518.49620975,
9
+ "end_time": 187553.120908083,
10
+ "total_evaluation_time_secondes": "34.62469833297655",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 187690.771319041,
9
+ "end_time": 187724.908132583,
10
+ "total_evaluation_time_secondes": "34.136813541990705",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 187785.492066916,
9
+ "end_time": 187824.287589375,
10
+ "total_evaluation_time_secondes": "38.79552245899686",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 188300.087538958,
9
+ "end_time": 188337.230208583,
10
+ "total_evaluation_time_secondes": "37.142669624998234",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "7e34d82512ce6dfc",
96
+ "hash_full_prompts": "af7c42c6f40964e1",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "7cdb142c3142312a",
111
+ "hash_full_prompts": "a2e47b0b68e57792",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 188452.784089458,
9
+ "end_time": 188490.538178958,
10
+ "total_evaluation_time_secondes": "37.75408949999837",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 15,
7
+ "job_id": 0,
8
+ "start_time": 188674.734532375,
9
+ "end_time": 188715.337919458,
10
+ "total_evaluation_time_secondes": "40.60338708298514",
11
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 15,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "35f5eef8199d4521",
96
+ "hash_full_prompts": "5590bc220414fefb",
97
+ "hash_input_tokens": "58ec870775e406f3",
98
+ "hash_cont_tokens": "58ec870775e406f3"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 15,
102
+ "padded": 0,
103
+ "non_padded": 15,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "bc7dfdffc5e53476",
111
+ "hash_full_prompts": "712fd00df902d786",
112
+ "hash_input_tokens": "544d800a25dfd777",
113
+ "hash_cont_tokens": "544d800a25dfd777"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 15,
117
+ "padded": 0,
118
+ "non_padded": 15,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 15,
7
+ "job_id": 0,
8
+ "start_time": 188674.734510208,
9
+ "end_time": 188690.205653,
10
+ "total_evaluation_time_secondes": "15.471142791997408",
11
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 15,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "35f5eef8199d4521",
96
+ "hash_full_prompts": "5590bc220414fefb",
97
+ "hash_input_tokens": "58ec870775e406f3",
98
+ "hash_cont_tokens": "58ec870775e406f3"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 15,
102
+ "padded": 0,
103
+ "non_padded": 15,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "bc7dfdffc5e53476",
111
+ "hash_full_prompts": "712fd00df902d786",
112
+ "hash_input_tokens": "544d800a25dfd777",
113
+ "hash_cont_tokens": "544d800a25dfd777"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 15,
117
+ "padded": 0,
118
+ "non_padded": 15,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186274.866369916,
9
+ "end_time": 186294.792813083,
10
+ "total_evaluation_time_secondes": "19.926443167001707",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186407.701222875,
9
+ "end_time": 186427.871588083,
10
+ "total_evaluation_time_secondes": "20.170365208003204",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186521.763754958,
9
+ "end_time": 186545.585271583,
10
+ "total_evaluation_time_secondes": "23.821516625001095",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 186704.882684291,
9
+ "end_time": 186723.820615833,
10
+ "total_evaluation_time_secondes": "18.937931542022852",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 187518.496174916,
9
+ "end_time": 187538.752125166,
10
+ "total_evaluation_time_secondes": "20.255950249993475",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 187690.771119125,
9
+ "end_time": 187715.172306583,
10
+ "total_evaluation_time_secondes": "24.40118745798827",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 187785.49207775,
9
+ "end_time": 187806.982701541,
10
+ "total_evaluation_time_secondes": "21.4906237910036",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 188300.087685291,
9
+ "end_time": 188324.829042291,
10
+ "total_evaluation_time_secondes": "24.7413570000208",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "7e34d82512ce6dfc",
96
+ "hash_full_prompts": "af7c42c6f40964e1",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "7cdb142c3142312a",
111
+ "hash_full_prompts": "a2e47b0b68e57792",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 5,
7
+ "job_id": 0,
8
+ "start_time": 188452.784059833,
9
+ "end_time": 188474.450274291,
10
+ "total_evaluation_time_secondes": "21.666214458004106",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 5,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "abaa6ef1f9715482",
96
+ "hash_full_prompts": "0b5eb6607b419659",
97
+ "hash_input_tokens": "bf9d9e969418cff7",
98
+ "hash_cont_tokens": "bf9d9e969418cff7"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 5,
102
+ "padded": 0,
103
+ "non_padded": 5,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b1bf475c2319e3b2",
111
+ "hash_full_prompts": "d860f90cd7291b63",
112
+ "hash_input_tokens": "5882dac673b9f859",
113
+ "hash_cont_tokens": "5882dac673b9f859"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 5,
117
+ "padded": 0,
118
+ "non_padded": 5,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 15,
7
+ "job_id": 0,
8
+ "start_time": 188674.734458958,
9
+ "end_time": 188711.276019958,
10
+ "total_evaluation_time_secondes": "36.54156099999091",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
51
+ "hf_subset": "single_shot_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 15,
87
+ "effective_num_docs": 15,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "35f5eef8199d4521",
96
+ "hash_full_prompts": "5590bc220414fefb",
97
+ "hash_input_tokens": "58ec870775e406f3",
98
+ "hash_cont_tokens": "58ec870775e406f3"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 15,
102
+ "padded": 0,
103
+ "non_padded": 15,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "bc7dfdffc5e53476",
111
+ "hash_full_prompts": "712fd00df902d786",
112
+ "hash_input_tokens": "544d800a25dfd777",
113
+ "hash_cont_tokens": "544d800a25dfd777"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 15,
117
+ "padded": 0,
118
+ "non_padded": 15,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/lighteval_task/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .lighteval_task import create_yourbench_task
2
+
3
+ __all__ = ["create_yourbench_task"]
backend/{tasks/yourbench_lighteval_task.py → lighteval_task/lighteval_task.py} RENAMED
@@ -136,10 +136,26 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
136
 
137
 
138
  def process_judge_response_yourbench(response):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # extract the final answer using regex from the response xml
140
  try:
141
  # Essayer d'abord le format XML
142
- match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL)
143
  if match:
144
  answer_text = match.group(1).strip()
145
  # Convertir différents formats possibles en 0 ou 1
@@ -155,14 +171,16 @@ def process_judge_response_yourbench(response):
155
  pass
156
 
157
  # Rechercher des mots-clés dans la réponse
158
- if re.search(r"\b(correct|vrai|true|yes)\b", response, re.IGNORECASE):
159
  return 1
160
- if re.search(r"\b(incorrect|faux|false|no)\b", response, re.IGNORECASE):
161
  return 0
162
 
163
- logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {response[:100]}...")
164
  except Exception as e:
165
  logger.error(f"Error processing judge response: {e}")
 
 
166
  return 0
167
 
168
 
@@ -185,10 +203,18 @@ class JudgeLLMYourBench(JudgeLLM):
185
  chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs]
186
  documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
187
 
 
 
 
 
 
188
  score, _, _ = self.judge.evaluate_answer_batch(
189
  questions, predictions, options, golds, chunks=chunks, documents=documents
190
  )
191
 
 
 
 
192
  metrics = []
193
  for i in range(len(sample_ids)):
194
  metrics.append(
@@ -214,17 +240,17 @@ def yourbench_prompt(line, task_name: str = ""):
214
  return Doc(
215
  task_name=task_name,
216
  query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]),
217
- choices=[line["ground_truth_answer"]],
218
  gold_index=0,
219
  specific={
220
- "question_category": line["question_category"],
221
- "kind": line["kind"],
222
  "estimated_difficulty": line["estimated_difficulty"],
223
  "document_id": line["document_id"],
224
- "question_generating_model": line["question_generating_model"],
225
- "chunks": line["chunks"],
226
  "question": line["question"],
227
- "document": line["document"],
228
  },
229
  )
230
 
 
136
 
137
 
138
  def process_judge_response_yourbench(response):
139
+ # Si la réponse est un dictionnaire, extraire le contenu
140
+ if isinstance(response, dict):
141
+ if "content" in response:
142
+ response = response["content"]
143
+ elif "text" in response:
144
+ response = response["text"]
145
+ elif "response" in response:
146
+ response = response["response"]
147
+ else:
148
+ # Si on ne trouve pas de champ texte, on prend la première valeur
149
+ response = str(list(response.values())[0])
150
+
151
+ # Si la réponse est une liste, prendre le premier élément
152
+ if isinstance(response, list):
153
+ response = response[0]
154
+
155
  # extract the final answer using regex from the response xml
156
  try:
157
  # Essayer d'abord le format XML
158
+ match = re.search(r"<final_answer>(.*?)</final_answer>", str(response), re.DOTALL)
159
  if match:
160
  answer_text = match.group(1).strip()
161
  # Convertir différents formats possibles en 0 ou 1
 
171
  pass
172
 
173
  # Rechercher des mots-clés dans la réponse
174
+ if re.search(r"\b(correct|vrai|true|yes)\b", str(response), re.IGNORECASE):
175
  return 1
176
+ if re.search(r"\b(incorrect|faux|false|no)\b", str(response), re.IGNORECASE):
177
  return 0
178
 
179
+ logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {str(response)[:100]}...")
180
  except Exception as e:
181
  logger.error(f"Error processing judge response: {e}")
182
+ logger.error(f"Response type: {type(response)}")
183
+ logger.error(f"Response content: {response}")
184
  return 0
185
 
186
 
 
203
  chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs]
204
  documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
205
 
206
+ # Ajout de logs pour déboguer
207
+ logger.info(f"Questions: {questions}")
208
+ logger.info(f"Predictions: {predictions}")
209
+ logger.info(f"Golds: {golds}")
210
+
211
  score, _, _ = self.judge.evaluate_answer_batch(
212
  questions, predictions, options, golds, chunks=chunks, documents=documents
213
  )
214
 
215
+ # Ajout de logs pour déboguer
216
+ logger.info(f"Scores: {score}")
217
+
218
  metrics = []
219
  for i in range(len(sample_ids)):
220
  metrics.append(
 
240
  return Doc(
241
  task_name=task_name,
242
  query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]),
243
+ choices=[line["self_answer"]],
244
  gold_index=0,
245
  specific={
246
+ "question_category": line["self_assessed_question_type"],
247
+ "kind": "qa",
248
  "estimated_difficulty": line["estimated_difficulty"],
249
  "document_id": line["document_id"],
250
+ "question_generating_model": line["generating_model"],
251
+ "chunks": line["citations"],
252
  "question": line["question"],
253
+ "document": line["raw_response"],
254
  },
255
  )
256
 
backend/pyproject.toml CHANGED
@@ -20,6 +20,9 @@ dependencies = [
20
  "lighteval[math]>=0.8.0",
21
  "huggingface-hub>=0.22.0",
22
  "python-multipart>=0.0.5",
 
 
 
23
  ]
24
 
25
  [build-system]
@@ -46,3 +49,6 @@ quote-style = "double"
46
  indent-style = "space"
47
  skip-magic-trailing-comma = false
48
  line-ending = "auto"
 
 
 
 
20
  "lighteval[math]>=0.8.0",
21
  "huggingface-hub>=0.22.0",
22
  "python-multipart>=0.0.5",
23
+ "fastapi>=0.110.0",
24
+ "uvicorn>=0.29.0",
25
+ "pydantic>=2.6.0",
26
  ]
27
 
28
  [build-system]
 
49
  indent-style = "space"
50
  skip-magic-trailing-comma = false
51
  line-ending = "auto"
52
+
53
+ [tool.setuptools]
54
+ packages = ["lighteval_task"]
backend/routes/evaluation.py CHANGED
@@ -2,6 +2,9 @@ from fastapi import APIRouter, HTTPException
2
  from typing import Dict, Any
3
  import os
4
  from tasks.evaluationTask import EvaluationTask
 
 
 
5
 
6
  router = APIRouter(tags=["evaluation"])
7
 
@@ -41,7 +44,7 @@ async def evaluate_benchmark(data: Dict[str, Any]):
41
 
42
  try:
43
  # Nom du dataset basé sur l'ID de session
44
- dataset_name = f"yourbench_{session_id}"
45
 
46
  # Créer et démarrer une nouvelle tâche d'évaluation
47
  evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
@@ -105,44 +108,52 @@ async def get_evaluation_results(session_id: str):
105
  Returns:
106
  Dictionary with evaluation results
107
  """
108
- # First, check if the task is in memory
109
- if session_id in active_evaluation_tasks:
110
- evaluation_task = active_evaluation_tasks[session_id]
 
111
 
112
- if not evaluation_task.is_task_completed():
113
- return {
114
- "success": False,
115
- "message": "Evaluation is still in progress"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  }
117
-
118
- if hasattr(evaluation_task, 'results') and evaluation_task.results:
119
  return {
120
  "success": True,
121
- "results": evaluation_task.results
122
  }
123
-
124
- # If we get here, either the task is not in memory or it doesn't have results
125
- # Try to load results from file
126
- try:
127
- # Construct the path to the results file
128
- results_path = f"uploaded_files/{session_id}/lighteval_results/models_comparison.json"
129
-
130
- # Check if the file exists
131
- if not os.path.exists(results_path):
132
  return {
133
  "success": False,
134
- "message": "No evaluation results found for this session"
135
  }
136
-
137
- # Read the file
138
- import json
139
- with open(results_path, 'r') as f:
140
- results = json.load(f)
141
-
142
- return {
143
- "success": True,
144
- "results": results
145
- }
146
  except Exception as e:
147
  return {
148
  "success": False,
 
2
  from typing import Dict, Any
3
  import os
4
  from tasks.evaluationTask import EvaluationTask
5
+ from huggingface_hub import hf_hub_download
6
+ import json
7
+ from datetime import datetime
8
 
9
  router = APIRouter(tags=["evaluation"])
10
 
 
44
 
45
  try:
46
  # Nom du dataset basé sur l'ID de session
47
+ dataset_name = f"yourbench/yourbench_{session_id}"
48
 
49
  # Créer et démarrer une nouvelle tâche d'évaluation
50
  evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
 
108
  Returns:
109
  Dictionary with evaluation results
110
  """
111
+ try:
112
+ # Get organization from environment
113
+ organization = os.getenv("HF_ORGANIZATION", "yourbench")
114
+ dataset_name = f"{organization}/yourbench_{session_id}"
115
 
116
+ # Try to load results from the Hub
117
+ try:
118
+ results_file = hf_hub_download(
119
+ repo_id=dataset_name,
120
+ repo_type="dataset",
121
+ filename="lighteval_results.json"
122
+ )
123
+
124
+ with open(results_file) as f:
125
+ results = json.load(f)
126
+
127
+ # Format results to match the expected format
128
+ formatted_results = {
129
+ "metadata": {
130
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
131
+ "total_models_tested": len(results),
132
+ "successful_tests": len([r for r in results if r["status"] == "success"])
133
+ },
134
+ "models_comparison": [
135
+ {
136
+ "model_name": result["model"],
137
+ "provider": result["provider"],
138
+ "success": result["status"] == "success",
139
+ "accuracy": result["accuracy"],
140
+ "evaluation_time": result["execution_time"],
141
+ "error": result["status"] if result["status"] != "success" else None
142
+ }
143
+ for result in results
144
+ ]
145
  }
146
+
 
147
  return {
148
  "success": True,
149
+ "results": formatted_results
150
  }
151
+ except Exception as e:
 
 
 
 
 
 
 
 
152
  return {
153
  "success": False,
154
+ "message": f"Failed to load results from Hub: {str(e)}"
155
  }
156
+
 
 
 
 
 
 
 
 
 
157
  except Exception as e:
158
  return {
159
  "success": False,
backend/tasks/createBench.py CHANGED
@@ -234,86 +234,4 @@ class CreateBenchTask:
234
  except Exception as e:
235
  self._add_log(f"[ERROR] Error starting ingestion process: {str(e)}")
236
  self.is_completed = True
237
-
238
- def _simulate_ingestion_process(self) -> None:
239
- """
240
- Simulate the ingestion process for testing/development
241
- This will be removed in production
242
- """
243
- # This method is just to simulate logs during development
244
- # It will be removed in production
245
-
246
- threading.Thread(target=self._simulate_logs).start()
247
-
248
- def _simulate_logs(self) -> None:
249
- """
250
- Simulate logs for testing/development
251
- This will be used when yourbench isn't installed or in development mode
252
- """
253
- # Log simulation (used when yourbench is not available)
254
- self._add_log("[INFO] Simulation mode enabled (yourbench is not actually running)")
255
-
256
- # Get filenames from source directory
257
- source_files = []
258
- try:
259
- with open(self.config_path, 'r') as f:
260
- config_yaml = yaml.safe_load(f)
261
-
262
- source_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("source_documents_dir", "")
263
- if source_dir and os.path.exists(source_dir):
264
- source_files = [f for f in os.listdir(source_dir)
265
- if os.path.isfile(os.path.join(source_dir, f))]
266
- except Exception:
267
- source_files = ["document.pdf", "document.txt"] # Fallback
268
-
269
- # Create output directory if it doesn't exist
270
- output_dir = ""
271
- try:
272
- output_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("output_dir", "")
273
- if output_dir:
274
- os.makedirs(output_dir, exist_ok=True)
275
- except Exception:
276
- pass
277
-
278
- # Simulate file processing
279
- time.sleep(1)
280
- self._add_log("[INFO] Initializing document ingestion")
281
- time.sleep(1.5)
282
- self._add_log("[INFO] Loading configuration parameters")
283
- time.sleep(1)
284
- self._add_log("[INFO] Verifying source files")
285
-
286
- # Process each file
287
- for file in source_files:
288
- time.sleep(1.5)
289
- self._add_log(f"[INFO] Processing file: {file}")
290
- time.sleep(2)
291
- self._add_log(f"[INFO] Extracting content from {file}")
292
- time.sleep(1.5)
293
- self._add_log(f"[INFO] Converting to markdown: {file}")
294
-
295
- # Create a simulated markdown file if an output directory is defined
296
- if output_dir:
297
- base_name = os.path.splitext(file)[0]
298
- output_file = os.path.join(output_dir, f"{base_name}.md")
299
- try:
300
- with open(output_file, 'w') as f:
301
- f.write(f"# {base_name}\n\n")
302
- f.write("This is a markdown document automatically generated by the simulation.\n\n")
303
- f.write("## Section 1\n\n")
304
- f.write("Content of section 1...\n\n")
305
- f.write("## Section 2\n\n")
306
- f.write("Content of section 2...\n\n")
307
- self._add_log(f"[INFO] Markdown file created: {output_file}")
308
- except Exception as e:
309
- self._add_log(f"[ERROR] Error creating markdown file: {str(e)}")
310
-
311
- time.sleep(2)
312
- self._add_log("[INFO] Finalizing processing")
313
- time.sleep(1)
314
- self._add_log("[SUCCESS] Stage completed: ingestion")
315
- time.sleep(0.5)
316
- self._add_log("[SUCCESS] Ingestion completed successfully")
317
-
318
- # Mark task as completed
319
- self.is_completed = True
 
234
  except Exception as e:
235
  self._add_log(f"[ERROR] Error starting ingestion process: {str(e)}")
236
  self.is_completed = True
237
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tasks/createBenchConfigFile.py CHANGED
@@ -145,15 +145,15 @@ class CreateBenchConfigTask:
145
  "tau_threshold": 0.8,
146
  "h_min": 2,
147
  "h_max": 5,
148
- "num_multihops_factor": 2,
149
  },
150
  },
151
  "single_shot_question_generation": {
152
- "run": True,
153
  "additional_instructions": "Generate questions to test a curious adult",
154
  "chunk_sampling": {
155
  "mode": "count",
156
- "value": 5,
157
  "random_seed": 123,
158
  },
159
  },
@@ -167,7 +167,7 @@ class CreateBenchConfigTask:
167
  },
168
  },
169
  "lighteval": {
170
- "run": True,
171
  },
172
  },
173
  }
 
145
  "tau_threshold": 0.8,
146
  "h_min": 2,
147
  "h_max": 5,
148
+ "num_multihops_factor": 1,
149
  },
150
  },
151
  "single_shot_question_generation": {
152
+ "run": False,
153
  "additional_instructions": "Generate questions to test a curious adult",
154
  "chunk_sampling": {
155
  "mode": "count",
156
+ "value": 10,
157
  "random_seed": 123,
158
  },
159
  },
 
167
  },
168
  },
169
  "lighteval": {
170
+ "run": False,
171
  },
172
  },
173
  }
backend/tasks/evaluationTask.py CHANGED
@@ -1,25 +1,22 @@
1
  """
2
- Task to evaluate models on a YourbBench dataset using LightEval
3
  """
4
  import os
5
- import sys
6
- import json
7
  import time
 
8
  import tempfile
9
- import asyncio
10
- import threading
11
  from pathlib import Path
12
- from typing import Optional, List, Dict, Any, Tuple
13
-
14
- from loguru import logger
15
- from huggingface_hub import HfApi, CommitOperationAdd
16
-
17
- from tasks.yourbench_lighteval_task import create_yourbench_task
18
-
19
 
20
  class EvaluationTask:
21
  """
22
- Task to evaluate models using LightEval on a YourbBench dataset
23
  """
24
 
25
  def __init__(self, session_uid: str, dataset_name: str):
@@ -32,440 +29,182 @@ class EvaluationTask:
32
  """
33
  self.session_uid = session_uid
34
  self.dataset_name = dataset_name
35
- self.logs: List[str] = []
36
  self.is_completed = False
37
- self.organization = os.getenv("HF_ORGANIZATION", "yourbench")
38
- self.results: Dict[str, Any] = {}
39
- self.output_dir = f"uploaded_files/{session_uid}/lighteval_results"
40
-
41
- # Models to evaluate - can be modified to allow customization
42
- self.models = [
43
- ("Qwen/Qwen2.5-72B-Instruct", "novita"),
44
- ("Qwen/QwQ-32B", "novita"),
45
- ]
46
-
47
- self._add_log("[INFO] Initializing evaluation task")
48
- self._add_log(f"[INFO] Dataset to evaluate: {self.organization}/{dataset_name}")
49
- self._add_log(f"[INFO] Output directory: {self.output_dir}")
50
-
51
- def _add_log(self, message: str) -> None:
52
- """
53
- Add a log message to the logs list
54
-
55
- Args:
56
- message: Log message to add
57
- """
58
- if message not in self.logs: # Avoid duplicates
59
- self.logs.append(message)
60
- # Force copy of the list to avoid reference problems
61
- self.logs = self.logs.copy()
62
- # Record in system logs
63
- logger.info(f"[{self.session_uid}] {message}")
64
-
65
- def get_logs(self) -> List[str]:
66
- """
67
- Get all logs for this task
68
-
69
- Returns:
70
- List of log messages
71
- """
72
- return self.logs.copy() # Retourner une copie pour éviter les problèmes de référence
73
-
74
- def is_task_completed(self) -> bool:
75
- """
76
- Check if the task is completed
77
-
78
- Returns:
79
- True if completed, False otherwise
80
  """
81
- return self.is_completed
82
-
83
- async def _evaluate_model(self, model_info: Tuple[str, str]) -> Dict[str, Any]:
84
  """
85
- Evaluate a specific model
86
-
87
- Args:
88
- model_info: Tuple of (model_name, provider)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- Returns:
91
- Dictionary with evaluation results
92
- """
93
- model_name, provider = model_info
94
- self._add_log(f"[INFO] Starting evaluation for {model_name} with {provider}")
95
-
96
- # Create output directory
97
- os.makedirs(self.output_dir, exist_ok=True)
98
-
99
- # Define full dataset path
100
- dataset_path = f"{self.organization}/{self.dataset_name}"
101
 
102
- # Create temporary file
103
  temp_file_path = tempfile.mktemp(suffix=".py")
104
- self._add_log(f"[INFO] Creating temporary file for {model_name}: {temp_file_path}")
105
-
106
  with open(temp_file_path, 'w') as temp_file:
107
  temp_file.write(f"""
108
- import os
109
- import sys
110
- sys.path.append("{os.getcwd()}")
111
-
112
- from tasks.yourbench_lighteval_task import create_yourbench_task
113
 
114
  # Create yourbench task
115
- yourbench = create_yourbench_task("{dataset_path}", "lighteval")
116
 
117
  # Define TASKS_TABLE needed by lighteval
118
  TASKS_TABLE = [yourbench]
119
  """)
120
-
121
- # Build lighteval command args
122
  cmd_args = [
123
  "lighteval",
124
- "endpoint",
125
  "inference-providers",
126
  f"model={model_name},provider={provider}",
127
  "custom|yourbench|0|0",
128
  "--custom-tasks",
129
  temp_file_path,
130
- "--max-samples", "5",
131
- "--output-dir", self.output_dir,
132
- "--save-details",
133
  "--no-push-to-hub"
134
  ]
135
-
136
- self._add_log(f"[INFO] Running command for {model_name}: {' '.join(cmd_args)}")
137
-
138
- results = {
139
- "model_name": model_name,
140
- "provider": provider,
141
- "success": False,
142
- "error": None,
143
- "results": None,
144
- "return_code": None
145
- }
146
-
147
  try:
148
- # Prepare environment with needed tokens
149
- env = os.environ.copy()
150
- hf_token = os.getenv("HF_TOKEN")
151
- if hf_token:
152
- env["HF_TOKEN"] = hf_token
153
- env["HUGGING_FACE_HUB_TOKEN"] = hf_token
154
- env["HF_ORGANIZATION"] = self.organization
155
-
156
- # Run the process asynchronously
157
- process = await asyncio.create_subprocess_exec(
158
- *cmd_args,
159
- stdout=asyncio.subprocess.PIPE,
160
- stderr=asyncio.subprocess.PIPE,
161
- env=env
162
- )
163
-
164
- # Wait for the process to complete
165
- stdout, stderr = await process.communicate()
166
-
167
- # Store return code
168
- exit_code = process.returncode
169
- results["return_code"] = exit_code
170
-
171
- # Log output
172
- if stdout:
173
- stdout_lines = stdout.decode().strip().split('\n')
174
- for line in stdout_lines[:5]: # Log only first 5 lines
175
- self._add_log(f"[INFO] {model_name} - {line}")
176
-
177
- # Log errors if any
178
- if stderr and exit_code != 0:
179
- stderr_lines = stderr.decode().strip().split('\n')
180
- for line in stderr_lines[:5]: # Log only first 5 lines
181
- self._add_log(f"[ERROR] {model_name} - {line}")
182
-
183
- # Find any JSON result files - LightEval organizes by model name in different ways
184
- result_files = []
185
- results_dir = Path(self.output_dir) / "results"
186
- if results_dir.exists():
187
- # Parcourir récursivement tous les répertoires pour trouver des fichiers JSON
188
- for json_file in results_dir.glob("**/*.json"):
189
- # Check if the filename or path contains parts of the model name
190
- model_parts = [
191
- model_name, # Full name
192
- model_name.replace('/', '_'), # Name with / replaced by _
193
- model_name.split('/')[-1] # Just the model name without the organization
194
- ]
195
-
196
- if any(part in str(json_file) for part in model_parts):
197
- result_files.append(json_file)
198
-
199
- # Traiter les fichiers de résultats trouvés
200
- if result_files:
201
- # Prendre le fichier le plus récent
202
- result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
203
- latest_result = result_files[0]
204
- self._add_log(f"[INFO] {model_name} - Found result file: {latest_result}")
205
-
206
- try:
207
- with open(latest_result, 'r') as f:
208
- test_results = json.load(f)
209
-
210
- # Vérifier si les résultats contiennent les informations essentielles
211
- if (test_results and
212
- isinstance(test_results, dict) and
213
- "results" in test_results and
214
- "all" in test_results["results"]):
215
-
216
- # Enregistrer les résultats
217
- results["results"] = test_results
218
- results["success"] = True
219
-
220
- # Afficher la précision
221
- accuracy = test_results["results"]["all"]["accuracy"]
222
- accuracy_stderr = test_results["results"]["all"]["accuracy_stderr"]
223
- self._add_log(f"[SUCCESS] {model_name} - Accuracy: {accuracy:.4f} ± {accuracy_stderr:.4f}")
224
- else:
225
- results["error"] = "Incomplete or unexpected result format"
226
- self._add_log(f"[WARNING] {model_name} - Unexpected result format")
227
-
228
- except (json.JSONDecodeError, KeyError) as e:
229
- results["error"] = f"Error reading results: {str(e)}"
230
- self._add_log(f"[ERROR] {model_name} - {results['error']}")
231
 
232
- # Si aucun résultat trouvé
233
- if not results["success"]:
234
- if exit_code == 0:
235
- results["error"] = "Execution completed without error but no results found"
236
- self._add_log(f"[WARNING] {model_name} - {results['error']}")
237
- else:
238
- results["error"] = f"Execution error (code: {exit_code})"
239
- self._add_log(f"[ERROR] {model_name} - {results['error']}")
240
-
 
 
241
  except Exception as e:
242
- results["error"] = f"Exception: {str(e)}"
243
- self._add_log(f"[ERROR] Exception during evaluation of {model_name}: {str(e)}")
244
- finally:
245
- # Delete temporary file
246
- try:
247
- os.unlink(temp_file_path)
248
- except:
249
- pass
250
-
251
- return results
252
-
253
- async def _run_evaluations(self) -> List[Dict[str, Any]]:
254
  """
255
- Run evaluations for all models
256
 
257
  Returns:
258
- List of evaluation results
259
  """
260
- self._add_log(f"[INFO] Starting evaluations for {len(self.models)} models")
 
261
 
262
- # Create tasks for each model
263
- tasks = [self._evaluate_model(model) for model in self.models]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- # Run all tasks concurrently and gather results
266
- model_results = await asyncio.gather(*tasks, return_exceptions=True)
267
 
268
- # Process results
269
- results = []
270
- for i, result in enumerate(model_results):
271
- if isinstance(result, Exception):
272
- # Handle exception
273
- model_name, provider = self.models[i]
274
- self._add_log(f"[ERROR] Evaluation failed for {model_name}: {str(result)}")
275
- results.append({
276
- "model_name": model_name,
277
- "provider": provider,
278
- "success": False,
279
- "error": str(result),
280
- "results": None,
281
- "return_code": None
282
- })
283
- else:
284
- # Valid result
285
- results.append(result)
286
 
287
- return results
288
-
289
- def _format_comparison_results(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
290
  """
291
- Format results for easy comparison between models
292
 
293
- Args:
294
- results: List of evaluation results
295
-
296
  Returns:
297
- Dictionary with formatted comparison results
298
  """
299
- comparison = {
300
- "metadata": {
301
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
302
- "dataset": f"{self.organization}/{self.dataset_name}",
303
- "total_models_tested": len(results),
304
- "successful_tests": len([r for r in results if r["success"]])
305
- },
306
- "models_comparison": []
307
- }
308
-
309
- # Liste des modèles réussis et des modèles échoués
310
- successful_models = [r for r in results if r["success"]]
311
- failed_models = [r for r in results if not r["success"]]
312
-
313
- # Trier les modèles réussis par précision (du plus précis au moins précis)
314
- if successful_models:
315
- sorted_successful = sorted(
316
- successful_models,
317
- key=lambda x: x["results"]["results"]["all"]["accuracy"],
318
- reverse=True # Du plus grand au plus petit
319
- )
320
- else:
321
- sorted_successful = []
322
-
323
- # Trier les modèles échoués par nom
324
- sorted_failed = sorted(failed_models, key=lambda x: x["model_name"])
325
-
326
- # Concaténer: d'abord les réussites, puis les échecs
327
- sorted_results = sorted_successful + sorted_failed
328
-
329
- # Créer l'entrée pour chaque modèle
330
- for result in sorted_results:
331
- model_result = {
332
- "model_name": result["model_name"],
333
- "provider": result["provider"],
334
- "success": result["success"]
335
- }
336
-
337
- if result["success"]:
338
- # Ajouter les métriques de précision et temps d'exécution
339
- model_result.update({
340
- "accuracy": result["results"]["results"]["all"]["accuracy"],
341
- "accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
342
- "evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
343
- })
344
- else:
345
- # Ajouter l'erreur
346
- model_result["error"] = result.get("error", "Unknown reason")
347
-
348
- comparison["models_comparison"].append(model_result)
349
-
350
- return comparison
351
-
352
- async def _upload_results_to_dataset(self, comparison_results: Dict[str, Any]) -> bool:
353
  """
354
- Upload evaluation results to the HuggingFace dataset
355
 
356
- Args:
357
- comparison_results: The formatted comparison results
358
-
359
  Returns:
360
- bool: True if upload succeeded, False otherwise
361
- """
362
- try:
363
- # Create a timestamp for the results file
364
- timestamp = time.strftime("%Y%m%d_%H%M%S")
365
- result_filename = f"lighteval_results.json"
366
-
367
- # Create temporary file for upload
368
- temp_file_path = tempfile.mktemp(suffix=".json")
369
- with open(temp_file_path, 'w') as f:
370
- json.dump(comparison_results, f, indent=2)
371
-
372
- # Initialize HF API
373
- hf_token = os.getenv("HF_TOKEN")
374
- if not hf_token:
375
- self._add_log("[ERROR] HF_TOKEN not found, cannot upload results to dataset")
376
- return False
377
-
378
- api = HfApi(token=hf_token)
379
- dataset_id = f"{self.organization}/{self.dataset_name}"
380
-
381
- # Prepare the file operation
382
- operation = CommitOperationAdd(
383
- path_in_repo=f"lighteval_results/{result_filename}",
384
- path_or_fileobj=temp_file_path
385
- )
386
-
387
- # Upload the file
388
- self._add_log(f"[INFO] Uploading results to dataset {dataset_id}")
389
- api.create_commit(
390
- repo_id=dataset_id,
391
- repo_type="dataset",
392
- operations=[operation],
393
- commit_message=f"Add evaluation results from {timestamp}"
394
- )
395
-
396
- # Cleanup temporary file
397
- os.unlink(temp_file_path)
398
-
399
- self._add_log(f"[SUCCESS] Results uploaded to dataset {dataset_id} at lighteval_results/{result_filename}")
400
- return True
401
-
402
- except Exception as e:
403
- self._add_log(f"[ERROR] Failed to upload results to dataset: {str(e)}")
404
- return False
405
-
406
- async def _process_evaluation_results(self, results: List[Dict[str, Any]]) -> None:
407
- """
408
- Process evaluation results, create summaries and save files
409
-
410
- Args:
411
- results: List of evaluation results
412
- """
413
- if results:
414
- try:
415
- # Save detailed results
416
- detailed_output_file = f"{self.output_dir}/detailed_results.json"
417
- os.makedirs(os.path.dirname(detailed_output_file), exist_ok=True)
418
- with open(detailed_output_file, 'w') as f:
419
- json.dump(results, f, indent=2)
420
- self._add_log(f"[INFO] Detailed results saved in {detailed_output_file}")
421
-
422
- # Generate and save comparison results
423
- comparison = self._format_comparison_results(results)
424
- comparison_file = f"{self.output_dir}/models_comparison.json"
425
- with open(comparison_file, 'w') as f:
426
- json.dump(comparison, f, indent=2)
427
- self._add_log(f"[INFO] Models comparison saved in {comparison_file}")
428
-
429
- # Upload results to the dataset
430
- await self._upload_results_to_dataset(comparison)
431
-
432
- # Store results for later access
433
- self.results = comparison
434
- self._add_log("[SUCCESS] Evaluation completed")
435
- except Exception as e:
436
- self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
437
- finally:
438
- self.is_completed = True
439
-
440
- def _async_run(self) -> None:
441
- """
442
- Run the evaluation asynchronously
443
  """
444
- async def run_async():
445
- try:
446
- # Run evaluations
447
- results = await self._run_evaluations()
448
-
449
- # Process evaluation results
450
- await self._process_evaluation_results(results)
451
- except Exception as e:
452
- self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
453
- finally:
454
- self.is_completed = True
455
-
456
- # Create and run the asyncio event loop
457
- loop = asyncio.new_event_loop()
458
- asyncio.set_event_loop(loop)
459
- loop.run_until_complete(run_async())
460
- loop.close()
461
-
462
  def run(self) -> None:
463
  """
464
- Run the evaluation task in a separate thread
465
  """
466
- self._add_log("[INFO] Starting evaluation")
467
-
468
- # Run in a separate thread to not block the main thread
469
- thread = threading.Thread(target=self._async_run)
470
- thread.daemon = True
471
- thread.start()
 
1
  """
2
+ Task to run evaluation using lighteval
3
  """
4
  import os
 
 
5
  import time
6
+ import subprocess
7
  import tempfile
 
 
8
  from pathlib import Path
9
+ import concurrent.futures
10
+ from dotenv import load_dotenv
11
+ from datetime import datetime
12
+ import json
13
+ from typing import List, Dict
14
+ from tasks.get_model_providers import get_model_providers
15
+ from huggingface_hub import HfApi
16
 
17
  class EvaluationTask:
18
  """
19
+ Task to run evaluation using lighteval
20
  """
21
 
22
  def __init__(self, session_uid: str, dataset_name: str):
 
29
  """
30
  self.session_uid = session_uid
31
  self.dataset_name = dataset_name
 
32
  self.is_completed = False
33
+ self.results = []
34
+ self.hf_api = HfApi()
35
+
36
+ def _save_results_to_hub(self) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  """
38
+ Save evaluation results to the dataset on the Hub
 
 
39
  """
40
+ try:
41
+ # Create results directory if it doesn't exist
42
+ results_dir = Path("data/lighteval_results")
43
+ results_dir.mkdir(parents=True, exist_ok=True)
44
+
45
+ # Save results to JSON file
46
+ results_file = results_dir / "lighteval_results.json"
47
+ with open(results_file, "w") as f:
48
+ json.dump(self.results, f, indent=2)
49
+
50
+ # Push to Hub
51
+ self.hf_api.upload_file(
52
+ path_or_fileobj=str(results_file),
53
+ path_in_repo="lighteval_results.json",
54
+ repo_id=self.dataset_name,
55
+ repo_type="dataset",
56
+ commit_message="Add lighteval evaluation results"
57
+ )
58
 
59
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json")
60
+ except Exception as e:
61
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
62
+
63
+ def _run_lighteval(self, model_name: str, provider: str, dataset_name: str) -> dict:
64
+ start_time = time.time()
65
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")
 
 
 
 
66
 
67
+ # Create temporary task file
68
  temp_file_path = tempfile.mktemp(suffix=".py")
 
 
69
  with open(temp_file_path, 'w') as temp_file:
70
  temp_file.write(f"""
71
+ from lighteval_task.lighteval_task import create_yourbench_task
 
 
 
 
72
 
73
  # Create yourbench task
74
+ yourbench = create_yourbench_task("{dataset_name}", "single_shot_questions")
75
 
76
  # Define TASKS_TABLE needed by lighteval
77
  TASKS_TABLE = [yourbench]
78
  """)
79
+
80
+ # LightEval command
81
  cmd_args = [
82
  "lighteval",
83
+ "endpoint",
84
  "inference-providers",
85
  f"model={model_name},provider={provider}",
86
  "custom|yourbench|0|0",
87
  "--custom-tasks",
88
  temp_file_path,
89
+ "--max-samples", "15",
90
+ "--output-dir", "data/lighteval_results",
91
+ # "--save-details",
92
  "--no-push-to-hub"
93
  ]
94
+
 
 
 
 
 
 
 
 
 
 
 
95
  try:
96
+ # Run the command with environment variables and timeout of 60 seconds
97
+ subprocess.run(cmd_args, env=os.environ, timeout=60)
98
+ except subprocess.TimeoutExpired:
99
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
100
+ return {
101
+ "model": model_name,
102
+ "provider": provider,
103
+ "accuracy": 0.0,
104
+ "execution_time": 60.0,
105
+ "status": "timeout"
106
+ }
107
+
108
+ # Calculate execution time
109
+ execution_time = time.time() - start_time
110
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
111
+
112
+ # Clean up
113
+ os.unlink(temp_file_path)
114
+
115
+ try:
116
+ # Get results from the output file
117
+ results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
118
+ results_file = next(results_dir.glob("results_*.json"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ with open(results_file) as f:
121
+ results = json.load(f)
122
+ accuracy = results["results"]["all"]["accuracy"]
123
+
124
+ return {
125
+ "model": model_name,
126
+ "provider": provider,
127
+ "accuracy": accuracy,
128
+ "execution_time": execution_time,
129
+ "status": "success"
130
+ }
131
  except Exception as e:
132
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
133
+ return {
134
+ "model": model_name,
135
+ "provider": provider,
136
+ "accuracy": 0.0,
137
+ "execution_time": execution_time,
138
+ "status": "parse_error"
139
+ }
140
+
141
+ def run_parallel(self) -> List[Dict]:
 
 
142
  """
143
+ Run the evaluation task with multiple models in parallel using ProcessPoolExecutor
144
 
145
  Returns:
146
+ List of results for each model
147
  """
148
+ # Start global timer
149
+ script_start_time = time.time()
150
 
151
+ # Load environment variables
152
+ load_dotenv()
153
+
154
+ # Models to evaluate
155
+ models = [
156
+ "Qwen/QwQ-32B",
157
+ "Qwen/Qwen2.5-72B-Instruct",
158
+ "deepseek-ai/DeepSeek-V3-0324",
159
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
160
+ ]
161
+
162
+ # Get providers for each model
163
+ model_providers = get_model_providers(models)
164
+
165
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
166
+
167
+ # Run evaluations in parallel using ProcessPoolExecutor
168
+ with concurrent.futures.ProcessPoolExecutor() as executor:
169
+ futures = [
170
+ executor.submit(self._run_lighteval, model_name, providers[0], self.dataset_name)
171
+ for model_name, providers in model_providers
172
+ if providers # Only run if providers are available
173
+ ]
174
+ self.results = [future.result() for future in concurrent.futures.as_completed(futures)]
175
+
176
+ # Calculate total script execution time
177
+ total_time = time.time() - script_start_time
178
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
179
 
180
+ # Save results to Hub
181
+ self._save_results_to_hub()
182
 
183
+ # Mark the task as completed
184
+ self.is_completed = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ return self.results
187
+
188
+ def get_logs(self) -> List[str]:
189
  """
190
+ Get logs for this task (empty list since we don't track logs anymore)
191
 
 
 
 
192
  Returns:
193
+ Empty list of logs
194
  """
195
+ return []
196
+
197
+ def is_task_completed(self) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  """
199
+ Check if the task is completed
200
 
 
 
 
201
  Returns:
202
+ True if completed, False otherwise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  """
204
+ return self.is_completed
205
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def run(self) -> None:
207
  """
208
+ Run the evaluation task (wrapper around run_parallel)
209
  """
210
+ self.run_parallel()
 
 
 
 
 
backend/tasks/get_model_providers.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import model_info
2
+ PREFERRED_PROVIDERS = ["sambanova", "novita"]
3
+
4
+ def filter_providers(providers):
5
+ return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
6
+
7
+ def get_model_providers(models):
8
+ results = []
9
+
10
+ for model_name in models:
11
+ try:
12
+ info = model_info(model_name, expand="inferenceProviderMapping")
13
+ providers = filter_providers(info.inference_provider_mapping.keys()) if hasattr(info, "inference_provider_mapping") else []
14
+ results.append((model_name, providers))
15
+ except Exception as e:
16
+ results.append((model_name, []))
17
+
18
+ return results
19
+
20
+ if __name__ == "__main__":
21
+ example_models = [
22
+ "Qwen/Qwen2.5-72B-Instruct",
23
+ "meta-llama/Llama-3.3-70B-Instruct",
24
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
25
+ "Qwen/QwQ-32B",
26
+ "mistralai/Mistral-Small-24B-Instruct-2501"
27
+ ]
28
+ results = get_model_providers(example_models)
29
+ print(results)
backend/test_import.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ try:
2
+ import lighteval_task
3
+ print("lighteval_task importé avec succès!")
4
+ except ImportError as e:
5
+ print(f"Erreur: {e}")
backend/yourbench_simple_demo.egg-info/PKG-INFO ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: yourbench-simple-demo
3
+ Version: 0.1.0
4
+ Author-email: Sumuk Shashidhar <[email protected]>, Alina Lozovskaia <[email protected]>, Clémentine Fourrier <[email protected]>, Nathan Habib <[email protected]>
5
+ Requires-Python: <3.13,>=3.12
6
+ Requires-Dist: yourbench@ git+https://github.com/huggingface/yourbench.git@main
7
+ Requires-Dist: asyncio>=3.4.3
8
+ Requires-Dist: datasets>=3.3.0
9
+ Requires-Dist: loguru>=0.7.3
10
+ Requires-Dist: python-dotenv>=1.0.1
11
+ Requires-Dist: tqdm>=4.67.1
12
+ Requires-Dist: ruff>=0.11.2
13
+ Requires-Dist: lighteval[math]>=0.8.0
14
+ Requires-Dist: huggingface-hub>=0.22.0
15
+ Requires-Dist: python-multipart>=0.0.5
16
+ Requires-Dist: fastapi>=0.110.0
17
+ Requires-Dist: uvicorn>=0.29.0
18
+ Requires-Dist: pydantic>=2.6.0
backend/yourbench_simple_demo.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ lighteval_task/__init__.py
4
+ lighteval_task/lighteval_task.py
5
+ tests/test_evaluation.py
6
+ tests/test_hf_upload.py
7
+ tests/test_inference.py
8
+ tests/test_lighteval.py
9
+ tests/test_openai.py
10
+ tests/test_parallel_lighteval.py
11
+ tests/test_provider_parallel_support.py
12
+ tests/test_yourbench_results.py
13
+ yourbench_simple_demo.egg-info/PKG-INFO
14
+ yourbench_simple_demo.egg-info/SOURCES.txt
15
+ yourbench_simple_demo.egg-info/dependency_links.txt
16
+ yourbench_simple_demo.egg-info/requires.txt
17
+ yourbench_simple_demo.egg-info/top_level.txt
backend/yourbench_simple_demo.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
backend/yourbench_simple_demo.egg-info/requires.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ yourbench@ git+https://github.com/huggingface/yourbench.git@main
2
+ asyncio>=3.4.3
3
+ datasets>=3.3.0
4
+ loguru>=0.7.3
5
+ python-dotenv>=1.0.1
6
+ tqdm>=4.67.1
7
+ ruff>=0.11.2
8
+ lighteval[math]>=0.8.0
9
+ huggingface-hub>=0.22.0
10
+ python-multipart>=0.0.5
11
+ fastapi>=0.110.0
12
+ uvicorn>=0.29.0
13
+ pydantic>=2.6.0
backend/yourbench_simple_demo.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ lighteval_task
frontend/src/components/BenchmarkDisplay.jsx CHANGED
@@ -99,19 +99,34 @@ const BenchmarkDisplay = ({
99
  <Typography variant="h6">Benchmark Created Successfully</Typography>
100
  </Box>
101
 
102
- <Tooltip title="Download the complete benchmark">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  <Button
104
- variant="outlined"
105
  color="primary"
106
- endIcon={
107
- isDownloading ? <CircularProgress size={16} /> : <DownloadIcon />
108
- }
109
- onClick={handleDownloadClick}
110
- disabled={isDownloading || !sessionId}
111
  >
112
- {isDownloading ? "Downloading..." : "Download Benchmark"}
113
  </Button>
114
- </Tooltip>
115
  </Box>
116
 
117
  <Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
@@ -154,18 +169,6 @@ const BenchmarkDisplay = ({
154
  </Card>
155
  ))}
156
  </Box>
157
-
158
- <Box sx={{ display: "flex", justifyContent: "center", mt: 4 }}>
159
- <Button
160
- variant="contained"
161
- color="primary"
162
- size="large"
163
- startIcon={<AssessmentIcon />}
164
- onClick={handleEvaluationClick}
165
- >
166
- Start Evaluation
167
- </Button>
168
- </Box>
169
  </>
170
  );
171
  };
 
99
  <Typography variant="h6">Benchmark Created Successfully</Typography>
100
  </Box>
101
 
102
+ <Box sx={{ display: "flex", gap: 2 }}>
103
+ <Tooltip title="Download the complete benchmark">
104
+ <Button
105
+ variant="outlined"
106
+ color="primary"
107
+ endIcon={
108
+ isDownloading ? (
109
+ <CircularProgress size={16} />
110
+ ) : (
111
+ <DownloadIcon />
112
+ )
113
+ }
114
+ onClick={handleDownloadClick}
115
+ disabled={isDownloading || !sessionId}
116
+ >
117
+ {isDownloading ? "Downloading..." : "Download Benchmark"}
118
+ </Button>
119
+ </Tooltip>
120
+
121
  <Button
122
+ variant="contained"
123
  color="primary"
124
+ startIcon={<AssessmentIcon />}
125
+ onClick={handleEvaluationClick}
 
 
 
126
  >
127
+ Start Evaluation
128
  </Button>
129
+ </Box>
130
  </Box>
131
 
132
  <Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
 
169
  </Card>
170
  ))}
171
  </Box>
 
 
 
 
 
 
 
 
 
 
 
 
172
  </>
173
  );
174
  };
frontend/src/components/BenchmarkEvaluation.jsx CHANGED
@@ -1,55 +1,53 @@
1
  import React, { useState, useEffect, useRef } from "react";
2
- import {
3
- Box,
4
- Typography,
5
- CircularProgress,
6
- Alert,
7
- Paper,
8
- Divider,
9
- Button,
10
- } from "@mui/material";
11
- import AccessTimeIcon from "@mui/icons-material/AccessTime";
12
- import LogDisplay from "./LogDisplay";
13
  import { useNavigate } from "react-router-dom";
14
 
15
- // Evaluation steps
16
- const EVALUATION_STEPS = [
17
- "preparation",
18
- "model_evaluation",
19
- "results_compilation",
 
20
  ];
21
 
22
- // Friendly step names for display
23
- const STEP_LABELS = {
24
- preparation: "Preparation",
25
- model_evaluation: "Model Evaluation",
26
- results_compilation: "Results Compilation",
27
- };
28
-
29
- /**
30
- * Component to handle benchmark evaluation and display logs
31
- *
32
- * @param {Object} props - Component props
33
- * @param {string} props.sessionId - Session ID of the benchmark to evaluate
34
- * @param {Function} props.onComplete - Function to call when evaluation is complete
35
- * @returns {JSX.Element} Benchmark evaluation component
36
- */
37
  const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
38
- const [evaluating, setEvaluating] = useState(false);
39
  const [evaluationComplete, setEvaluationComplete] = useState(false);
40
- const [evaluationLogs, setEvaluationLogs] = useState([]);
41
  const [error, setError] = useState(null);
42
- const [currentPhase, setCurrentPhase] = useState("initializing");
43
- const [completedSteps, setCompletedSteps] = useState([]);
44
- const [activeStep, setActiveStep] = useState(0);
45
  const [elapsedTime, setElapsedTime] = useState(0);
 
46
 
47
- const pollingIntervalRef = useRef(null);
48
  const timerIntervalRef = useRef(null);
49
  const startTimeRef = useRef(null);
 
 
50
 
51
  const navigate = useNavigate();
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  // Start evaluation when component mounts
54
  useEffect(() => {
55
  // Set start time
@@ -76,80 +74,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
76
  };
77
  }, []);
78
 
79
- // Determine current phase and completed steps from logs
80
- useEffect(() => {
81
- if (evaluationLogs.length === 0) return;
82
-
83
- // Check all logs for completed steps
84
- const newCompletedSteps = [...completedSteps];
85
- let newActiveStep = activeStep;
86
-
87
- evaluationLogs.forEach((log) => {
88
- // Detect completed steps (format: [SUCCESS] Stage completed: step_name)
89
- const match = log.match(/\[SUCCESS\] Stage completed: (\w+)/);
90
- if (match && match[1]) {
91
- const completedStep = match[1].trim();
92
- if (
93
- EVALUATION_STEPS.includes(completedStep) &&
94
- !newCompletedSteps.includes(completedStep)
95
- ) {
96
- newCompletedSteps.push(completedStep);
97
- // Set active step to index of next step
98
- const stepIndex = EVALUATION_STEPS.indexOf(completedStep);
99
- if (stepIndex >= 0 && stepIndex + 1 > newActiveStep) {
100
- newActiveStep = stepIndex + 1;
101
- if (newActiveStep >= EVALUATION_STEPS.length) {
102
- newActiveStep = EVALUATION_STEPS.length;
103
- }
104
- }
105
- }
106
- }
107
- });
108
-
109
- // Update state if there are new completed steps
110
- if (newCompletedSteps.length > completedSteps.length) {
111
- setCompletedSteps(newCompletedSteps);
112
- setActiveStep(newActiveStep);
113
- }
114
-
115
- // Check recent logs to determine current phase
116
- const recentLogs = evaluationLogs.slice(-10);
117
-
118
- // Detect completion conditions
119
- const isComplete =
120
- recentLogs.some((log) =>
121
- log.includes("[SUCCESS] Evaluation completed")
122
- ) ||
123
- completedSteps.includes("results_compilation") ||
124
- newCompletedSteps.includes("results_compilation");
125
-
126
- if (isComplete) {
127
- setCurrentPhase("complete");
128
- setEvaluationComplete(true);
129
- // Stop polling when evaluation is complete
130
- if (pollingIntervalRef.current) {
131
- clearInterval(pollingIntervalRef.current);
132
- }
133
- if (timerIntervalRef.current) {
134
- clearInterval(timerIntervalRef.current);
135
- }
136
- // Notify parent component that evaluation is complete
137
- if (onComplete) {
138
- onComplete({
139
- success: true,
140
- sessionId,
141
- logs: evaluationLogs,
142
- });
143
- }
144
- } else if (recentLogs.some((log) => log.includes("Comparing models"))) {
145
- setCurrentPhase("compiling_results");
146
- } else if (recentLogs.some((log) => log.includes("Starting evaluations"))) {
147
- setCurrentPhase("evaluating");
148
- } else if (recentLogs.some((log) => log.includes("Initialization"))) {
149
- setCurrentPhase("preparing");
150
- }
151
- }, [evaluationLogs, completedSteps, activeStep, sessionId, onComplete]);
152
-
153
  // Format elapsed time as HH:MM:SS
154
  const formatElapsedTime = () => {
155
  const hours = Math.floor(elapsedTime / 3600);
@@ -170,13 +94,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
170
  return;
171
  }
172
 
173
- setEvaluating(true);
174
- setEvaluationLogs([]);
175
- setError(null);
176
- setCurrentPhase("initializing");
177
- setCompletedSteps([]);
178
- setActiveStep(0);
179
-
180
  try {
181
  // Call API to start evaluation
182
  const response = await fetch("http://localhost:3001/evaluate-benchmark", {
@@ -192,34 +109,15 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
192
  const result = await response.json();
193
 
194
  if (response.ok) {
195
- setEvaluationLogs(result.logs || []);
196
-
197
- // Set up polling to retrieve more logs
198
  pollingIntervalRef.current = setInterval(async () => {
199
- // Check if we're already done
200
- if (evaluationComplete) {
201
- clearInterval(pollingIntervalRef.current);
202
- return;
203
- }
204
-
205
  try {
206
- // Call API to get latest logs
207
  const logsResponse = await fetch(
208
  `http://localhost:3001/evaluation-logs/${sessionId}`
209
  );
210
 
211
  if (logsResponse.ok) {
212
  const logsResult = await logsResponse.json();
213
-
214
- // Update logs if there are new ones
215
- if (
216
- logsResult.logs &&
217
- logsResult.logs.length > evaluationLogs.length
218
- ) {
219
- setEvaluationLogs(logsResult.logs);
220
- }
221
-
222
- // Check if evaluation is complete
223
  if (logsResult.is_completed) {
224
  setEvaluationComplete(true);
225
  clearInterval(pollingIntervalRef.current);
@@ -227,71 +125,17 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
227
  }
228
  } catch (error) {
229
  console.log("Error polling logs:", error);
230
- // Don't stop polling on network errors
231
  }
232
- }, 2000); // Poll every 2 seconds
233
  } else {
234
- // Handle error
235
- setEvaluationLogs([`Error: ${result.error || "Unknown error"}`]);
236
  setError(result.error || "Benchmark evaluation failed");
237
  }
238
  } catch (error) {
239
  console.error("Error starting evaluation:", error);
240
- setEvaluationLogs([`Error: ${error.message || "Unknown error"}`]);
241
  setError("Error connecting to server");
242
- } finally {
243
- setEvaluating(false);
244
- }
245
- };
246
-
247
- // Get title based on current phase
248
- const getPhaseTitle = () => {
249
- switch (currentPhase) {
250
- case "initializing":
251
- return "Preparing evaluation...";
252
- case "preparing":
253
- return "Preparing models...";
254
- case "evaluating":
255
- return "Evaluating models...";
256
- case "compiling_results":
257
- return "Compiling results...";
258
- case "complete":
259
- return "Evaluation completed successfully!";
260
- default:
261
- return "Processing...";
262
  }
263
  };
264
 
265
- // Get current step info for display
266
- const getCurrentStepInfo = () => {
267
- const totalSteps = EVALUATION_STEPS.length;
268
- const currentStepIndex = activeStep;
269
-
270
- // If no active step yet
271
- if (currentStepIndex === 0 && completedSteps.length === 0) {
272
- return `Starting... (0%)`;
273
- }
274
-
275
- // If all steps completed
276
- if (currentStepIndex >= totalSteps) {
277
- return `Completed (100%)`;
278
- }
279
-
280
- // Calculate percentage
281
- const percentage = Math.round((currentStepIndex / totalSteps) * 100);
282
-
283
- // Get current step name
284
- const currentStepName =
285
- STEP_LABELS[EVALUATION_STEPS[currentStepIndex]] || "Processing";
286
-
287
- return `${currentStepName} (${percentage}%)`;
288
- };
289
-
290
- // Function to navigate to results page
291
- const viewResults = () => {
292
- navigate(`/evaluation-display?session=${sessionId}`);
293
- };
294
-
295
  return (
296
  <Paper
297
  elevation={3}
@@ -313,29 +157,19 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
313
  ) : (
314
  <>
315
  {evaluationComplete ? (
316
- <>
317
- <Alert severity="success" sx={{ width: "100%", mb: 3 }}>
318
- Evaluation completed successfully!
319
- </Alert>
320
- <Button
321
- variant="contained"
322
- color="primary"
323
- onClick={viewResults}
324
- sx={{ mb: 3 }}
325
- >
326
- View Results Leaderboard
327
- </Button>
328
- </>
329
  ) : (
330
  <>
331
  <CircularProgress size={60} sx={{ mb: 2 }} />
332
  <Typography variant="h6" component="div" gutterBottom>
333
- {getPhaseTitle()}
334
  </Typography>
335
 
336
  {/* Step progress indicator */}
337
  <Typography variant="body1" color="text.secondary">
338
- {getCurrentStepInfo()}
339
  </Typography>
340
 
341
  {/* Timer display */}
@@ -354,9 +188,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
354
  )}
355
  </>
356
  )}
357
-
358
- {/* Use the LogDisplay component for logs */}
359
- <LogDisplay logs={evaluationLogs} height={150} />
360
  </Paper>
361
  );
362
  };
 
1
  import React, { useState, useEffect, useRef } from "react";
2
+ import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
 
 
 
 
 
 
 
 
 
 
3
  import { useNavigate } from "react-router-dom";
4
 
5
+ // Starting messages with their timing
6
+ const STARTING_MESSAGES = [
7
+ { message: "Initializing evaluation environment...", progress: 22 },
8
+ { message: "Starting evaluation process...", progress: 54 },
9
+ { message: "Evaluating models...", progress: 71 },
10
+ { message: "Storing evaluation results...", progress: 100 },
11
  ];
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
 
14
  const [evaluationComplete, setEvaluationComplete] = useState(false);
 
15
  const [error, setError] = useState(null);
 
 
 
16
  const [elapsedTime, setElapsedTime] = useState(0);
17
+ const [startingMessageIndex, setStartingMessageIndex] = useState(0);
18
 
 
19
  const timerIntervalRef = useRef(null);
20
  const startTimeRef = useRef(null);
21
+ const startingMessageIntervalRef = useRef(null);
22
+ const pollingIntervalRef = useRef(null);
23
 
24
  const navigate = useNavigate();
25
 
26
+ // Add effect to handle automatic redirection when evaluation is complete
27
+ useEffect(() => {
28
+ if (evaluationComplete) {
29
+ navigate(`/evaluation-display?session=${sessionId}`);
30
+ }
31
+ }, [evaluationComplete, sessionId, navigate]);
32
+
33
+ // Add effect to handle starting messages
34
+ useEffect(() => {
35
+ startingMessageIntervalRef.current = setInterval(() => {
36
+ setStartingMessageIndex((prev) => {
37
+ if (prev < STARTING_MESSAGES.length - 1) {
38
+ return prev + 1;
39
+ }
40
+ return prev;
41
+ });
42
+ }, 20000); // Change message every 20 seconds
43
+
44
+ return () => {
45
+ if (startingMessageIntervalRef.current) {
46
+ clearInterval(startingMessageIntervalRef.current);
47
+ }
48
+ };
49
+ }, []);
50
+
51
  // Start evaluation when component mounts
52
  useEffect(() => {
53
  // Set start time
 
74
  };
75
  }, []);
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  // Format elapsed time as HH:MM:SS
78
  const formatElapsedTime = () => {
79
  const hours = Math.floor(elapsedTime / 3600);
 
94
  return;
95
  }
96
 
 
 
 
 
 
 
 
97
  try {
98
  // Call API to start evaluation
99
  const response = await fetch("http://localhost:3001/evaluate-benchmark", {
 
109
  const result = await response.json();
110
 
111
  if (response.ok) {
112
+ // Set up polling to check completion
 
 
113
  pollingIntervalRef.current = setInterval(async () => {
 
 
 
 
 
 
114
  try {
 
115
  const logsResponse = await fetch(
116
  `http://localhost:3001/evaluation-logs/${sessionId}`
117
  );
118
 
119
  if (logsResponse.ok) {
120
  const logsResult = await logsResponse.json();
 
 
 
 
 
 
 
 
 
 
121
  if (logsResult.is_completed) {
122
  setEvaluationComplete(true);
123
  clearInterval(pollingIntervalRef.current);
 
125
  }
126
  } catch (error) {
127
  console.log("Error polling logs:", error);
 
128
  }
129
+ }, 2000);
130
  } else {
 
 
131
  setError(result.error || "Benchmark evaluation failed");
132
  }
133
  } catch (error) {
134
  console.error("Error starting evaluation:", error);
 
135
  setError("Error connecting to server");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
137
  };
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  return (
140
  <Paper
141
  elevation={3}
 
157
  ) : (
158
  <>
159
  {evaluationComplete ? (
160
+ <Alert severity="success" sx={{ width: "100%", mb: 3 }}>
161
+ Evaluation completed successfully!
162
+ </Alert>
 
 
 
 
 
 
 
 
 
 
163
  ) : (
164
  <>
165
  <CircularProgress size={60} sx={{ mb: 2 }} />
166
  <Typography variant="h6" component="div" gutterBottom>
167
+ Benchmark evaluation...
168
  </Typography>
169
 
170
  {/* Step progress indicator */}
171
  <Typography variant="body1" color="text.secondary">
172
+ {`${STARTING_MESSAGES[startingMessageIndex].message} (${STARTING_MESSAGES[startingMessageIndex].progress}%)`}
173
  </Typography>
174
 
175
  {/* Timer display */}
 
188
  )}
189
  </>
190
  )}
 
 
 
191
  </Paper>
192
  );
193
  };
frontend/src/components/BenchmarkGenerator.jsx CHANGED
@@ -288,7 +288,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
288
  case "initializing":
289
  return "Benchmark generation...";
290
  case "configuring":
291
- return "Generating configuration file...";
292
  case "benchmarking":
293
  return "Creating benchmark...";
294
  case "complete":
@@ -390,7 +390,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
390
  )}
391
 
392
  {/* Use the LogDisplay component */}
393
- <LogDisplay logs={generationLogs} height={150} />
394
  </Paper>
395
  );
396
  };
 
288
  case "initializing":
289
  return "Benchmark generation...";
290
  case "configuring":
291
+ return "Creating benchmark...";
292
  case "benchmarking":
293
  return "Creating benchmark...";
294
  case "complete":
 
390
  )}
391
 
392
  {/* Use the LogDisplay component */}
393
+ {/* <LogDisplay logs={generationLogs} height={150} /> */}
394
  </Paper>
395
  );
396
  };
frontend/src/components/EvaluationDisplay.jsx CHANGED
@@ -10,7 +10,7 @@ import {
10
  TableHead,
11
  TableRow,
12
  Alert,
13
- LinearProgress,
14
  Card,
15
  CardContent,
16
  Link,
@@ -70,11 +70,20 @@ const EvaluationDisplay = ({ sessionId }) => {
70
 
71
  if (loading) {
72
  return (
73
- <Box sx={{ width: "100%", mt: 4, mb: 4 }}>
 
 
 
 
 
 
 
 
 
74
  <Typography variant="h5" gutterBottom>
75
  Loading Evaluation Results...
76
  </Typography>
77
- <LinearProgress />
78
  </Box>
79
  );
80
  }
@@ -127,50 +136,45 @@ const EvaluationDisplay = ({ sessionId }) => {
127
  </TableRow>
128
  </TableHead>
129
  <TableBody>
130
- {results.models_comparison.map((model, index) => (
131
- <TableRow
132
- key={`${model.model_name}-${model.provider}`}
133
- sx={{
134
- "&:last-child td, &:last-child th": { border: 0 },
135
- backgroundColor: model.success
136
- ? "inherit"
137
- : "rgba(0, 0, 0, 0.04)",
138
- }}
139
- >
140
- <TableCell>{index + 1}</TableCell>
141
- <TableCell component="th" scope="row">
142
- <Link
143
- href={`https://huggingface.co/${model.model_name}`}
144
- target="_blank"
145
- rel="noopener noreferrer"
146
- sx={{
147
- textDecoration: "none",
148
- "&:hover": {
149
- textDecoration: "underline",
150
- },
151
- display: "flex",
152
- alignItems: "center",
153
- }}
154
- >
155
- {model.model_name}
156
- <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
157
- </Link>
158
- </TableCell>
159
- <TableCell align="center">
160
- {model.success ? formatAccuracy(model.accuracy) : "-"}
161
- </TableCell>
162
- <TableCell align="center">
163
- {model.success ? formatTime(model.evaluation_time) : "-"}
164
- </TableCell>
165
- <TableCell align="center">
166
- {model.success ? (
167
  <span style={{ color: "green" }}>✓ Success</span>
168
- ) : (
169
- <span style={{ color: "red" }}>✗ Failed</span>
170
- )}
171
- </TableCell>
172
- </TableRow>
173
- ))}
174
  </TableBody>
175
  </Table>
176
  </TableContainer>
 
10
  TableHead,
11
  TableRow,
12
  Alert,
13
+ CircularProgress,
14
  Card,
15
  CardContent,
16
  Link,
 
70
 
71
  if (loading) {
72
  return (
73
+ <Box
74
+ sx={{
75
+ width: "100%",
76
+ mt: 4,
77
+ mb: 4,
78
+ display: "flex",
79
+ flexDirection: "column",
80
+ alignItems: "center",
81
+ }}
82
+ >
83
  <Typography variant="h5" gutterBottom>
84
  Loading Evaluation Results...
85
  </Typography>
86
+ <CircularProgress />
87
  </Box>
88
  );
89
  }
 
136
  </TableRow>
137
  </TableHead>
138
  <TableBody>
139
+ {results.models_comparison
140
+ .filter((model) => model.success)
141
+ .map((model, index) => (
142
+ <TableRow
143
+ key={`${model.model_name}-${model.provider}`}
144
+ sx={{
145
+ "&:last-child td, &:last-child th": { border: 0 },
146
+ }}
147
+ >
148
+ <TableCell>{index + 1}</TableCell>
149
+ <TableCell component="th" scope="row">
150
+ <Link
151
+ href={`https://huggingface.co/${model.model_name}`}
152
+ target="_blank"
153
+ rel="noopener noreferrer"
154
+ sx={{
155
+ textDecoration: "none",
156
+ "&:hover": {
157
+ textDecoration: "underline",
158
+ },
159
+ display: "flex",
160
+ alignItems: "center",
161
+ }}
162
+ >
163
+ {model.model_name}
164
+ <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
165
+ </Link>
166
+ </TableCell>
167
+ <TableCell align="center">
168
+ {formatAccuracy(model.accuracy)}
169
+ </TableCell>
170
+ <TableCell align="center">
171
+ {formatTime(model.evaluation_time)}
172
+ </TableCell>
173
+ <TableCell align="center">
 
 
174
  <span style={{ color: "green" }}>✓ Success</span>
175
+ </TableCell>
176
+ </TableRow>
177
+ ))}
 
 
 
178
  </TableBody>
179
  </Table>
180
  </TableContainer>
frontend/src/components/ExternalLinks.jsx CHANGED
@@ -1,16 +1,31 @@
1
  import React from "react";
2
- import { Box, Typography } from "@mui/material";
3
  import OpenInNewIcon from "@mui/icons-material/OpenInNew";
 
4
 
5
  const ExternalLinks = () => {
 
 
 
 
 
 
 
 
 
 
 
 
6
  return (
7
  <Box
8
  sx={{
9
  position: "fixed",
10
  top: 24,
 
11
  right: 24,
 
12
  display: "flex",
13
- gap: 2,
14
  alignItems: "center",
15
  zIndex: 1000,
16
  }}
@@ -57,6 +72,22 @@ const ExternalLinks = () => {
57
  <OpenInNewIcon sx={{ fontSize: "0.75rem", ml: 0.5, opacity: 0.6 }} />
58
  </a>
59
  </Typography>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  </Box>
61
  );
62
  };
 
1
  import React from "react";
2
+ import { Box, Typography, IconButton, Tooltip } from "@mui/material";
3
  import OpenInNewIcon from "@mui/icons-material/OpenInNew";
4
+ import ShareIcon from "@mui/icons-material/Share";
5
 
6
  const ExternalLinks = () => {
7
+ const handleShare = async () => {
8
+ try {
9
+ await navigator.share({
10
+ title: "YourBench Demo",
11
+ text: "Check out this benchmark evaluation on YourBench!",
12
+ url: window.location.href,
13
+ });
14
+ } catch (err) {
15
+ console.log("Error sharing:", err);
16
+ }
17
+ };
18
+
19
  return (
20
  <Box
21
  sx={{
22
  position: "fixed",
23
  top: 24,
24
+ left: 24,
25
  right: 24,
26
+ margin: "auto",
27
  display: "flex",
28
+ justifyContent: "space-between",
29
  alignItems: "center",
30
  zIndex: 1000,
31
  }}
 
72
  <OpenInNewIcon sx={{ fontSize: "0.75rem", ml: 0.5, opacity: 0.6 }} />
73
  </a>
74
  </Typography>
75
+ <Tooltip title="Share">
76
+ <IconButton
77
+ onClick={handleShare}
78
+ size="small"
79
+ sx={{
80
+ ml: 1,
81
+ color: "inherit",
82
+ opacity: 0.7,
83
+ "&:hover": {
84
+ opacity: 1,
85
+ },
86
+ }}
87
+ >
88
+ <ShareIcon fontSize="small" />
89
+ </IconButton>
90
+ </Tooltip>
91
  </Box>
92
  );
93
  };
frontend/src/config/theme.js CHANGED
@@ -375,7 +375,7 @@ const getDesignTokens = (mode) => ({
375
  values: {
376
  xs: 0,
377
  sm: 600,
378
- md: 900,
379
  lg: 1240,
380
  xl: 1536,
381
  },
 
375
  values: {
376
  xs: 0,
377
  sm: 600,
378
+ md: 1100,
379
  lg: 1240,
380
  xl: 1536,
381
  },
test_import.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ try:
2
+ import lighteval_task
3
+ print("lighteval_task importé avec succès!")
4
+ except ImportError as e:
5
+ print(f"Erreur: {e}")