Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update on tasks
Browse files- backend/data/lighteval_results/lighteval_results.json +30 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json +121 -0
- backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json +121 -0
- backend/lighteval_task/__init__.py +3 -0
- backend/{tasks/yourbench_lighteval_task.py → lighteval_task/lighteval_task.py} +36 -10
- backend/pyproject.toml +6 -0
- backend/routes/evaluation.py +42 -31
- backend/tasks/createBench.py +1 -83
- backend/tasks/createBenchConfigFile.py +4 -4
- backend/tasks/evaluationTask.py +144 -405
- backend/tasks/get_model_providers.py +29 -0
- backend/test_import.py +5 -0
- backend/yourbench_simple_demo.egg-info/PKG-INFO +18 -0
- backend/yourbench_simple_demo.egg-info/SOURCES.txt +17 -0
- backend/yourbench_simple_demo.egg-info/dependency_links.txt +1 -0
- backend/yourbench_simple_demo.egg-info/requires.txt +13 -0
- backend/yourbench_simple_demo.egg-info/top_level.txt +1 -0
- frontend/src/components/BenchmarkDisplay.jsx +24 -21
- frontend/src/components/BenchmarkEvaluation.jsx +42 -211
- frontend/src/components/BenchmarkGenerator.jsx +2 -2
- frontend/src/components/EvaluationDisplay.jsx +50 -46
- frontend/src/components/ExternalLinks.jsx +33 -2
- frontend/src/config/theme.js +1 -1
- test_import.py +5 -0
backend/data/lighteval_results/lighteval_results.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
4 |
+
"provider": "sambanova",
|
5 |
+
"accuracy": 1.0,
|
6 |
+
"execution_time": 18.800472021102905,
|
7 |
+
"status": "success"
|
8 |
+
},
|
9 |
+
{
|
10 |
+
"model": "deepseek-ai/DeepSeek-V3-0324",
|
11 |
+
"provider": "novita",
|
12 |
+
"accuracy": 1.0,
|
13 |
+
"execution_time": 34.95434904098511,
|
14 |
+
"status": "success"
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"model": "Qwen/Qwen2.5-72B-Instruct",
|
18 |
+
"provider": "sambanova",
|
19 |
+
"accuracy": 0.0,
|
20 |
+
"execution_time": 60.0,
|
21 |
+
"status": "timeout"
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"model": "Qwen/QwQ-32B",
|
25 |
+
"provider": "sambanova",
|
26 |
+
"accuracy": 0.0,
|
27 |
+
"execution_time": 60.0,
|
28 |
+
"status": "timeout"
|
29 |
+
}
|
30 |
+
]
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186274.866411583,
|
9 |
+
"end_time": 186322.987643416,
|
10 |
+
"total_evaluation_time_secondes": "48.12123183300719",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186407.701185,
|
9 |
+
"end_time": 186447.883386625,
|
10 |
+
"total_evaluation_time_secondes": "40.18220162499347",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186521.763833833,
|
9 |
+
"end_time": 186557.476439666,
|
10 |
+
"total_evaluation_time_secondes": "35.71260583298863",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186704.883209333,
|
9 |
+
"end_time": 186743.215716791,
|
10 |
+
"total_evaluation_time_secondes": "38.332507457991596",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 187518.49620975,
|
9 |
+
"end_time": 187553.120908083,
|
10 |
+
"total_evaluation_time_secondes": "34.62469833297655",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 187690.771319041,
|
9 |
+
"end_time": 187724.908132583,
|
10 |
+
"total_evaluation_time_secondes": "34.136813541990705",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 187785.492066916,
|
9 |
+
"end_time": 187824.287589375,
|
10 |
+
"total_evaluation_time_secondes": "38.79552245899686",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 188300.087538958,
|
9 |
+
"end_time": 188337.230208583,
|
10 |
+
"total_evaluation_time_secondes": "37.142669624998234",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "7e34d82512ce6dfc",
|
96 |
+
"hash_full_prompts": "af7c42c6f40964e1",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "7cdb142c3142312a",
|
111 |
+
"hash_full_prompts": "a2e47b0b68e57792",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 188452.784089458,
|
9 |
+
"end_time": 188490.538178958,
|
10 |
+
"total_evaluation_time_secondes": "37.75408949999837",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 15,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 188674.734532375,
|
9 |
+
"end_time": 188715.337919458,
|
10 |
+
"total_evaluation_time_secondes": "40.60338708298514",
|
11 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 15,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "35f5eef8199d4521",
|
96 |
+
"hash_full_prompts": "5590bc220414fefb",
|
97 |
+
"hash_input_tokens": "58ec870775e406f3",
|
98 |
+
"hash_cont_tokens": "58ec870775e406f3"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 15,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 15,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "bc7dfdffc5e53476",
|
111 |
+
"hash_full_prompts": "712fd00df902d786",
|
112 |
+
"hash_input_tokens": "544d800a25dfd777",
|
113 |
+
"hash_cont_tokens": "544d800a25dfd777"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 15,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 15,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 15,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 188674.734510208,
|
9 |
+
"end_time": 188690.205653,
|
10 |
+
"total_evaluation_time_secondes": "15.471142791997408",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 15,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "35f5eef8199d4521",
|
96 |
+
"hash_full_prompts": "5590bc220414fefb",
|
97 |
+
"hash_input_tokens": "58ec870775e406f3",
|
98 |
+
"hash_cont_tokens": "58ec870775e406f3"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 15,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 15,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "bc7dfdffc5e53476",
|
111 |
+
"hash_full_prompts": "712fd00df902d786",
|
112 |
+
"hash_input_tokens": "544d800a25dfd777",
|
113 |
+
"hash_cont_tokens": "544d800a25dfd777"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 15,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 15,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186274.866369916,
|
9 |
+
"end_time": 186294.792813083,
|
10 |
+
"total_evaluation_time_secondes": "19.926443167001707",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186407.701222875,
|
9 |
+
"end_time": 186427.871588083,
|
10 |
+
"total_evaluation_time_secondes": "20.170365208003204",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186521.763754958,
|
9 |
+
"end_time": 186545.585271583,
|
10 |
+
"total_evaluation_time_secondes": "23.821516625001095",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 186704.882684291,
|
9 |
+
"end_time": 186723.820615833,
|
10 |
+
"total_evaluation_time_secondes": "18.937931542022852",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 187518.496174916,
|
9 |
+
"end_time": 187538.752125166,
|
10 |
+
"total_evaluation_time_secondes": "20.255950249993475",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 187690.771119125,
|
9 |
+
"end_time": 187715.172306583,
|
10 |
+
"total_evaluation_time_secondes": "24.40118745798827",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 187785.49207775,
|
9 |
+
"end_time": 187806.982701541,
|
10 |
+
"total_evaluation_time_secondes": "21.4906237910036",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 188300.087685291,
|
9 |
+
"end_time": 188324.829042291,
|
10 |
+
"total_evaluation_time_secondes": "24.7413570000208",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "7e34d82512ce6dfc",
|
96 |
+
"hash_full_prompts": "af7c42c6f40964e1",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "7cdb142c3142312a",
|
111 |
+
"hash_full_prompts": "a2e47b0b68e57792",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 5,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 188452.784059833,
|
9 |
+
"end_time": 188474.450274291,
|
10 |
+
"total_evaluation_time_secondes": "21.666214458004106",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 5,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "abaa6ef1f9715482",
|
96 |
+
"hash_full_prompts": "0b5eb6607b419659",
|
97 |
+
"hash_input_tokens": "bf9d9e969418cff7",
|
98 |
+
"hash_cont_tokens": "bf9d9e969418cff7"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 5,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 5,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b1bf475c2319e3b2",
|
111 |
+
"hash_full_prompts": "d860f90cd7291b63",
|
112 |
+
"hash_input_tokens": "5882dac673b9f859",
|
113 |
+
"hash_cont_tokens": "5882dac673b9f859"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 5,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 5,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 15,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 188674.734458958,
|
9 |
+
"end_time": 188711.276019958,
|
10 |
+
"total_evaluation_time_secondes": "36.54156099999091",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
|
51 |
+
"hf_subset": "single_shot_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 15,
|
87 |
+
"effective_num_docs": 15,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "35f5eef8199d4521",
|
96 |
+
"hash_full_prompts": "5590bc220414fefb",
|
97 |
+
"hash_input_tokens": "58ec870775e406f3",
|
98 |
+
"hash_cont_tokens": "58ec870775e406f3"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 15,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 15,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "bc7dfdffc5e53476",
|
111 |
+
"hash_full_prompts": "712fd00df902d786",
|
112 |
+
"hash_input_tokens": "544d800a25dfd777",
|
113 |
+
"hash_cont_tokens": "544d800a25dfd777"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 15,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 15,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/lighteval_task/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .lighteval_task import create_yourbench_task
|
2 |
+
|
3 |
+
__all__ = ["create_yourbench_task"]
|
backend/{tasks/yourbench_lighteval_task.py → lighteval_task/lighteval_task.py}
RENAMED
@@ -136,10 +136,26 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
|
|
136 |
|
137 |
|
138 |
def process_judge_response_yourbench(response):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
# extract the final answer using regex from the response xml
|
140 |
try:
|
141 |
# Essayer d'abord le format XML
|
142 |
-
match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL)
|
143 |
if match:
|
144 |
answer_text = match.group(1).strip()
|
145 |
# Convertir différents formats possibles en 0 ou 1
|
@@ -155,14 +171,16 @@ def process_judge_response_yourbench(response):
|
|
155 |
pass
|
156 |
|
157 |
# Rechercher des mots-clés dans la réponse
|
158 |
-
if re.search(r"\b(correct|vrai|true|yes)\b", response, re.IGNORECASE):
|
159 |
return 1
|
160 |
-
if re.search(r"\b(incorrect|faux|false|no)\b", response, re.IGNORECASE):
|
161 |
return 0
|
162 |
|
163 |
-
logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {response[:100]}...")
|
164 |
except Exception as e:
|
165 |
logger.error(f"Error processing judge response: {e}")
|
|
|
|
|
166 |
return 0
|
167 |
|
168 |
|
@@ -185,10 +203,18 @@ class JudgeLLMYourBench(JudgeLLM):
|
|
185 |
chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs]
|
186 |
documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
|
187 |
|
|
|
|
|
|
|
|
|
|
|
188 |
score, _, _ = self.judge.evaluate_answer_batch(
|
189 |
questions, predictions, options, golds, chunks=chunks, documents=documents
|
190 |
)
|
191 |
|
|
|
|
|
|
|
192 |
metrics = []
|
193 |
for i in range(len(sample_ids)):
|
194 |
metrics.append(
|
@@ -214,17 +240,17 @@ def yourbench_prompt(line, task_name: str = ""):
|
|
214 |
return Doc(
|
215 |
task_name=task_name,
|
216 |
query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]),
|
217 |
-
choices=[line["
|
218 |
gold_index=0,
|
219 |
specific={
|
220 |
-
"question_category": line["
|
221 |
-
"kind":
|
222 |
"estimated_difficulty": line["estimated_difficulty"],
|
223 |
"document_id": line["document_id"],
|
224 |
-
"question_generating_model": line["
|
225 |
-
"chunks": line["
|
226 |
"question": line["question"],
|
227 |
-
"document": line["
|
228 |
},
|
229 |
)
|
230 |
|
|
|
136 |
|
137 |
|
138 |
def process_judge_response_yourbench(response):
|
139 |
+
# Si la réponse est un dictionnaire, extraire le contenu
|
140 |
+
if isinstance(response, dict):
|
141 |
+
if "content" in response:
|
142 |
+
response = response["content"]
|
143 |
+
elif "text" in response:
|
144 |
+
response = response["text"]
|
145 |
+
elif "response" in response:
|
146 |
+
response = response["response"]
|
147 |
+
else:
|
148 |
+
# Si on ne trouve pas de champ texte, on prend la première valeur
|
149 |
+
response = str(list(response.values())[0])
|
150 |
+
|
151 |
+
# Si la réponse est une liste, prendre le premier élément
|
152 |
+
if isinstance(response, list):
|
153 |
+
response = response[0]
|
154 |
+
|
155 |
# extract the final answer using regex from the response xml
|
156 |
try:
|
157 |
# Essayer d'abord le format XML
|
158 |
+
match = re.search(r"<final_answer>(.*?)</final_answer>", str(response), re.DOTALL)
|
159 |
if match:
|
160 |
answer_text = match.group(1).strip()
|
161 |
# Convertir différents formats possibles en 0 ou 1
|
|
|
171 |
pass
|
172 |
|
173 |
# Rechercher des mots-clés dans la réponse
|
174 |
+
if re.search(r"\b(correct|vrai|true|yes)\b", str(response), re.IGNORECASE):
|
175 |
return 1
|
176 |
+
if re.search(r"\b(incorrect|faux|false|no)\b", str(response), re.IGNORECASE):
|
177 |
return 0
|
178 |
|
179 |
+
logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {str(response)[:100]}...")
|
180 |
except Exception as e:
|
181 |
logger.error(f"Error processing judge response: {e}")
|
182 |
+
logger.error(f"Response type: {type(response)}")
|
183 |
+
logger.error(f"Response content: {response}")
|
184 |
return 0
|
185 |
|
186 |
|
|
|
203 |
chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs]
|
204 |
documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
|
205 |
|
206 |
+
# Ajout de logs pour déboguer
|
207 |
+
logger.info(f"Questions: {questions}")
|
208 |
+
logger.info(f"Predictions: {predictions}")
|
209 |
+
logger.info(f"Golds: {golds}")
|
210 |
+
|
211 |
score, _, _ = self.judge.evaluate_answer_batch(
|
212 |
questions, predictions, options, golds, chunks=chunks, documents=documents
|
213 |
)
|
214 |
|
215 |
+
# Ajout de logs pour déboguer
|
216 |
+
logger.info(f"Scores: {score}")
|
217 |
+
|
218 |
metrics = []
|
219 |
for i in range(len(sample_ids)):
|
220 |
metrics.append(
|
|
|
240 |
return Doc(
|
241 |
task_name=task_name,
|
242 |
query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]),
|
243 |
+
choices=[line["self_answer"]],
|
244 |
gold_index=0,
|
245 |
specific={
|
246 |
+
"question_category": line["self_assessed_question_type"],
|
247 |
+
"kind": "qa",
|
248 |
"estimated_difficulty": line["estimated_difficulty"],
|
249 |
"document_id": line["document_id"],
|
250 |
+
"question_generating_model": line["generating_model"],
|
251 |
+
"chunks": line["citations"],
|
252 |
"question": line["question"],
|
253 |
+
"document": line["raw_response"],
|
254 |
},
|
255 |
)
|
256 |
|
backend/pyproject.toml
CHANGED
@@ -20,6 +20,9 @@ dependencies = [
|
|
20 |
"lighteval[math]>=0.8.0",
|
21 |
"huggingface-hub>=0.22.0",
|
22 |
"python-multipart>=0.0.5",
|
|
|
|
|
|
|
23 |
]
|
24 |
|
25 |
[build-system]
|
@@ -46,3 +49,6 @@ quote-style = "double"
|
|
46 |
indent-style = "space"
|
47 |
skip-magic-trailing-comma = false
|
48 |
line-ending = "auto"
|
|
|
|
|
|
|
|
20 |
"lighteval[math]>=0.8.0",
|
21 |
"huggingface-hub>=0.22.0",
|
22 |
"python-multipart>=0.0.5",
|
23 |
+
"fastapi>=0.110.0",
|
24 |
+
"uvicorn>=0.29.0",
|
25 |
+
"pydantic>=2.6.0",
|
26 |
]
|
27 |
|
28 |
[build-system]
|
|
|
49 |
indent-style = "space"
|
50 |
skip-magic-trailing-comma = false
|
51 |
line-ending = "auto"
|
52 |
+
|
53 |
+
[tool.setuptools]
|
54 |
+
packages = ["lighteval_task"]
|
backend/routes/evaluation.py
CHANGED
@@ -2,6 +2,9 @@ from fastapi import APIRouter, HTTPException
|
|
2 |
from typing import Dict, Any
|
3 |
import os
|
4 |
from tasks.evaluationTask import EvaluationTask
|
|
|
|
|
|
|
5 |
|
6 |
router = APIRouter(tags=["evaluation"])
|
7 |
|
@@ -41,7 +44,7 @@ async def evaluate_benchmark(data: Dict[str, Any]):
|
|
41 |
|
42 |
try:
|
43 |
# Nom du dataset basé sur l'ID de session
|
44 |
-
dataset_name = f"yourbench_{session_id}"
|
45 |
|
46 |
# Créer et démarrer une nouvelle tâche d'évaluation
|
47 |
evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
|
@@ -105,44 +108,52 @@ async def get_evaluation_results(session_id: str):
|
|
105 |
Returns:
|
106 |
Dictionary with evaluation results
|
107 |
"""
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
}
|
117 |
-
|
118 |
-
if hasattr(evaluation_task, 'results') and evaluation_task.results:
|
119 |
return {
|
120 |
"success": True,
|
121 |
-
"results":
|
122 |
}
|
123 |
-
|
124 |
-
# If we get here, either the task is not in memory or it doesn't have results
|
125 |
-
# Try to load results from file
|
126 |
-
try:
|
127 |
-
# Construct the path to the results file
|
128 |
-
results_path = f"uploaded_files/{session_id}/lighteval_results/models_comparison.json"
|
129 |
-
|
130 |
-
# Check if the file exists
|
131 |
-
if not os.path.exists(results_path):
|
132 |
return {
|
133 |
"success": False,
|
134 |
-
"message": "
|
135 |
}
|
136 |
-
|
137 |
-
# Read the file
|
138 |
-
import json
|
139 |
-
with open(results_path, 'r') as f:
|
140 |
-
results = json.load(f)
|
141 |
-
|
142 |
-
return {
|
143 |
-
"success": True,
|
144 |
-
"results": results
|
145 |
-
}
|
146 |
except Exception as e:
|
147 |
return {
|
148 |
"success": False,
|
|
|
2 |
from typing import Dict, Any
|
3 |
import os
|
4 |
from tasks.evaluationTask import EvaluationTask
|
5 |
+
from huggingface_hub import hf_hub_download
|
6 |
+
import json
|
7 |
+
from datetime import datetime
|
8 |
|
9 |
router = APIRouter(tags=["evaluation"])
|
10 |
|
|
|
44 |
|
45 |
try:
|
46 |
# Nom du dataset basé sur l'ID de session
|
47 |
+
dataset_name = f"yourbench/yourbench_{session_id}"
|
48 |
|
49 |
# Créer et démarrer une nouvelle tâche d'évaluation
|
50 |
evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
|
|
|
108 |
Returns:
|
109 |
Dictionary with evaluation results
|
110 |
"""
|
111 |
+
try:
|
112 |
+
# Get organization from environment
|
113 |
+
organization = os.getenv("HF_ORGANIZATION", "yourbench")
|
114 |
+
dataset_name = f"{organization}/yourbench_{session_id}"
|
115 |
|
116 |
+
# Try to load results from the Hub
|
117 |
+
try:
|
118 |
+
results_file = hf_hub_download(
|
119 |
+
repo_id=dataset_name,
|
120 |
+
repo_type="dataset",
|
121 |
+
filename="lighteval_results.json"
|
122 |
+
)
|
123 |
+
|
124 |
+
with open(results_file) as f:
|
125 |
+
results = json.load(f)
|
126 |
+
|
127 |
+
# Format results to match the expected format
|
128 |
+
formatted_results = {
|
129 |
+
"metadata": {
|
130 |
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
131 |
+
"total_models_tested": len(results),
|
132 |
+
"successful_tests": len([r for r in results if r["status"] == "success"])
|
133 |
+
},
|
134 |
+
"models_comparison": [
|
135 |
+
{
|
136 |
+
"model_name": result["model"],
|
137 |
+
"provider": result["provider"],
|
138 |
+
"success": result["status"] == "success",
|
139 |
+
"accuracy": result["accuracy"],
|
140 |
+
"evaluation_time": result["execution_time"],
|
141 |
+
"error": result["status"] if result["status"] != "success" else None
|
142 |
+
}
|
143 |
+
for result in results
|
144 |
+
]
|
145 |
}
|
146 |
+
|
|
|
147 |
return {
|
148 |
"success": True,
|
149 |
+
"results": formatted_results
|
150 |
}
|
151 |
+
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
return {
|
153 |
"success": False,
|
154 |
+
"message": f"Failed to load results from Hub: {str(e)}"
|
155 |
}
|
156 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
except Exception as e:
|
158 |
return {
|
159 |
"success": False,
|
backend/tasks/createBench.py
CHANGED
@@ -234,86 +234,4 @@ class CreateBenchTask:
|
|
234 |
except Exception as e:
|
235 |
self._add_log(f"[ERROR] Error starting ingestion process: {str(e)}")
|
236 |
self.is_completed = True
|
237 |
-
|
238 |
-
def _simulate_ingestion_process(self) -> None:
|
239 |
-
"""
|
240 |
-
Simulate the ingestion process for testing/development
|
241 |
-
This will be removed in production
|
242 |
-
"""
|
243 |
-
# This method is just to simulate logs during development
|
244 |
-
# It will be removed in production
|
245 |
-
|
246 |
-
threading.Thread(target=self._simulate_logs).start()
|
247 |
-
|
248 |
-
def _simulate_logs(self) -> None:
|
249 |
-
"""
|
250 |
-
Simulate logs for testing/development
|
251 |
-
This will be used when yourbench isn't installed or in development mode
|
252 |
-
"""
|
253 |
-
# Log simulation (used when yourbench is not available)
|
254 |
-
self._add_log("[INFO] Simulation mode enabled (yourbench is not actually running)")
|
255 |
-
|
256 |
-
# Get filenames from source directory
|
257 |
-
source_files = []
|
258 |
-
try:
|
259 |
-
with open(self.config_path, 'r') as f:
|
260 |
-
config_yaml = yaml.safe_load(f)
|
261 |
-
|
262 |
-
source_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("source_documents_dir", "")
|
263 |
-
if source_dir and os.path.exists(source_dir):
|
264 |
-
source_files = [f for f in os.listdir(source_dir)
|
265 |
-
if os.path.isfile(os.path.join(source_dir, f))]
|
266 |
-
except Exception:
|
267 |
-
source_files = ["document.pdf", "document.txt"] # Fallback
|
268 |
-
|
269 |
-
# Create output directory if it doesn't exist
|
270 |
-
output_dir = ""
|
271 |
-
try:
|
272 |
-
output_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("output_dir", "")
|
273 |
-
if output_dir:
|
274 |
-
os.makedirs(output_dir, exist_ok=True)
|
275 |
-
except Exception:
|
276 |
-
pass
|
277 |
-
|
278 |
-
# Simulate file processing
|
279 |
-
time.sleep(1)
|
280 |
-
self._add_log("[INFO] Initializing document ingestion")
|
281 |
-
time.sleep(1.5)
|
282 |
-
self._add_log("[INFO] Loading configuration parameters")
|
283 |
-
time.sleep(1)
|
284 |
-
self._add_log("[INFO] Verifying source files")
|
285 |
-
|
286 |
-
# Process each file
|
287 |
-
for file in source_files:
|
288 |
-
time.sleep(1.5)
|
289 |
-
self._add_log(f"[INFO] Processing file: {file}")
|
290 |
-
time.sleep(2)
|
291 |
-
self._add_log(f"[INFO] Extracting content from {file}")
|
292 |
-
time.sleep(1.5)
|
293 |
-
self._add_log(f"[INFO] Converting to markdown: {file}")
|
294 |
-
|
295 |
-
# Create a simulated markdown file if an output directory is defined
|
296 |
-
if output_dir:
|
297 |
-
base_name = os.path.splitext(file)[0]
|
298 |
-
output_file = os.path.join(output_dir, f"{base_name}.md")
|
299 |
-
try:
|
300 |
-
with open(output_file, 'w') as f:
|
301 |
-
f.write(f"# {base_name}\n\n")
|
302 |
-
f.write("This is a markdown document automatically generated by the simulation.\n\n")
|
303 |
-
f.write("## Section 1\n\n")
|
304 |
-
f.write("Content of section 1...\n\n")
|
305 |
-
f.write("## Section 2\n\n")
|
306 |
-
f.write("Content of section 2...\n\n")
|
307 |
-
self._add_log(f"[INFO] Markdown file created: {output_file}")
|
308 |
-
except Exception as e:
|
309 |
-
self._add_log(f"[ERROR] Error creating markdown file: {str(e)}")
|
310 |
-
|
311 |
-
time.sleep(2)
|
312 |
-
self._add_log("[INFO] Finalizing processing")
|
313 |
-
time.sleep(1)
|
314 |
-
self._add_log("[SUCCESS] Stage completed: ingestion")
|
315 |
-
time.sleep(0.5)
|
316 |
-
self._add_log("[SUCCESS] Ingestion completed successfully")
|
317 |
-
|
318 |
-
# Mark task as completed
|
319 |
-
self.is_completed = True
|
|
|
234 |
except Exception as e:
|
235 |
self._add_log(f"[ERROR] Error starting ingestion process: {str(e)}")
|
236 |
self.is_completed = True
|
237 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tasks/createBenchConfigFile.py
CHANGED
@@ -145,15 +145,15 @@ class CreateBenchConfigTask:
|
|
145 |
"tau_threshold": 0.8,
|
146 |
"h_min": 2,
|
147 |
"h_max": 5,
|
148 |
-
"num_multihops_factor":
|
149 |
},
|
150 |
},
|
151 |
"single_shot_question_generation": {
|
152 |
-
"run":
|
153 |
"additional_instructions": "Generate questions to test a curious adult",
|
154 |
"chunk_sampling": {
|
155 |
"mode": "count",
|
156 |
-
"value":
|
157 |
"random_seed": 123,
|
158 |
},
|
159 |
},
|
@@ -167,7 +167,7 @@ class CreateBenchConfigTask:
|
|
167 |
},
|
168 |
},
|
169 |
"lighteval": {
|
170 |
-
"run":
|
171 |
},
|
172 |
},
|
173 |
}
|
|
|
145 |
"tau_threshold": 0.8,
|
146 |
"h_min": 2,
|
147 |
"h_max": 5,
|
148 |
+
"num_multihops_factor": 1,
|
149 |
},
|
150 |
},
|
151 |
"single_shot_question_generation": {
|
152 |
+
"run": False,
|
153 |
"additional_instructions": "Generate questions to test a curious adult",
|
154 |
"chunk_sampling": {
|
155 |
"mode": "count",
|
156 |
+
"value": 10,
|
157 |
"random_seed": 123,
|
158 |
},
|
159 |
},
|
|
|
167 |
},
|
168 |
},
|
169 |
"lighteval": {
|
170 |
+
"run": False,
|
171 |
},
|
172 |
},
|
173 |
}
|
backend/tasks/evaluationTask.py
CHANGED
@@ -1,25 +1,22 @@
|
|
1 |
"""
|
2 |
-
Task to
|
3 |
"""
|
4 |
import os
|
5 |
-
import sys
|
6 |
-
import json
|
7 |
import time
|
|
|
8 |
import tempfile
|
9 |
-
import asyncio
|
10 |
-
import threading
|
11 |
from pathlib import Path
|
12 |
-
|
13 |
-
|
14 |
-
from
|
15 |
-
|
16 |
-
|
17 |
-
from tasks.
|
18 |
-
|
19 |
|
20 |
class EvaluationTask:
|
21 |
"""
|
22 |
-
Task to
|
23 |
"""
|
24 |
|
25 |
def __init__(self, session_uid: str, dataset_name: str):
|
@@ -32,440 +29,182 @@ class EvaluationTask:
|
|
32 |
"""
|
33 |
self.session_uid = session_uid
|
34 |
self.dataset_name = dataset_name
|
35 |
-
self.logs: List[str] = []
|
36 |
self.is_completed = False
|
37 |
-
self.
|
38 |
-
self.
|
39 |
-
|
40 |
-
|
41 |
-
# Models to evaluate - can be modified to allow customization
|
42 |
-
self.models = [
|
43 |
-
("Qwen/Qwen2.5-72B-Instruct", "novita"),
|
44 |
-
("Qwen/QwQ-32B", "novita"),
|
45 |
-
]
|
46 |
-
|
47 |
-
self._add_log("[INFO] Initializing evaluation task")
|
48 |
-
self._add_log(f"[INFO] Dataset to evaluate: {self.organization}/{dataset_name}")
|
49 |
-
self._add_log(f"[INFO] Output directory: {self.output_dir}")
|
50 |
-
|
51 |
-
def _add_log(self, message: str) -> None:
|
52 |
-
"""
|
53 |
-
Add a log message to the logs list
|
54 |
-
|
55 |
-
Args:
|
56 |
-
message: Log message to add
|
57 |
-
"""
|
58 |
-
if message not in self.logs: # Avoid duplicates
|
59 |
-
self.logs.append(message)
|
60 |
-
# Force copy of the list to avoid reference problems
|
61 |
-
self.logs = self.logs.copy()
|
62 |
-
# Record in system logs
|
63 |
-
logger.info(f"[{self.session_uid}] {message}")
|
64 |
-
|
65 |
-
def get_logs(self) -> List[str]:
|
66 |
-
"""
|
67 |
-
Get all logs for this task
|
68 |
-
|
69 |
-
Returns:
|
70 |
-
List of log messages
|
71 |
-
"""
|
72 |
-
return self.logs.copy() # Retourner une copie pour éviter les problèmes de référence
|
73 |
-
|
74 |
-
def is_task_completed(self) -> bool:
|
75 |
-
"""
|
76 |
-
Check if the task is completed
|
77 |
-
|
78 |
-
Returns:
|
79 |
-
True if completed, False otherwise
|
80 |
"""
|
81 |
-
|
82 |
-
|
83 |
-
async def _evaluate_model(self, model_info: Tuple[str, str]) -> Dict[str, Any]:
|
84 |
"""
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
os.makedirs(self.output_dir, exist_ok=True)
|
98 |
-
|
99 |
-
# Define full dataset path
|
100 |
-
dataset_path = f"{self.organization}/{self.dataset_name}"
|
101 |
|
102 |
-
# Create temporary file
|
103 |
temp_file_path = tempfile.mktemp(suffix=".py")
|
104 |
-
self._add_log(f"[INFO] Creating temporary file for {model_name}: {temp_file_path}")
|
105 |
-
|
106 |
with open(temp_file_path, 'w') as temp_file:
|
107 |
temp_file.write(f"""
|
108 |
-
import
|
109 |
-
import sys
|
110 |
-
sys.path.append("{os.getcwd()}")
|
111 |
-
|
112 |
-
from tasks.yourbench_lighteval_task import create_yourbench_task
|
113 |
|
114 |
# Create yourbench task
|
115 |
-
yourbench = create_yourbench_task("{
|
116 |
|
117 |
# Define TASKS_TABLE needed by lighteval
|
118 |
TASKS_TABLE = [yourbench]
|
119 |
""")
|
120 |
-
|
121 |
-
#
|
122 |
cmd_args = [
|
123 |
"lighteval",
|
124 |
-
"endpoint",
|
125 |
"inference-providers",
|
126 |
f"model={model_name},provider={provider}",
|
127 |
"custom|yourbench|0|0",
|
128 |
"--custom-tasks",
|
129 |
temp_file_path,
|
130 |
-
"--max-samples", "
|
131 |
-
"--output-dir",
|
132 |
-
"--save-details",
|
133 |
"--no-push-to-hub"
|
134 |
]
|
135 |
-
|
136 |
-
self._add_log(f"[INFO] Running command for {model_name}: {' '.join(cmd_args)}")
|
137 |
-
|
138 |
-
results = {
|
139 |
-
"model_name": model_name,
|
140 |
-
"provider": provider,
|
141 |
-
"success": False,
|
142 |
-
"error": None,
|
143 |
-
"results": None,
|
144 |
-
"return_code": None
|
145 |
-
}
|
146 |
-
|
147 |
try:
|
148 |
-
#
|
149 |
-
env
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
results
|
170 |
-
|
171 |
-
# Log output
|
172 |
-
if stdout:
|
173 |
-
stdout_lines = stdout.decode().strip().split('\n')
|
174 |
-
for line in stdout_lines[:5]: # Log only first 5 lines
|
175 |
-
self._add_log(f"[INFO] {model_name} - {line}")
|
176 |
-
|
177 |
-
# Log errors if any
|
178 |
-
if stderr and exit_code != 0:
|
179 |
-
stderr_lines = stderr.decode().strip().split('\n')
|
180 |
-
for line in stderr_lines[:5]: # Log only first 5 lines
|
181 |
-
self._add_log(f"[ERROR] {model_name} - {line}")
|
182 |
-
|
183 |
-
# Find any JSON result files - LightEval organizes by model name in different ways
|
184 |
-
result_files = []
|
185 |
-
results_dir = Path(self.output_dir) / "results"
|
186 |
-
if results_dir.exists():
|
187 |
-
# Parcourir récursivement tous les répertoires pour trouver des fichiers JSON
|
188 |
-
for json_file in results_dir.glob("**/*.json"):
|
189 |
-
# Check if the filename or path contains parts of the model name
|
190 |
-
model_parts = [
|
191 |
-
model_name, # Full name
|
192 |
-
model_name.replace('/', '_'), # Name with / replaced by _
|
193 |
-
model_name.split('/')[-1] # Just the model name without the organization
|
194 |
-
]
|
195 |
-
|
196 |
-
if any(part in str(json_file) for part in model_parts):
|
197 |
-
result_files.append(json_file)
|
198 |
-
|
199 |
-
# Traiter les fichiers de résultats trouvés
|
200 |
-
if result_files:
|
201 |
-
# Prendre le fichier le plus récent
|
202 |
-
result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
|
203 |
-
latest_result = result_files[0]
|
204 |
-
self._add_log(f"[INFO] {model_name} - Found result file: {latest_result}")
|
205 |
-
|
206 |
-
try:
|
207 |
-
with open(latest_result, 'r') as f:
|
208 |
-
test_results = json.load(f)
|
209 |
-
|
210 |
-
# Vérifier si les résultats contiennent les informations essentielles
|
211 |
-
if (test_results and
|
212 |
-
isinstance(test_results, dict) and
|
213 |
-
"results" in test_results and
|
214 |
-
"all" in test_results["results"]):
|
215 |
-
|
216 |
-
# Enregistrer les résultats
|
217 |
-
results["results"] = test_results
|
218 |
-
results["success"] = True
|
219 |
-
|
220 |
-
# Afficher la précision
|
221 |
-
accuracy = test_results["results"]["all"]["accuracy"]
|
222 |
-
accuracy_stderr = test_results["results"]["all"]["accuracy_stderr"]
|
223 |
-
self._add_log(f"[SUCCESS] {model_name} - Accuracy: {accuracy:.4f} ± {accuracy_stderr:.4f}")
|
224 |
-
else:
|
225 |
-
results["error"] = "Incomplete or unexpected result format"
|
226 |
-
self._add_log(f"[WARNING] {model_name} - Unexpected result format")
|
227 |
-
|
228 |
-
except (json.JSONDecodeError, KeyError) as e:
|
229 |
-
results["error"] = f"Error reading results: {str(e)}"
|
230 |
-
self._add_log(f"[ERROR] {model_name} - {results['error']}")
|
231 |
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
241 |
except Exception as e:
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
async def _run_evaluations(self) -> List[Dict[str, Any]]:
|
254 |
"""
|
255 |
-
Run
|
256 |
|
257 |
Returns:
|
258 |
-
List of
|
259 |
"""
|
260 |
-
|
|
|
261 |
|
262 |
-
#
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
-
#
|
266 |
-
|
267 |
|
268 |
-
#
|
269 |
-
|
270 |
-
for i, result in enumerate(model_results):
|
271 |
-
if isinstance(result, Exception):
|
272 |
-
# Handle exception
|
273 |
-
model_name, provider = self.models[i]
|
274 |
-
self._add_log(f"[ERROR] Evaluation failed for {model_name}: {str(result)}")
|
275 |
-
results.append({
|
276 |
-
"model_name": model_name,
|
277 |
-
"provider": provider,
|
278 |
-
"success": False,
|
279 |
-
"error": str(result),
|
280 |
-
"results": None,
|
281 |
-
"return_code": None
|
282 |
-
})
|
283 |
-
else:
|
284 |
-
# Valid result
|
285 |
-
results.append(result)
|
286 |
|
287 |
-
return results
|
288 |
-
|
289 |
-
def
|
290 |
"""
|
291 |
-
|
292 |
|
293 |
-
Args:
|
294 |
-
results: List of evaluation results
|
295 |
-
|
296 |
Returns:
|
297 |
-
|
298 |
"""
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
"dataset": f"{self.organization}/{self.dataset_name}",
|
303 |
-
"total_models_tested": len(results),
|
304 |
-
"successful_tests": len([r for r in results if r["success"]])
|
305 |
-
},
|
306 |
-
"models_comparison": []
|
307 |
-
}
|
308 |
-
|
309 |
-
# Liste des modèles réussis et des modèles échoués
|
310 |
-
successful_models = [r for r in results if r["success"]]
|
311 |
-
failed_models = [r for r in results if not r["success"]]
|
312 |
-
|
313 |
-
# Trier les modèles réussis par précision (du plus précis au moins précis)
|
314 |
-
if successful_models:
|
315 |
-
sorted_successful = sorted(
|
316 |
-
successful_models,
|
317 |
-
key=lambda x: x["results"]["results"]["all"]["accuracy"],
|
318 |
-
reverse=True # Du plus grand au plus petit
|
319 |
-
)
|
320 |
-
else:
|
321 |
-
sorted_successful = []
|
322 |
-
|
323 |
-
# Trier les modèles échoués par nom
|
324 |
-
sorted_failed = sorted(failed_models, key=lambda x: x["model_name"])
|
325 |
-
|
326 |
-
# Concaténer: d'abord les réussites, puis les échecs
|
327 |
-
sorted_results = sorted_successful + sorted_failed
|
328 |
-
|
329 |
-
# Créer l'entrée pour chaque modèle
|
330 |
-
for result in sorted_results:
|
331 |
-
model_result = {
|
332 |
-
"model_name": result["model_name"],
|
333 |
-
"provider": result["provider"],
|
334 |
-
"success": result["success"]
|
335 |
-
}
|
336 |
-
|
337 |
-
if result["success"]:
|
338 |
-
# Ajouter les métriques de précision et temps d'exécution
|
339 |
-
model_result.update({
|
340 |
-
"accuracy": result["results"]["results"]["all"]["accuracy"],
|
341 |
-
"accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
|
342 |
-
"evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
|
343 |
-
})
|
344 |
-
else:
|
345 |
-
# Ajouter l'erreur
|
346 |
-
model_result["error"] = result.get("error", "Unknown reason")
|
347 |
-
|
348 |
-
comparison["models_comparison"].append(model_result)
|
349 |
-
|
350 |
-
return comparison
|
351 |
-
|
352 |
-
async def _upload_results_to_dataset(self, comparison_results: Dict[str, Any]) -> bool:
|
353 |
"""
|
354 |
-
|
355 |
|
356 |
-
Args:
|
357 |
-
comparison_results: The formatted comparison results
|
358 |
-
|
359 |
Returns:
|
360 |
-
|
361 |
-
"""
|
362 |
-
try:
|
363 |
-
# Create a timestamp for the results file
|
364 |
-
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
365 |
-
result_filename = f"lighteval_results.json"
|
366 |
-
|
367 |
-
# Create temporary file for upload
|
368 |
-
temp_file_path = tempfile.mktemp(suffix=".json")
|
369 |
-
with open(temp_file_path, 'w') as f:
|
370 |
-
json.dump(comparison_results, f, indent=2)
|
371 |
-
|
372 |
-
# Initialize HF API
|
373 |
-
hf_token = os.getenv("HF_TOKEN")
|
374 |
-
if not hf_token:
|
375 |
-
self._add_log("[ERROR] HF_TOKEN not found, cannot upload results to dataset")
|
376 |
-
return False
|
377 |
-
|
378 |
-
api = HfApi(token=hf_token)
|
379 |
-
dataset_id = f"{self.organization}/{self.dataset_name}"
|
380 |
-
|
381 |
-
# Prepare the file operation
|
382 |
-
operation = CommitOperationAdd(
|
383 |
-
path_in_repo=f"lighteval_results/{result_filename}",
|
384 |
-
path_or_fileobj=temp_file_path
|
385 |
-
)
|
386 |
-
|
387 |
-
# Upload the file
|
388 |
-
self._add_log(f"[INFO] Uploading results to dataset {dataset_id}")
|
389 |
-
api.create_commit(
|
390 |
-
repo_id=dataset_id,
|
391 |
-
repo_type="dataset",
|
392 |
-
operations=[operation],
|
393 |
-
commit_message=f"Add evaluation results from {timestamp}"
|
394 |
-
)
|
395 |
-
|
396 |
-
# Cleanup temporary file
|
397 |
-
os.unlink(temp_file_path)
|
398 |
-
|
399 |
-
self._add_log(f"[SUCCESS] Results uploaded to dataset {dataset_id} at lighteval_results/{result_filename}")
|
400 |
-
return True
|
401 |
-
|
402 |
-
except Exception as e:
|
403 |
-
self._add_log(f"[ERROR] Failed to upload results to dataset: {str(e)}")
|
404 |
-
return False
|
405 |
-
|
406 |
-
async def _process_evaluation_results(self, results: List[Dict[str, Any]]) -> None:
|
407 |
-
"""
|
408 |
-
Process evaluation results, create summaries and save files
|
409 |
-
|
410 |
-
Args:
|
411 |
-
results: List of evaluation results
|
412 |
-
"""
|
413 |
-
if results:
|
414 |
-
try:
|
415 |
-
# Save detailed results
|
416 |
-
detailed_output_file = f"{self.output_dir}/detailed_results.json"
|
417 |
-
os.makedirs(os.path.dirname(detailed_output_file), exist_ok=True)
|
418 |
-
with open(detailed_output_file, 'w') as f:
|
419 |
-
json.dump(results, f, indent=2)
|
420 |
-
self._add_log(f"[INFO] Detailed results saved in {detailed_output_file}")
|
421 |
-
|
422 |
-
# Generate and save comparison results
|
423 |
-
comparison = self._format_comparison_results(results)
|
424 |
-
comparison_file = f"{self.output_dir}/models_comparison.json"
|
425 |
-
with open(comparison_file, 'w') as f:
|
426 |
-
json.dump(comparison, f, indent=2)
|
427 |
-
self._add_log(f"[INFO] Models comparison saved in {comparison_file}")
|
428 |
-
|
429 |
-
# Upload results to the dataset
|
430 |
-
await self._upload_results_to_dataset(comparison)
|
431 |
-
|
432 |
-
# Store results for later access
|
433 |
-
self.results = comparison
|
434 |
-
self._add_log("[SUCCESS] Evaluation completed")
|
435 |
-
except Exception as e:
|
436 |
-
self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
|
437 |
-
finally:
|
438 |
-
self.is_completed = True
|
439 |
-
|
440 |
-
def _async_run(self) -> None:
|
441 |
-
"""
|
442 |
-
Run the evaluation asynchronously
|
443 |
"""
|
444 |
-
|
445 |
-
|
446 |
-
# Run evaluations
|
447 |
-
results = await self._run_evaluations()
|
448 |
-
|
449 |
-
# Process evaluation results
|
450 |
-
await self._process_evaluation_results(results)
|
451 |
-
except Exception as e:
|
452 |
-
self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
|
453 |
-
finally:
|
454 |
-
self.is_completed = True
|
455 |
-
|
456 |
-
# Create and run the asyncio event loop
|
457 |
-
loop = asyncio.new_event_loop()
|
458 |
-
asyncio.set_event_loop(loop)
|
459 |
-
loop.run_until_complete(run_async())
|
460 |
-
loop.close()
|
461 |
-
|
462 |
def run(self) -> None:
|
463 |
"""
|
464 |
-
Run the evaluation task
|
465 |
"""
|
466 |
-
self.
|
467 |
-
|
468 |
-
# Run in a separate thread to not block the main thread
|
469 |
-
thread = threading.Thread(target=self._async_run)
|
470 |
-
thread.daemon = True
|
471 |
-
thread.start()
|
|
|
1 |
"""
|
2 |
+
Task to run evaluation using lighteval
|
3 |
"""
|
4 |
import os
|
|
|
|
|
5 |
import time
|
6 |
+
import subprocess
|
7 |
import tempfile
|
|
|
|
|
8 |
from pathlib import Path
|
9 |
+
import concurrent.futures
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from datetime import datetime
|
12 |
+
import json
|
13 |
+
from typing import List, Dict
|
14 |
+
from tasks.get_model_providers import get_model_providers
|
15 |
+
from huggingface_hub import HfApi
|
16 |
|
17 |
class EvaluationTask:
|
18 |
"""
|
19 |
+
Task to run evaluation using lighteval
|
20 |
"""
|
21 |
|
22 |
def __init__(self, session_uid: str, dataset_name: str):
|
|
|
29 |
"""
|
30 |
self.session_uid = session_uid
|
31 |
self.dataset_name = dataset_name
|
|
|
32 |
self.is_completed = False
|
33 |
+
self.results = []
|
34 |
+
self.hf_api = HfApi()
|
35 |
+
|
36 |
+
def _save_results_to_hub(self) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
"""
|
38 |
+
Save evaluation results to the dataset on the Hub
|
|
|
|
|
39 |
"""
|
40 |
+
try:
|
41 |
+
# Create results directory if it doesn't exist
|
42 |
+
results_dir = Path("data/lighteval_results")
|
43 |
+
results_dir.mkdir(parents=True, exist_ok=True)
|
44 |
+
|
45 |
+
# Save results to JSON file
|
46 |
+
results_file = results_dir / "lighteval_results.json"
|
47 |
+
with open(results_file, "w") as f:
|
48 |
+
json.dump(self.results, f, indent=2)
|
49 |
+
|
50 |
+
# Push to Hub
|
51 |
+
self.hf_api.upload_file(
|
52 |
+
path_or_fileobj=str(results_file),
|
53 |
+
path_in_repo="lighteval_results.json",
|
54 |
+
repo_id=self.dataset_name,
|
55 |
+
repo_type="dataset",
|
56 |
+
commit_message="Add lighteval evaluation results"
|
57 |
+
)
|
58 |
|
59 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json")
|
60 |
+
except Exception as e:
|
61 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
|
62 |
+
|
63 |
+
def _run_lighteval(self, model_name: str, provider: str, dataset_name: str) -> dict:
|
64 |
+
start_time = time.time()
|
65 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
# Create temporary task file
|
68 |
temp_file_path = tempfile.mktemp(suffix=".py")
|
|
|
|
|
69 |
with open(temp_file_path, 'w') as temp_file:
|
70 |
temp_file.write(f"""
|
71 |
+
from lighteval_task.lighteval_task import create_yourbench_task
|
|
|
|
|
|
|
|
|
72 |
|
73 |
# Create yourbench task
|
74 |
+
yourbench = create_yourbench_task("{dataset_name}", "single_shot_questions")
|
75 |
|
76 |
# Define TASKS_TABLE needed by lighteval
|
77 |
TASKS_TABLE = [yourbench]
|
78 |
""")
|
79 |
+
|
80 |
+
# LightEval command
|
81 |
cmd_args = [
|
82 |
"lighteval",
|
83 |
+
"endpoint",
|
84 |
"inference-providers",
|
85 |
f"model={model_name},provider={provider}",
|
86 |
"custom|yourbench|0|0",
|
87 |
"--custom-tasks",
|
88 |
temp_file_path,
|
89 |
+
"--max-samples", "15",
|
90 |
+
"--output-dir", "data/lighteval_results",
|
91 |
+
# "--save-details",
|
92 |
"--no-push-to-hub"
|
93 |
]
|
94 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
try:
|
96 |
+
# Run the command with environment variables and timeout of 60 seconds
|
97 |
+
subprocess.run(cmd_args, env=os.environ, timeout=60)
|
98 |
+
except subprocess.TimeoutExpired:
|
99 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
|
100 |
+
return {
|
101 |
+
"model": model_name,
|
102 |
+
"provider": provider,
|
103 |
+
"accuracy": 0.0,
|
104 |
+
"execution_time": 60.0,
|
105 |
+
"status": "timeout"
|
106 |
+
}
|
107 |
+
|
108 |
+
# Calculate execution time
|
109 |
+
execution_time = time.time() - start_time
|
110 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
|
111 |
+
|
112 |
+
# Clean up
|
113 |
+
os.unlink(temp_file_path)
|
114 |
+
|
115 |
+
try:
|
116 |
+
# Get results from the output file
|
117 |
+
results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
|
118 |
+
results_file = next(results_dir.glob("results_*.json"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
with open(results_file) as f:
|
121 |
+
results = json.load(f)
|
122 |
+
accuracy = results["results"]["all"]["accuracy"]
|
123 |
+
|
124 |
+
return {
|
125 |
+
"model": model_name,
|
126 |
+
"provider": provider,
|
127 |
+
"accuracy": accuracy,
|
128 |
+
"execution_time": execution_time,
|
129 |
+
"status": "success"
|
130 |
+
}
|
131 |
except Exception as e:
|
132 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
|
133 |
+
return {
|
134 |
+
"model": model_name,
|
135 |
+
"provider": provider,
|
136 |
+
"accuracy": 0.0,
|
137 |
+
"execution_time": execution_time,
|
138 |
+
"status": "parse_error"
|
139 |
+
}
|
140 |
+
|
141 |
+
def run_parallel(self) -> List[Dict]:
|
|
|
|
|
142 |
"""
|
143 |
+
Run the evaluation task with multiple models in parallel using ProcessPoolExecutor
|
144 |
|
145 |
Returns:
|
146 |
+
List of results for each model
|
147 |
"""
|
148 |
+
# Start global timer
|
149 |
+
script_start_time = time.time()
|
150 |
|
151 |
+
# Load environment variables
|
152 |
+
load_dotenv()
|
153 |
+
|
154 |
+
# Models to evaluate
|
155 |
+
models = [
|
156 |
+
"Qwen/QwQ-32B",
|
157 |
+
"Qwen/Qwen2.5-72B-Instruct",
|
158 |
+
"deepseek-ai/DeepSeek-V3-0324",
|
159 |
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
160 |
+
]
|
161 |
+
|
162 |
+
# Get providers for each model
|
163 |
+
model_providers = get_model_providers(models)
|
164 |
+
|
165 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
|
166 |
+
|
167 |
+
# Run evaluations in parallel using ProcessPoolExecutor
|
168 |
+
with concurrent.futures.ProcessPoolExecutor() as executor:
|
169 |
+
futures = [
|
170 |
+
executor.submit(self._run_lighteval, model_name, providers[0], self.dataset_name)
|
171 |
+
for model_name, providers in model_providers
|
172 |
+
if providers # Only run if providers are available
|
173 |
+
]
|
174 |
+
self.results = [future.result() for future in concurrent.futures.as_completed(futures)]
|
175 |
+
|
176 |
+
# Calculate total script execution time
|
177 |
+
total_time = time.time() - script_start_time
|
178 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
|
179 |
|
180 |
+
# Save results to Hub
|
181 |
+
self._save_results_to_hub()
|
182 |
|
183 |
+
# Mark the task as completed
|
184 |
+
self.is_completed = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
+
return self.results
|
187 |
+
|
188 |
+
def get_logs(self) -> List[str]:
|
189 |
"""
|
190 |
+
Get logs for this task (empty list since we don't track logs anymore)
|
191 |
|
|
|
|
|
|
|
192 |
Returns:
|
193 |
+
Empty list of logs
|
194 |
"""
|
195 |
+
return []
|
196 |
+
|
197 |
+
def is_task_completed(self) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
"""
|
199 |
+
Check if the task is completed
|
200 |
|
|
|
|
|
|
|
201 |
Returns:
|
202 |
+
True if completed, False otherwise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
"""
|
204 |
+
return self.is_completed
|
205 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
def run(self) -> None:
|
207 |
"""
|
208 |
+
Run the evaluation task (wrapper around run_parallel)
|
209 |
"""
|
210 |
+
self.run_parallel()
|
|
|
|
|
|
|
|
|
|
backend/tasks/get_model_providers.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import model_info
|
2 |
+
PREFERRED_PROVIDERS = ["sambanova", "novita"]
|
3 |
+
|
4 |
+
def filter_providers(providers):
|
5 |
+
return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
|
6 |
+
|
7 |
+
def get_model_providers(models):
|
8 |
+
results = []
|
9 |
+
|
10 |
+
for model_name in models:
|
11 |
+
try:
|
12 |
+
info = model_info(model_name, expand="inferenceProviderMapping")
|
13 |
+
providers = filter_providers(info.inference_provider_mapping.keys()) if hasattr(info, "inference_provider_mapping") else []
|
14 |
+
results.append((model_name, providers))
|
15 |
+
except Exception as e:
|
16 |
+
results.append((model_name, []))
|
17 |
+
|
18 |
+
return results
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
example_models = [
|
22 |
+
"Qwen/Qwen2.5-72B-Instruct",
|
23 |
+
"meta-llama/Llama-3.3-70B-Instruct",
|
24 |
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
25 |
+
"Qwen/QwQ-32B",
|
26 |
+
"mistralai/Mistral-Small-24B-Instruct-2501"
|
27 |
+
]
|
28 |
+
results = get_model_providers(example_models)
|
29 |
+
print(results)
|
backend/test_import.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
try:
|
2 |
+
import lighteval_task
|
3 |
+
print("lighteval_task importé avec succès!")
|
4 |
+
except ImportError as e:
|
5 |
+
print(f"Erreur: {e}")
|
backend/yourbench_simple_demo.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.4
|
2 |
+
Name: yourbench-simple-demo
|
3 |
+
Version: 0.1.0
|
4 |
+
Author-email: Sumuk Shashidhar <[email protected]>, Alina Lozovskaia <[email protected]>, Clémentine Fourrier <[email protected]>, Nathan Habib <[email protected]>
|
5 |
+
Requires-Python: <3.13,>=3.12
|
6 |
+
Requires-Dist: yourbench@ git+https://github.com/huggingface/yourbench.git@main
|
7 |
+
Requires-Dist: asyncio>=3.4.3
|
8 |
+
Requires-Dist: datasets>=3.3.0
|
9 |
+
Requires-Dist: loguru>=0.7.3
|
10 |
+
Requires-Dist: python-dotenv>=1.0.1
|
11 |
+
Requires-Dist: tqdm>=4.67.1
|
12 |
+
Requires-Dist: ruff>=0.11.2
|
13 |
+
Requires-Dist: lighteval[math]>=0.8.0
|
14 |
+
Requires-Dist: huggingface-hub>=0.22.0
|
15 |
+
Requires-Dist: python-multipart>=0.0.5
|
16 |
+
Requires-Dist: fastapi>=0.110.0
|
17 |
+
Requires-Dist: uvicorn>=0.29.0
|
18 |
+
Requires-Dist: pydantic>=2.6.0
|
backend/yourbench_simple_demo.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
README.md
|
2 |
+
pyproject.toml
|
3 |
+
lighteval_task/__init__.py
|
4 |
+
lighteval_task/lighteval_task.py
|
5 |
+
tests/test_evaluation.py
|
6 |
+
tests/test_hf_upload.py
|
7 |
+
tests/test_inference.py
|
8 |
+
tests/test_lighteval.py
|
9 |
+
tests/test_openai.py
|
10 |
+
tests/test_parallel_lighteval.py
|
11 |
+
tests/test_provider_parallel_support.py
|
12 |
+
tests/test_yourbench_results.py
|
13 |
+
yourbench_simple_demo.egg-info/PKG-INFO
|
14 |
+
yourbench_simple_demo.egg-info/SOURCES.txt
|
15 |
+
yourbench_simple_demo.egg-info/dependency_links.txt
|
16 |
+
yourbench_simple_demo.egg-info/requires.txt
|
17 |
+
yourbench_simple_demo.egg-info/top_level.txt
|
backend/yourbench_simple_demo.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
backend/yourbench_simple_demo.egg-info/requires.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
yourbench@ git+https://github.com/huggingface/yourbench.git@main
|
2 |
+
asyncio>=3.4.3
|
3 |
+
datasets>=3.3.0
|
4 |
+
loguru>=0.7.3
|
5 |
+
python-dotenv>=1.0.1
|
6 |
+
tqdm>=4.67.1
|
7 |
+
ruff>=0.11.2
|
8 |
+
lighteval[math]>=0.8.0
|
9 |
+
huggingface-hub>=0.22.0
|
10 |
+
python-multipart>=0.0.5
|
11 |
+
fastapi>=0.110.0
|
12 |
+
uvicorn>=0.29.0
|
13 |
+
pydantic>=2.6.0
|
backend/yourbench_simple_demo.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
lighteval_task
|
frontend/src/components/BenchmarkDisplay.jsx
CHANGED
@@ -99,19 +99,34 @@ const BenchmarkDisplay = ({
|
|
99 |
<Typography variant="h6">Benchmark Created Successfully</Typography>
|
100 |
</Box>
|
101 |
|
102 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
<Button
|
104 |
-
variant="
|
105 |
color="primary"
|
106 |
-
|
107 |
-
|
108 |
-
}
|
109 |
-
onClick={handleDownloadClick}
|
110 |
-
disabled={isDownloading || !sessionId}
|
111 |
>
|
112 |
-
|
113 |
</Button>
|
114 |
-
</
|
115 |
</Box>
|
116 |
|
117 |
<Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
|
@@ -154,18 +169,6 @@ const BenchmarkDisplay = ({
|
|
154 |
</Card>
|
155 |
))}
|
156 |
</Box>
|
157 |
-
|
158 |
-
<Box sx={{ display: "flex", justifyContent: "center", mt: 4 }}>
|
159 |
-
<Button
|
160 |
-
variant="contained"
|
161 |
-
color="primary"
|
162 |
-
size="large"
|
163 |
-
startIcon={<AssessmentIcon />}
|
164 |
-
onClick={handleEvaluationClick}
|
165 |
-
>
|
166 |
-
Start Evaluation
|
167 |
-
</Button>
|
168 |
-
</Box>
|
169 |
</>
|
170 |
);
|
171 |
};
|
|
|
99 |
<Typography variant="h6">Benchmark Created Successfully</Typography>
|
100 |
</Box>
|
101 |
|
102 |
+
<Box sx={{ display: "flex", gap: 2 }}>
|
103 |
+
<Tooltip title="Download the complete benchmark">
|
104 |
+
<Button
|
105 |
+
variant="outlined"
|
106 |
+
color="primary"
|
107 |
+
endIcon={
|
108 |
+
isDownloading ? (
|
109 |
+
<CircularProgress size={16} />
|
110 |
+
) : (
|
111 |
+
<DownloadIcon />
|
112 |
+
)
|
113 |
+
}
|
114 |
+
onClick={handleDownloadClick}
|
115 |
+
disabled={isDownloading || !sessionId}
|
116 |
+
>
|
117 |
+
{isDownloading ? "Downloading..." : "Download Benchmark"}
|
118 |
+
</Button>
|
119 |
+
</Tooltip>
|
120 |
+
|
121 |
<Button
|
122 |
+
variant="contained"
|
123 |
color="primary"
|
124 |
+
startIcon={<AssessmentIcon />}
|
125 |
+
onClick={handleEvaluationClick}
|
|
|
|
|
|
|
126 |
>
|
127 |
+
Start Evaluation
|
128 |
</Button>
|
129 |
+
</Box>
|
130 |
</Box>
|
131 |
|
132 |
<Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
|
|
|
169 |
</Card>
|
170 |
))}
|
171 |
</Box>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
</>
|
173 |
);
|
174 |
};
|
frontend/src/components/BenchmarkEvaluation.jsx
CHANGED
@@ -1,55 +1,53 @@
|
|
1 |
import React, { useState, useEffect, useRef } from "react";
|
2 |
-
import {
|
3 |
-
Box,
|
4 |
-
Typography,
|
5 |
-
CircularProgress,
|
6 |
-
Alert,
|
7 |
-
Paper,
|
8 |
-
Divider,
|
9 |
-
Button,
|
10 |
-
} from "@mui/material";
|
11 |
-
import AccessTimeIcon from "@mui/icons-material/AccessTime";
|
12 |
-
import LogDisplay from "./LogDisplay";
|
13 |
import { useNavigate } from "react-router-dom";
|
14 |
|
15 |
-
//
|
16 |
-
const
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"
|
|
|
20 |
];
|
21 |
|
22 |
-
// Friendly step names for display
|
23 |
-
const STEP_LABELS = {
|
24 |
-
preparation: "Preparation",
|
25 |
-
model_evaluation: "Model Evaluation",
|
26 |
-
results_compilation: "Results Compilation",
|
27 |
-
};
|
28 |
-
|
29 |
-
/**
|
30 |
-
* Component to handle benchmark evaluation and display logs
|
31 |
-
*
|
32 |
-
* @param {Object} props - Component props
|
33 |
-
* @param {string} props.sessionId - Session ID of the benchmark to evaluate
|
34 |
-
* @param {Function} props.onComplete - Function to call when evaluation is complete
|
35 |
-
* @returns {JSX.Element} Benchmark evaluation component
|
36 |
-
*/
|
37 |
const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
38 |
-
const [evaluating, setEvaluating] = useState(false);
|
39 |
const [evaluationComplete, setEvaluationComplete] = useState(false);
|
40 |
-
const [evaluationLogs, setEvaluationLogs] = useState([]);
|
41 |
const [error, setError] = useState(null);
|
42 |
-
const [currentPhase, setCurrentPhase] = useState("initializing");
|
43 |
-
const [completedSteps, setCompletedSteps] = useState([]);
|
44 |
-
const [activeStep, setActiveStep] = useState(0);
|
45 |
const [elapsedTime, setElapsedTime] = useState(0);
|
|
|
46 |
|
47 |
-
const pollingIntervalRef = useRef(null);
|
48 |
const timerIntervalRef = useRef(null);
|
49 |
const startTimeRef = useRef(null);
|
|
|
|
|
50 |
|
51 |
const navigate = useNavigate();
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
// Start evaluation when component mounts
|
54 |
useEffect(() => {
|
55 |
// Set start time
|
@@ -76,80 +74,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
76 |
};
|
77 |
}, []);
|
78 |
|
79 |
-
// Determine current phase and completed steps from logs
|
80 |
-
useEffect(() => {
|
81 |
-
if (evaluationLogs.length === 0) return;
|
82 |
-
|
83 |
-
// Check all logs for completed steps
|
84 |
-
const newCompletedSteps = [...completedSteps];
|
85 |
-
let newActiveStep = activeStep;
|
86 |
-
|
87 |
-
evaluationLogs.forEach((log) => {
|
88 |
-
// Detect completed steps (format: [SUCCESS] Stage completed: step_name)
|
89 |
-
const match = log.match(/\[SUCCESS\] Stage completed: (\w+)/);
|
90 |
-
if (match && match[1]) {
|
91 |
-
const completedStep = match[1].trim();
|
92 |
-
if (
|
93 |
-
EVALUATION_STEPS.includes(completedStep) &&
|
94 |
-
!newCompletedSteps.includes(completedStep)
|
95 |
-
) {
|
96 |
-
newCompletedSteps.push(completedStep);
|
97 |
-
// Set active step to index of next step
|
98 |
-
const stepIndex = EVALUATION_STEPS.indexOf(completedStep);
|
99 |
-
if (stepIndex >= 0 && stepIndex + 1 > newActiveStep) {
|
100 |
-
newActiveStep = stepIndex + 1;
|
101 |
-
if (newActiveStep >= EVALUATION_STEPS.length) {
|
102 |
-
newActiveStep = EVALUATION_STEPS.length;
|
103 |
-
}
|
104 |
-
}
|
105 |
-
}
|
106 |
-
}
|
107 |
-
});
|
108 |
-
|
109 |
-
// Update state if there are new completed steps
|
110 |
-
if (newCompletedSteps.length > completedSteps.length) {
|
111 |
-
setCompletedSteps(newCompletedSteps);
|
112 |
-
setActiveStep(newActiveStep);
|
113 |
-
}
|
114 |
-
|
115 |
-
// Check recent logs to determine current phase
|
116 |
-
const recentLogs = evaluationLogs.slice(-10);
|
117 |
-
|
118 |
-
// Detect completion conditions
|
119 |
-
const isComplete =
|
120 |
-
recentLogs.some((log) =>
|
121 |
-
log.includes("[SUCCESS] Evaluation completed")
|
122 |
-
) ||
|
123 |
-
completedSteps.includes("results_compilation") ||
|
124 |
-
newCompletedSteps.includes("results_compilation");
|
125 |
-
|
126 |
-
if (isComplete) {
|
127 |
-
setCurrentPhase("complete");
|
128 |
-
setEvaluationComplete(true);
|
129 |
-
// Stop polling when evaluation is complete
|
130 |
-
if (pollingIntervalRef.current) {
|
131 |
-
clearInterval(pollingIntervalRef.current);
|
132 |
-
}
|
133 |
-
if (timerIntervalRef.current) {
|
134 |
-
clearInterval(timerIntervalRef.current);
|
135 |
-
}
|
136 |
-
// Notify parent component that evaluation is complete
|
137 |
-
if (onComplete) {
|
138 |
-
onComplete({
|
139 |
-
success: true,
|
140 |
-
sessionId,
|
141 |
-
logs: evaluationLogs,
|
142 |
-
});
|
143 |
-
}
|
144 |
-
} else if (recentLogs.some((log) => log.includes("Comparing models"))) {
|
145 |
-
setCurrentPhase("compiling_results");
|
146 |
-
} else if (recentLogs.some((log) => log.includes("Starting evaluations"))) {
|
147 |
-
setCurrentPhase("evaluating");
|
148 |
-
} else if (recentLogs.some((log) => log.includes("Initialization"))) {
|
149 |
-
setCurrentPhase("preparing");
|
150 |
-
}
|
151 |
-
}, [evaluationLogs, completedSteps, activeStep, sessionId, onComplete]);
|
152 |
-
|
153 |
// Format elapsed time as HH:MM:SS
|
154 |
const formatElapsedTime = () => {
|
155 |
const hours = Math.floor(elapsedTime / 3600);
|
@@ -170,13 +94,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
170 |
return;
|
171 |
}
|
172 |
|
173 |
-
setEvaluating(true);
|
174 |
-
setEvaluationLogs([]);
|
175 |
-
setError(null);
|
176 |
-
setCurrentPhase("initializing");
|
177 |
-
setCompletedSteps([]);
|
178 |
-
setActiveStep(0);
|
179 |
-
|
180 |
try {
|
181 |
// Call API to start evaluation
|
182 |
const response = await fetch("http://localhost:3001/evaluate-benchmark", {
|
@@ -192,34 +109,15 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
192 |
const result = await response.json();
|
193 |
|
194 |
if (response.ok) {
|
195 |
-
|
196 |
-
|
197 |
-
// Set up polling to retrieve more logs
|
198 |
pollingIntervalRef.current = setInterval(async () => {
|
199 |
-
// Check if we're already done
|
200 |
-
if (evaluationComplete) {
|
201 |
-
clearInterval(pollingIntervalRef.current);
|
202 |
-
return;
|
203 |
-
}
|
204 |
-
|
205 |
try {
|
206 |
-
// Call API to get latest logs
|
207 |
const logsResponse = await fetch(
|
208 |
`http://localhost:3001/evaluation-logs/${sessionId}`
|
209 |
);
|
210 |
|
211 |
if (logsResponse.ok) {
|
212 |
const logsResult = await logsResponse.json();
|
213 |
-
|
214 |
-
// Update logs if there are new ones
|
215 |
-
if (
|
216 |
-
logsResult.logs &&
|
217 |
-
logsResult.logs.length > evaluationLogs.length
|
218 |
-
) {
|
219 |
-
setEvaluationLogs(logsResult.logs);
|
220 |
-
}
|
221 |
-
|
222 |
-
// Check if evaluation is complete
|
223 |
if (logsResult.is_completed) {
|
224 |
setEvaluationComplete(true);
|
225 |
clearInterval(pollingIntervalRef.current);
|
@@ -227,71 +125,17 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
227 |
}
|
228 |
} catch (error) {
|
229 |
console.log("Error polling logs:", error);
|
230 |
-
// Don't stop polling on network errors
|
231 |
}
|
232 |
-
}, 2000);
|
233 |
} else {
|
234 |
-
// Handle error
|
235 |
-
setEvaluationLogs([`Error: ${result.error || "Unknown error"}`]);
|
236 |
setError(result.error || "Benchmark evaluation failed");
|
237 |
}
|
238 |
} catch (error) {
|
239 |
console.error("Error starting evaluation:", error);
|
240 |
-
setEvaluationLogs([`Error: ${error.message || "Unknown error"}`]);
|
241 |
setError("Error connecting to server");
|
242 |
-
} finally {
|
243 |
-
setEvaluating(false);
|
244 |
-
}
|
245 |
-
};
|
246 |
-
|
247 |
-
// Get title based on current phase
|
248 |
-
const getPhaseTitle = () => {
|
249 |
-
switch (currentPhase) {
|
250 |
-
case "initializing":
|
251 |
-
return "Preparing evaluation...";
|
252 |
-
case "preparing":
|
253 |
-
return "Preparing models...";
|
254 |
-
case "evaluating":
|
255 |
-
return "Evaluating models...";
|
256 |
-
case "compiling_results":
|
257 |
-
return "Compiling results...";
|
258 |
-
case "complete":
|
259 |
-
return "Evaluation completed successfully!";
|
260 |
-
default:
|
261 |
-
return "Processing...";
|
262 |
}
|
263 |
};
|
264 |
|
265 |
-
// Get current step info for display
|
266 |
-
const getCurrentStepInfo = () => {
|
267 |
-
const totalSteps = EVALUATION_STEPS.length;
|
268 |
-
const currentStepIndex = activeStep;
|
269 |
-
|
270 |
-
// If no active step yet
|
271 |
-
if (currentStepIndex === 0 && completedSteps.length === 0) {
|
272 |
-
return `Starting... (0%)`;
|
273 |
-
}
|
274 |
-
|
275 |
-
// If all steps completed
|
276 |
-
if (currentStepIndex >= totalSteps) {
|
277 |
-
return `Completed (100%)`;
|
278 |
-
}
|
279 |
-
|
280 |
-
// Calculate percentage
|
281 |
-
const percentage = Math.round((currentStepIndex / totalSteps) * 100);
|
282 |
-
|
283 |
-
// Get current step name
|
284 |
-
const currentStepName =
|
285 |
-
STEP_LABELS[EVALUATION_STEPS[currentStepIndex]] || "Processing";
|
286 |
-
|
287 |
-
return `${currentStepName} (${percentage}%)`;
|
288 |
-
};
|
289 |
-
|
290 |
-
// Function to navigate to results page
|
291 |
-
const viewResults = () => {
|
292 |
-
navigate(`/evaluation-display?session=${sessionId}`);
|
293 |
-
};
|
294 |
-
|
295 |
return (
|
296 |
<Paper
|
297 |
elevation={3}
|
@@ -313,29 +157,19 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
313 |
) : (
|
314 |
<>
|
315 |
{evaluationComplete ? (
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
</Alert>
|
320 |
-
<Button
|
321 |
-
variant="contained"
|
322 |
-
color="primary"
|
323 |
-
onClick={viewResults}
|
324 |
-
sx={{ mb: 3 }}
|
325 |
-
>
|
326 |
-
View Results Leaderboard
|
327 |
-
</Button>
|
328 |
-
</>
|
329 |
) : (
|
330 |
<>
|
331 |
<CircularProgress size={60} sx={{ mb: 2 }} />
|
332 |
<Typography variant="h6" component="div" gutterBottom>
|
333 |
-
|
334 |
</Typography>
|
335 |
|
336 |
{/* Step progress indicator */}
|
337 |
<Typography variant="body1" color="text.secondary">
|
338 |
-
{
|
339 |
</Typography>
|
340 |
|
341 |
{/* Timer display */}
|
@@ -354,9 +188,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
354 |
)}
|
355 |
</>
|
356 |
)}
|
357 |
-
|
358 |
-
{/* Use the LogDisplay component for logs */}
|
359 |
-
<LogDisplay logs={evaluationLogs} height={150} />
|
360 |
</Paper>
|
361 |
);
|
362 |
};
|
|
|
1 |
import React, { useState, useEffect, useRef } from "react";
|
2 |
+
import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import { useNavigate } from "react-router-dom";
|
4 |
|
5 |
+
// Starting messages with their timing
|
6 |
+
const STARTING_MESSAGES = [
|
7 |
+
{ message: "Initializing evaluation environment...", progress: 22 },
|
8 |
+
{ message: "Starting evaluation process...", progress: 54 },
|
9 |
+
{ message: "Evaluating models...", progress: 71 },
|
10 |
+
{ message: "Storing evaluation results...", progress: 100 },
|
11 |
];
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
|
14 |
const [evaluationComplete, setEvaluationComplete] = useState(false);
|
|
|
15 |
const [error, setError] = useState(null);
|
|
|
|
|
|
|
16 |
const [elapsedTime, setElapsedTime] = useState(0);
|
17 |
+
const [startingMessageIndex, setStartingMessageIndex] = useState(0);
|
18 |
|
|
|
19 |
const timerIntervalRef = useRef(null);
|
20 |
const startTimeRef = useRef(null);
|
21 |
+
const startingMessageIntervalRef = useRef(null);
|
22 |
+
const pollingIntervalRef = useRef(null);
|
23 |
|
24 |
const navigate = useNavigate();
|
25 |
|
26 |
+
// Add effect to handle automatic redirection when evaluation is complete
|
27 |
+
useEffect(() => {
|
28 |
+
if (evaluationComplete) {
|
29 |
+
navigate(`/evaluation-display?session=${sessionId}`);
|
30 |
+
}
|
31 |
+
}, [evaluationComplete, sessionId, navigate]);
|
32 |
+
|
33 |
+
// Add effect to handle starting messages
|
34 |
+
useEffect(() => {
|
35 |
+
startingMessageIntervalRef.current = setInterval(() => {
|
36 |
+
setStartingMessageIndex((prev) => {
|
37 |
+
if (prev < STARTING_MESSAGES.length - 1) {
|
38 |
+
return prev + 1;
|
39 |
+
}
|
40 |
+
return prev;
|
41 |
+
});
|
42 |
+
}, 20000); // Change message every 20 seconds
|
43 |
+
|
44 |
+
return () => {
|
45 |
+
if (startingMessageIntervalRef.current) {
|
46 |
+
clearInterval(startingMessageIntervalRef.current);
|
47 |
+
}
|
48 |
+
};
|
49 |
+
}, []);
|
50 |
+
|
51 |
// Start evaluation when component mounts
|
52 |
useEffect(() => {
|
53 |
// Set start time
|
|
|
74 |
};
|
75 |
}, []);
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
// Format elapsed time as HH:MM:SS
|
78 |
const formatElapsedTime = () => {
|
79 |
const hours = Math.floor(elapsedTime / 3600);
|
|
|
94 |
return;
|
95 |
}
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
try {
|
98 |
// Call API to start evaluation
|
99 |
const response = await fetch("http://localhost:3001/evaluate-benchmark", {
|
|
|
109 |
const result = await response.json();
|
110 |
|
111 |
if (response.ok) {
|
112 |
+
// Set up polling to check completion
|
|
|
|
|
113 |
pollingIntervalRef.current = setInterval(async () => {
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
try {
|
|
|
115 |
const logsResponse = await fetch(
|
116 |
`http://localhost:3001/evaluation-logs/${sessionId}`
|
117 |
);
|
118 |
|
119 |
if (logsResponse.ok) {
|
120 |
const logsResult = await logsResponse.json();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
if (logsResult.is_completed) {
|
122 |
setEvaluationComplete(true);
|
123 |
clearInterval(pollingIntervalRef.current);
|
|
|
125 |
}
|
126 |
} catch (error) {
|
127 |
console.log("Error polling logs:", error);
|
|
|
128 |
}
|
129 |
+
}, 2000);
|
130 |
} else {
|
|
|
|
|
131 |
setError(result.error || "Benchmark evaluation failed");
|
132 |
}
|
133 |
} catch (error) {
|
134 |
console.error("Error starting evaluation:", error);
|
|
|
135 |
setError("Error connecting to server");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
}
|
137 |
};
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
return (
|
140 |
<Paper
|
141 |
elevation={3}
|
|
|
157 |
) : (
|
158 |
<>
|
159 |
{evaluationComplete ? (
|
160 |
+
<Alert severity="success" sx={{ width: "100%", mb: 3 }}>
|
161 |
+
Evaluation completed successfully!
|
162 |
+
</Alert>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
) : (
|
164 |
<>
|
165 |
<CircularProgress size={60} sx={{ mb: 2 }} />
|
166 |
<Typography variant="h6" component="div" gutterBottom>
|
167 |
+
Benchmark evaluation...
|
168 |
</Typography>
|
169 |
|
170 |
{/* Step progress indicator */}
|
171 |
<Typography variant="body1" color="text.secondary">
|
172 |
+
{`${STARTING_MESSAGES[startingMessageIndex].message} (${STARTING_MESSAGES[startingMessageIndex].progress}%)`}
|
173 |
</Typography>
|
174 |
|
175 |
{/* Timer display */}
|
|
|
188 |
)}
|
189 |
</>
|
190 |
)}
|
|
|
|
|
|
|
191 |
</Paper>
|
192 |
);
|
193 |
};
|
frontend/src/components/BenchmarkGenerator.jsx
CHANGED
@@ -288,7 +288,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
288 |
case "initializing":
|
289 |
return "Benchmark generation...";
|
290 |
case "configuring":
|
291 |
-
return "
|
292 |
case "benchmarking":
|
293 |
return "Creating benchmark...";
|
294 |
case "complete":
|
@@ -390,7 +390,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
390 |
)}
|
391 |
|
392 |
{/* Use the LogDisplay component */}
|
393 |
-
<LogDisplay logs={generationLogs} height={150} />
|
394 |
</Paper>
|
395 |
);
|
396 |
};
|
|
|
288 |
case "initializing":
|
289 |
return "Benchmark generation...";
|
290 |
case "configuring":
|
291 |
+
return "Creating benchmark...";
|
292 |
case "benchmarking":
|
293 |
return "Creating benchmark...";
|
294 |
case "complete":
|
|
|
390 |
)}
|
391 |
|
392 |
{/* Use the LogDisplay component */}
|
393 |
+
{/* <LogDisplay logs={generationLogs} height={150} /> */}
|
394 |
</Paper>
|
395 |
);
|
396 |
};
|
frontend/src/components/EvaluationDisplay.jsx
CHANGED
@@ -10,7 +10,7 @@ import {
|
|
10 |
TableHead,
|
11 |
TableRow,
|
12 |
Alert,
|
13 |
-
|
14 |
Card,
|
15 |
CardContent,
|
16 |
Link,
|
@@ -70,11 +70,20 @@ const EvaluationDisplay = ({ sessionId }) => {
|
|
70 |
|
71 |
if (loading) {
|
72 |
return (
|
73 |
-
<Box
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
<Typography variant="h5" gutterBottom>
|
75 |
Loading Evaluation Results...
|
76 |
</Typography>
|
77 |
-
<
|
78 |
</Box>
|
79 |
);
|
80 |
}
|
@@ -127,50 +136,45 @@ const EvaluationDisplay = ({ sessionId }) => {
|
|
127 |
</TableRow>
|
128 |
</TableHead>
|
129 |
<TableBody>
|
130 |
-
{results.models_comparison
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
</
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
<TableCell align="center">
|
166 |
-
{model.success ? (
|
167 |
<span style={{ color: "green" }}>✓ Success</span>
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
</TableCell>
|
172 |
-
</TableRow>
|
173 |
-
))}
|
174 |
</TableBody>
|
175 |
</Table>
|
176 |
</TableContainer>
|
|
|
10 |
TableHead,
|
11 |
TableRow,
|
12 |
Alert,
|
13 |
+
CircularProgress,
|
14 |
Card,
|
15 |
CardContent,
|
16 |
Link,
|
|
|
70 |
|
71 |
if (loading) {
|
72 |
return (
|
73 |
+
<Box
|
74 |
+
sx={{
|
75 |
+
width: "100%",
|
76 |
+
mt: 4,
|
77 |
+
mb: 4,
|
78 |
+
display: "flex",
|
79 |
+
flexDirection: "column",
|
80 |
+
alignItems: "center",
|
81 |
+
}}
|
82 |
+
>
|
83 |
<Typography variant="h5" gutterBottom>
|
84 |
Loading Evaluation Results...
|
85 |
</Typography>
|
86 |
+
<CircularProgress />
|
87 |
</Box>
|
88 |
);
|
89 |
}
|
|
|
136 |
</TableRow>
|
137 |
</TableHead>
|
138 |
<TableBody>
|
139 |
+
{results.models_comparison
|
140 |
+
.filter((model) => model.success)
|
141 |
+
.map((model, index) => (
|
142 |
+
<TableRow
|
143 |
+
key={`${model.model_name}-${model.provider}`}
|
144 |
+
sx={{
|
145 |
+
"&:last-child td, &:last-child th": { border: 0 },
|
146 |
+
}}
|
147 |
+
>
|
148 |
+
<TableCell>{index + 1}</TableCell>
|
149 |
+
<TableCell component="th" scope="row">
|
150 |
+
<Link
|
151 |
+
href={`https://huggingface.co/${model.model_name}`}
|
152 |
+
target="_blank"
|
153 |
+
rel="noopener noreferrer"
|
154 |
+
sx={{
|
155 |
+
textDecoration: "none",
|
156 |
+
"&:hover": {
|
157 |
+
textDecoration: "underline",
|
158 |
+
},
|
159 |
+
display: "flex",
|
160 |
+
alignItems: "center",
|
161 |
+
}}
|
162 |
+
>
|
163 |
+
{model.model_name}
|
164 |
+
<OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
|
165 |
+
</Link>
|
166 |
+
</TableCell>
|
167 |
+
<TableCell align="center">
|
168 |
+
{formatAccuracy(model.accuracy)}
|
169 |
+
</TableCell>
|
170 |
+
<TableCell align="center">
|
171 |
+
{formatTime(model.evaluation_time)}
|
172 |
+
</TableCell>
|
173 |
+
<TableCell align="center">
|
|
|
|
|
174 |
<span style={{ color: "green" }}>✓ Success</span>
|
175 |
+
</TableCell>
|
176 |
+
</TableRow>
|
177 |
+
))}
|
|
|
|
|
|
|
178 |
</TableBody>
|
179 |
</Table>
|
180 |
</TableContainer>
|
frontend/src/components/ExternalLinks.jsx
CHANGED
@@ -1,16 +1,31 @@
|
|
1 |
import React from "react";
|
2 |
-
import { Box, Typography } from "@mui/material";
|
3 |
import OpenInNewIcon from "@mui/icons-material/OpenInNew";
|
|
|
4 |
|
5 |
const ExternalLinks = () => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
return (
|
7 |
<Box
|
8 |
sx={{
|
9 |
position: "fixed",
|
10 |
top: 24,
|
|
|
11 |
right: 24,
|
|
|
12 |
display: "flex",
|
13 |
-
|
14 |
alignItems: "center",
|
15 |
zIndex: 1000,
|
16 |
}}
|
@@ -57,6 +72,22 @@ const ExternalLinks = () => {
|
|
57 |
<OpenInNewIcon sx={{ fontSize: "0.75rem", ml: 0.5, opacity: 0.6 }} />
|
58 |
</a>
|
59 |
</Typography>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
</Box>
|
61 |
);
|
62 |
};
|
|
|
1 |
import React from "react";
|
2 |
+
import { Box, Typography, IconButton, Tooltip } from "@mui/material";
|
3 |
import OpenInNewIcon from "@mui/icons-material/OpenInNew";
|
4 |
+
import ShareIcon from "@mui/icons-material/Share";
|
5 |
|
6 |
const ExternalLinks = () => {
|
7 |
+
const handleShare = async () => {
|
8 |
+
try {
|
9 |
+
await navigator.share({
|
10 |
+
title: "YourBench Demo",
|
11 |
+
text: "Check out this benchmark evaluation on YourBench!",
|
12 |
+
url: window.location.href,
|
13 |
+
});
|
14 |
+
} catch (err) {
|
15 |
+
console.log("Error sharing:", err);
|
16 |
+
}
|
17 |
+
};
|
18 |
+
|
19 |
return (
|
20 |
<Box
|
21 |
sx={{
|
22 |
position: "fixed",
|
23 |
top: 24,
|
24 |
+
left: 24,
|
25 |
right: 24,
|
26 |
+
margin: "auto",
|
27 |
display: "flex",
|
28 |
+
justifyContent: "space-between",
|
29 |
alignItems: "center",
|
30 |
zIndex: 1000,
|
31 |
}}
|
|
|
72 |
<OpenInNewIcon sx={{ fontSize: "0.75rem", ml: 0.5, opacity: 0.6 }} />
|
73 |
</a>
|
74 |
</Typography>
|
75 |
+
<Tooltip title="Share">
|
76 |
+
<IconButton
|
77 |
+
onClick={handleShare}
|
78 |
+
size="small"
|
79 |
+
sx={{
|
80 |
+
ml: 1,
|
81 |
+
color: "inherit",
|
82 |
+
opacity: 0.7,
|
83 |
+
"&:hover": {
|
84 |
+
opacity: 1,
|
85 |
+
},
|
86 |
+
}}
|
87 |
+
>
|
88 |
+
<ShareIcon fontSize="small" />
|
89 |
+
</IconButton>
|
90 |
+
</Tooltip>
|
91 |
</Box>
|
92 |
);
|
93 |
};
|
frontend/src/config/theme.js
CHANGED
@@ -375,7 +375,7 @@ const getDesignTokens = (mode) => ({
|
|
375 |
values: {
|
376 |
xs: 0,
|
377 |
sm: 600,
|
378 |
-
md:
|
379 |
lg: 1240,
|
380 |
xl: 1536,
|
381 |
},
|
|
|
375 |
values: {
|
376 |
xs: 0,
|
377 |
sm: 600,
|
378 |
+
md: 1100,
|
379 |
lg: 1240,
|
380 |
xl: 1536,
|
381 |
},
|
test_import.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
try:
|
2 |
+
import lighteval_task
|
3 |
+
print("lighteval_task importé avec succès!")
|
4 |
+
except ImportError as e:
|
5 |
+
print(f"Erreur: {e}")
|