Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update dockerfile
Browse files- Dockerfile +2 -5
- backend/data/lighteval_results/lighteval_results.json +10 -10
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json +121 -0
- backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json +121 -0
- backend/tasks/evaluationTask.py +2 -2
- backend/tasks/get_model_providers.py +1 -1
Dockerfile
CHANGED
@@ -32,17 +32,14 @@ RUN useradd -m -u 1000 user
|
|
32 |
RUN mkdir -p /app/.cache && \
|
33 |
chown -R user:user /app
|
34 |
|
35 |
-
# Copy backend
|
36 |
-
COPY backend/
|
37 |
|
38 |
# Install all dependencies explicitly
|
39 |
RUN pip install fastapi uvicorn
|
40 |
# Install project dependencies
|
41 |
RUN uv pip install -e . --system
|
42 |
|
43 |
-
# Copy backend code
|
44 |
-
COPY backend/ .
|
45 |
-
|
46 |
# Copy frontend server and build
|
47 |
COPY --from=frontend-build /app/build ./frontend/build
|
48 |
COPY --from=frontend-build /app/package*.json ./frontend/
|
|
|
32 |
RUN mkdir -p /app/.cache && \
|
33 |
chown -R user:user /app
|
34 |
|
35 |
+
# Copy all backend code first
|
36 |
+
COPY backend/ .
|
37 |
|
38 |
# Install all dependencies explicitly
|
39 |
RUN pip install fastapi uvicorn
|
40 |
# Install project dependencies
|
41 |
RUN uv pip install -e . --system
|
42 |
|
|
|
|
|
|
|
43 |
# Copy frontend server and build
|
44 |
COPY --from=frontend-build /app/build ./frontend/build
|
45 |
COPY --from=frontend-build /app/package*.json ./frontend/
|
backend/data/lighteval_results/lighteval_results.json
CHANGED
@@ -1,27 +1,27 @@
|
|
1 |
[
|
2 |
-
{
|
3 |
-
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
4 |
-
"provider": "sambanova",
|
5 |
-
"accuracy": 1.0,
|
6 |
-
"execution_time": 18.800472021102905,
|
7 |
-
"status": "success"
|
8 |
-
},
|
9 |
{
|
10 |
"model": "deepseek-ai/DeepSeek-V3-0324",
|
11 |
"provider": "novita",
|
12 |
"accuracy": 1.0,
|
13 |
-
"execution_time":
|
14 |
"status": "success"
|
15 |
},
|
16 |
{
|
17 |
-
"model": "Qwen/
|
18 |
"provider": "sambanova",
|
19 |
"accuracy": 0.0,
|
20 |
"execution_time": 60.0,
|
21 |
"status": "timeout"
|
22 |
},
|
23 |
{
|
24 |
-
"model": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
"provider": "sambanova",
|
26 |
"accuracy": 0.0,
|
27 |
"execution_time": 60.0,
|
|
|
1 |
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
{
|
3 |
"model": "deepseek-ai/DeepSeek-V3-0324",
|
4 |
"provider": "novita",
|
5 |
"accuracy": 1.0,
|
6 |
+
"execution_time": 54.32098197937012,
|
7 |
"status": "success"
|
8 |
},
|
9 |
{
|
10 |
+
"model": "Qwen/QwQ-32B",
|
11 |
"provider": "sambanova",
|
12 |
"accuracy": 0.0,
|
13 |
"execution_time": 60.0,
|
14 |
"status": "timeout"
|
15 |
},
|
16 |
{
|
17 |
+
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
18 |
+
"provider": "sambanova",
|
19 |
+
"accuracy": 0.0,
|
20 |
+
"execution_time": 60.0,
|
21 |
+
"status": "timeout"
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"model": "Qwen/Qwen2.5-72B-Instruct",
|
25 |
"provider": "sambanova",
|
26 |
"accuracy": 0.0,
|
27 |
"execution_time": 60.0,
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 15,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 190861.972782125,
|
9 |
+
"end_time": 190876.962226916,
|
10 |
+
"total_evaluation_time_secondes": "14.989444790990092",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
51 |
+
"hf_subset": "multi_hop_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 34,
|
87 |
+
"effective_num_docs": 15,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "97803694d4430d2d",
|
96 |
+
"hash_full_prompts": "3125bcda69618d2b",
|
97 |
+
"hash_input_tokens": "58ec870775e406f3",
|
98 |
+
"hash_cont_tokens": "58ec870775e406f3"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 15,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 15,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "13a4051f728a0e87",
|
111 |
+
"hash_full_prompts": "e18b288370ab6ae2",
|
112 |
+
"hash_input_tokens": "544d800a25dfd777",
|
113 |
+
"hash_cont_tokens": "544d800a25dfd777"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 15,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 15,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 15,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 190861.972804458,
|
9 |
+
"end_time": 190894.739973125,
|
10 |
+
"total_evaluation_time_secondes": "32.7671686669928",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
51 |
+
"hf_subset": "multi_hop_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 34,
|
87 |
+
"effective_num_docs": 15,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "97803694d4430d2d",
|
96 |
+
"hash_full_prompts": "3125bcda69618d2b",
|
97 |
+
"hash_input_tokens": "58ec870775e406f3",
|
98 |
+
"hash_cont_tokens": "58ec870775e406f3"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 15,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 15,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "13a4051f728a0e87",
|
111 |
+
"hash_full_prompts": "e18b288370ab6ae2",
|
112 |
+
"hash_input_tokens": "544d800a25dfd777",
|
113 |
+
"hash_cont_tokens": "544d800a25dfd777"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 15,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 15,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 30,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 190994.241279791,
|
9 |
+
"end_time": 191043.871577458,
|
10 |
+
"total_evaluation_time_secondes": "49.63029766699765",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
51 |
+
"hf_subset": "multi_hop_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 34,
|
87 |
+
"effective_num_docs": 30,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "1b5afc5f13827f79",
|
96 |
+
"hash_full_prompts": "cd8c39c007643835",
|
97 |
+
"hash_input_tokens": "79ab129e9a18c6d6",
|
98 |
+
"hash_cont_tokens": "79ab129e9a18c6d6"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 30,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 30,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b18e19e266a5bc51",
|
111 |
+
"hash_full_prompts": "1eaa15cbc4a17d04",
|
112 |
+
"hash_input_tokens": "05a66e44e190c178",
|
113 |
+
"hash_cont_tokens": "05a66e44e190c178"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 30,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 30,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config_general": {
|
3 |
+
"lighteval_sha": "?",
|
4 |
+
"num_fewshot_seeds": 1,
|
5 |
+
"override_batch_size": null,
|
6 |
+
"max_samples": 30,
|
7 |
+
"job_id": 0,
|
8 |
+
"start_time": 191195.945968041,
|
9 |
+
"end_time": 191244.057571,
|
10 |
+
"total_evaluation_time_secondes": "48.111602959019365",
|
11 |
+
"model_name": "deepseek-ai/DeepSeek-V3-0324",
|
12 |
+
"model_sha": "",
|
13 |
+
"model_dtype": null,
|
14 |
+
"model_size": "",
|
15 |
+
"generation_parameters": {
|
16 |
+
"early_stopping": null,
|
17 |
+
"repetition_penalty": null,
|
18 |
+
"frequency_penalty": null,
|
19 |
+
"length_penalty": null,
|
20 |
+
"presence_penalty": null,
|
21 |
+
"max_new_tokens": null,
|
22 |
+
"min_new_tokens": null,
|
23 |
+
"seed": null,
|
24 |
+
"stop_tokens": null,
|
25 |
+
"temperature": null,
|
26 |
+
"top_k": null,
|
27 |
+
"min_p": null,
|
28 |
+
"top_p": null,
|
29 |
+
"truncate_prompt": null,
|
30 |
+
"response_format": null
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"results": {
|
34 |
+
"custom|yourbench|0": {
|
35 |
+
"accuracy": 1.0,
|
36 |
+
"accuracy_stderr": 0.0
|
37 |
+
},
|
38 |
+
"all": {
|
39 |
+
"accuracy": 1.0,
|
40 |
+
"accuracy_stderr": 0.0
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"versions": {
|
44 |
+
"custom|yourbench|0": 0
|
45 |
+
},
|
46 |
+
"config_tasks": {
|
47 |
+
"custom|yourbench": {
|
48 |
+
"name": "yourbench",
|
49 |
+
"prompt_function": "yourbench_prompt",
|
50 |
+
"hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
|
51 |
+
"hf_subset": "multi_hop_questions",
|
52 |
+
"metric": [
|
53 |
+
{
|
54 |
+
"metric_name": [
|
55 |
+
"accuracy"
|
56 |
+
],
|
57 |
+
"higher_is_better": {
|
58 |
+
"accuracy": true
|
59 |
+
},
|
60 |
+
"category": "7",
|
61 |
+
"use_case": "1",
|
62 |
+
"sample_level_fn": "compute",
|
63 |
+
"corpus_level_fn": {
|
64 |
+
"accuracy": "mean"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"hf_revision": null,
|
69 |
+
"hf_filter": null,
|
70 |
+
"hf_avail_splits": [
|
71 |
+
"train"
|
72 |
+
],
|
73 |
+
"trust_dataset": true,
|
74 |
+
"evaluation_splits": [
|
75 |
+
"train"
|
76 |
+
],
|
77 |
+
"few_shots_split": null,
|
78 |
+
"few_shots_select": null,
|
79 |
+
"generation_size": 8192,
|
80 |
+
"generation_grammar": null,
|
81 |
+
"stop_sequence": [],
|
82 |
+
"num_samples": null,
|
83 |
+
"suite": [
|
84 |
+
"custom"
|
85 |
+
],
|
86 |
+
"original_num_docs": 34,
|
87 |
+
"effective_num_docs": 30,
|
88 |
+
"must_remove_duplicate_docs": false,
|
89 |
+
"version": 0
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"summary_tasks": {
|
93 |
+
"custom|yourbench|0": {
|
94 |
+
"hashes": {
|
95 |
+
"hash_examples": "1b5afc5f13827f79",
|
96 |
+
"hash_full_prompts": "cd8c39c007643835",
|
97 |
+
"hash_input_tokens": "79ab129e9a18c6d6",
|
98 |
+
"hash_cont_tokens": "79ab129e9a18c6d6"
|
99 |
+
},
|
100 |
+
"truncated": 0,
|
101 |
+
"non_truncated": 30,
|
102 |
+
"padded": 0,
|
103 |
+
"non_padded": 30,
|
104 |
+
"effective_few_shots": 0.0,
|
105 |
+
"num_truncated_few_shots": 0
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"summary_general": {
|
109 |
+
"hashes": {
|
110 |
+
"hash_examples": "b18e19e266a5bc51",
|
111 |
+
"hash_full_prompts": "1eaa15cbc4a17d04",
|
112 |
+
"hash_input_tokens": "05a66e44e190c178",
|
113 |
+
"hash_cont_tokens": "05a66e44e190c178"
|
114 |
+
},
|
115 |
+
"truncated": 0,
|
116 |
+
"non_truncated": 30,
|
117 |
+
"padded": 0,
|
118 |
+
"non_padded": 30,
|
119 |
+
"num_truncated_few_shots": 0
|
120 |
+
}
|
121 |
+
}
|
backend/tasks/evaluationTask.py
CHANGED
@@ -71,7 +71,7 @@ class EvaluationTask:
|
|
71 |
from lighteval_task.lighteval_task import create_yourbench_task
|
72 |
|
73 |
# Create yourbench task
|
74 |
-
yourbench = create_yourbench_task("{dataset_name}", "
|
75 |
|
76 |
# Define TASKS_TABLE needed by lighteval
|
77 |
TASKS_TABLE = [yourbench]
|
@@ -86,7 +86,7 @@ TASKS_TABLE = [yourbench]
|
|
86 |
"custom|yourbench|0|0",
|
87 |
"--custom-tasks",
|
88 |
temp_file_path,
|
89 |
-
"--max-samples", "
|
90 |
"--output-dir", "data/lighteval_results",
|
91 |
# "--save-details",
|
92 |
"--no-push-to-hub"
|
|
|
71 |
from lighteval_task.lighteval_task import create_yourbench_task
|
72 |
|
73 |
# Create yourbench task
|
74 |
+
yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions")
|
75 |
|
76 |
# Define TASKS_TABLE needed by lighteval
|
77 |
TASKS_TABLE = [yourbench]
|
|
|
86 |
"custom|yourbench|0|0",
|
87 |
"--custom-tasks",
|
88 |
temp_file_path,
|
89 |
+
"--max-samples", "30",
|
90 |
"--output-dir", "data/lighteval_results",
|
91 |
# "--save-details",
|
92 |
"--no-push-to-hub"
|
backend/tasks/get_model_providers.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from huggingface_hub import model_info
|
2 |
-
PREFERRED_PROVIDERS = ["
|
3 |
|
4 |
def filter_providers(providers):
|
5 |
return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
|
|
|
1 |
from huggingface_hub import model_info
|
2 |
+
PREFERRED_PROVIDERS = ["novita","sambanova"]
|
3 |
|
4 |
def filter_providers(providers):
|
5 |
return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
|