tfrere commited on
Commit
f8ec36f
·
1 Parent(s): 2a8ebbd

update dockerfile

Browse files
Dockerfile CHANGED
@@ -32,17 +32,14 @@ RUN useradd -m -u 1000 user
32
  RUN mkdir -p /app/.cache && \
33
  chown -R user:user /app
34
 
35
- # Copy backend requirements
36
- COPY backend/pyproject.toml ./
37
 
38
  # Install all dependencies explicitly
39
  RUN pip install fastapi uvicorn
40
  # Install project dependencies
41
  RUN uv pip install -e . --system
42
 
43
- # Copy backend code
44
- COPY backend/ .
45
-
46
  # Copy frontend server and build
47
  COPY --from=frontend-build /app/build ./frontend/build
48
  COPY --from=frontend-build /app/package*.json ./frontend/
 
32
  RUN mkdir -p /app/.cache && \
33
  chown -R user:user /app
34
 
35
+ # Copy all backend code first
36
+ COPY backend/ .
37
 
38
  # Install all dependencies explicitly
39
  RUN pip install fastapi uvicorn
40
  # Install project dependencies
41
  RUN uv pip install -e . --system
42
 
 
 
 
43
  # Copy frontend server and build
44
  COPY --from=frontend-build /app/build ./frontend/build
45
  COPY --from=frontend-build /app/package*.json ./frontend/
backend/data/lighteval_results/lighteval_results.json CHANGED
@@ -1,27 +1,27 @@
1
  [
2
- {
3
- "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
4
- "provider": "sambanova",
5
- "accuracy": 1.0,
6
- "execution_time": 18.800472021102905,
7
- "status": "success"
8
- },
9
  {
10
  "model": "deepseek-ai/DeepSeek-V3-0324",
11
  "provider": "novita",
12
  "accuracy": 1.0,
13
- "execution_time": 34.95434904098511,
14
  "status": "success"
15
  },
16
  {
17
- "model": "Qwen/Qwen2.5-72B-Instruct",
18
  "provider": "sambanova",
19
  "accuracy": 0.0,
20
  "execution_time": 60.0,
21
  "status": "timeout"
22
  },
23
  {
24
- "model": "Qwen/QwQ-32B",
 
 
 
 
 
 
 
25
  "provider": "sambanova",
26
  "accuracy": 0.0,
27
  "execution_time": 60.0,
 
1
  [
 
 
 
 
 
 
 
2
  {
3
  "model": "deepseek-ai/DeepSeek-V3-0324",
4
  "provider": "novita",
5
  "accuracy": 1.0,
6
+ "execution_time": 54.32098197937012,
7
  "status": "success"
8
  },
9
  {
10
+ "model": "Qwen/QwQ-32B",
11
  "provider": "sambanova",
12
  "accuracy": 0.0,
13
  "execution_time": 60.0,
14
  "status": "timeout"
15
  },
16
  {
17
+ "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
18
+ "provider": "sambanova",
19
+ "accuracy": 0.0,
20
+ "execution_time": 60.0,
21
+ "status": "timeout"
22
+ },
23
+ {
24
+ "model": "Qwen/Qwen2.5-72B-Instruct",
25
  "provider": "sambanova",
26
  "accuracy": 0.0,
27
  "execution_time": 60.0,
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 15,
7
+ "job_id": 0,
8
+ "start_time": 190861.972782125,
9
+ "end_time": 190876.962226916,
10
+ "total_evaluation_time_secondes": "14.989444790990092",
11
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
+ "hf_subset": "multi_hop_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 34,
87
+ "effective_num_docs": 15,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "97803694d4430d2d",
96
+ "hash_full_prompts": "3125bcda69618d2b",
97
+ "hash_input_tokens": "58ec870775e406f3",
98
+ "hash_cont_tokens": "58ec870775e406f3"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 15,
102
+ "padded": 0,
103
+ "non_padded": 15,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "13a4051f728a0e87",
111
+ "hash_full_prompts": "e18b288370ab6ae2",
112
+ "hash_input_tokens": "544d800a25dfd777",
113
+ "hash_cont_tokens": "544d800a25dfd777"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 15,
117
+ "padded": 0,
118
+ "non_padded": 15,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 15,
7
+ "job_id": 0,
8
+ "start_time": 190861.972804458,
9
+ "end_time": 190894.739973125,
10
+ "total_evaluation_time_secondes": "32.7671686669928",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
+ "hf_subset": "multi_hop_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 34,
87
+ "effective_num_docs": 15,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "97803694d4430d2d",
96
+ "hash_full_prompts": "3125bcda69618d2b",
97
+ "hash_input_tokens": "58ec870775e406f3",
98
+ "hash_cont_tokens": "58ec870775e406f3"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 15,
102
+ "padded": 0,
103
+ "non_padded": 15,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "13a4051f728a0e87",
111
+ "hash_full_prompts": "e18b288370ab6ae2",
112
+ "hash_input_tokens": "544d800a25dfd777",
113
+ "hash_cont_tokens": "544d800a25dfd777"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 15,
117
+ "padded": 0,
118
+ "non_padded": 15,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 30,
7
+ "job_id": 0,
8
+ "start_time": 190994.241279791,
9
+ "end_time": 191043.871577458,
10
+ "total_evaluation_time_secondes": "49.63029766699765",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
+ "hf_subset": "multi_hop_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 34,
87
+ "effective_num_docs": 30,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "1b5afc5f13827f79",
96
+ "hash_full_prompts": "cd8c39c007643835",
97
+ "hash_input_tokens": "79ab129e9a18c6d6",
98
+ "hash_cont_tokens": "79ab129e9a18c6d6"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 30,
102
+ "padded": 0,
103
+ "non_padded": 30,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b18e19e266a5bc51",
111
+ "hash_full_prompts": "1eaa15cbc4a17d04",
112
+ "hash_input_tokens": "05a66e44e190c178",
113
+ "hash_cont_tokens": "05a66e44e190c178"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 30,
117
+ "padded": 0,
118
+ "non_padded": 30,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": null,
6
+ "max_samples": 30,
7
+ "job_id": 0,
8
+ "start_time": 191195.945968041,
9
+ "end_time": 191244.057571,
10
+ "total_evaluation_time_secondes": "48.111602959019365",
11
+ "model_name": "deepseek-ai/DeepSeek-V3-0324",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": "",
15
+ "generation_parameters": {
16
+ "early_stopping": null,
17
+ "repetition_penalty": null,
18
+ "frequency_penalty": null,
19
+ "length_penalty": null,
20
+ "presence_penalty": null,
21
+ "max_new_tokens": null,
22
+ "min_new_tokens": null,
23
+ "seed": null,
24
+ "stop_tokens": null,
25
+ "temperature": null,
26
+ "top_k": null,
27
+ "min_p": null,
28
+ "top_p": null,
29
+ "truncate_prompt": null,
30
+ "response_format": null
31
+ }
32
+ },
33
+ "results": {
34
+ "custom|yourbench|0": {
35
+ "accuracy": 1.0,
36
+ "accuracy_stderr": 0.0
37
+ },
38
+ "all": {
39
+ "accuracy": 1.0,
40
+ "accuracy_stderr": 0.0
41
+ }
42
+ },
43
+ "versions": {
44
+ "custom|yourbench|0": 0
45
+ },
46
+ "config_tasks": {
47
+ "custom|yourbench": {
48
+ "name": "yourbench",
49
+ "prompt_function": "yourbench_prompt",
50
+ "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
51
+ "hf_subset": "multi_hop_questions",
52
+ "metric": [
53
+ {
54
+ "metric_name": [
55
+ "accuracy"
56
+ ],
57
+ "higher_is_better": {
58
+ "accuracy": true
59
+ },
60
+ "category": "7",
61
+ "use_case": "1",
62
+ "sample_level_fn": "compute",
63
+ "corpus_level_fn": {
64
+ "accuracy": "mean"
65
+ }
66
+ }
67
+ ],
68
+ "hf_revision": null,
69
+ "hf_filter": null,
70
+ "hf_avail_splits": [
71
+ "train"
72
+ ],
73
+ "trust_dataset": true,
74
+ "evaluation_splits": [
75
+ "train"
76
+ ],
77
+ "few_shots_split": null,
78
+ "few_shots_select": null,
79
+ "generation_size": 8192,
80
+ "generation_grammar": null,
81
+ "stop_sequence": [],
82
+ "num_samples": null,
83
+ "suite": [
84
+ "custom"
85
+ ],
86
+ "original_num_docs": 34,
87
+ "effective_num_docs": 30,
88
+ "must_remove_duplicate_docs": false,
89
+ "version": 0
90
+ }
91
+ },
92
+ "summary_tasks": {
93
+ "custom|yourbench|0": {
94
+ "hashes": {
95
+ "hash_examples": "1b5afc5f13827f79",
96
+ "hash_full_prompts": "cd8c39c007643835",
97
+ "hash_input_tokens": "79ab129e9a18c6d6",
98
+ "hash_cont_tokens": "79ab129e9a18c6d6"
99
+ },
100
+ "truncated": 0,
101
+ "non_truncated": 30,
102
+ "padded": 0,
103
+ "non_padded": 30,
104
+ "effective_few_shots": 0.0,
105
+ "num_truncated_few_shots": 0
106
+ }
107
+ },
108
+ "summary_general": {
109
+ "hashes": {
110
+ "hash_examples": "b18e19e266a5bc51",
111
+ "hash_full_prompts": "1eaa15cbc4a17d04",
112
+ "hash_input_tokens": "05a66e44e190c178",
113
+ "hash_cont_tokens": "05a66e44e190c178"
114
+ },
115
+ "truncated": 0,
116
+ "non_truncated": 30,
117
+ "padded": 0,
118
+ "non_padded": 30,
119
+ "num_truncated_few_shots": 0
120
+ }
121
+ }
backend/tasks/evaluationTask.py CHANGED
@@ -71,7 +71,7 @@ class EvaluationTask:
71
  from lighteval_task.lighteval_task import create_yourbench_task
72
 
73
  # Create yourbench task
74
- yourbench = create_yourbench_task("{dataset_name}", "single_shot_questions")
75
 
76
  # Define TASKS_TABLE needed by lighteval
77
  TASKS_TABLE = [yourbench]
@@ -86,7 +86,7 @@ TASKS_TABLE = [yourbench]
86
  "custom|yourbench|0|0",
87
  "--custom-tasks",
88
  temp_file_path,
89
- "--max-samples", "15",
90
  "--output-dir", "data/lighteval_results",
91
  # "--save-details",
92
  "--no-push-to-hub"
 
71
  from lighteval_task.lighteval_task import create_yourbench_task
72
 
73
  # Create yourbench task
74
+ yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions")
75
 
76
  # Define TASKS_TABLE needed by lighteval
77
  TASKS_TABLE = [yourbench]
 
86
  "custom|yourbench|0|0",
87
  "--custom-tasks",
88
  temp_file_path,
89
+ "--max-samples", "30",
90
  "--output-dir", "data/lighteval_results",
91
  # "--save-details",
92
  "--no-push-to-hub"
backend/tasks/get_model_providers.py CHANGED
@@ -1,5 +1,5 @@
1
  from huggingface_hub import model_info
2
- PREFERRED_PROVIDERS = ["sambanova", "novita"]
3
 
4
  def filter_providers(providers):
5
  return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
 
1
  from huggingface_hub import model_info
2
+ PREFERRED_PROVIDERS = ["novita","sambanova"]
3
 
4
  def filter_providers(providers):
5
  return [provider for provider in providers if provider in PREFERRED_PROVIDERS]