Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

tfrere commited on Mar 28

Commit

f8ec36f

1 Parent(s): 2a8ebbd

update dockerfile

Browse files

Files changed (8) hide show

Dockerfile +2 -5
backend/data/lighteval_results/lighteval_results.json +10 -10
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json +121 -0
backend/tasks/evaluationTask.py +2 -2
backend/tasks/get_model_providers.py +1 -1

Dockerfile CHANGED Viewed

@@ -32,17 +32,14 @@ RUN useradd -m -u 1000 user
 RUN mkdir -p /app/.cache && \
     chown -R user:user /app
-# Copy backend requirements
-COPY backend/pyproject.toml ./
 # Install all dependencies explicitly
 RUN pip install fastapi uvicorn
 # Install project dependencies
 RUN uv pip install -e . --system
-# Copy backend code
-COPY backend/ .
 # Copy frontend server and build
 COPY --from=frontend-build /app/build ./frontend/build
 COPY --from=frontend-build /app/package*.json ./frontend/

 RUN mkdir -p /app/.cache && \
     chown -R user:user /app
+# Copy all backend code first
+COPY backend/ .
 # Install all dependencies explicitly
 RUN pip install fastapi uvicorn
 # Install project dependencies
 RUN uv pip install -e . --system
 # Copy frontend server and build
 COPY --from=frontend-build /app/build ./frontend/build
 COPY --from=frontend-build /app/package*.json ./frontend/

backend/data/lighteval_results/lighteval_results.json CHANGED Viewed

@@ -1,27 +1,27 @@
 [
-  {
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-    "provider": "sambanova",
-    "accuracy": 1.0,
-    "execution_time": 18.800472021102905,
-    "status": "success"
-  },
   {
     "model": "deepseek-ai/DeepSeek-V3-0324",
     "provider": "novita",
     "accuracy": 1.0,
-    "execution_time": 34.95434904098511,
     "status": "success"
   },
   {
-    "model": "Qwen/Qwen2.5-72B-Instruct",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,
     "status": "timeout"
   },
   {
-    "model": "Qwen/QwQ-32B",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,

 [
   {
     "model": "deepseek-ai/DeepSeek-V3-0324",
     "provider": "novita",
     "accuracy": 1.0,
+    "execution_time": 54.32098197937012,
     "status": "success"
   },
   {
+    "model": "Qwen/QwQ-32B",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,
     "status": "timeout"
   },
   {
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+    "provider": "sambanova",
+    "accuracy": 0.0,
+    "execution_time": 60.0,
+    "status": "timeout"
+  },
+  {
+    "model": "Qwen/Qwen2.5-72B-Instruct",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 15,
+    "job_id": 0,
+    "start_time": 190861.972782125,
+    "end_time": 190876.962226916,
+    "total_evaluation_time_secondes": "14.989444790990092",
+    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
+      "hf_subset": "multi_hop_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 34,
+      "effective_num_docs": 15,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "97803694d4430d2d",
+        "hash_full_prompts": "3125bcda69618d2b",
+        "hash_input_tokens": "58ec870775e406f3",
+        "hash_cont_tokens": "58ec870775e406f3"
+      },
+      "truncated": 0,
+      "non_truncated": 15,
+      "padded": 0,
+      "non_padded": 15,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "13a4051f728a0e87",
+      "hash_full_prompts": "e18b288370ab6ae2",
+      "hash_input_tokens": "544d800a25dfd777",
+      "hash_cont_tokens": "544d800a25dfd777"
+    },
+    "truncated": 0,
+    "non_truncated": 15,
+    "padded": 0,
+    "non_padded": 15,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 15,
+    "job_id": 0,
+    "start_time": 190861.972804458,
+    "end_time": 190894.739973125,
+    "total_evaluation_time_secondes": "32.7671686669928",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
+      "hf_subset": "multi_hop_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 34,
+      "effective_num_docs": 15,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "97803694d4430d2d",
+        "hash_full_prompts": "3125bcda69618d2b",
+        "hash_input_tokens": "58ec870775e406f3",
+        "hash_cont_tokens": "58ec870775e406f3"
+      },
+      "truncated": 0,
+      "non_truncated": 15,
+      "padded": 0,
+      "non_padded": 15,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "13a4051f728a0e87",
+      "hash_full_prompts": "e18b288370ab6ae2",
+      "hash_input_tokens": "544d800a25dfd777",
+      "hash_cont_tokens": "544d800a25dfd777"
+    },
+    "truncated": 0,
+    "non_truncated": 15,
+    "padded": 0,
+    "non_padded": 15,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 30,
+    "job_id": 0,
+    "start_time": 190994.241279791,
+    "end_time": 191043.871577458,
+    "total_evaluation_time_secondes": "49.63029766699765",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
+      "hf_subset": "multi_hop_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 34,
+      "effective_num_docs": 30,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "1b5afc5f13827f79",
+        "hash_full_prompts": "cd8c39c007643835",
+        "hash_input_tokens": "79ab129e9a18c6d6",
+        "hash_cont_tokens": "79ab129e9a18c6d6"
+      },
+      "truncated": 0,
+      "non_truncated": 30,
+      "padded": 0,
+      "non_padded": 30,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b18e19e266a5bc51",
+      "hash_full_prompts": "1eaa15cbc4a17d04",
+      "hash_input_tokens": "05a66e44e190c178",
+      "hash_cont_tokens": "05a66e44e190c178"
+    },
+    "truncated": 0,
+    "non_truncated": 30,
+    "padded": 0,
+    "non_padded": 30,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 30,
+    "job_id": 0,
+    "start_time": 191195.945968041,
+    "end_time": 191244.057571,
+    "total_evaluation_time_secondes": "48.111602959019365",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
+      "hf_subset": "multi_hop_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 34,
+      "effective_num_docs": 30,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "1b5afc5f13827f79",
+        "hash_full_prompts": "cd8c39c007643835",
+        "hash_input_tokens": "79ab129e9a18c6d6",
+        "hash_cont_tokens": "79ab129e9a18c6d6"
+      },
+      "truncated": 0,
+      "non_truncated": 30,
+      "padded": 0,
+      "non_padded": 30,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b18e19e266a5bc51",
+      "hash_full_prompts": "1eaa15cbc4a17d04",
+      "hash_input_tokens": "05a66e44e190c178",
+      "hash_cont_tokens": "05a66e44e190c178"
+    },
+    "truncated": 0,
+    "non_truncated": 30,
+    "padded": 0,
+    "non_padded": 30,
+    "num_truncated_few_shots": 0
+  }
+}

backend/tasks/evaluationTask.py CHANGED Viewed

@@ -71,7 +71,7 @@ class EvaluationTask:
 from lighteval_task.lighteval_task import create_yourbench_task
 # Create yourbench task
-yourbench = create_yourbench_task("{dataset_name}", "single_shot_questions")
 # Define TASKS_TABLE needed by lighteval
 TASKS_TABLE = [yourbench]
@@ -86,7 +86,7 @@ TASKS_TABLE = [yourbench]
             "custom|yourbench|0|0",
             "--custom-tasks",
             temp_file_path,
-            "--max-samples", "15",
             "--output-dir", "data/lighteval_results",
             # "--save-details",
             "--no-push-to-hub"

 from lighteval_task.lighteval_task import create_yourbench_task
 # Create yourbench task
+yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions")
 # Define TASKS_TABLE needed by lighteval
 TASKS_TABLE = [yourbench]
             "custom|yourbench|0|0",
             "--custom-tasks",
             temp_file_path,
+            "--max-samples", "30",
             "--output-dir", "data/lighteval_results",
             # "--save-details",
             "--no-push-to-hub"

backend/tasks/get_model_providers.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from huggingface_hub import model_info
-PREFERRED_PROVIDERS = ["sambanova", "novita"]
 def filter_providers(providers):
     return [provider for provider in providers if provider in PREFERRED_PROVIDERS]

 from huggingface_hub import model_info
+PREFERRED_PROVIDERS = ["novita","sambanova"]
 def filter_providers(providers):
     return [provider for provider in providers if provider in PREFERRED_PROVIDERS]