Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

tfrere commited on Mar 28

Commit

2a8ebbd

1 Parent(s): ebdfd67

update on tasks

Browse files

Files changed (43) hide show

backend/data/lighteval_results/lighteval_results.json +30 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json +121 -0
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json +121 -0
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json +121 -0
backend/lighteval_task/__init__.py +3 -0
backend/{tasks/yourbench_lighteval_task.py → lighteval_task/lighteval_task.py} +36 -10
backend/pyproject.toml +6 -0
backend/routes/evaluation.py +42 -31
backend/tasks/createBench.py +1 -83
backend/tasks/createBenchConfigFile.py +4 -4
backend/tasks/evaluationTask.py +144 -405
backend/tasks/get_model_providers.py +29 -0
backend/test_import.py +5 -0
backend/yourbench_simple_demo.egg-info/PKG-INFO +18 -0
backend/yourbench_simple_demo.egg-info/SOURCES.txt +17 -0
backend/yourbench_simple_demo.egg-info/dependency_links.txt +1 -0
backend/yourbench_simple_demo.egg-info/requires.txt +13 -0
backend/yourbench_simple_demo.egg-info/top_level.txt +1 -0
frontend/src/components/BenchmarkDisplay.jsx +24 -21
frontend/src/components/BenchmarkEvaluation.jsx +42 -211
frontend/src/components/BenchmarkGenerator.jsx +2 -2
frontend/src/components/EvaluationDisplay.jsx +50 -46
frontend/src/components/ExternalLinks.jsx +33 -2
frontend/src/config/theme.js +1 -1
test_import.py +5 -0

backend/data/lighteval_results/lighteval_results.json ADDED Viewed

	@@ -0,0 +1,30 @@

+[
+  {
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+    "provider": "sambanova",
+    "accuracy": 1.0,
+    "execution_time": 18.800472021102905,
+    "status": "success"
+  },
+  {
+    "model": "deepseek-ai/DeepSeek-V3-0324",
+    "provider": "novita",
+    "accuracy": 1.0,
+    "execution_time": 34.95434904098511,
+    "status": "success"
+  },
+  {
+    "model": "Qwen/Qwen2.5-72B-Instruct",
+    "provider": "sambanova",
+    "accuracy": 0.0,
+    "execution_time": 60.0,
+    "status": "timeout"
+  },
+  {
+    "model": "Qwen/QwQ-32B",
+    "provider": "sambanova",
+    "accuracy": 0.0,
+    "execution_time": 60.0,
+    "status": "timeout"
+  }
+]

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186274.866411583,
+    "end_time": 186322.987643416,
+    "total_evaluation_time_secondes": "48.12123183300719",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186407.701185,
+    "end_time": 186447.883386625,
+    "total_evaluation_time_secondes": "40.18220162499347",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186521.763833833,
+    "end_time": 186557.476439666,
+    "total_evaluation_time_secondes": "35.71260583298863",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186704.883209333,
+    "end_time": 186743.215716791,
+    "total_evaluation_time_secondes": "38.332507457991596",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 187518.49620975,
+    "end_time": 187553.120908083,
+    "total_evaluation_time_secondes": "34.62469833297655",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 187690.771319041,
+    "end_time": 187724.908132583,
+    "total_evaluation_time_secondes": "34.136813541990705",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 187785.492066916,
+    "end_time": 187824.287589375,
+    "total_evaluation_time_secondes": "38.79552245899686",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 188300.087538958,
+    "end_time": 188337.230208583,
+    "total_evaluation_time_secondes": "37.142669624998234",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "7e34d82512ce6dfc",
+        "hash_full_prompts": "af7c42c6f40964e1",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "7cdb142c3142312a",
+      "hash_full_prompts": "a2e47b0b68e57792",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 188452.784089458,
+    "end_time": 188490.538178958,
+    "total_evaluation_time_secondes": "37.75408949999837",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 15,
+    "job_id": 0,
+    "start_time": 188674.734532375,
+    "end_time": 188715.337919458,
+    "total_evaluation_time_secondes": "40.60338708298514",
+    "model_name": "Qwen/Qwen2.5-72B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 15,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "35f5eef8199d4521",
+        "hash_full_prompts": "5590bc220414fefb",
+        "hash_input_tokens": "58ec870775e406f3",
+        "hash_cont_tokens": "58ec870775e406f3"
+      },
+      "truncated": 0,
+      "non_truncated": 15,
+      "padded": 0,
+      "non_padded": 15,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "bc7dfdffc5e53476",
+      "hash_full_prompts": "712fd00df902d786",
+      "hash_input_tokens": "544d800a25dfd777",
+      "hash_cont_tokens": "544d800a25dfd777"
+    },
+    "truncated": 0,
+    "non_truncated": 15,
+    "padded": 0,
+    "non_padded": 15,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 15,
+    "job_id": 0,
+    "start_time": 188674.734510208,
+    "end_time": 188690.205653,
+    "total_evaluation_time_secondes": "15.471142791997408",
+    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 15,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "35f5eef8199d4521",
+        "hash_full_prompts": "5590bc220414fefb",
+        "hash_input_tokens": "58ec870775e406f3",
+        "hash_cont_tokens": "58ec870775e406f3"
+      },
+      "truncated": 0,
+      "non_truncated": 15,
+      "padded": 0,
+      "non_padded": 15,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "bc7dfdffc5e53476",
+      "hash_full_prompts": "712fd00df902d786",
+      "hash_input_tokens": "544d800a25dfd777",
+      "hash_cont_tokens": "544d800a25dfd777"
+    },
+    "truncated": 0,
+    "non_truncated": 15,
+    "padded": 0,
+    "non_padded": 15,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186274.866369916,
+    "end_time": 186294.792813083,
+    "total_evaluation_time_secondes": "19.926443167001707",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186407.701222875,
+    "end_time": 186427.871588083,
+    "total_evaluation_time_secondes": "20.170365208003204",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186521.763754958,
+    "end_time": 186545.585271583,
+    "total_evaluation_time_secondes": "23.821516625001095",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 186704.882684291,
+    "end_time": 186723.820615833,
+    "total_evaluation_time_secondes": "18.937931542022852",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 187518.496174916,
+    "end_time": 187538.752125166,
+    "total_evaluation_time_secondes": "20.255950249993475",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 187690.771119125,
+    "end_time": 187715.172306583,
+    "total_evaluation_time_secondes": "24.40118745798827",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 187785.49207775,
+    "end_time": 187806.982701541,
+    "total_evaluation_time_secondes": "21.4906237910036",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 188300.087685291,
+    "end_time": 188324.829042291,
+    "total_evaluation_time_secondes": "24.7413570000208",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "7e34d82512ce6dfc",
+        "hash_full_prompts": "af7c42c6f40964e1",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "7cdb142c3142312a",
+      "hash_full_prompts": "a2e47b0b68e57792",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 5,
+    "job_id": 0,
+    "start_time": 188452.784059833,
+    "end_time": 188474.450274291,
+    "total_evaluation_time_secondes": "21.666214458004106",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 5,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "abaa6ef1f9715482",
+        "hash_full_prompts": "0b5eb6607b419659",
+        "hash_input_tokens": "bf9d9e969418cff7",
+        "hash_cont_tokens": "bf9d9e969418cff7"
+      },
+      "truncated": 0,
+      "non_truncated": 5,
+      "padded": 0,
+      "non_padded": 5,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b1bf475c2319e3b2",
+      "hash_full_prompts": "d860f90cd7291b63",
+      "hash_input_tokens": "5882dac673b9f859",
+      "hash_cont_tokens": "5882dac673b9f859"
+    },
+    "truncated": 0,
+    "non_truncated": 5,
+    "padded": 0,
+    "non_padded": 5,
+    "num_truncated_few_shots": 0
+  }
+}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 15,
+    "job_id": 0,
+    "start_time": 188674.734458958,
+    "end_time": 188711.276019958,
+    "total_evaluation_time_secondes": "36.54156099999091",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
+      "hf_subset": "single_shot_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 15,
+      "effective_num_docs": 15,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "35f5eef8199d4521",
+        "hash_full_prompts": "5590bc220414fefb",
+        "hash_input_tokens": "58ec870775e406f3",
+        "hash_cont_tokens": "58ec870775e406f3"
+      },
+      "truncated": 0,
+      "non_truncated": 15,
+      "padded": 0,
+      "non_padded": 15,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "bc7dfdffc5e53476",
+      "hash_full_prompts": "712fd00df902d786",
+      "hash_input_tokens": "544d800a25dfd777",
+      "hash_cont_tokens": "544d800a25dfd777"
+    },
+    "truncated": 0,
+    "non_truncated": 15,
+    "padded": 0,
+    "non_padded": 15,
+    "num_truncated_few_shots": 0
+  }
+}

backend/lighteval_task/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .lighteval_task import create_yourbench_task
2	+
3	+ __all__ = ["create_yourbench_task"]

backend/{tasks/yourbench_lighteval_task.py → lighteval_task/lighteval_task.py} RENAMED Viewed

@@ -136,10 +136,26 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
 def process_judge_response_yourbench(response):
     # extract the final answer using regex from the response xml
     try:
         # Essayer d'abord le format XML
-        match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL)
         if match:
             answer_text = match.group(1).strip()
             # Convertir différents formats possibles en 0 ou 1
@@ -155,14 +171,16 @@ def process_judge_response_yourbench(response):
                 pass
         # Rechercher des mots-clés dans la réponse
-        if re.search(r"\b(correct|vrai|true|yes)\b", response, re.IGNORECASE):
             return 1
-        if re.search(r"\b(incorrect|faux|false|no)\b", response, re.IGNORECASE):
             return 0
-        logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {response[:100]}...")
     except Exception as e:
         logger.error(f"Error processing judge response: {e}")
     return 0
@@ -185,10 +203,18 @@ class JudgeLLMYourBench(JudgeLLM):
         chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs]
         documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
         score, _, _ = self.judge.evaluate_answer_batch(
             questions, predictions, options, golds, chunks=chunks, documents=documents
         )
         metrics = []
         for i in range(len(sample_ids)):
             metrics.append(
@@ -214,17 +240,17 @@ def yourbench_prompt(line, task_name: str = ""):
     return Doc(
         task_name=task_name,
         query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]),
-        choices=[line["ground_truth_answer"]],
         gold_index=0,
         specific={
-            "question_category": line["question_category"],
-            "kind": line["kind"],
             "estimated_difficulty": line["estimated_difficulty"],
             "document_id": line["document_id"],
-            "question_generating_model": line["question_generating_model"],
-            "chunks": line["chunks"],
             "question": line["question"],
-            "document": line["document"],
         },
     )

 def process_judge_response_yourbench(response):
+    # Si la réponse est un dictionnaire, extraire le contenu
+    if isinstance(response, dict):
+        if "content" in response:
+            response = response["content"]
+        elif "text" in response:
+            response = response["text"]
+        elif "response" in response:
+            response = response["response"]
+        else:
+            # Si on ne trouve pas de champ texte, on prend la première valeur
+            response = str(list(response.values())[0])
+    # Si la réponse est une liste, prendre le premier élément
+    if isinstance(response, list):
+        response = response[0]
     # extract the final answer using regex from the response xml
     try:
         # Essayer d'abord le format XML
+        match = re.search(r"<final_answer>(.*?)</final_answer>", str(response), re.DOTALL)
         if match:
             answer_text = match.group(1).strip()
             # Convertir différents formats possibles en 0 ou 1
                 pass
         # Rechercher des mots-clés dans la réponse
+        if re.search(r"\b(correct|vrai|true|yes)\b", str(response), re.IGNORECASE):
             return 1
+        if re.search(r"\b(incorrect|faux|false|no)\b", str(response), re.IGNORECASE):
             return 0
+        logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {str(response)[:100]}...")
     except Exception as e:
         logger.error(f"Error processing judge response: {e}")
+        logger.error(f"Response type: {type(response)}")
+        logger.error(f"Response content: {response}")
     return 0
         chunks = [formatted_doc.specific["chunks"][0] for formatted_doc in formatted_docs]
         documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
+        # Ajout de logs pour déboguer
+        logger.info(f"Questions: {questions}")
+        logger.info(f"Predictions: {predictions}")
+        logger.info(f"Golds: {golds}")
         score, _, _ = self.judge.evaluate_answer_batch(
             questions, predictions, options, golds, chunks=chunks, documents=documents
         )
+        # Ajout de logs pour déboguer
+        logger.info(f"Scores: {score}")
         metrics = []
         for i in range(len(sample_ids)):
             metrics.append(
     return Doc(
         task_name=task_name,
         query=ZEROSHOT_QA_USER_PROMPT.format(question=line["question"]),
+        choices=[line["self_answer"]],
         gold_index=0,
         specific={
+            "question_category": line["self_assessed_question_type"],
+            "kind": "qa",
             "estimated_difficulty": line["estimated_difficulty"],
             "document_id": line["document_id"],
+            "question_generating_model": line["generating_model"],
+            "chunks": line["citations"],
             "question": line["question"],
+            "document": line["raw_response"],
         },
     )

backend/pyproject.toml CHANGED Viewed

@@ -20,6 +20,9 @@ dependencies = [
     "lighteval[math]>=0.8.0",
     "huggingface-hub>=0.22.0",
     "python-multipart>=0.0.5",
 ]
 [build-system]
@@ -46,3 +49,6 @@ quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"

     "lighteval[math]>=0.8.0",
     "huggingface-hub>=0.22.0",
     "python-multipart>=0.0.5",
+    "fastapi>=0.110.0",
+    "uvicorn>=0.29.0",
+    "pydantic>=2.6.0",
 ]
 [build-system]
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
+[tool.setuptools]
+packages = ["lighteval_task"]

backend/routes/evaluation.py CHANGED Viewed

@@ -2,6 +2,9 @@ from fastapi import APIRouter, HTTPException
 from typing import Dict, Any
 import os
 from tasks.evaluationTask import EvaluationTask
 router = APIRouter(tags=["evaluation"])
@@ -41,7 +44,7 @@ async def evaluate_benchmark(data: Dict[str, Any]):
     try:
         # Nom du dataset basé sur l'ID de session
-        dataset_name = f"yourbench_{session_id}"
         # Créer et démarrer une nouvelle tâche d'évaluation
         evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
@@ -105,44 +108,52 @@ async def get_evaluation_results(session_id: str):
     Returns:
         Dictionary with evaluation results
     """
-    # First, check if the task is in memory
-    if session_id in active_evaluation_tasks:
-        evaluation_task = active_evaluation_tasks[session_id]
-        if not evaluation_task.is_task_completed():
-            return {
-                "success": False,
-                "message": "Evaluation is still in progress"
             }
-        if hasattr(evaluation_task, 'results') and evaluation_task.results:
             return {
                 "success": True,
-                "results": evaluation_task.results
             }
-    # If we get here, either the task is not in memory or it doesn't have results
-    # Try to load results from file
-    try:
-        # Construct the path to the results file
-        results_path = f"uploaded_files/{session_id}/lighteval_results/models_comparison.json"
-        # Check if the file exists
-        if not os.path.exists(results_path):
             return {
                 "success": False,
-                "message": "No evaluation results found for this session"
             }
-        # Read the file
-        import json
-        with open(results_path, 'r') as f:
-            results = json.load(f)
-        return {
-            "success": True,
-            "results": results
-        }
     except Exception as e:
         return {
             "success": False,

 from typing import Dict, Any
 import os
 from tasks.evaluationTask import EvaluationTask
+from huggingface_hub import hf_hub_download
+import json
+from datetime import datetime
 router = APIRouter(tags=["evaluation"])
     try:
         # Nom du dataset basé sur l'ID de session
+        dataset_name = f"yourbench/yourbench_{session_id}"
         # Créer et démarrer une nouvelle tâche d'évaluation
         evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
     Returns:
         Dictionary with evaluation results
     """
+    try:
+        # Get organization from environment
+        organization = os.getenv("HF_ORGANIZATION", "yourbench")
+        dataset_name = f"{organization}/yourbench_{session_id}"
+        # Try to load results from the Hub
+        try:
+            results_file = hf_hub_download(
+                repo_id=dataset_name,
+                repo_type="dataset",
+                filename="lighteval_results.json"
+            )
+            with open(results_file) as f:
+                results = json.load(f)
+            # Format results to match the expected format
+            formatted_results = {
+                "metadata": {
+                    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    "total_models_tested": len(results),
+                    "successful_tests": len([r for r in results if r["status"] == "success"])
+                },
+                "models_comparison": [
+                    {
+                        "model_name": result["model"],
+                        "provider": result["provider"],
+                        "success": result["status"] == "success",
+                        "accuracy": result["accuracy"],
+                        "evaluation_time": result["execution_time"],
+                        "error": result["status"] if result["status"] != "success" else None
+                    }
+                    for result in results
+                ]
             }
             return {
                 "success": True,
+                "results": formatted_results
             }
+        except Exception as e:
             return {
                 "success": False,
+                "message": f"Failed to load results from Hub: {str(e)}"
             }
     except Exception as e:
         return {
             "success": False,

backend/tasks/createBench.py CHANGED Viewed

@@ -234,86 +234,4 @@ class CreateBenchTask:
         except Exception as e:
             self._add_log(f"[ERROR] Error starting ingestion process: {str(e)}")
             self.is_completed = True
-    def _simulate_ingestion_process(self) -> None:
-        """
-        Simulate the ingestion process for testing/development
-        This will be removed in production
-        """
-        # This method is just to simulate logs during development
-        # It will be removed in production
-        threading.Thread(target=self._simulate_logs).start()
-    def _simulate_logs(self) -> None:
-        """
-        Simulate logs for testing/development
-        This will be used when yourbench isn't installed or in development mode
-        """
-        # Log simulation (used when yourbench is not available)
-        self._add_log("[INFO] Simulation mode enabled (yourbench is not actually running)")
-        # Get filenames from source directory
-        source_files = []
-        try:
-            with open(self.config_path, 'r') as f:
-                config_yaml = yaml.safe_load(f)
-            source_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("source_documents_dir", "")
-            if source_dir and os.path.exists(source_dir):
-                source_files = [f for f in os.listdir(source_dir)
-                               if os.path.isfile(os.path.join(source_dir, f))]
-        except Exception:
-            source_files = ["document.pdf", "document.txt"]  # Fallback
-        # Create output directory if it doesn't exist
-        output_dir = ""
-        try:
-            output_dir = config_yaml.get("pipeline", {}).get("ingestion", {}).get("output_dir", "")
-            if output_dir:
-                os.makedirs(output_dir, exist_ok=True)
-        except Exception:
-            pass
-        # Simulate file processing
-        time.sleep(1)
-        self._add_log("[INFO] Initializing document ingestion")
-        time.sleep(1.5)
-        self._add_log("[INFO] Loading configuration parameters")
-        time.sleep(1)
-        self._add_log("[INFO] Verifying source files")
-        # Process each file
-        for file in source_files:
-            time.sleep(1.5)
-            self._add_log(f"[INFO] Processing file: {file}")
-            time.sleep(2)
-            self._add_log(f"[INFO] Extracting content from {file}")
-            time.sleep(1.5)
-            self._add_log(f"[INFO] Converting to markdown: {file}")
-            # Create a simulated markdown file if an output directory is defined
-            if output_dir:
-                base_name = os.path.splitext(file)[0]
-                output_file = os.path.join(output_dir, f"{base_name}.md")
-                try:
-                    with open(output_file, 'w') as f:
-                        f.write(f"# {base_name}\n\n")
-                        f.write("This is a markdown document automatically generated by the simulation.\n\n")
-                        f.write("## Section 1\n\n")
-                        f.write("Content of section 1...\n\n")
-                        f.write("## Section 2\n\n")
-                        f.write("Content of section 2...\n\n")
-                    self._add_log(f"[INFO] Markdown file created: {output_file}")
-                except Exception as e:
-                    self._add_log(f"[ERROR] Error creating markdown file: {str(e)}")
-        time.sleep(2)
-        self._add_log("[INFO] Finalizing processing")
-        time.sleep(1)
-        self._add_log("[SUCCESS] Stage completed: ingestion")
-        time.sleep(0.5)
-        self._add_log("[SUCCESS] Ingestion completed successfully")
-        # Mark task as completed
-        self.is_completed = True

         except Exception as e:
             self._add_log(f"[ERROR] Error starting ingestion process: {str(e)}")
             self.is_completed = True

backend/tasks/createBenchConfigFile.py CHANGED Viewed

@@ -145,15 +145,15 @@ class CreateBenchConfigTask:
                         "tau_threshold": 0.8,
                         "h_min": 2,
                         "h_max": 5,
-                        "num_multihops_factor": 2,
                     },
                 },
                 "single_shot_question_generation": {
-                    "run": True,
                     "additional_instructions": "Generate questions to test a curious adult",
                     "chunk_sampling": {
                         "mode": "count",
-                        "value": 5,
                         "random_seed": 123,
                     },
                 },
@@ -167,7 +167,7 @@ class CreateBenchConfigTask:
                     },
                 },
                 "lighteval": {
-                    "run": True,
                 },
             },
         }

                         "tau_threshold": 0.8,
                         "h_min": 2,
                         "h_max": 5,
+                        "num_multihops_factor": 1,
                     },
                 },
                 "single_shot_question_generation": {
+                    "run": False,
                     "additional_instructions": "Generate questions to test a curious adult",
                     "chunk_sampling": {
                         "mode": "count",
+                        "value": 10,
                         "random_seed": 123,
                     },
                 },
                     },
                 },
                 "lighteval": {
+                    "run": False,
                 },
             },
         }

backend/tasks/evaluationTask.py CHANGED Viewed

@@ -1,25 +1,22 @@
 """
-Task to evaluate models on a YourbBench dataset using LightEval
 """
 import os
-import sys
-import json
 import time
 import tempfile
-import asyncio
-import threading
 from pathlib import Path
-from typing import Optional, List, Dict, Any, Tuple
-from loguru import logger
-from huggingface_hub import HfApi, CommitOperationAdd
-from tasks.yourbench_lighteval_task import create_yourbench_task
 class EvaluationTask:
     """
-    Task to evaluate models using LightEval on a YourbBench dataset
     """
     def __init__(self, session_uid: str, dataset_name: str):
@@ -32,440 +29,182 @@ class EvaluationTask:
         """
         self.session_uid = session_uid
         self.dataset_name = dataset_name
-        self.logs: List[str] = []
         self.is_completed = False
-        self.organization = os.getenv("HF_ORGANIZATION", "yourbench")
-        self.results: Dict[str, Any] = {}
-        self.output_dir = f"uploaded_files/{session_uid}/lighteval_results"
-        # Models to evaluate - can be modified to allow customization
-        self.models = [
-            ("Qwen/Qwen2.5-72B-Instruct", "novita"),
-            ("Qwen/QwQ-32B", "novita"),
-        ]
-        self._add_log("[INFO] Initializing evaluation task")
-        self._add_log(f"[INFO] Dataset to evaluate: {self.organization}/{dataset_name}")
-        self._add_log(f"[INFO] Output directory: {self.output_dir}")
-    def _add_log(self, message: str) -> None:
-        """
-        Add a log message to the logs list
-        Args:
-            message: Log message to add
-        """
-        if message not in self.logs:  # Avoid duplicates
-            self.logs.append(message)
-            # Force copy of the list to avoid reference problems
-            self.logs = self.logs.copy()
-            # Record in system logs
-            logger.info(f"[{self.session_uid}] {message}")
-    def get_logs(self) -> List[str]:
-        """
-        Get all logs for this task
-        Returns:
-            List of log messages
-        """
-        return self.logs.copy()  # Retourner une copie pour éviter les problèmes de référence
-    def is_task_completed(self) -> bool:
-        """
-        Check if the task is completed
-        Returns:
-            True if completed, False otherwise
         """
-        return self.is_completed
-    async def _evaluate_model(self, model_info: Tuple[str, str]) -> Dict[str, Any]:
         """
-        Evaluate a specific model
-        Args:
-            model_info: Tuple of (model_name, provider)
-        Returns:
-            Dictionary with evaluation results
-        """
-        model_name, provider = model_info
-        self._add_log(f"[INFO] Starting evaluation for {model_name} with {provider}")
-        # Create output directory
-        os.makedirs(self.output_dir, exist_ok=True)
-        # Define full dataset path
-        dataset_path = f"{self.organization}/{self.dataset_name}"
-        # Create temporary file
         temp_file_path = tempfile.mktemp(suffix=".py")
-        self._add_log(f"[INFO] Creating temporary file for {model_name}: {temp_file_path}")
         with open(temp_file_path, 'w') as temp_file:
             temp_file.write(f"""
-import os
-import sys
-sys.path.append("{os.getcwd()}")
-from tasks.yourbench_lighteval_task import create_yourbench_task
 # Create yourbench task
-yourbench = create_yourbench_task("{dataset_path}", "lighteval")
 # Define TASKS_TABLE needed by lighteval
 TASKS_TABLE = [yourbench]
 """)
-        # Build lighteval command args
         cmd_args = [
             "lighteval",
-            "endpoint",
             "inference-providers",
             f"model={model_name},provider={provider}",
             "custom|yourbench|0|0",
             "--custom-tasks",
             temp_file_path,
-            "--max-samples", "5",
-            "--output-dir", self.output_dir,
-            "--save-details",
             "--no-push-to-hub"
         ]
-        self._add_log(f"[INFO] Running command for {model_name}: {' '.join(cmd_args)}")
-        results = {
-            "model_name": model_name,
-            "provider": provider,
-            "success": False,
-            "error": None,
-            "results": None,
-            "return_code": None
-        }
         try:
-            # Prepare environment with needed tokens
-            env = os.environ.copy()
-            hf_token = os.getenv("HF_TOKEN")
-            if hf_token:
-                env["HF_TOKEN"] = hf_token
-                env["HUGGING_FACE_HUB_TOKEN"] = hf_token
-                env["HF_ORGANIZATION"] = self.organization
-            # Run the process asynchronously
-            process = await asyncio.create_subprocess_exec(
-                *cmd_args,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
-                env=env
-            )
-            # Wait for the process to complete
-            stdout, stderr = await process.communicate()
-            # Store return code
-            exit_code = process.returncode
-            results["return_code"] = exit_code
-            # Log output
-            if stdout:
-                stdout_lines = stdout.decode().strip().split('\n')
-                for line in stdout_lines[:5]:  # Log only first 5 lines
-                    self._add_log(f"[INFO] {model_name} - {line}")
-            # Log errors if any
-            if stderr and exit_code != 0:
-                stderr_lines = stderr.decode().strip().split('\n')
-                for line in stderr_lines[:5]:  # Log only first 5 lines
-                    self._add_log(f"[ERROR] {model_name} - {line}")
-            # Find any JSON result files - LightEval organizes by model name in different ways
-            result_files = []
-            results_dir = Path(self.output_dir) / "results"
-            if results_dir.exists():
-                # Parcourir récursivement tous les répertoires pour trouver des fichiers JSON
-                for json_file in results_dir.glob("**/*.json"):
-                    # Check if the filename or path contains parts of the model name
-                    model_parts = [
-                        model_name,  # Full name
-                        model_name.replace('/', '_'),  # Name with / replaced by _
-                        model_name.split('/')[-1]  # Just the model name without the organization
-                    ]
-                    if any(part in str(json_file) for part in model_parts):
-                        result_files.append(json_file)
-            # Traiter les fichiers de résultats trouvés
-            if result_files:
-                # Prendre le fichier le plus récent
-                result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
-                latest_result = result_files[0]
-                self._add_log(f"[INFO] {model_name} - Found result file: {latest_result}")
-                try:
-                    with open(latest_result, 'r') as f:
-                        test_results = json.load(f)
-                    # Vérifier si les résultats contiennent les informations essentielles
-                    if (test_results and
-                        isinstance(test_results, dict) and
-                        "results" in test_results and
-                        "all" in test_results["results"]):
-                        # Enregistrer les résultats
-                        results["results"] = test_results
-                        results["success"] = True
-                        # Afficher la précision
-                        accuracy = test_results["results"]["all"]["accuracy"]
-                        accuracy_stderr = test_results["results"]["all"]["accuracy_stderr"]
-                        self._add_log(f"[SUCCESS] {model_name} - Accuracy: {accuracy:.4f} ± {accuracy_stderr:.4f}")
-                    else:
-                        results["error"] = "Incomplete or unexpected result format"
-                        self._add_log(f"[WARNING] {model_name} - Unexpected result format")
-                except (json.JSONDecodeError, KeyError) as e:
-                    results["error"] = f"Error reading results: {str(e)}"
-                    self._add_log(f"[ERROR] {model_name} - {results['error']}")
-            # Si aucun résultat trouvé
-            if not results["success"]:
-                if exit_code == 0:
-                    results["error"] = "Execution completed without error but no results found"
-                    self._add_log(f"[WARNING] {model_name} - {results['error']}")
-                else:
-                    results["error"] = f"Execution error (code: {exit_code})"
-                    self._add_log(f"[ERROR] {model_name} - {results['error']}")
         except Exception as e:
-            results["error"] = f"Exception: {str(e)}"
-            self._add_log(f"[ERROR] Exception during evaluation of {model_name}: {str(e)}")
-        finally:
-            # Delete temporary file
-            try:
-                os.unlink(temp_file_path)
-            except:
-                pass
-        return results
-    async def _run_evaluations(self) -> List[Dict[str, Any]]:
         """
-        Run evaluations for all models
         Returns:
-            List of evaluation results
         """
-        self._add_log(f"[INFO] Starting evaluations for {len(self.models)} models")
-        # Create tasks for each model
-        tasks = [self._evaluate_model(model) for model in self.models]
-        # Run all tasks concurrently and gather results
-        model_results = await asyncio.gather(*tasks, return_exceptions=True)
-        # Process results
-        results = []
-        for i, result in enumerate(model_results):
-            if isinstance(result, Exception):
-                # Handle exception
-                model_name, provider = self.models[i]
-                self._add_log(f"[ERROR] Evaluation failed for {model_name}: {str(result)}")
-                results.append({
-                    "model_name": model_name,
-                    "provider": provider,
-                    "success": False,
-                    "error": str(result),
-                    "results": None,
-                    "return_code": None
-                })
-            else:
-                # Valid result
-                results.append(result)
-        return results
-    def _format_comparison_results(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
-        Format results for easy comparison between models
-        Args:
-            results: List of evaluation results
         Returns:
-            Dictionary with formatted comparison results
         """
-        comparison = {
-            "metadata": {
-                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-                "dataset": f"{self.organization}/{self.dataset_name}",
-                "total_models_tested": len(results),
-                "successful_tests": len([r for r in results if r["success"]])
-            },
-            "models_comparison": []
-        }
-        # Liste des modèles réussis et des modèles échoués
-        successful_models = [r for r in results if r["success"]]
-        failed_models = [r for r in results if not r["success"]]
-        # Trier les modèles réussis par précision (du plus précis au moins précis)
-        if successful_models:
-            sorted_successful = sorted(
-                successful_models,
-                key=lambda x: x["results"]["results"]["all"]["accuracy"],
-                reverse=True  # Du plus grand au plus petit
-            )
-        else:
-            sorted_successful = []
-        # Trier les modèles échoués par nom
-        sorted_failed = sorted(failed_models, key=lambda x: x["model_name"])
-        # Concaténer: d'abord les réussites, puis les échecs
-        sorted_results = sorted_successful + sorted_failed
-        # Créer l'entrée pour chaque modèle
-        for result in sorted_results:
-            model_result = {
-                "model_name": result["model_name"],
-                "provider": result["provider"],
-                "success": result["success"]
-            }
-            if result["success"]:
-                # Ajouter les métriques de précision et temps d'exécution
-                model_result.update({
-                    "accuracy": result["results"]["results"]["all"]["accuracy"],
-                    "accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
-                    "evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
-                })
-            else:
-                # Ajouter l'erreur
-                model_result["error"] = result.get("error", "Unknown reason")
-            comparison["models_comparison"].append(model_result)
-        return comparison
-    async def _upload_results_to_dataset(self, comparison_results: Dict[str, Any]) -> bool:
         """
-        Upload evaluation results to the HuggingFace dataset
-        Args:
-            comparison_results: The formatted comparison results
         Returns:
-            bool: True if upload succeeded, False otherwise
-        """
-        try:
-            # Create a timestamp for the results file
-            timestamp = time.strftime("%Y%m%d_%H%M%S")
-            result_filename = f"lighteval_results.json"
-            # Create temporary file for upload
-            temp_file_path = tempfile.mktemp(suffix=".json")
-            with open(temp_file_path, 'w') as f:
-                json.dump(comparison_results, f, indent=2)
-            # Initialize HF API
-            hf_token = os.getenv("HF_TOKEN")
-            if not hf_token:
-                self._add_log("[ERROR] HF_TOKEN not found, cannot upload results to dataset")
-                return False
-            api = HfApi(token=hf_token)
-            dataset_id = f"{self.organization}/{self.dataset_name}"
-            # Prepare the file operation
-            operation = CommitOperationAdd(
-                path_in_repo=f"lighteval_results/{result_filename}",
-                path_or_fileobj=temp_file_path
-            )
-            # Upload the file
-            self._add_log(f"[INFO] Uploading results to dataset {dataset_id}")
-            api.create_commit(
-                repo_id=dataset_id,
-                repo_type="dataset",
-                operations=[operation],
-                commit_message=f"Add evaluation results from {timestamp}"
-            )
-            # Cleanup temporary file
-            os.unlink(temp_file_path)
-            self._add_log(f"[SUCCESS] Results uploaded to dataset {dataset_id} at lighteval_results/{result_filename}")
-            return True
-        except Exception as e:
-            self._add_log(f"[ERROR] Failed to upload results to dataset: {str(e)}")
-            return False
-    async def _process_evaluation_results(self, results: List[Dict[str, Any]]) -> None:
-        """
-        Process evaluation results, create summaries and save files
-        Args:
-            results: List of evaluation results
-        """
-        if results:
-            try:
-                # Save detailed results
-                detailed_output_file = f"{self.output_dir}/detailed_results.json"
-                os.makedirs(os.path.dirname(detailed_output_file), exist_ok=True)
-                with open(detailed_output_file, 'w') as f:
-                    json.dump(results, f, indent=2)
-                self._add_log(f"[INFO] Detailed results saved in {detailed_output_file}")
-                # Generate and save comparison results
-                comparison = self._format_comparison_results(results)
-                comparison_file = f"{self.output_dir}/models_comparison.json"
-                with open(comparison_file, 'w') as f:
-                    json.dump(comparison, f, indent=2)
-                self._add_log(f"[INFO] Models comparison saved in {comparison_file}")
-                # Upload results to the dataset
-                await self._upload_results_to_dataset(comparison)
-                # Store results for later access
-                self.results = comparison
-                self._add_log("[SUCCESS] Evaluation completed")
-            except Exception as e:
-                self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
-            finally:
-                self.is_completed = True
-    def _async_run(self) -> None:
-        """
-        Run the evaluation asynchronously
         """
-        async def run_async():
-            try:
-                # Run evaluations
-                results = await self._run_evaluations()
-                # Process evaluation results
-                await self._process_evaluation_results(results)
-            except Exception as e:
-                self._add_log(f"[ERROR] Error during evaluation execution: {str(e)}")
-            finally:
-                self.is_completed = True
-        # Create and run the asyncio event loop
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        loop.run_until_complete(run_async())
-        loop.close()
     def run(self) -> None:
         """
-        Run the evaluation task in a separate thread
         """
-        self._add_log("[INFO] Starting evaluation")
-        # Run in a separate thread to not block the main thread
-        thread = threading.Thread(target=self._async_run)
-        thread.daemon = True
-        thread.start()

 """
+Task to run evaluation using lighteval
 """
 import os
 import time
+import subprocess
 import tempfile
 from pathlib import Path
+import concurrent.futures
+from dotenv import load_dotenv
+from datetime import datetime
+import json
+from typing import List, Dict
+from tasks.get_model_providers import get_model_providers
+from huggingface_hub import HfApi
 class EvaluationTask:
     """
+    Task to run evaluation using lighteval
     """
     def __init__(self, session_uid: str, dataset_name: str):
         """
         self.session_uid = session_uid
         self.dataset_name = dataset_name
         self.is_completed = False
+        self.results = []
+        self.hf_api = HfApi()
+    def _save_results_to_hub(self) -> None:
         """
+        Save evaluation results to the dataset on the Hub
         """
+        try:
+            # Create results directory if it doesn't exist
+            results_dir = Path("data/lighteval_results")
+            results_dir.mkdir(parents=True, exist_ok=True)
+            # Save results to JSON file
+            results_file = results_dir / "lighteval_results.json"
+            with open(results_file, "w") as f:
+                json.dump(self.results, f, indent=2)
+            # Push to Hub
+            self.hf_api.upload_file(
+                path_or_fileobj=str(results_file),
+                path_in_repo="lighteval_results.json",
+                repo_id=self.dataset_name,
+                repo_type="dataset",
+                commit_message="Add lighteval evaluation results"
+            )
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json")
+        except Exception as e:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
+    def _run_lighteval(self, model_name: str, provider: str, dataset_name: str) -> dict:
+        start_time = time.time()
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")
+        # Create temporary task file
         temp_file_path = tempfile.mktemp(suffix=".py")
         with open(temp_file_path, 'w') as temp_file:
             temp_file.write(f"""
+from lighteval_task.lighteval_task import create_yourbench_task
 # Create yourbench task
+yourbench = create_yourbench_task("{dataset_name}", "single_shot_questions")
 # Define TASKS_TABLE needed by lighteval
 TASKS_TABLE = [yourbench]
 """)
+        # LightEval command
         cmd_args = [
             "lighteval",
+            "endpoint",
             "inference-providers",
             f"model={model_name},provider={provider}",
             "custom|yourbench|0|0",
             "--custom-tasks",
             temp_file_path,
+            "--max-samples", "15",
+            "--output-dir", "data/lighteval_results",
+            # "--save-details",
             "--no-push-to-hub"
         ]
         try:
+            # Run the command with environment variables and timeout of 60 seconds
+            subprocess.run(cmd_args, env=os.environ, timeout=60)
+        except subprocess.TimeoutExpired:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
+            return {
+                "model": model_name,
+                "provider": provider,
+                "accuracy": 0.0,
+                "execution_time": 60.0,
+                "status": "timeout"
+            }
+        # Calculate execution time
+        execution_time = time.time() - start_time
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
+        # Clean up
+        os.unlink(temp_file_path)
+        try:
+            # Get results from the output file
+            results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
+            results_file = next(results_dir.glob("results_*.json"))
+            with open(results_file) as f:
+                results = json.load(f)
+                accuracy = results["results"]["all"]["accuracy"]
+            return {
+                "model": model_name,
+                "provider": provider,
+                "accuracy": accuracy,
+                "execution_time": execution_time,
+                "status": "success"
+            }
         except Exception as e:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
+            return {
+                "model": model_name,
+                "provider": provider,
+                "accuracy": 0.0,
+                "execution_time": execution_time,
+                "status": "parse_error"
+            }
+    def run_parallel(self) -> List[Dict]:
         """
+        Run the evaluation task with multiple models in parallel using ProcessPoolExecutor
         Returns:
+            List of results for each model
         """
+        # Start global timer
+        script_start_time = time.time()
+        # Load environment variables
+        load_dotenv()
+        # Models to evaluate
+        models = [
+            "Qwen/QwQ-32B",
+            "Qwen/Qwen2.5-72B-Instruct",
+            "deepseek-ai/DeepSeek-V3-0324",
+            "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        ]
+        # Get providers for each model
+        model_providers = get_model_providers(models)
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
+        # Run evaluations in parallel using ProcessPoolExecutor
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            futures = [
+                executor.submit(self._run_lighteval, model_name, providers[0], self.dataset_name)
+                for model_name, providers in model_providers
+                if providers  # Only run if providers are available
+            ]
+            self.results = [future.result() for future in concurrent.futures.as_completed(futures)]
+        # Calculate total script execution time
+        total_time = time.time() - script_start_time
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
+        # Save results to Hub
+        self._save_results_to_hub()
+        # Mark the task as completed
+        self.is_completed = True
+        return self.results
+    def get_logs(self) -> List[str]:
         """
+        Get logs for this task (empty list since we don't track logs anymore)
         Returns:
+            Empty list of logs
         """
+        return []
+    def is_task_completed(self) -> bool:
         """
+        Check if the task is completed
         Returns:
+            True if completed, False otherwise
         """
+        return self.is_completed
     def run(self) -> None:
         """
+        Run the evaluation task (wrapper around run_parallel)
         """
+        self.run_parallel()

backend/tasks/get_model_providers.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from huggingface_hub import model_info
+PREFERRED_PROVIDERS = ["sambanova", "novita"]
+def filter_providers(providers):
+    return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
+def get_model_providers(models):
+    results = []
+    for model_name in models:
+        try:
+            info = model_info(model_name, expand="inferenceProviderMapping")
+            providers = filter_providers(info.inference_provider_mapping.keys()) if hasattr(info, "inference_provider_mapping") else []
+            results.append((model_name, providers))
+        except Exception as e:
+            results.append((model_name, []))
+    return results
+if __name__ == "__main__":
+    example_models = [
+        "Qwen/Qwen2.5-72B-Instruct",
+        "meta-llama/Llama-3.3-70B-Instruct",
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        "Qwen/QwQ-32B",
+        "mistralai/Mistral-Small-24B-Instruct-2501"
+    ]
+    results = get_model_providers(example_models)
+    print(results)

backend/test_import.py ADDED Viewed

	@@ -0,0 +1,5 @@

+try:
+    import lighteval_task
+    print("lighteval_task importé avec succès!")
+except ImportError as e:
+    print(f"Erreur: {e}")

backend/yourbench_simple_demo.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,18 @@

+Metadata-Version: 2.4
+Name: yourbench-simple-demo
+Version: 0.1.0
+Author-email: Sumuk Shashidhar <[email protected]>, Alina Lozovskaia <[email protected]>, Clémentine Fourrier <[email protected]>, Nathan Habib <[email protected]>
+Requires-Python: <3.13,>=3.12
+Requires-Dist: yourbench@ git+https://github.com/huggingface/yourbench.git@main
+Requires-Dist: asyncio>=3.4.3
+Requires-Dist: datasets>=3.3.0
+Requires-Dist: loguru>=0.7.3
+Requires-Dist: python-dotenv>=1.0.1
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: ruff>=0.11.2
+Requires-Dist: lighteval[math]>=0.8.0
+Requires-Dist: huggingface-hub>=0.22.0
+Requires-Dist: python-multipart>=0.0.5
+Requires-Dist: fastapi>=0.110.0
+Requires-Dist: uvicorn>=0.29.0
+Requires-Dist: pydantic>=2.6.0

backend/yourbench_simple_demo.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+README.md
+pyproject.toml
+lighteval_task/__init__.py
+lighteval_task/lighteval_task.py
+tests/test_evaluation.py
+tests/test_hf_upload.py
+tests/test_inference.py
+tests/test_lighteval.py
+tests/test_openai.py
+tests/test_parallel_lighteval.py
+tests/test_provider_parallel_support.py
+tests/test_yourbench_results.py
+yourbench_simple_demo.egg-info/PKG-INFO
+yourbench_simple_demo.egg-info/SOURCES.txt
+yourbench_simple_demo.egg-info/dependency_links.txt
+yourbench_simple_demo.egg-info/requires.txt
+yourbench_simple_demo.egg-info/top_level.txt

backend/yourbench_simple_demo.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

backend/yourbench_simple_demo.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+yourbench@ git+https://github.com/huggingface/yourbench.git@main
+asyncio>=3.4.3
+datasets>=3.3.0
+loguru>=0.7.3
+python-dotenv>=1.0.1
+tqdm>=4.67.1
+ruff>=0.11.2
+lighteval[math]>=0.8.0
+huggingface-hub>=0.22.0
+python-multipart>=0.0.5
+fastapi>=0.110.0
+uvicorn>=0.29.0
+pydantic>=2.6.0

backend/yourbench_simple_demo.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ lighteval_task

frontend/src/components/BenchmarkDisplay.jsx CHANGED Viewed

@@ -99,19 +99,34 @@ const BenchmarkDisplay = ({
           <Typography variant="h6">Benchmark Created Successfully</Typography>
         </Box>
-        <Tooltip title="Download the complete benchmark">
           <Button
-            variant="outlined"
             color="primary"
-            endIcon={
-              isDownloading ? <CircularProgress size={16} /> : <DownloadIcon />
-            }
-            onClick={handleDownloadClick}
-            disabled={isDownloading || !sessionId}
           >
-            {isDownloading ? "Downloading..." : "Download Benchmark"}
           </Button>
-        </Tooltip>
       </Box>
       <Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
@@ -154,18 +169,6 @@ const BenchmarkDisplay = ({
           </Card>
         ))}
       </Box>
-      <Box sx={{ display: "flex", justifyContent: "center", mt: 4 }}>
-        <Button
-          variant="contained"
-          color="primary"
-          size="large"
-          startIcon={<AssessmentIcon />}
-          onClick={handleEvaluationClick}
-        >
-          Start Evaluation
-        </Button>
-      </Box>
     </>
   );
 };

           <Typography variant="h6">Benchmark Created Successfully</Typography>
         </Box>
+        <Box sx={{ display: "flex", gap: 2 }}>
+          <Tooltip title="Download the complete benchmark">
+            <Button
+              variant="outlined"
+              color="primary"
+              endIcon={
+                isDownloading ? (
+                  <CircularProgress size={16} />
+                ) : (
+                  <DownloadIcon />
+                )
+              }
+              onClick={handleDownloadClick}
+              disabled={isDownloading || !sessionId}
+            >
+              {isDownloading ? "Downloading..." : "Download Benchmark"}
+            </Button>
+          </Tooltip>
           <Button
+            variant="contained"
             color="primary"
+            startIcon={<AssessmentIcon />}
+            onClick={handleEvaluationClick}
           >
+            Start Evaluation
           </Button>
+        </Box>
       </Box>
       <Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
           </Card>
         ))}
       </Box>
     </>
   );
 };

frontend/src/components/BenchmarkEvaluation.jsx CHANGED Viewed

@@ -1,55 +1,53 @@
 import React, { useState, useEffect, useRef } from "react";
-import {
-  Box,
-  Typography,
-  CircularProgress,
-  Alert,
-  Paper,
-  Divider,
-  Button,
-} from "@mui/material";
-import AccessTimeIcon from "@mui/icons-material/AccessTime";
-import LogDisplay from "./LogDisplay";
 import { useNavigate } from "react-router-dom";
-// Evaluation steps
-const EVALUATION_STEPS = [
-  "preparation",
-  "model_evaluation",
-  "results_compilation",
 ];
-// Friendly step names for display
-const STEP_LABELS = {
-  preparation: "Preparation",
-  model_evaluation: "Model Evaluation",
-  results_compilation: "Results Compilation",
-};
-/**
- * Component to handle benchmark evaluation and display logs
- *
- * @param {Object} props - Component props
- * @param {string} props.sessionId - Session ID of the benchmark to evaluate
- * @param {Function} props.onComplete - Function to call when evaluation is complete
- * @returns {JSX.Element} Benchmark evaluation component
- */
 const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
-  const [evaluating, setEvaluating] = useState(false);
   const [evaluationComplete, setEvaluationComplete] = useState(false);
-  const [evaluationLogs, setEvaluationLogs] = useState([]);
   const [error, setError] = useState(null);
-  const [currentPhase, setCurrentPhase] = useState("initializing");
-  const [completedSteps, setCompletedSteps] = useState([]);
-  const [activeStep, setActiveStep] = useState(0);
   const [elapsedTime, setElapsedTime] = useState(0);
-  const pollingIntervalRef = useRef(null);
   const timerIntervalRef = useRef(null);
   const startTimeRef = useRef(null);
   const navigate = useNavigate();
   // Start evaluation when component mounts
   useEffect(() => {
     // Set start time
@@ -76,80 +74,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
     };
   }, []);
-  // Determine current phase and completed steps from logs
-  useEffect(() => {
-    if (evaluationLogs.length === 0) return;
-    // Check all logs for completed steps
-    const newCompletedSteps = [...completedSteps];
-    let newActiveStep = activeStep;
-    evaluationLogs.forEach((log) => {
-      // Detect completed steps (format: [SUCCESS] Stage completed: step_name)
-      const match = log.match(/\[SUCCESS\] Stage completed: (\w+)/);
-      if (match && match[1]) {
-        const completedStep = match[1].trim();
-        if (
-          EVALUATION_STEPS.includes(completedStep) &&
-          !newCompletedSteps.includes(completedStep)
-        ) {
-          newCompletedSteps.push(completedStep);
-          // Set active step to index of next step
-          const stepIndex = EVALUATION_STEPS.indexOf(completedStep);
-          if (stepIndex >= 0 && stepIndex + 1 > newActiveStep) {
-            newActiveStep = stepIndex + 1;
-            if (newActiveStep >= EVALUATION_STEPS.length) {
-              newActiveStep = EVALUATION_STEPS.length;
-            }
-          }
-        }
-      }
-    });
-    // Update state if there are new completed steps
-    if (newCompletedSteps.length > completedSteps.length) {
-      setCompletedSteps(newCompletedSteps);
-      setActiveStep(newActiveStep);
-    }
-    // Check recent logs to determine current phase
-    const recentLogs = evaluationLogs.slice(-10);
-    // Detect completion conditions
-    const isComplete =
-      recentLogs.some((log) =>
-        log.includes("[SUCCESS] Evaluation completed")
-      ) ||
-      completedSteps.includes("results_compilation") ||
-      newCompletedSteps.includes("results_compilation");
-    if (isComplete) {
-      setCurrentPhase("complete");
-      setEvaluationComplete(true);
-      // Stop polling when evaluation is complete
-      if (pollingIntervalRef.current) {
-        clearInterval(pollingIntervalRef.current);
-      }
-      if (timerIntervalRef.current) {
-        clearInterval(timerIntervalRef.current);
-      }
-      // Notify parent component that evaluation is complete
-      if (onComplete) {
-        onComplete({
-          success: true,
-          sessionId,
-          logs: evaluationLogs,
-        });
-      }
-    } else if (recentLogs.some((log) => log.includes("Comparing models"))) {
-      setCurrentPhase("compiling_results");
-    } else if (recentLogs.some((log) => log.includes("Starting evaluations"))) {
-      setCurrentPhase("evaluating");
-    } else if (recentLogs.some((log) => log.includes("Initialization"))) {
-      setCurrentPhase("preparing");
-    }
-  }, [evaluationLogs, completedSteps, activeStep, sessionId, onComplete]);
   // Format elapsed time as HH:MM:SS
   const formatElapsedTime = () => {
     const hours = Math.floor(elapsedTime / 3600);
@@ -170,13 +94,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
       return;
     }
-    setEvaluating(true);
-    setEvaluationLogs([]);
-    setError(null);
-    setCurrentPhase("initializing");
-    setCompletedSteps([]);
-    setActiveStep(0);
     try {
       // Call API to start evaluation
       const response = await fetch("http://localhost:3001/evaluate-benchmark", {
@@ -192,34 +109,15 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
       const result = await response.json();
       if (response.ok) {
-        setEvaluationLogs(result.logs || []);
-        // Set up polling to retrieve more logs
         pollingIntervalRef.current = setInterval(async () => {
-          // Check if we're already done
-          if (evaluationComplete) {
-            clearInterval(pollingIntervalRef.current);
-            return;
-          }
           try {
-            // Call API to get latest logs
             const logsResponse = await fetch(
               `http://localhost:3001/evaluation-logs/${sessionId}`
             );
             if (logsResponse.ok) {
               const logsResult = await logsResponse.json();
-              // Update logs if there are new ones
-              if (
-                logsResult.logs &&
-                logsResult.logs.length > evaluationLogs.length
-              ) {
-                setEvaluationLogs(logsResult.logs);
-              }
-              // Check if evaluation is complete
               if (logsResult.is_completed) {
                 setEvaluationComplete(true);
                 clearInterval(pollingIntervalRef.current);
@@ -227,71 +125,17 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
             }
           } catch (error) {
             console.log("Error polling logs:", error);
-            // Don't stop polling on network errors
           }
-        }, 2000); // Poll every 2 seconds
       } else {
-        // Handle error
-        setEvaluationLogs([`Error: ${result.error || "Unknown error"}`]);
         setError(result.error || "Benchmark evaluation failed");
       }
     } catch (error) {
       console.error("Error starting evaluation:", error);
-      setEvaluationLogs([`Error: ${error.message || "Unknown error"}`]);
       setError("Error connecting to server");
-    } finally {
-      setEvaluating(false);
-    }
-  };
-  // Get title based on current phase
-  const getPhaseTitle = () => {
-    switch (currentPhase) {
-      case "initializing":
-        return "Preparing evaluation...";
-      case "preparing":
-        return "Preparing models...";
-      case "evaluating":
-        return "Evaluating models...";
-      case "compiling_results":
-        return "Compiling results...";
-      case "complete":
-        return "Evaluation completed successfully!";
-      default:
-        return "Processing...";
     }
   };
-  // Get current step info for display
-  const getCurrentStepInfo = () => {
-    const totalSteps = EVALUATION_STEPS.length;
-    const currentStepIndex = activeStep;
-    // If no active step yet
-    if (currentStepIndex === 0 && completedSteps.length === 0) {
-      return `Starting... (0%)`;
-    }
-    // If all steps completed
-    if (currentStepIndex >= totalSteps) {
-      return `Completed (100%)`;
-    }
-    // Calculate percentage
-    const percentage = Math.round((currentStepIndex / totalSteps) * 100);
-    // Get current step name
-    const currentStepName =
-      STEP_LABELS[EVALUATION_STEPS[currentStepIndex]] || "Processing";
-    return `${currentStepName} (${percentage}%)`;
-  };
-  // Function to navigate to results page
-  const viewResults = () => {
-    navigate(`/evaluation-display?session=${sessionId}`);
-  };
   return (
     <Paper
       elevation={3}
@@ -313,29 +157,19 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
       ) : (
         <>
           {evaluationComplete ? (
-            <>
-              <Alert severity="success" sx={{ width: "100%", mb: 3 }}>
-                Evaluation completed successfully!
-              </Alert>
-              <Button
-                variant="contained"
-                color="primary"
-                onClick={viewResults}
-                sx={{ mb: 3 }}
-              >
-                View Results Leaderboard
-              </Button>
-            </>
           ) : (
             <>
               <CircularProgress size={60} sx={{ mb: 2 }} />
               <Typography variant="h6" component="div" gutterBottom>
-                {getPhaseTitle()}
               </Typography>
               {/* Step progress indicator */}
               <Typography variant="body1" color="text.secondary">
-                {getCurrentStepInfo()}
               </Typography>
               {/* Timer display */}
@@ -354,9 +188,6 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
           )}
         </>
       )}
-      {/* Use the LogDisplay component for logs */}
-      <LogDisplay logs={evaluationLogs} height={150} />
     </Paper>
   );
 };

 import React, { useState, useEffect, useRef } from "react";
+import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
 import { useNavigate } from "react-router-dom";
+// Starting messages with their timing
+const STARTING_MESSAGES = [
+  { message: "Initializing evaluation environment...", progress: 22 },
+  { message: "Starting evaluation process...", progress: 54 },
+  { message: "Evaluating models...", progress: 71 },
+  { message: "Storing evaluation results...", progress: 100 },
 ];
 const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
   const [evaluationComplete, setEvaluationComplete] = useState(false);
   const [error, setError] = useState(null);
   const [elapsedTime, setElapsedTime] = useState(0);
+  const [startingMessageIndex, setStartingMessageIndex] = useState(0);
   const timerIntervalRef = useRef(null);
   const startTimeRef = useRef(null);
+  const startingMessageIntervalRef = useRef(null);
+  const pollingIntervalRef = useRef(null);
   const navigate = useNavigate();
+  // Add effect to handle automatic redirection when evaluation is complete
+  useEffect(() => {
+    if (evaluationComplete) {
+      navigate(`/evaluation-display?session=${sessionId}`);
+    }
+  }, [evaluationComplete, sessionId, navigate]);
+  // Add effect to handle starting messages
+  useEffect(() => {
+    startingMessageIntervalRef.current = setInterval(() => {
+      setStartingMessageIndex((prev) => {
+        if (prev < STARTING_MESSAGES.length - 1) {
+          return prev + 1;
+        }
+        return prev;
+      });
+    }, 20000); // Change message every 20 seconds
+    return () => {
+      if (startingMessageIntervalRef.current) {
+        clearInterval(startingMessageIntervalRef.current);
+      }
+    };
+  }, []);
   // Start evaluation when component mounts
   useEffect(() => {
     // Set start time
     };
   }, []);
   // Format elapsed time as HH:MM:SS
   const formatElapsedTime = () => {
     const hours = Math.floor(elapsedTime / 3600);
       return;
     }
     try {
       // Call API to start evaluation
       const response = await fetch("http://localhost:3001/evaluate-benchmark", {
       const result = await response.json();
       if (response.ok) {
+        // Set up polling to check completion
         pollingIntervalRef.current = setInterval(async () => {
           try {
             const logsResponse = await fetch(
               `http://localhost:3001/evaluation-logs/${sessionId}`
             );
             if (logsResponse.ok) {
               const logsResult = await logsResponse.json();
               if (logsResult.is_completed) {
                 setEvaluationComplete(true);
                 clearInterval(pollingIntervalRef.current);
             }
           } catch (error) {
             console.log("Error polling logs:", error);
           }
+        }, 2000);
       } else {
         setError(result.error || "Benchmark evaluation failed");
       }
     } catch (error) {
       console.error("Error starting evaluation:", error);
       setError("Error connecting to server");
     }
   };
   return (
     <Paper
       elevation={3}
       ) : (
         <>
           {evaluationComplete ? (
+            <Alert severity="success" sx={{ width: "100%", mb: 3 }}>
+              Evaluation completed successfully!
+            </Alert>
           ) : (
             <>
               <CircularProgress size={60} sx={{ mb: 2 }} />
               <Typography variant="h6" component="div" gutterBottom>
+                Benchmark evaluation...
               </Typography>
               {/* Step progress indicator */}
               <Typography variant="body1" color="text.secondary">
+                {`${STARTING_MESSAGES[startingMessageIndex].message} (${STARTING_MESSAGES[startingMessageIndex].progress}%)`}
               </Typography>
               {/* Timer display */}
           )}
         </>
       )}
     </Paper>
   );
 };

frontend/src/components/BenchmarkGenerator.jsx CHANGED Viewed

@@ -288,7 +288,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
       case "initializing":
         return "Benchmark generation...";
       case "configuring":
-        return "Generating configuration file...";
       case "benchmarking":
         return "Creating benchmark...";
       case "complete":
@@ -390,7 +390,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
       )}
       {/* Use the LogDisplay component */}
-      <LogDisplay logs={generationLogs} height={150} />
     </Paper>
   );
 };

       case "initializing":
         return "Benchmark generation...";
       case "configuring":
+        return "Creating benchmark...";
       case "benchmarking":
         return "Creating benchmark...";
       case "complete":
       )}
       {/* Use the LogDisplay component */}
+      {/* <LogDisplay logs={generationLogs} height={150} /> */}
     </Paper>
   );
 };

frontend/src/components/EvaluationDisplay.jsx CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   TableHead,
   TableRow,
   Alert,
-  LinearProgress,
   Card,
   CardContent,
   Link,
@@ -70,11 +70,20 @@ const EvaluationDisplay = ({ sessionId }) => {
   if (loading) {
     return (
-      <Box sx={{ width: "100%", mt: 4, mb: 4 }}>
         <Typography variant="h5" gutterBottom>
           Loading Evaluation Results...
         </Typography>
-        <LinearProgress />
       </Box>
     );
   }
@@ -127,50 +136,45 @@ const EvaluationDisplay = ({ sessionId }) => {
             </TableRow>
           </TableHead>
           <TableBody>
-            {results.models_comparison.map((model, index) => (
-              <TableRow
-                key={`${model.model_name}-${model.provider}`}
-                sx={{
-                  "&:last-child td, &:last-child th": { border: 0 },
-                  backgroundColor: model.success
-                    ? "inherit"
-                    : "rgba(0, 0, 0, 0.04)",
-                }}
-              >
-                <TableCell>{index + 1}</TableCell>
-                <TableCell component="th" scope="row">
-                  <Link
-                    href={`https://huggingface.co/${model.model_name}`}
-                    target="_blank"
-                    rel="noopener noreferrer"
-                    sx={{
-                      textDecoration: "none",
-                      "&:hover": {
-                        textDecoration: "underline",
-                      },
-                      display: "flex",
-                      alignItems: "center",
-                    }}
-                  >
-                    {model.model_name}
-                    <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
-                  </Link>
-                </TableCell>
-                <TableCell align="center">
-                  {model.success ? formatAccuracy(model.accuracy) : "-"}
-                </TableCell>
-                <TableCell align="center">
-                  {model.success ? formatTime(model.evaluation_time) : "-"}
-                </TableCell>
-                <TableCell align="center">
-                  {model.success ? (
                     <span style={{ color: "green" }}>✓ Success</span>
-                  ) : (
-                    <span style={{ color: "red" }}>✗ Failed</span>
-                  )}
-                </TableCell>
-              </TableRow>
-            ))}
           </TableBody>
         </Table>
       </TableContainer>

   TableHead,
   TableRow,
   Alert,
+  CircularProgress,
   Card,
   CardContent,
   Link,
   if (loading) {
     return (
+      <Box
+        sx={{
+          width: "100%",
+          mt: 4,
+          mb: 4,
+          display: "flex",
+          flexDirection: "column",
+          alignItems: "center",
+        }}
+      >
         <Typography variant="h5" gutterBottom>
           Loading Evaluation Results...
         </Typography>
+        <CircularProgress />
       </Box>
     );
   }
             </TableRow>
           </TableHead>
           <TableBody>
+            {results.models_comparison
+              .filter((model) => model.success)
+              .map((model, index) => (
+                <TableRow
+                  key={`${model.model_name}-${model.provider}`}
+                  sx={{
+                    "&:last-child td, &:last-child th": { border: 0 },
+                  }}
+                >
+                  <TableCell>{index + 1}</TableCell>
+                  <TableCell component="th" scope="row">
+                    <Link
+                      href={`https://huggingface.co/${model.model_name}`}
+                      target="_blank"
+                      rel="noopener noreferrer"
+                      sx={{
+                        textDecoration: "none",
+                        "&:hover": {
+                          textDecoration: "underline",
+                        },
+                        display: "flex",
+                        alignItems: "center",
+                      }}
+                    >
+                      {model.model_name}
+                      <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
+                    </Link>
+                  </TableCell>
+                  <TableCell align="center">
+                    {formatAccuracy(model.accuracy)}
+                  </TableCell>
+                  <TableCell align="center">
+                    {formatTime(model.evaluation_time)}
+                  </TableCell>
+                  <TableCell align="center">
                     <span style={{ color: "green" }}>✓ Success</span>
+                  </TableCell>
+                </TableRow>
+              ))}
           </TableBody>
         </Table>
       </TableContainer>

frontend/src/components/ExternalLinks.jsx CHANGED Viewed

@@ -1,16 +1,31 @@
 import React from "react";
-import { Box, Typography } from "@mui/material";
 import OpenInNewIcon from "@mui/icons-material/OpenInNew";
 const ExternalLinks = () => {
   return (
     <Box
       sx={{
         position: "fixed",
         top: 24,
         right: 24,
         display: "flex",
-        gap: 2,
         alignItems: "center",
         zIndex: 1000,
       }}
@@ -57,6 +72,22 @@ const ExternalLinks = () => {
           <OpenInNewIcon sx={{ fontSize: "0.75rem", ml: 0.5, opacity: 0.6 }} />
         </a>
       </Typography>
     </Box>
   );
 };

 import React from "react";
+import { Box, Typography, IconButton, Tooltip } from "@mui/material";
 import OpenInNewIcon from "@mui/icons-material/OpenInNew";
+import ShareIcon from "@mui/icons-material/Share";
 const ExternalLinks = () => {
+  const handleShare = async () => {
+    try {
+      await navigator.share({
+        title: "YourBench Demo",
+        text: "Check out this benchmark evaluation on YourBench!",
+        url: window.location.href,
+      });
+    } catch (err) {
+      console.log("Error sharing:", err);
+    }
+  };
   return (
     <Box
       sx={{
         position: "fixed",
         top: 24,
+        left: 24,
         right: 24,
+        margin: "auto",
         display: "flex",
+        justifyContent: "space-between",
         alignItems: "center",
         zIndex: 1000,
       }}
           <OpenInNewIcon sx={{ fontSize: "0.75rem", ml: 0.5, opacity: 0.6 }} />
         </a>
       </Typography>
+      <Tooltip title="Share">
+        <IconButton
+          onClick={handleShare}
+          size="small"
+          sx={{
+            ml: 1,
+            color: "inherit",
+            opacity: 0.7,
+            "&:hover": {
+              opacity: 1,
+            },
+          }}
+        >
+          <ShareIcon fontSize="small" />
+        </IconButton>
+      </Tooltip>
     </Box>
   );
 };

frontend/src/config/theme.js CHANGED Viewed

@@ -375,7 +375,7 @@ const getDesignTokens = (mode) => ({
     values: {
       xs: 0,
       sm: 600,
-      md: 900,
       lg: 1240,
       xl: 1536,
     },

     values: {
       xs: 0,
       sm: 600,
+      md: 1100,
       lg: 1240,
       xl: 1536,
     },

test_import.py ADDED Viewed

	@@ -0,0 +1,5 @@

+try:
+    import lighteval_task
+    print("lighteval_task importé avec succès!")
+except ImportError as e:
+    print(f"Erreur: {e}")