tfrere commited on
Commit
ffa4ae8
·
1 Parent(s): 83d60af

add prerendered documents | update filename | refactor

Browse files
.gitignore CHANGED
@@ -3,6 +3,8 @@
3
  __pycache__
4
  .cache/
5
 
 
 
6
  # dependencies
7
 
8
  frontend/node_modules
 
3
  __pycache__
4
  .cache/
5
 
6
+ *.egg-info
7
+
8
  # dependencies
9
 
10
  frontend/node_modules
backend/main.py CHANGED
@@ -1,8 +1,8 @@
1
- from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
  import os
4
  from dotenv import load_dotenv
5
- from routes import routers, session_files, active_bench_tasks
6
 
7
  # Load environment variables from .env file
8
  load_dotenv()
 
1
+ from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  import os
4
  from dotenv import load_dotenv
5
+ from routes import routers, session_files, active_bench_tasks, benchmark
6
 
7
  # Load environment variables from .env file
8
  load_dotenv()
backend/old-pyproject.toml DELETED
@@ -1,26 +0,0 @@
1
- [tool.poetry]
2
- name = "llm-leaderboard-backend"
3
- version = "0.1.0"
4
- description = "Backend for the Open LLM Leaderboard"
5
- authors = ["Your Name <[email protected]>"]
6
-
7
- [tool.poetry.dependencies]
8
- python = ">=3.12,<3.13"
9
- fastapi = "^0.115.6"
10
- huggingface-hub = "0.29.3"
11
- python-dotenv = "^1.0.1"
12
- python-multipart = "^0.0.9"
13
- uvicorn = {extras = ["standard"], version = "^0.27.0"}
14
- loguru = "^0.7.3"
15
- lighteval = {version = ">=0.8.0", extras = ["math"]}
16
- tqdm = "^4.67.1"
17
- asyncio = "^3.4.3"
18
- datasets = "^3.3.0"
19
- yourbench = {git = "https://github.com/huggingface/yourbench.git"}
20
- tiktoken = "^0.9.0"
21
- requests = {extras = ["socks"], version = "^2.32.3"}
22
- httpx-socks = "^0.10.0"
23
-
24
- [build-system]
25
- requires = ["poetry-core>=1.0.0"]
26
- build-backend = "poetry.core.masonry.api"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/poetry.lock DELETED
The diff for this file is too large to render. See raw diff
 
backend/requirements.txt DELETED
@@ -1 +0,0 @@
1
-
 
 
backend/routes/benchmark.py CHANGED
@@ -2,8 +2,8 @@ from fastapi import APIRouter, HTTPException
2
  from typing import Dict, Any
3
  import os
4
  import time
5
- from tasks.createBenchConfigFile import CreateBenchConfigTask
6
- from tasks.createBench import CreateBenchTask
7
 
8
  router = APIRouter(tags=["benchmark"])
9
 
 
2
  from typing import Dict, Any
3
  import os
4
  import time
5
+ from tasks.create_bench_config_file import CreateBenchConfigTask
6
+ from tasks.create_bench import CreateBenchTask
7
 
8
  router = APIRouter(tags=["benchmark"])
9
 
backend/routes/evaluation.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import APIRouter, HTTPException
2
  from typing import Dict, Any
3
  import os
4
- from tasks.evaluationTask import EvaluationTask
5
  from huggingface_hub import hf_hub_download
6
  import json
7
  from datetime import datetime
 
1
  from fastapi import APIRouter, HTTPException
2
  from typing import Dict, Any
3
  import os
4
+ from tasks.evaluation_task import EvaluationTask
5
  from huggingface_hub import hf_hub_download
6
  import json
7
  from datetime import datetime
backend/routes/questions.py CHANGED
@@ -37,9 +37,7 @@ async def get_benchmark_questions(session_id: str):
37
  if single_dataset and len(single_dataset['train']) > 0:
38
  # Get a random sample (up to 2) from single-shot questions
39
  sample_indices = random.sample(range(len(single_dataset['train'])), min(2, len(single_dataset['train'])))
40
- print(f"Dataset structure: {single_dataset['train'][0].keys()}")
41
  for idx in sample_indices:
42
- print(f"Question {idx} data: {single_dataset['train'][idx]}")
43
  questions.append({
44
  "id": str(idx),
45
  "question": single_dataset['train'][idx].get("question", ""),
@@ -58,9 +56,7 @@ async def get_benchmark_questions(session_id: str):
58
  # Get remaining questions from multi-hop questions
59
  remaining = 2 - len(questions)
60
  sample_indices = random.sample(range(len(multi_dataset['train'])), min(remaining, len(multi_dataset['train'])))
61
- print(f"Multi-hop dataset structure: {multi_dataset['train'][0].keys()}")
62
  for idx in sample_indices:
63
- print(f"Multi-hop question {idx} data: {multi_dataset['train'][idx]}")
64
  questions.append({
65
  "id": str(idx),
66
  "question": multi_dataset['train'][idx].get("question", ""),
 
37
  if single_dataset and len(single_dataset['train']) > 0:
38
  # Get a random sample (up to 2) from single-shot questions
39
  sample_indices = random.sample(range(len(single_dataset['train'])), min(2, len(single_dataset['train'])))
 
40
  for idx in sample_indices:
 
41
  questions.append({
42
  "id": str(idx),
43
  "question": single_dataset['train'][idx].get("question", ""),
 
56
  # Get remaining questions from multi-hop questions
57
  remaining = 2 - len(questions)
58
  sample_indices = random.sample(range(len(multi_dataset['train'])), min(remaining, len(multi_dataset['train'])))
 
59
  for idx in sample_indices:
 
60
  questions.append({
61
  "id": str(idx),
62
  "question": multi_dataset['train'][idx].get("question", ""),
backend/routes/upload.py CHANGED
@@ -12,6 +12,29 @@ session_files = {}
12
  UPLOAD_ROOT = "uploaded_files"
13
  os.makedirs(UPLOAD_ROOT, exist_ok=True)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  @router.post("/upload")
16
  async def upload_file(file: UploadFile = File(...)):
17
  """
 
12
  UPLOAD_ROOT = "uploaded_files"
13
  os.makedirs(UPLOAD_ROOT, exist_ok=True)
14
 
15
+ # Initialize session files dictionary with pre-calculated documents
16
+ precalculated_docs = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]
17
+
18
+ for doc_id in precalculated_docs:
19
+ doc_dir = os.path.join(UPLOAD_ROOT, doc_id)
20
+ if os.path.exists(doc_dir):
21
+ doc_files_dir = os.path.join(doc_dir, "uploaded_files")
22
+ if os.path.exists(doc_files_dir):
23
+ for filename in os.listdir(doc_files_dir):
24
+ if filename.endswith((".pdf", ".txt", ".html", ".md")):
25
+ file_path = os.path.join(doc_files_dir, filename)
26
+ session_files[doc_id] = file_path
27
+ print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
28
+ break
29
+ else:
30
+ # Search directly in the doc_dir
31
+ for filename in os.listdir(doc_dir):
32
+ if filename.endswith((".pdf", ".txt", ".html", ".md")):
33
+ file_path = os.path.join(doc_dir, filename)
34
+ session_files[doc_id] = file_path
35
+ print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
36
+ break
37
+
38
  @router.post("/upload")
39
  async def upload_file(file: UploadFile = File(...)):
40
  """
backend/tasks/{createBench.py → create_bench.py} RENAMED
File without changes
backend/tasks/{createBenchConfigFile.py → create_bench_config_file.py} RENAMED
@@ -114,7 +114,7 @@ class CreateBenchConfigTask:
114
  "provider": "novita",
115
  "api_key": "$HF_TOKEN",
116
  "max_concurrent_requests": 32,
117
- },
118
  ],
119
 
120
  "model_roles": {
 
114
  "provider": "novita",
115
  "api_key": "$HF_TOKEN",
116
  "max_concurrent_requests": 32,
117
+ }
118
  ],
119
 
120
  "model_roles": {
backend/tasks/{evaluationTask.py → evaluation_task.py} RENAMED
File without changes
backend/test_import.py DELETED
@@ -1,5 +0,0 @@
1
- try:
2
- import lighteval_task
3
- print("lighteval_task importé avec succès!")
4
- except ImportError as e:
5
- print(f"Erreur: {e}")
 
 
 
 
 
 
backend/tests/test_evaluation.py DELETED
@@ -1,165 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script to test the evaluation task in standalone mode
4
- """
5
- import os
6
- import sys
7
- import uuid
8
- import json
9
- import time
10
- import argparse
11
- from dotenv import load_dotenv
12
- from pathlib import Path
13
- import traceback
14
-
15
- # Ensure the environment is properly configured
16
- load_dotenv()
17
-
18
- # Add the current directory to the path to import modules
19
- sys.path.append(os.getcwd())
20
- from tasks.evaluationTask import EvaluationTask
21
-
22
-
23
- def setup_environment():
24
- """
25
- Configure the environment for testing
26
- """
27
- # Check if the HF token is defined
28
- hf_token = os.getenv("HF_TOKEN")
29
- if not hf_token:
30
- print("⚠️ The HF_TOKEN is not defined in the environment or .env file")
31
- print(" Please define this variable before continuing.")
32
- sys.exit(1)
33
-
34
- # Set the default organization if not defined
35
- if not os.getenv("HF_ORGANIZATION"):
36
- os.environ["HF_ORGANIZATION"] = "yourbench"
37
- print("ℹ️ The HF_ORGANIZATION variable is not defined, using 'yourbench' as default")
38
-
39
-
40
- def run_standalone_evaluation(dataset_name, models=None, max_wait_time=3600):
41
- """
42
- Run the evaluation task in standalone mode
43
-
44
- Args:
45
- dataset_name: Name of the dataset to evaluate
46
- models: List of models to evaluate (optional)
47
- max_wait_time: Maximum waiting time in seconds
48
- """
49
- # Generate a unique session ID
50
- session_uid = str(uuid.uuid4())
51
- print(f"🔧 Session ID: {session_uid}")
52
-
53
- # Create the evaluation task instance
54
- evaluation_task = EvaluationTask(session_uid, dataset_name)
55
-
56
- # If specific models are provided, use them
57
- if models:
58
- evaluation_task.models = models
59
- print(f"🤖 Using custom models: {models}")
60
-
61
- # Display dataset information
62
- organization = os.getenv("HF_ORGANIZATION", "yourbench")
63
- print(f"📊 Evaluating dataset: {organization}/{dataset_name}")
64
- print(f"💾 Results saved in: {evaluation_task.output_dir}")
65
-
66
- # Start the evaluation task
67
- print("🚀 Starting evaluation...")
68
- evaluation_task.run()
69
-
70
- # Wait for the task to complete while displaying logs
71
- start_time = time.time()
72
- last_log_count = 0
73
-
74
- while not evaluation_task.is_task_completed():
75
- current_logs = evaluation_task.get_logs()
76
-
77
- # Display only new logs
78
- if len(current_logs) > last_log_count:
79
- for log in current_logs[last_log_count:]:
80
- print(f" {log}")
81
- last_log_count = len(current_logs)
82
-
83
- # Check if the maximum time is reached
84
- elapsed_time = time.time() - start_time
85
- if elapsed_time > max_wait_time:
86
- print("⚠️ Maximum waiting time reached, forced stop")
87
- break
88
-
89
- time.sleep(1)
90
-
91
- # Check if results are available
92
- results_file = Path(f"{evaluation_task.output_dir}/models_comparison.json")
93
- if results_file.exists():
94
- try:
95
- with open(results_file, 'r') as f:
96
- results = json.load(f)
97
-
98
- print("\n📈 Evaluation Results:")
99
- print(f" Dataset: {results['metadata']['dataset']}")
100
- print(f" Models tested: {results['metadata']['total_models_tested']}")
101
- print(f" Successful tests: {results['metadata']['successful_tests']}")
102
- print(f" Timestamp: {results['metadata']['timestamp']}")
103
-
104
- if results['metadata']['successful_tests'] > 0:
105
- print("\n📊 Model ranking by accuracy:")
106
- successful_models = [m for m in results['models_comparison'] if m['success']]
107
- for i, model in enumerate(successful_models):
108
- print(f" {i+1}. ✅ {model['model_name']} ({model['provider']})")
109
- print(f" Accuracy: {model['accuracy']:.4f} ± {model['accuracy_stderr']:.4f}")
110
- print(f" Evaluation time: {model['evaluation_time']:.2f}s")
111
-
112
- failed_models = [m for m in results['models_comparison'] if not m['success']]
113
- if failed_models:
114
- print("\n❌ Unevaluated models:")
115
- for i, model in enumerate(failed_models):
116
- print(f" {i+1}. {model['model_name']} ({model['provider']})")
117
- error_msg = model.get('error', 'Unknown reason')
118
- print(f" Reason: {error_msg}")
119
-
120
- # Check detailed results files
121
- detailed_file = Path(f"{evaluation_task.output_dir}/detailed_results.json")
122
- if detailed_file.exists():
123
- print(f"\n📄 Detailed results available in: {detailed_file}")
124
-
125
- # Check raw files
126
- raw_results = list(Path(f"{evaluation_task.output_dir}/results").glob("**/*.json"))
127
- if raw_results:
128
- print(f"\n📁 {len(raw_results)} raw result files available in: {evaluation_task.output_dir}/results")
129
-
130
- print(f"\n✅ Evaluation completed!")
131
- except Exception as e:
132
- print(f"❌ Error reading results: {str(e)}")
133
- print(f" Details: {traceback.format_exc()}")
134
- else:
135
- print(f"❌ No evaluation results found in {results_file}")
136
-
137
-
138
- if __name__ == "__main__":
139
- # Configure the argument parser
140
- parser = argparse.ArgumentParser(description="Test the evaluation task in standalone mode")
141
- parser.add_argument("dataset_name", type=str, help="Name of the dataset to evaluate (without the organization)")
142
- parser.add_argument("--model", action="append", dest="models",
143
- help="Model to evaluate in the format 'name/model,provider'. Can be used multiple times.")
144
- parser.add_argument("--timeout", type=int, default=3600,
145
- help="Maximum waiting time in seconds (default: 3600)")
146
-
147
- args = parser.parse_args()
148
-
149
- # Configure the environment
150
- setup_environment()
151
-
152
- # Transform models into tuples if specified
153
- models_to_evaluate = None
154
- if args.models:
155
- models_to_evaluate = []
156
- for model_spec in args.models:
157
- try:
158
- model_name, provider = model_spec.split(",")
159
- models_to_evaluate.append((model_name, provider))
160
- except ValueError:
161
- print(f"⚠️ Invalid model format: {model_spec}. Use 'name/model,provider'")
162
- sys.exit(1)
163
-
164
- # Run the evaluation
165
- run_standalone_evaluation(args.dataset_name, models_to_evaluate, args.timeout)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tests/test_hf_upload.py DELETED
@@ -1,78 +0,0 @@
1
- """
2
- Script de test pour vérifier l'upload vers Hugging Face Hub
3
- Ce script crée un dataset simple et tente de l'uploader vers le Hub
4
- en utilisant le token et l'organisation définis dans les variables d'environnement.
5
- """
6
- import os
7
- import sys
8
- import datasets
9
- from huggingface_hub import HfApi, login
10
- from datasets import Dataset
11
- from dotenv import load_dotenv
12
- from pathlib import Path
13
-
14
- def test_hf_upload():
15
- # Charger les variables d'environnement depuis le fichier .env
16
- dotenv_path = Path('.env')
17
- load_dotenv(dotenv_path=dotenv_path)
18
-
19
- # Récupérer le token et l'organisation des variables d'environnement
20
- hf_token = os.getenv("HF_TOKEN")
21
- org_name = os.getenv("HF_ORGANIZATION", "yourbench")
22
-
23
- if not hf_token:
24
- print("Erreur: La variable HF_TOKEN n'est pas définie dans le fichier .env.")
25
- sys.exit(1)
26
-
27
- dataset_name = "test_dataset_upload"
28
- repo_id = f"{org_name}/{dataset_name}"
29
-
30
- print(f"Tentative d'upload vers {repo_id} avec le token {hf_token[:5]}... (token tronqué pour la sécurité)")
31
-
32
- try:
33
- # Se connecter à l'API Hugging Face
34
- print("Connexion à l'API Hugging Face...")
35
- login(token=hf_token)
36
- api = HfApi(token=hf_token)
37
-
38
- # Créer un dataset simple
39
- print("Création d'un dataset de test...")
40
- data = {
41
- "text": ["Ceci est un test", "Un autre exemple", "Troisième exemple"],
42
- "label": [1, 0, 1]
43
- }
44
- dataset = Dataset.from_dict(data)
45
-
46
- # Vérifier si le repo existe déjà et le supprimer si nécessaire
47
- try:
48
- api.delete_repo(repo_id=repo_id, repo_type="dataset")
49
- print(f"Repo existant {repo_id} supprimé.")
50
- except Exception:
51
- print(f"Le repo {repo_id} n'existait pas encore.")
52
-
53
- # Uploader le dataset
54
- print(f"Upload du dataset vers {repo_id}...")
55
- dataset.push_to_hub(
56
- repo_id=repo_id,
57
- token=hf_token,
58
- private=True,
59
- commit_message="Test d'upload de dataset"
60
- )
61
-
62
- print(f"Succès! Dataset uploadé vers https://huggingface.co/datasets/{repo_id}")
63
- return True
64
-
65
- except Exception as e:
66
- print(f"Erreur lors de l'upload: {str(e)}")
67
- print("\nTraceback complet:")
68
- import traceback
69
- traceback.print_exc()
70
- return False
71
-
72
- if __name__ == "__main__":
73
- print("=== Test d'upload vers Hugging Face Hub ===")
74
- success = test_hf_upload()
75
- if success:
76
- print("\n✅ Le test a réussi! L'upload fonctionne correctement.")
77
- else:
78
- print("\n❌ Le test a échoué. Vérifiez les erreurs ci-dessus.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tests/test_inference.py DELETED
@@ -1,84 +0,0 @@
1
- import time
2
- import signal
3
- from huggingface_hub import InferenceClient
4
-
5
- # Configuration - Modèles et leurs providers
6
- MODELS = [
7
- ("Qwen/Qwen2.5-72B-Instruct", "sambanova"),
8
- ("meta-llama/Llama-3.3-70B-Instruct", "sambanova"),
9
- ("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "sambanova"),
10
- ("Qwen/QwQ-32B", "novita"),
11
- # ("mistralai/Mistral-Small-24B-Instruct-2501", "novita")
12
- ]
13
- QUESTION = "What is the capital of France?"
14
- TIMEOUT = 10 # secondes
15
-
16
-
17
- class TimeoutException(Exception):
18
- pass
19
-
20
-
21
- def timeout_handler(signum, frame):
22
- raise TimeoutException("Timeout")
23
-
24
-
25
- def test_model(model, provider):
26
- client = InferenceClient(provider=provider)
27
-
28
- # Configure le timeout
29
- signal.signal(signal.SIGALRM, timeout_handler)
30
- signal.alarm(TIMEOUT)
31
-
32
- start_time = time.time()
33
- try:
34
- response = client.chat_completion(
35
- model=model,
36
- messages=[{"role": "user", "content": QUESTION}]
37
- )
38
- result = response.choices[0].message.content
39
- success = True
40
- except TimeoutException:
41
- result = f"TIMEOUT ({TIMEOUT}s)"
42
- success = False
43
- except Exception as e:
44
- result = str(e)
45
- success = False
46
- finally:
47
- # Désactive l'alarme
48
- signal.alarm(0)
49
-
50
- execution_time = time.time() - start_time
51
-
52
- status = "✅" if success else "❌"
53
- print(f"{status} {model} ({provider}) - Temps: {execution_time:.2f}s")
54
- if success:
55
- print(f" Réponse: {result[:80]}..." if len(result) > 80 else f" Réponse: {result}")
56
- else:
57
- print(f" Erreur: {result}")
58
-
59
- return success, execution_time, result
60
-
61
-
62
- def main():
63
- print(f"\nTest de {len(MODELS)} modèles avec leurs providers spécifiques")
64
- print(f"Question: {QUESTION}")
65
- print(f"Timeout: {TIMEOUT}s\n")
66
-
67
- results = []
68
- for model, provider in MODELS:
69
- success, time_taken, response = test_model(model, provider)
70
- results.append({
71
- "model": model,
72
- "provider": provider,
73
- "success": success,
74
- "time": time_taken
75
- })
76
-
77
- print("\n=== RÉSUMÉ ===")
78
- for result in results:
79
- status = "✅" if result["success"] else "❌"
80
- print(f"{status} {result['model']} ({result['provider']}): {result['time']:.2f}s")
81
-
82
-
83
- if __name__ == "__main__":
84
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tests/test_lighteval.py DELETED
@@ -1,151 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script minimal pour tester directement lighteval avec la tâche yourbench
4
- """
5
- import os
6
- import sys
7
- import subprocess
8
- import json
9
- import time
10
- from pathlib import Path
11
- import logging
12
-
13
- # Assurez-vous que l'environnement est correctement configuré
14
- from dotenv import load_dotenv
15
- load_dotenv()
16
-
17
- # Importer le module de définition de tâche yourbench
18
- sys.path.append(os.getcwd())
19
- from tasks.yourbench_lighteval_task import create_yourbench_task
20
-
21
- def run_lighteval_test():
22
- """
23
- Exécuter un test minimal avec lighteval
24
- """
25
- # Parameters
26
- dataset_name = "yourbench_a"
27
- organization = "yourbench"
28
- model_name = "Qwen/Qwen2.5-72B-Instruct"
29
- provider = "novita"
30
- output_dir = f"uploaded_files/test_{provider}/lighteval_results"
31
-
32
- # Créer le répertoire de sortie
33
- os.makedirs(output_dir, exist_ok=True)
34
-
35
- # Définir le chemin d'accès complet au dataset
36
- dataset_path = f"{organization}/{dataset_name}"
37
- print(f"Dataset à évaluer: {dataset_path}")
38
-
39
- # Créer un fichier temporaire
40
- import tempfile
41
- temp_file_path = tempfile.mktemp(suffix=".py")
42
- print(f"Création du fichier temporaire: {temp_file_path}")
43
-
44
- with open(temp_file_path, 'w') as temp_file:
45
- # Écrire le contenu du fichier
46
- temp_file.write(f"""
47
- import os
48
- import sys
49
- import logging
50
- sys.path.append("{os.getcwd()}")
51
-
52
- from tasks.yourbench_lighteval_task import create_yourbench_task
53
-
54
- # Configurer le logging
55
- logging.basicConfig(level=logging.INFO)
56
-
57
- # Créer la tâche yourbench
58
- yourbench = create_yourbench_task("{dataset_path}", "lighteval")
59
-
60
- # Définir la variable TASKS_TABLE dont lighteval a besoin
61
- TASKS_TABLE = [yourbench]
62
- """)
63
-
64
- # Construire la commande lighteval
65
- cmd = [
66
- "lighteval",
67
- "endpoint",
68
- "inference-providers",
69
- f"model={model_name},provider={provider}",
70
- "custom|yourbench|0|0",
71
- "--custom-tasks",
72
- temp_file_path,
73
- "--max-samples", "5", # Seulement 1 échantillon
74
- "--output-dir", output_dir,
75
- "--save-details",
76
- "--no-push-to-hub" # Pas de push pour gagner du temps
77
- ]
78
-
79
- # Afficher la commande
80
- print(f"Exécution de la commande: {' '.join(cmd)}")
81
- print(f"Heure de début: {time.strftime('%H:%M:%S')}")
82
-
83
- # Exécuter la commande
84
- try:
85
- # Exécuter avec capture des sorties
86
- result = subprocess.run(cmd, capture_output=True, text=True)
87
-
88
- # Afficher les résultats
89
- print(f"Code de retour: {result.returncode}")
90
- print("--- SORTIE STANDARD ---")
91
- print(result.stdout)
92
- print("--- ERREUR STANDARD ---")
93
- print(result.stderr)
94
-
95
- # Vérifier si des résultats ont été générés
96
- results_dir = Path(output_dir) / "results"
97
- if results_dir.exists():
98
- print(f"Dossier de résultats créé: {results_dir}")
99
- # Lister les fichiers de résultats
100
- result_files = list(results_dir.glob("**/*.json"))
101
- if result_files:
102
- print(f"Fichiers de résultats trouvés: {result_files}")
103
- # Trier les fichiers par date de modification pour prendre le plus récent
104
- result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
105
- latest_result = result_files[0]
106
- print(f"Fichier de résultats le plus récent: {latest_result}")
107
-
108
- # Lire le fichier de résultats
109
- with open(latest_result, 'r') as f:
110
- results = json.load(f)
111
- print("Contenu du fichier de résultats:")
112
- print(json.dumps(results, indent=2))
113
-
114
- # Analyse des résultats
115
- print("\n==== ANALYSE DES RÉSULTATS ====")
116
- if "results" in results:
117
- for task_name, task_results in results["results"].items():
118
- print(f"Tâche: {task_name}")
119
- for metric_name, metric_value in task_results.items():
120
- print(f" {metric_name}: {metric_value}")
121
- else:
122
- print("Aucun résultat trouvé dans le fichier JSON")
123
-
124
- # Vérifier les détails
125
- details_dir = Path(output_dir) / "details"
126
- if details_dir.exists():
127
- print(f"\nDossier de détails trouvé: {details_dir}")
128
- model_details_dirs = list(details_dir.glob("**/*"))
129
- if model_details_dirs:
130
- print(f"Dossiers de détails par modèle: {model_details_dirs}")
131
- else:
132
- print("Aucun fichier de résultats trouvé.")
133
- else:
134
- print(f"Aucun dossier de résultats créé.")
135
-
136
- except subprocess.CalledProcessError as e:
137
- print(f"Erreur lors de l'exécution de la commande: {e}")
138
- except Exception as e:
139
- print(f"Exception: {e}")
140
- finally:
141
- # Supprimer le fichier temporaire
142
- try:
143
- os.unlink(temp_file_path)
144
- print(f"Fichier temporaire supprimé: {temp_file_path}")
145
- except:
146
- pass
147
-
148
- print(f"Heure de fin: {time.strftime('%H:%M:%S')}")
149
-
150
- if __name__ == "__main__":
151
- run_lighteval_test()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tests/test_openai.py DELETED
@@ -1,31 +0,0 @@
1
- import os
2
- from openai import OpenAI
3
- from dotenv import load_dotenv
4
-
5
- # Load environment variables
6
- load_dotenv()
7
-
8
- def test_openai_connection():
9
- try:
10
- # Initialize OpenAI client
11
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
12
-
13
- # Make a simple request
14
- response = client.chat.completions.create(
15
- model="gpt-3.5-turbo",
16
- messages=[
17
- {"role": "user", "content": "Say 'Hello World'"}
18
- ]
19
- )
20
-
21
- print("✅ OpenAI API connection successful!")
22
- print(f"Response: {response.choices[0].message.content}")
23
- return True
24
-
25
- except Exception as e:
26
- print("❌ OpenAI API connection failed!")
27
- print(f"Error: {str(e)}")
28
- return False
29
-
30
- if __name__ == "__main__":
31
- test_openai_connection()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tests/test_parallel_lighteval.py DELETED
@@ -1,278 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script to run lighteval tests in parallel for multiple models
4
- """
5
- import os
6
- import sys
7
- import json
8
- import time
9
- import tempfile
10
- import asyncio
11
- from pathlib import Path
12
- from typing import Tuple, List, Dict, Any
13
-
14
- # Ensure environment is properly configured
15
- from dotenv import load_dotenv
16
- load_dotenv()
17
-
18
- # Import yourbench task module
19
- sys.path.append(os.getcwd())
20
- from tasks.yourbench_lighteval_task import create_yourbench_task
21
-
22
- # Define models to test
23
- INIT_MODELS = [
24
- # 70B
25
- ("Qwen/Qwen2.5-72B-Instruct", "novita"),
26
- ("meta-llama/Llama-3.3-70B-Instruct", "novita"),
27
- ("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "novita"),
28
- # 20 to 30B
29
- ("Qwen/QwQ-32B", "novita"),
30
- # ("mistralai/Mistral-Small-24B-Instruct-2501", "sambanova"),
31
- ]
32
-
33
- async def run_lighteval_test_for_model(model_info: Tuple[str, str]) -> Dict[str, Any]:
34
- """
35
- Run lighteval test for a specific model
36
- """
37
- model_name, provider = model_info
38
-
39
- # Parameters
40
- dataset_name = "yourbench_a"
41
- organization = "yourbench"
42
- output_dir = f"uploaded_files/test_parallel_{provider}/lighteval_results"
43
-
44
- # Create output directory
45
- os.makedirs(output_dir, exist_ok=True)
46
-
47
- # Define full dataset path
48
- dataset_path = f"{organization}/{dataset_name}"
49
- print(f"Dataset to evaluate for {model_name}: {dataset_path}")
50
-
51
- # Create temporary file
52
- temp_file_path = tempfile.mktemp(suffix=".py")
53
- print(f"Creating temporary file for {model_name}: {temp_file_path}")
54
-
55
- with open(temp_file_path, 'w') as temp_file:
56
- temp_file.write(f"""
57
- import os
58
- import sys
59
- sys.path.append("{os.getcwd()}")
60
-
61
- from tasks.yourbench_lighteval_task import create_yourbench_task
62
-
63
- # Create yourbench task
64
- yourbench = create_yourbench_task("{dataset_path}", "lighteval")
65
-
66
- # Define TASKS_TABLE needed by lighteval
67
- TASKS_TABLE = [yourbench]
68
- """)
69
-
70
- # Build lighteval command args
71
- cmd_args = [
72
- "lighteval",
73
- "endpoint",
74
- "inference-providers",
75
- f"model={model_name},provider={provider}",
76
- "custom|yourbench|0|0",
77
- "--custom-tasks",
78
- temp_file_path,
79
- "--max-samples", "5",
80
- "--output-dir", output_dir,
81
- "--save-details",
82
- "--no-push-to-hub"
83
- ]
84
-
85
- print(f"Running command for {model_name}: {' '.join(cmd_args)}")
86
- print(f"Start time for {model_name}: {time.strftime('%H:%M:%S')}")
87
-
88
- results = {
89
- "model_name": model_name,
90
- "provider": provider,
91
- "success": False,
92
- "error": None,
93
- "results": None,
94
- "return_code": None
95
- }
96
-
97
- try:
98
- # Prepare environment with needed tokens
99
- env = os.environ.copy()
100
- hf_token = os.getenv("HF_TOKEN")
101
- if hf_token:
102
- env["HF_TOKEN"] = hf_token
103
- env["HUGGING_FACE_HUB_TOKEN"] = hf_token
104
- env["HF_ORGANIZATION"] = organization
105
-
106
- # Run the process asynchronously
107
- process = await asyncio.create_subprocess_exec(
108
- *cmd_args,
109
- stdout=asyncio.subprocess.PIPE,
110
- stderr=asyncio.subprocess.PIPE,
111
- env=env
112
- )
113
-
114
- # Wait for the process to complete
115
- stdout, stderr = await process.communicate()
116
-
117
- # Store return code
118
- exit_code = process.returncode
119
- results["return_code"] = exit_code
120
-
121
- # Log some output for debugging
122
- if stdout:
123
- stdout_lines = stdout.decode().strip().split('\n')
124
- if stdout_lines and len(stdout_lines) > 0:
125
- print(f"Output from {model_name}: {stdout_lines[0]}")
126
-
127
- # Check if results were generated
128
- results_dir = Path(output_dir) / "results"
129
- if results_dir.exists():
130
- result_files = list(results_dir.glob("**/*.json"))
131
- if result_files:
132
- # Read the first results file
133
- with open(result_files[0], 'r') as f:
134
- test_results = json.load(f)
135
- results["results"] = test_results
136
- results["success"] = True
137
-
138
- except asyncio.CancelledError:
139
- results["error"] = "Task cancelled"
140
- print(f"Task cancelled for {model_name}")
141
- except Exception as e:
142
- results["error"] = f"Exception: {str(e)}"
143
- print(f"Error running test for {model_name}: {str(e)}")
144
- finally:
145
- # Delete temporary file
146
- try:
147
- os.unlink(temp_file_path)
148
- except:
149
- pass
150
-
151
- print(f"End time for {model_name}: {time.strftime('%H:%M:%S')}")
152
- return results
153
-
154
- async def run_parallel_tests(models: List[Tuple[str, str]]) -> List[Dict[str, Any]]:
155
- """
156
- Run tests in parallel for multiple models using asyncio
157
- """
158
- print(f"Starting parallel tests for {len(models)} models")
159
-
160
- # Create tasks for each model
161
- tasks = [run_lighteval_test_for_model(model) for model in models]
162
-
163
- # Run all tasks concurrently and gather results
164
- model_results = await asyncio.gather(*tasks, return_exceptions=True)
165
-
166
- # Process results
167
- results = []
168
- for i, result in enumerate(model_results):
169
- if isinstance(result, Exception):
170
- # Handle exception
171
- model_name, provider = models[i]
172
- print(f"Test failed for {model_name}: {str(result)}")
173
- results.append({
174
- "model_name": model_name,
175
- "provider": provider,
176
- "success": False,
177
- "error": str(result),
178
- "results": None,
179
- "return_code": None
180
- })
181
- else:
182
- # Valid result
183
- results.append(result)
184
- print(f"Test completed for {result['model_name']}")
185
-
186
- return results
187
-
188
- def format_comparison_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
189
- """
190
- Format results for easy comparison between models
191
- """
192
- comparison = {
193
- "metadata": {
194
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
195
- "total_models_tested": len(results),
196
- "successful_tests": len([r for r in results if r["success"]])
197
- },
198
- "models_comparison": []
199
- }
200
-
201
- # Sort models by accuracy (if available) or name
202
- sorted_results = sorted(
203
- results,
204
- key=lambda x: (
205
- x["results"]["results"]["all"]["accuracy"] if x["success"] and x["results"] else -1,
206
- x["model_name"]
207
- ),
208
- reverse=True
209
- )
210
-
211
- for result in sorted_results:
212
- model_result = {
213
- "model_name": result["model_name"],
214
- "provider": result["provider"],
215
- "success": result["success"]
216
- }
217
-
218
- if result["success"] and result["results"]:
219
- model_result.update({
220
- "accuracy": result["results"]["results"]["all"]["accuracy"],
221
- "accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
222
- "evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
223
- })
224
- else:
225
- model_result["error"] = result["error"]
226
-
227
- comparison["models_comparison"].append(model_result)
228
-
229
- return comparison
230
-
231
- async def main_async():
232
- """
233
- Async main function to run parallel tests
234
- """
235
- print("Starting parallel lighteval tests")
236
- start_time = time.time()
237
-
238
- # Run tests in parallel
239
- results = await run_parallel_tests(INIT_MODELS)
240
-
241
- # Save detailed results
242
- detailed_output_file = "parallel_test_detailed_results.json"
243
- with open(detailed_output_file, 'w') as f:
244
- json.dump(results, f, indent=2)
245
-
246
- # Generate and save comparison results
247
- comparison = format_comparison_results(results)
248
- comparison_file = "models_comparison.json"
249
- with open(comparison_file, 'w') as f:
250
- json.dump(comparison, f, indent=2)
251
-
252
- # Print summary
253
- print("\nTest Summary:")
254
- for model in comparison["models_comparison"]:
255
- status = "✅" if model["success"] else "❌"
256
- print(f"{status} {model['model_name']} ({model['provider']})")
257
- if not model["success"]:
258
- print(f" Error: {model['error']}")
259
- else:
260
- print(f" Accuracy: {model['accuracy']:.2%} (±{model['accuracy_stderr']:.2%})")
261
- print(f" Evaluation time: {model['evaluation_time']:.2f}s")
262
-
263
- duration = time.time() - start_time
264
- print(f"\nTotal execution time: {duration:.2f} seconds")
265
- print(f"Detailed results saved to: {detailed_output_file}")
266
- print(f"Comparison results saved to: {comparison_file}")
267
-
268
- def main():
269
- """
270
- Main function to run parallel tests
271
- """
272
- # Create event loop and run the async main
273
- loop = asyncio.get_event_loop()
274
- loop.run_until_complete(main_async())
275
- loop.close()
276
-
277
- if __name__ == "__main__":
278
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tests/test_provider_parallel_support.py DELETED
@@ -1,227 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script pour tester si un fournisseur d'API supporte réellement les requêtes parallèles
4
- """
5
- import os
6
- import sys
7
- import time
8
- import asyncio
9
- import json
10
- from pathlib import Path
11
- from datetime import datetime
12
-
13
- # Ensure environment is properly configured
14
- from dotenv import load_dotenv
15
- load_dotenv()
16
-
17
- # Définir le modèle et le fournisseur à tester
18
- MODEL_NAME = "Qwen/QwQ-32B"
19
- PROVIDER = "novita"
20
- REQUEST_COUNT = 5 # Nombre de requêtes
21
-
22
- # Liste de questions
23
- PROMPTS = [
24
- "Explain in detail how parallel computing has transformed modern data processing.",
25
- "Describe the fundamental differences between CPU and GPU architectures.",
26
- "Analyze the key challenges in distributed systems design.",
27
- "Discuss the evolution of natural language processing from rule-based systems to modern transformer architectures.",
28
- "Explain the concept of quantum computing and how it differs from classical computing paradigms."
29
- ]
30
-
31
- async def send_request(prompt, request_id=None, show_logs=True):
32
- """Envoie une requête au modèle et mesure le temps d'exécution"""
33
- if show_logs and request_id is not None:
34
- print(f"Démarrage requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
35
-
36
- start_time = time.time()
37
-
38
- cmd_args = [
39
- "curl", "-s",
40
- "-X", "POST",
41
- f"https://api-inference.huggingface.co/models/{MODEL_NAME}",
42
- "-H", f"Authorization: Bearer {os.environ.get('HF_TOKEN')}",
43
- "-H", "Content-Type: application/json",
44
- "-d", json.dumps({
45
- "inputs": prompt,
46
- "parameters": {
47
- "provider": PROVIDER,
48
- "max_new_tokens": 20
49
- }
50
- })
51
- ]
52
-
53
- process = await asyncio.create_subprocess_exec(
54
- *cmd_args,
55
- stdout=asyncio.subprocess.PIPE,
56
- stderr=asyncio.subprocess.PIPE
57
- )
58
-
59
- stdout, stderr = await process.communicate()
60
-
61
- end_time = time.time()
62
- duration = end_time - start_time
63
-
64
- response = stdout.decode("utf-8")
65
- stderr_output = stderr.decode("utf-8")
66
-
67
- # Déterminer le succès
68
- is_success = False
69
- try:
70
- response_json = json.loads(response)
71
- is_success = process.returncode == 0 and isinstance(response_json, list) and "generated_text" in response_json[0]
72
- except json.JSONDecodeError:
73
- is_success = process.returncode == 0 and not ("error" in response.lower())
74
- except Exception:
75
- is_success = process.returncode == 0
76
-
77
- # Extraire message d'erreur si échec
78
- error_message = None
79
- if not is_success:
80
- try:
81
- if "error" in response.lower():
82
- try:
83
- response_json = json.loads(response)
84
- if "error" in response_json:
85
- error_message = response_json["error"]
86
- except:
87
- error_message = f"Erreur non-JSON: {response}"
88
- elif stderr_output:
89
- error_message = stderr_output
90
- else:
91
- error_message = f"Réponse: {response}"
92
- except:
93
- error_message = f"Erreur inconnue. Code: {process.returncode}"
94
-
95
- if show_logs and request_id is not None:
96
- print(f"Fin requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]} (durée: {duration:.2f}s)")
97
- if not is_success:
98
- print(f"ERREUR requête {request_id}: {error_message[:100]}..." if error_message and len(error_message) > 100 else error_message)
99
-
100
- return {
101
- "request_id": request_id,
102
- "prompt": prompt,
103
- "start_time": start_time,
104
- "end_time": end_time,
105
- "duration": duration,
106
- "success": is_success,
107
- "response": response,
108
- "error_message": error_message
109
- }
110
-
111
- async def run_parallel_requests(prompts):
112
- """Exécute les requêtes en parallèle"""
113
- print(f"\n=== Test parallèle: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
114
- print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
115
-
116
- # Synchroniser le démarrage des requêtes
117
- start_event = asyncio.Event()
118
-
119
- async def synchronized_request(prompt, req_id):
120
- await start_event.wait()
121
- return await send_request(prompt, req_id)
122
-
123
- # Créer toutes les tâches
124
- tasks = [asyncio.create_task(synchronized_request(prompts[i], i)) for i in range(len(prompts))]
125
-
126
- # Attendre que toutes les tâches soient prêtes
127
- await asyncio.sleep(1)
128
-
129
- # Lancer toutes les requêtes en même temps
130
- parallel_start_time = time.time()
131
- print(f"Démarrage synchronisé à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
132
- start_event.set()
133
-
134
- # Attendre que toutes les tâches se terminent
135
- results = await asyncio.gather(*tasks)
136
- parallel_end_time = time.time()
137
- parallel_duration = parallel_end_time - parallel_start_time
138
-
139
- print(f"Test parallèle terminé en {parallel_duration:.2f}s\n")
140
- return results, parallel_duration
141
-
142
- async def run_sequential_requests(prompts):
143
- """Exécute les mêmes requêtes séquentiellement"""
144
- print(f"\n=== Test séquentiel: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
145
- print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
146
-
147
- sequential_start_time = time.time()
148
- results = []
149
-
150
- for i, prompt in enumerate(prompts):
151
- print(f"Requête séquentielle {i}...")
152
- result = await send_request(prompt, i)
153
- results.append(result)
154
-
155
- sequential_end_time = time.time()
156
- sequential_duration = sequential_end_time - sequential_start_time
157
-
158
- print(f"Test séquentiel terminé en {sequential_duration:.2f}s\n")
159
- return results, sequential_duration
160
-
161
- async def run_tests():
162
- """Exécute les tests parallèles puis séquentiels et compare les résultats"""
163
- global_start = time.time()
164
- prompts = PROMPTS[:REQUEST_COUNT] # Utiliser le nombre de prompts spécifié
165
-
166
- # 1. Test parallèle
167
- parallel_results, parallel_duration = await run_parallel_requests(prompts)
168
-
169
- # 2. Test séquentiel
170
- sequential_results, sequential_duration = await run_sequential_requests(prompts)
171
-
172
- # 3. Analyser les résultats
173
- global_end = time.time()
174
- total_duration = global_end - global_start
175
-
176
- # Calculer les métriques
177
- parallel_success = sum(1 for r in parallel_results if r["success"])
178
- sequential_success = sum(1 for r in sequential_results if r["success"])
179
-
180
- # Calculer le facteur de parallélisme réel (temps séquentiel / temps parallèle)
181
- if parallel_duration > 0:
182
- parallelism_factor = sequential_duration / parallel_duration
183
- else:
184
- parallelism_factor = 0
185
-
186
- # Pourcentage d'amélioration
187
- improvement_percent = (1 - (parallel_duration / sequential_duration)) * 100 if sequential_duration > 0 else 0
188
-
189
- # Afficher le résumé
190
- print("\n====== RÉSUMÉ DES TESTS ======")
191
- print(f"Modèle: {MODEL_NAME}, Provider: {PROVIDER}, Requêtes: {len(prompts)}")
192
- print(f"\nDurée test parallèle: {parallel_duration:.2f}s ({parallel_success}/{len(prompts)} réussies)")
193
- print(f"Durée test séquentiel: {sequential_duration:.2f}s ({sequential_success}/{len(prompts)} réussies)")
194
- print(f"Facteur de parallélisme: {parallelism_factor:.2f}x")
195
- print(f"Amélioration: {improvement_percent:.1f}%")
196
-
197
- if parallelism_factor >= len(prompts) * 0.8:
198
- conclusion = "EXCELLENT parallélisme (proche du théorique maximum)"
199
- elif parallelism_factor >= 2:
200
- conclusion = "BON parallélisme (significativement meilleur que séquentiel)"
201
- elif parallelism_factor >= 1.3:
202
- conclusion = "MOYEN parallélisme (légèrement meilleur que séquentiel)"
203
- else:
204
- conclusion = "FAIBLE ou PAS DE parallélisme (pas d'avantage significatif)"
205
-
206
- print(f"\nConclusion: {conclusion}")
207
-
208
- # Enregistrer les résultats
209
- output_file = f"parallel_test_{PROVIDER}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
210
- with open(output_file, 'w') as f:
211
- json.dump({
212
- "model": MODEL_NAME,
213
- "provider": PROVIDER,
214
- "request_count": len(prompts),
215
- "parallel_duration": parallel_duration,
216
- "sequential_duration": sequential_duration,
217
- "parallelism_factor": parallelism_factor,
218
- "improvement_percent": improvement_percent,
219
- "conclusion": conclusion,
220
- "parallel_results": parallel_results,
221
- "sequential_results": sequential_results
222
- }, f, indent=2)
223
-
224
- print(f"\nRésultats détaillés sauvegardés dans {output_file}")
225
-
226
- if __name__ == "__main__":
227
- asyncio.run(run_tests())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/tests/test_yourbench_results.py DELETED
@@ -1,394 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script pour tester les résultats de Yourbench et vérifier les datasets sur le Hub Hugging Face.
4
- """
5
-
6
- import os
7
- import sys
8
- import json
9
- import argparse
10
- import requests
11
- import tempfile
12
- from datetime import datetime
13
- from typing import Dict, List, Any, Optional, Tuple
14
-
15
- # Vérifier si les bibliothèques nécessaires sont installées
16
- try:
17
- from dotenv import load_dotenv
18
- from huggingface_hub import HfApi, DatasetInfo, ModelInfo
19
- from loguru import logger
20
- import pandas as pd
21
- except ImportError:
22
- print("Installation des dépendances...")
23
- import subprocess
24
- subprocess.run(["pip", "install", "python-dotenv", "huggingface_hub", "loguru", "pandas", "pyarrow"], check=True)
25
- from dotenv import load_dotenv
26
- from huggingface_hub import HfApi, DatasetInfo, ModelInfo
27
- from loguru import logger
28
- import pandas as pd
29
-
30
- # Charger les variables d'environnement depuis .env
31
- load_dotenv()
32
-
33
- # Configuration de la journalisation
34
- logger.remove()
35
- logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>")
36
- logger.add("yourbench_tests.log", rotation="10 MB", retention="1 week")
37
-
38
- def configure_argument_parser() -> argparse.ArgumentParser:
39
- """Configure le parser d'arguments."""
40
- parser = argparse.ArgumentParser(description="Tester les résultats de Yourbench et vérifier les datasets")
41
- parser.add_argument("--dataset", type=str, help="Nom du dataset à vérifier (sans le nom de l'organisation)")
42
- parser.add_argument("--org", type=str, default=os.environ.get("HF_ORGANIZATION", "yourbench"),
43
- help="Organisation Hugging Face (défaut: valeur de HF_ORGANIZATION dans .env ou 'yourbench')")
44
- parser.add_argument("--verbose", "-v", action="store_true", help="Afficher des informations détaillées")
45
- return parser
46
-
47
- class YourbenchTester:
48
- """Classe pour tester les résultats et datasets de Yourbench."""
49
-
50
- def __init__(self, organization: str, verbose: bool = False):
51
- """Initialise le testeur Yourbench.
52
-
53
- Args:
54
- organization: Nom de l'organisation sur Hugging Face
55
- verbose: Afficher des informations détaillées
56
- """
57
- self.organization = organization
58
- self.verbose = verbose
59
- self.hf_token = os.environ.get("HF_TOKEN")
60
-
61
- if not self.hf_token:
62
- logger.error("Variable d'environnement HF_TOKEN non trouvée dans le fichier .env")
63
- sys.exit(1)
64
-
65
- self.api = HfApi(token=self.hf_token)
66
- logger.info(f"Initialisation du testeur pour l'organisation: {organization}")
67
-
68
- def test_dataset_exists(self, dataset_name: str) -> Optional[DatasetInfo]:
69
- """Vérifie si un dataset existe sur le Hub.
70
-
71
- Args:
72
- dataset_name: Nom du dataset à vérifier
73
-
74
- Returns:
75
- Informations sur le dataset s'il existe, None sinon
76
- """
77
- full_dataset_name = f"{self.organization}/{dataset_name}"
78
- logger.info(f"Vérification de l'existence du dataset: {full_dataset_name}")
79
-
80
- try:
81
- dataset_info = self.api.dataset_info(full_dataset_name)
82
- logger.success(f"Dataset {full_dataset_name} trouvé!")
83
-
84
- if self.verbose:
85
- logger.info(f"ID: {dataset_info.id}")
86
- logger.info(f"Dernière modification: {dataset_info.lastModified}")
87
- logger.info(f"SHA: {dataset_info.sha}")
88
-
89
- return dataset_info
90
-
91
- except Exception as e:
92
- logger.error(f"Impossible de trouver le dataset {full_dataset_name}: {str(e)}")
93
- return None
94
-
95
- def analyze_dataset_content(self, dataset_name: str) -> Tuple[bool, Dict[str, Any]]:
96
- """Analyse le contenu d'un dataset.
97
-
98
- Args:
99
- dataset_name: Nom du dataset à analyser
100
-
101
- Returns:
102
- Tuple contenant un booléen indiquant si l'analyse a réussi et un dictionnaire de statistiques
103
- """
104
- full_dataset_name = f"{self.organization}/{dataset_name}"
105
- logger.info(f"Analyse du contenu du dataset: {full_dataset_name}")
106
-
107
- stats = {
108
- "fichiers": 0,
109
- "taille_totale": 0,
110
- "fichiers_json": 0,
111
- "fichiers_parquet": 0,
112
- "a_questions": False,
113
- "nb_questions": 0,
114
- "structure_parquet": {},
115
- "types_documents": set()
116
- }
117
-
118
- try:
119
- # Lister les fichiers dans le dataset
120
- files = self.api.list_repo_files(full_dataset_name, repo_type="dataset")
121
- stats["fichiers"] = len(files)
122
-
123
- if self.verbose:
124
- logger.info(f"Fichiers trouvés dans le dataset: {len(files)}")
125
- for file in files[:10]: # Limiter à 10 fichiers pour éviter un affichage trop verbeux
126
- logger.info(f" - {file}")
127
- if len(files) > 10:
128
- logger.info(f" ... et {len(files) - 10} fichiers supplémentaires")
129
-
130
- # Vérifier la présence de fichiers questions
131
- question_files = [f for f in files if "question" in f.lower() and f.endswith(".json")]
132
- stats["fichiers_json"] = len([f for f in files if f.endswith(".json")])
133
-
134
- # Vérifier les fichiers Parquet qui sont utilisés par Yourbench
135
- parquet_files = [f for f in files if f.endswith(".parquet")]
136
- stats["fichiers_parquet"] = len(parquet_files)
137
-
138
- if parquet_files:
139
- logger.info(f"Fichiers Parquet trouvés: {len(parquet_files)}")
140
-
141
- # Analyser un échantillon de fichiers Parquet
142
- for parquet_file in parquet_files[:3]: # Limiter à 3 fichiers pour l'analyse
143
- category = parquet_file.split('/')[0] if '/' in parquet_file else "unknown"
144
-
145
- logger.info(f"Analyse du fichier Parquet: {parquet_file} (catégorie: {category})")
146
-
147
- try:
148
- # Télécharger le fichier Parquet
149
- temp_file = self.api.hf_hub_download(
150
- repo_id=full_dataset_name,
151
- filename=parquet_file,
152
- repo_type="dataset"
153
- )
154
-
155
- # Lire le fichier Parquet avec pandas
156
- df = pd.read_parquet(temp_file)
157
-
158
- # Ajouter des statistiques
159
- stats["structure_parquet"][category] = {
160
- "colonnes": list(df.columns),
161
- "nb_lignes": len(df),
162
- "exemple": df.iloc[0].to_dict() if len(df) > 0 else {}
163
- }
164
-
165
- # Vérifier si ce fichier contient des questions
166
- if any(col for col in df.columns if "question" in col.lower()):
167
- stats["a_questions"] = True
168
- question_col = next(col for col in df.columns if "question" in col.lower())
169
- stats["nb_questions"] = len(df)
170
-
171
- # Récupérer un exemple de question
172
- if len(df) > 0 and question_col in df.columns:
173
- logger.info(f"Exemple de question: {df[question_col].iloc[0][:100]}...")
174
-
175
- # Identifier les types de documents si disponible
176
- if "doc_type" in df.columns and len(df) > 0:
177
- doc_types = df["doc_type"].unique()
178
- stats["types_documents"].update(doc_types)
179
-
180
- except Exception as e:
181
- logger.warning(f"Erreur lors de l'analyse du fichier {parquet_file}: {str(e)}")
182
-
183
- # Convertir le set en liste pour la sérialisation JSON
184
- stats["types_documents"] = list(stats["types_documents"])
185
-
186
- if question_files:
187
- stats["a_questions"] = True
188
-
189
- # Analyser un fichier de questions pour comprendre sa structure
190
- sample_file = question_files[0]
191
- content = self.api.hf_hub_download(
192
- repo_id=full_dataset_name,
193
- filename=sample_file,
194
- repo_type="dataset"
195
- )
196
-
197
- with open(content, 'r') as f:
198
- data = json.load(f)
199
-
200
- if isinstance(data, list):
201
- stats["nb_questions"] = len(data)
202
- elif isinstance(data, dict) and "questions" in data:
203
- stats["nb_questions"] = len(data["questions"])
204
-
205
- logger.success(f"Fichiers de questions trouvés: {len(question_files)}")
206
- logger.info(f"Exemple de fichier analysé: {sample_file}")
207
- logger.info(f"Nombre de questions trouvées: {stats['nb_questions']}")
208
-
209
- return True, stats
210
-
211
- except Exception as e:
212
- logger.error(f"Erreur lors de l'analyse du dataset {full_dataset_name}: {str(e)}")
213
- return False, stats
214
-
215
- def check_evaluation_results(self, dataset_name: str) -> bool:
216
- """Vérifie s'il existe des résultats d'évaluation pour ce dataset.
217
-
218
- Args:
219
- dataset_name: Nom du dataset à vérifier
220
-
221
- Returns:
222
- True si des résultats d'évaluation existent, False sinon
223
- """
224
- logger.info(f"Recherche de résultats d'évaluation pour le dataset: {dataset_name}")
225
-
226
- try:
227
- # Lister tous les datasets de l'organisation
228
- datasets = self.api.list_datasets(author=self.organization)
229
-
230
- # Chercher les datasets d'évaluation
231
- eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
232
-
233
- if self.verbose:
234
- logger.info(f"Datasets d'évaluation trouvés: {len(eval_datasets)}")
235
- for ds in eval_datasets[:5]:
236
- logger.info(f" - {ds.id}")
237
-
238
- # Vérifier si le dataset spécifié est mentionné dans les évaluations
239
- for eval_ds in eval_datasets:
240
- try:
241
- # Télécharger le README pour voir si le dataset est mentionné
242
- readme_path = self.api.hf_hub_download(
243
- repo_id=eval_ds.id,
244
- filename="README.md",
245
- repo_type="dataset"
246
- )
247
-
248
- with open(readme_path, 'r') as f:
249
- readme_content = f.read()
250
-
251
- if dataset_name in readme_content:
252
- logger.success(f"Résultats d'évaluation trouvés dans: {eval_ds.id}")
253
- return True
254
- except:
255
- continue
256
-
257
- logger.warning(f"Aucun résultat d'évaluation trouvé pour le dataset: {dataset_name}")
258
- return False
259
-
260
- except Exception as e:
261
- logger.error(f"Erreur lors de la recherche de résultats d'évaluation: {str(e)}")
262
- return False
263
-
264
- def check_model_performances(self, dataset_name: str) -> Dict[str, float]:
265
- """Vérifie les performances des modèles sur le dataset spécifié.
266
-
267
- Args:
268
- dataset_name: Nom du dataset à vérifier
269
-
270
- Returns:
271
- Dictionnaire des performances des modèles (model_name -> score)
272
- """
273
- logger.info(f"Vérification des performances des modèles sur le dataset: {dataset_name}")
274
- performances = {}
275
-
276
- try:
277
- # Cette partie est spéculative car nous ne connaissons pas la structure exacte
278
- # des résultats. Une approche possible serait de chercher des fichiers JSON
279
- # contenant des métriques dans les datasets d'évaluation.
280
-
281
- # Chercher les datasets d'évaluation
282
- datasets = self.api.list_datasets(author=self.organization)
283
- eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
284
-
285
- for eval_ds in eval_datasets:
286
- try:
287
- files = self.api.list_repo_files(eval_ds.id, repo_type="dataset")
288
- result_files = [f for f in files if "result" in f.lower() and f.endswith(".json")]
289
-
290
- for result_file in result_files:
291
- file_path = self.api.hf_hub_download(
292
- repo_id=eval_ds.id,
293
- filename=result_file,
294
- repo_type="dataset"
295
- )
296
-
297
- with open(file_path, 'r') as f:
298
- results = json.load(f)
299
-
300
- # Analyse basique des résultats (à adapter selon la structure réelle)
301
- if "model_name" in results and "metrics" in results:
302
- model_name = results["model_name"]
303
- metrics = results["metrics"]
304
-
305
- # Prendre la première métrique trouvée comme score
306
- if metrics and isinstance(metrics, dict):
307
- first_metric = list(metrics.keys())[0]
308
- performances[model_name] = metrics[first_metric]
309
- except:
310
- continue
311
-
312
- if performances:
313
- logger.success(f"Performances trouvées pour {len(performances)} modèles")
314
- for model, score in performances.items():
315
- logger.info(f" - {model}: {score}")
316
- else:
317
- logger.warning("Aucune performance de modèle trouvée")
318
-
319
- return performances
320
-
321
- except Exception as e:
322
- logger.error(f"Erreur lors de la vérification des performances: {str(e)}")
323
- return {}
324
-
325
- def main():
326
- """Fonction principale."""
327
- parser = configure_argument_parser()
328
- args = parser.parse_args()
329
-
330
- if not args.dataset:
331
- logger.error("Veuillez spécifier un dataset avec --dataset")
332
- parser.print_help()
333
- return
334
-
335
- # Créer le testeur
336
- tester = YourbenchTester(args.org, args.verbose)
337
-
338
- # 1. Vérifier l'existence du dataset
339
- dataset_info = tester.test_dataset_exists(args.dataset)
340
-
341
- if not dataset_info:
342
- logger.error(f"Le dataset {args.org}/{args.dataset} n'existe pas ou n'est pas accessible")
343
- return
344
-
345
- # 2. Analyser le contenu du dataset
346
- success, stats = tester.analyze_dataset_content(args.dataset)
347
-
348
- if success:
349
- logger.info("\n=== Statistiques du dataset ===")
350
- logger.info(f"Nombre de fichiers: {stats['fichiers']}")
351
- logger.info(f"Fichiers JSON: {stats['fichiers_json']}")
352
- logger.info(f"Fichiers Parquet: {stats['fichiers_parquet']}")
353
- logger.info(f"Contient des questions: {'Oui' if stats['a_questions'] else 'Non'}")
354
-
355
- if stats['a_questions']:
356
- logger.info(f"Nombre de questions: {stats['nb_questions']}")
357
-
358
- if 'types_documents' in stats and stats['types_documents']:
359
- logger.info(f"Types de documents: {', '.join(stats['types_documents'])}")
360
-
361
- # Afficher la structure des fichiers Parquet
362
- if 'structure_parquet' in stats and stats['structure_parquet']:
363
- logger.info("\n=== Structure des fichiers Parquet ===")
364
- for category, info in stats['structure_parquet'].items():
365
- logger.info(f"\nCatégorie: {category}")
366
- logger.info(f"Nombre de lignes: {info['nb_lignes']}")
367
- logger.info(f"Colonnes: {', '.join(info['colonnes'])}")
368
-
369
- if args.verbose and 'exemple' in info and info['exemple']:
370
- logger.info("\nExemple de ligne:")
371
- for key, value in info['exemple'].items():
372
- # Tronquer les valeurs trop longues
373
- if isinstance(value, str) and len(value) > 100:
374
- value = value[:100] + "..."
375
- logger.info(f" {key}: {value}")
376
-
377
- # 3. Vérifier s'il existe des résultats d'évaluation
378
- has_evaluations = tester.check_evaluation_results(args.dataset)
379
-
380
- if has_evaluations:
381
- # 4. Vérifier les performances des modèles
382
- performances = tester.check_model_performances(args.dataset)
383
-
384
- if performances:
385
- logger.info("\n=== Classement des modèles ===")
386
- # Trier les modèles par score (du plus élevé au plus bas)
387
- sorted_models = sorted(performances.items(), key=lambda x: x[1], reverse=True)
388
- for i, (model, score) in enumerate(sorted_models, 1):
389
- logger.info(f"{i}. {model}: {score:.4f}")
390
-
391
- logger.success("Test terminé !")
392
-
393
- if __name__ == "__main__":
394
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/yourbench_simple_demo.egg-info/SOURCES.txt CHANGED
@@ -2,14 +2,6 @@ README.md
2
  pyproject.toml
3
  lighteval_task/__init__.py
4
  lighteval_task/lighteval_task.py
5
- tests/test_evaluation.py
6
- tests/test_hf_upload.py
7
- tests/test_inference.py
8
- tests/test_lighteval.py
9
- tests/test_openai.py
10
- tests/test_parallel_lighteval.py
11
- tests/test_provider_parallel_support.py
12
- tests/test_yourbench_results.py
13
  yourbench_simple_demo.egg-info/PKG-INFO
14
  yourbench_simple_demo.egg-info/SOURCES.txt
15
  yourbench_simple_demo.egg-info/dependency_links.txt
 
2
  pyproject.toml
3
  lighteval_task/__init__.py
4
  lighteval_task/lighteval_task.py
 
 
 
 
 
 
 
 
5
  yourbench_simple_demo.egg-info/PKG-INFO
6
  yourbench_simple_demo.egg-info/SOURCES.txt
7
  yourbench_simple_demo.egg-info/dependency_links.txt
frontend/src/components/BenchmarkCreateForm.jsx CHANGED
@@ -1,48 +1,26 @@
1
- import React, { useState, useRef, useEffect } from "react";
2
  import {
3
  Box,
4
  Paper,
5
  Typography,
6
  CircularProgress,
7
- Alert,
8
  Button,
9
- Stepper,
10
- Step,
11
- StepLabel,
12
  } from "@mui/material";
13
- import { useLocation } from "react-router-dom";
14
  import CloudUploadIcon from "@mui/icons-material/CloudUpload";
15
  import AutoFixHighIcon from "@mui/icons-material/AutoFixHigh";
16
- import AuthContainer from "./shared/AuthContainer";
 
 
 
17
  import { useThemeMode } from "../hooks/useThemeMode";
18
  import getTheme from "../config/theme";
19
  import API_CONFIG from "../config/api";
20
 
21
  /**
22
- * Component to display a stepper with three steps: Login, Upload File, and Generate
23
- *
24
- * @param {Object} props - Component props
25
- * @param {number} props.activeStep - Current active step (0-based index)
26
- * @returns {JSX.Element} Stepper component
27
- */
28
- const StepsDisplay = ({ activeStep }) => {
29
- const steps = ["Login", "Upload File", "Generate"];
30
-
31
- return (
32
- <Box sx={{ width: "100%", mb: 4 }}>
33
- <Stepper activeStep={activeStep} alternativeLabel>
34
- {steps.map((label) => (
35
- <Step key={label}>
36
- <StepLabel>{label}</StepLabel>
37
- </Step>
38
- ))}
39
- </Stepper>
40
- </Box>
41
- );
42
- };
43
-
44
- /**
45
- * Component for creating a new benchmark, including authentication, file upload, and generation initiation
46
  *
47
  * @param {Object} props - Component props
48
  * @param {Function} props.onStartGeneration - Callback when generation starts with sessionId
@@ -54,31 +32,36 @@ function BenchmarkCreateForm({ onStartGeneration }) {
54
  const [isDragging, setIsDragging] = useState(false);
55
  const [uploadStatus, setUploadStatus] = useState(null);
56
  const [isLoading, setIsLoading] = useState(false);
57
- const [activeStep, setActiveStep] = useState(0);
58
  const [sessionId, setSessionId] = useState(null);
 
 
 
59
  const fileInputRef = useRef(null);
60
- const location = useLocation();
61
-
62
- // Check if we're coming back from an OAuth redirect
63
- useEffect(() => {
64
- // If we have code in URL parameters, it's an OAuth callback
65
- const params = new URLSearchParams(window.location.search);
66
- if (params.has("code")) {
67
- console.log("Detected OAuth callback, cleaning URL");
68
 
69
- // Remove the query parameters from the URL without reloading
70
- window.history.replaceState({}, document.title, window.location.pathname);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- // Check if we have auth data in localStorage after a brief delay to let OAuth process complete
73
- setTimeout(() => {
74
- const storedAuth = localStorage.getItem("hf_oauth");
75
- if (storedAuth) {
76
- console.log("Found auth data after redirect, refreshing UI state");
77
- setActiveStep(1); // Move to next step if authenticated
78
- }
79
- }, 1000);
80
- }
81
- }, [location]);
82
 
83
  const handleDragOver = (e) => {
84
  e.preventDefault();
@@ -97,7 +80,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
97
  const file = e.target.files[0];
98
  if (!file) return;
99
 
100
- // Vérifier si c'est un PDF, TXT, HTML ou MD
101
  if (
102
  !file.name.endsWith(".pdf") &&
103
  !file.name.endsWith(".txt") &&
@@ -117,6 +100,8 @@ function BenchmarkCreateForm({ onStartGeneration }) {
117
  const handleFileUpload = async (file) => {
118
  setIsLoading(true);
119
  setUploadStatus(null);
 
 
120
 
121
  try {
122
  const formData = new FormData();
@@ -134,20 +119,22 @@ function BenchmarkCreateForm({ onStartGeneration }) {
134
  success: true,
135
  message: `File ${result.filename} uploaded successfully`,
136
  });
137
- // Store the session ID for the benchmark generation
138
  setSessionId(result.session_id);
139
- setActiveStep(2); // Advance to Generate step after successful upload
140
  } else {
141
  setUploadStatus({
142
  success: false,
143
  message: result.error || "Upload failed",
144
  });
 
145
  }
146
  } catch (error) {
147
  setUploadStatus({
148
  success: false,
149
  message: "Server connection error",
150
  });
 
151
  } finally {
152
  setIsLoading(false);
153
  }
@@ -163,7 +150,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
163
  return;
164
  }
165
 
166
- // Vérifier si c'est un PDF, TXT, HTML ou MD
167
  if (
168
  !file.name.endsWith(".pdf") &&
169
  !file.name.endsWith(".txt") &&
@@ -180,114 +167,175 @@ function BenchmarkCreateForm({ onStartGeneration }) {
180
  handleFileUpload(file);
181
  };
182
 
 
 
 
 
 
 
183
  const handleGenerateClick = () => {
184
  if (onStartGeneration && sessionId) {
185
- onStartGeneration(sessionId);
186
  }
187
  };
188
 
189
  return (
190
- <>
191
- <StepsDisplay activeStep={activeStep} />
 
 
 
 
 
 
 
192
 
193
- {/* Authentication step */}
194
- {activeStep === 0 && (
195
- <AuthContainer
196
- actionText="use this demo"
197
- onSuccess={() => setActiveStep(1)}
198
- />
199
- )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- {/* File upload step */}
202
- {activeStep === 1 && (
203
- <Paper
204
- elevation={3}
205
- sx={{
206
- p: 4,
207
- mt: 3,
208
- mb: 3,
209
- border: isDragging
 
 
 
 
 
 
 
 
 
 
210
  ? `2px dashed ${theme.palette.primary.main}`
211
- : "2px dashed #ccc",
212
- backgroundColor: isDragging ? "rgba(0, 0, 0, 0.05)" : "transparent",
213
- display: "flex",
214
- flexDirection: "column",
215
- alignItems: "center",
216
- justifyContent: "center",
217
- minHeight: 200,
218
- cursor: "pointer",
219
- transition: "all 0.3s ease",
220
- }}
221
- onDragOver={handleDragOver}
222
- onDragLeave={handleDragLeave}
223
- onDrop={handleDrop}
224
- onClick={handleClick}
225
- >
226
- <input
227
- type="file"
228
- ref={fileInputRef}
229
- onChange={handleFileChange}
230
- accept=".pdf,.txt,.html,.md"
231
- style={{ display: "none" }}
232
- />
233
- <CloudUploadIcon
234
- sx={{ fontSize: 60, color: "text.secondary", mb: 1 }}
235
- />
236
- <Typography variant="h6" component="div" gutterBottom>
237
- Drag and drop your file here or click to browse
238
- </Typography>
239
- <Typography variant="body2" color="text.secondary">
240
- Accepted formats: PDF, TXT, HTML, MD
241
- </Typography>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
- {isLoading && (
244
- <Box sx={{ mt: 2 }}>
245
- <CircularProgress size={30} />
246
- </Box>
247
- )}
 
248
 
249
- {uploadStatus && (
250
- <Alert
251
- severity={uploadStatus.success ? "success" : "error"}
252
- sx={{ mt: 2, width: "100%" }}
253
- >
254
- {uploadStatus.message}
255
- </Alert>
256
- )}
257
- </Paper>
258
- )}
 
 
259
 
260
- {/* Generate button step */}
261
- {activeStep === 2 && (
262
- <Paper
263
- elevation={3}
264
- sx={{
265
- p: 4,
266
- mt: 3,
267
- display: "flex",
268
- flexDirection: "column",
269
- alignItems: "center",
270
- justifyContent: "center",
271
- minHeight: 200,
272
- }}
273
  >
274
- <AutoFixHighIcon
275
- sx={{ fontSize: 60, color: "text.secondary", mb: 1 }}
276
- />
277
- <Typography variant="h6" component="div" gutterBottom>
278
- Ready to generate your benchmark
279
- </Typography>
280
- <Button
281
- variant="contained"
282
- color="primary"
283
- onClick={handleGenerateClick}
284
- sx={{ mt: 2 }}
285
- >
286
- Generate Benchmark
287
- </Button>
288
- </Paper>
289
- )}
290
- </>
291
  );
292
  }
293
 
 
1
+ import React, { useState, useRef } from "react";
2
  import {
3
  Box,
4
  Paper,
5
  Typography,
6
  CircularProgress,
 
7
  Button,
8
+ Snackbar,
9
+ Alert,
10
+ Grid,
11
  } from "@mui/material";
 
12
  import CloudUploadIcon from "@mui/icons-material/CloudUpload";
13
  import AutoFixHighIcon from "@mui/icons-material/AutoFixHigh";
14
+ import InsertDriveFileIcon from "@mui/icons-material/InsertDriveFile";
15
+ import DescriptionIcon from "@mui/icons-material/Description";
16
+ import ArticleIcon from "@mui/icons-material/Article";
17
+ import MenuBookIcon from "@mui/icons-material/MenuBook";
18
  import { useThemeMode } from "../hooks/useThemeMode";
19
  import getTheme from "../config/theme";
20
  import API_CONFIG from "../config/api";
21
 
22
  /**
23
+ * Component for creating a new benchmark, including file upload and generation initiation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  *
25
  * @param {Object} props - Component props
26
  * @param {Function} props.onStartGeneration - Callback when generation starts with sessionId
 
32
  const [isDragging, setIsDragging] = useState(false);
33
  const [uploadStatus, setUploadStatus] = useState(null);
34
  const [isLoading, setIsLoading] = useState(false);
 
35
  const [sessionId, setSessionId] = useState(null);
36
+ const [openSnackbar, setOpenSnackbar] = useState(false);
37
+ const [selectedDocument, setSelectedDocument] = useState(null);
38
+ const [isDefaultDocument, setIsDefaultDocument] = useState(false);
39
  const fileInputRef = useRef(null);
 
 
 
 
 
 
 
 
40
 
41
+ const defaultDocuments = [
42
+ {
43
+ id: "the-bitter-lesson",
44
+ name: "The Bitter Lesson",
45
+ icon: <ArticleIcon sx={{ fontSize: 40 }} />,
46
+ description: "A seminal paper on AI development by Rich Sutton",
47
+ },
48
+ {
49
+ id: "hurricane-faq",
50
+ name: "Hurricane FAQ",
51
+ icon: <DescriptionIcon sx={{ fontSize: 40 }} />,
52
+ description: "Frequently asked questions about hurricanes",
53
+ },
54
+ {
55
+ id: "pokemon-guide",
56
+ name: "Pokemon Guide",
57
+ icon: <MenuBookIcon sx={{ fontSize: 40 }} />,
58
+ description: "A comprehensive guide to Pokemon",
59
+ },
60
+ ];
61
 
62
+ const handleCloseSnackbar = () => {
63
+ setOpenSnackbar(false);
64
+ };
 
 
 
 
 
 
 
65
 
66
  const handleDragOver = (e) => {
67
  e.preventDefault();
 
80
  const file = e.target.files[0];
81
  if (!file) return;
82
 
83
+ // Check if it's a PDF, TXT, HTML or MD
84
  if (
85
  !file.name.endsWith(".pdf") &&
86
  !file.name.endsWith(".txt") &&
 
100
  const handleFileUpload = async (file) => {
101
  setIsLoading(true);
102
  setUploadStatus(null);
103
+ setIsDefaultDocument(false);
104
+ setSelectedDocument(null);
105
 
106
  try {
107
  const formData = new FormData();
 
119
  success: true,
120
  message: `File ${result.filename} uploaded successfully`,
121
  });
122
+ setOpenSnackbar(true);
123
  setSessionId(result.session_id);
124
+ setSelectedDocument({ name: file.name });
125
  } else {
126
  setUploadStatus({
127
  success: false,
128
  message: result.error || "Upload failed",
129
  });
130
+ setOpenSnackbar(true);
131
  }
132
  } catch (error) {
133
  setUploadStatus({
134
  success: false,
135
  message: "Server connection error",
136
  });
137
+ setOpenSnackbar(true);
138
  } finally {
139
  setIsLoading(false);
140
  }
 
150
  return;
151
  }
152
 
153
+ // Check if it's a PDF, TXT, HTML or MD
154
  if (
155
  !file.name.endsWith(".pdf") &&
156
  !file.name.endsWith(".txt") &&
 
167
  handleFileUpload(file);
168
  };
169
 
170
+ const handleDefaultDocClick = (doc) => {
171
+ setSelectedDocument(doc);
172
+ setSessionId(doc.id);
173
+ setIsDefaultDocument(true);
174
+ };
175
+
176
  const handleGenerateClick = () => {
177
  if (onStartGeneration && sessionId) {
178
+ onStartGeneration(sessionId, isDefaultDocument);
179
  }
180
  };
181
 
182
  return (
183
+ <Box sx={{ mt: -2 }}>
184
+ <Typography
185
+ variant="subtitle1"
186
+ component="div"
187
+ align="center"
188
+ sx={{ mb: 2, color: "text.secondary" }}
189
+ >
190
+ Choose a sample document
191
+ </Typography>
192
 
193
+ <Grid container spacing={2} sx={{ mb: 2 }}>
194
+ {defaultDocuments.map((doc) => (
195
+ <Grid item xs={12} md={4} key={doc.id}>
196
+ <Box
197
+ elevation={2}
198
+ sx={{
199
+ p: 2,
200
+ display: "flex",
201
+ flexDirection: "column",
202
+ borderRadius: 1.5,
203
+ alignItems: "center",
204
+ cursor: "pointer",
205
+ transition: "all 0.2s ease",
206
+ height: "100%",
207
+ // border: "2px solid rgba(0, 0, 0, 0.1)",
208
+ border:
209
+ selectedDocument?.id === doc.id
210
+ ? `2px solid ${theme.palette.primary.main}`
211
+ : "2px solid rgba(0, 0, 0, 0.1)",
212
+ "&:hover": {
213
+ transform: "translateY(-2px)",
214
+ boxShadow: 3,
215
+ },
216
+ }}
217
+ onClick={() => handleDefaultDocClick(doc)}
218
+ >
219
+ <Box sx={{ color: "primary.main", mb: 1 }}>{doc.icon}</Box>
220
+ <Typography variant="subtitle1" component="div" gutterBottom>
221
+ {doc.name}
222
+ </Typography>
223
+ <Typography
224
+ variant="body2"
225
+ color="text.secondary"
226
+ align="center"
227
+ sx={{ flexGrow: 1 }}
228
+ >
229
+ {doc.description}
230
+ </Typography>
231
+ </Box>
232
+ </Grid>
233
+ ))}
234
+ </Grid>
235
 
236
+ <Typography
237
+ variant="subtitle1"
238
+ component="div"
239
+ align="center"
240
+ sx={{ mb: 2, color: "text.secondary" }}
241
+ >
242
+ Or upload your own ...
243
+ </Typography>
244
+
245
+ <Box
246
+ sx={{
247
+ p: 4,
248
+ mt: 2,
249
+ mb: 2,
250
+ borderRadius: 1.5,
251
+ border:
252
+ selectedDocument?.name && !isDefaultDocument
253
+ ? `2px solid ${theme.palette.primary.main}`
254
+ : isDragging
255
  ? `2px dashed ${theme.palette.primary.main}`
256
+ : "2px dashed rgba(0, 0, 0, 0.16)",
257
+ backgroundColor: isDragging ? "rgba(0, 0, 0, 0.05)" : "transparent",
258
+ display: "flex",
259
+ flexDirection: "column",
260
+ alignItems: "center",
261
+ justifyContent: "center",
262
+ minHeight: 180,
263
+ cursor: "pointer",
264
+ transition: "all 0.3s ease",
265
+ }}
266
+ onDragOver={handleDragOver}
267
+ onDragLeave={handleDragLeave}
268
+ onDrop={handleDrop}
269
+ onClick={handleClick}
270
+ >
271
+ <input
272
+ type="file"
273
+ ref={fileInputRef}
274
+ onChange={handleFileChange}
275
+ accept=".pdf,.txt,.html,.md"
276
+ style={{ display: "none" }}
277
+ />
278
+ {selectedDocument?.name && !isDefaultDocument ? (
279
+ <>
280
+ <InsertDriveFileIcon
281
+ sx={{ fontSize: 50, color: "primary.main", mb: 1 }}
282
+ />
283
+ <Typography variant="h6" component="div" gutterBottom>
284
+ {selectedDocument.name}
285
+ </Typography>
286
+ <Typography variant="body2" color="text.secondary">
287
+ Click to upload a different file
288
+ </Typography>
289
+ </>
290
+ ) : (
291
+ <>
292
+ <CloudUploadIcon
293
+ sx={{ fontSize: 50, color: "primary.main", mb: 1 }}
294
+ />
295
+ <Typography variant="h6" component="div" gutterBottom>
296
+ Drag and drop your file here or click to browse
297
+ </Typography>
298
+ <Typography variant="body2" color="text.secondary">
299
+ Accepted formats: PDF, TXT, HTML, MD
300
+ </Typography>
301
+ </>
302
+ )}
303
 
304
+ {isLoading && (
305
+ <Box sx={{ mt: 2 }}>
306
+ <CircularProgress size={30} />
307
+ </Box>
308
+ )}
309
+ </Box>
310
 
311
+ <Box sx={{ display: "flex", justifyContent: "center" }}>
312
+ <Button
313
+ variant="contained"
314
+ color="primary"
315
+ onClick={handleGenerateClick}
316
+ startIcon={<AutoFixHighIcon />}
317
+ disabled={!sessionId}
318
+ sx={{ mt: 2 }}
319
+ >
320
+ Generate Benchmark
321
+ </Button>
322
+ </Box>
323
 
324
+ <Snackbar
325
+ open={openSnackbar}
326
+ autoHideDuration={6000}
327
+ onClose={handleCloseSnackbar}
328
+ anchorOrigin={{ vertical: "bottom", horizontal: "right" }}
329
+ >
330
+ <Alert
331
+ onClose={handleCloseSnackbar}
332
+ severity={uploadStatus?.success ? "success" : "error"}
333
+ sx={{ width: "100%" }}
 
 
 
334
  >
335
+ {uploadStatus?.message}
336
+ </Alert>
337
+ </Snackbar>
338
+ </Box>
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  );
340
  }
341
 
frontend/src/components/BenchmarkEvaluation.jsx CHANGED
@@ -3,15 +3,28 @@ import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
3
  import { useNavigate, useSearchParams } from "react-router-dom";
4
  import API_CONFIG from "../config/api";
5
 
 
 
 
 
 
 
 
 
 
6
  // Starting messages with their timing
7
  const STARTING_MESSAGES = [
8
- { message: "Initializing evaluation environment...", progress: 22 },
9
- { message: "Starting evaluation process...", progress: 54 },
10
- { message: "Evaluating models...", progress: 71 },
11
- { message: "Storing evaluation results...", progress: 100 },
12
  ];
13
 
14
- const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
 
 
 
 
15
  const [evaluationComplete, setEvaluationComplete] = useState(false);
16
  const [error, setError] = useState(null);
17
  const [elapsedTime, setElapsedTime] = useState(0);
@@ -21,6 +34,7 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
21
  const startTimeRef = useRef(null);
22
  const startingMessageIntervalRef = useRef(null);
23
  const pollingIntervalRef = useRef(null);
 
24
 
25
  const navigate = useNavigate();
26
 
@@ -33,21 +47,26 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
33
 
34
  // Add effect to handle starting messages
35
  useEffect(() => {
36
- startingMessageIntervalRef.current = setInterval(() => {
37
- setStartingMessageIndex((prev) => {
38
- if (prev < STARTING_MESSAGES.length - 1) {
39
- return prev + 1;
40
- }
41
- return prev;
42
- });
43
- }, 20000); // Change message every 20 seconds
 
 
 
 
 
44
 
45
  return () => {
46
  if (startingMessageIntervalRef.current) {
47
  clearInterval(startingMessageIntervalRef.current);
48
  }
49
  };
50
- }, []);
51
 
52
  // Start evaluation when component mounts
53
  useEffect(() => {
@@ -62,7 +81,11 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
62
  setElapsedTime(timeElapsed);
63
  }, 1000);
64
 
65
- startEvaluation();
 
 
 
 
66
 
67
  // Clean up intervals on unmount
68
  return () => {
@@ -72,8 +95,25 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
72
  if (timerIntervalRef.current) {
73
  clearInterval(timerIntervalRef.current);
74
  }
 
 
 
75
  };
76
- }, []);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  // Format elapsed time as HH:MM:SS
79
  const formatElapsedTime = () => {
 
3
  import { useNavigate, useSearchParams } from "react-router-dom";
4
  import API_CONFIG from "../config/api";
5
 
6
+ // Temps de simulation en millisecondes pour les documents précalculés
7
+ const SIMULATION_DURATION = 20000; // 20 secondes
8
+
9
+ // Intervalle de changement des messages pour les documents standards vs précalculés
10
+ const MESSAGE_CHANGE_INTERVAL = {
11
+ DEFAULT: 20000, // 20 secondes pour documents standards
12
+ PRECALCULATED: 5000, // 5 secondes pour documents précalculés
13
+ };
14
+
15
  // Starting messages with their timing
16
  const STARTING_MESSAGES = [
17
+ { message: "Initializing evaluation environment...", progress: 0 },
18
+ { message: "Starting evaluation process...", progress: 27 },
19
+ { message: "Evaluating models...", progress: 54 },
20
+ { message: "Storing evaluation results...", progress: 84 },
21
  ];
22
 
23
+ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
24
+ const [searchParams] = useSearchParams();
25
+ const isDefault =
26
+ isDefaultDocument ||
27
+ ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
28
  const [evaluationComplete, setEvaluationComplete] = useState(false);
29
  const [error, setError] = useState(null);
30
  const [elapsedTime, setElapsedTime] = useState(0);
 
34
  const startTimeRef = useRef(null);
35
  const startingMessageIntervalRef = useRef(null);
36
  const pollingIntervalRef = useRef(null);
37
+ const simulationTimeoutRef = useRef(null);
38
 
39
  const navigate = useNavigate();
40
 
 
47
 
48
  // Add effect to handle starting messages
49
  useEffect(() => {
50
+ startingMessageIntervalRef.current = setInterval(
51
+ () => {
52
+ setStartingMessageIndex((prev) => {
53
+ if (prev < STARTING_MESSAGES.length - 1) {
54
+ return prev + 1;
55
+ }
56
+ return prev;
57
+ });
58
+ },
59
+ isDefault
60
+ ? MESSAGE_CHANGE_INTERVAL.PRECALCULATED
61
+ : MESSAGE_CHANGE_INTERVAL.DEFAULT
62
+ );
63
 
64
  return () => {
65
  if (startingMessageIntervalRef.current) {
66
  clearInterval(startingMessageIntervalRef.current);
67
  }
68
  };
69
+ }, [isDefault]);
70
 
71
  // Start evaluation when component mounts
72
  useEffect(() => {
 
81
  setElapsedTime(timeElapsed);
82
  }, 1000);
83
 
84
+ if (isDefault) {
85
+ simulateEvaluation();
86
+ } else {
87
+ startEvaluation();
88
+ }
89
 
90
  // Clean up intervals on unmount
91
  return () => {
 
95
  if (timerIntervalRef.current) {
96
  clearInterval(timerIntervalRef.current);
97
  }
98
+ if (simulationTimeoutRef.current) {
99
+ clearTimeout(simulationTimeoutRef.current);
100
+ }
101
  };
102
+ }, [isDefault]);
103
+
104
+ // Simulate the evaluation process for pre-calculated documents
105
+ const simulateEvaluation = () => {
106
+ // Complete after 20 seconds
107
+ simulationTimeoutRef.current = setTimeout(() => {
108
+ setEvaluationComplete(true);
109
+
110
+ if (startingMessageIntervalRef.current) {
111
+ clearInterval(startingMessageIntervalRef.current);
112
+ }
113
+
114
+ setStartingMessageIndex(STARTING_MESSAGES.length - 1); // Set to last message
115
+ }, SIMULATION_DURATION);
116
+ };
117
 
118
  // Format elapsed time as HH:MM:SS
119
  const formatElapsedTime = () => {
frontend/src/components/BenchmarkGenerator.jsx CHANGED
@@ -6,6 +6,9 @@ import LogDisplay from "./LogDisplay";
6
  import { useNavigate, useSearchParams } from "react-router-dom";
7
  import API_CONFIG from "../config/api";
8
 
 
 
 
9
  // Define all benchmark steps in sequence
10
  const BENCHMARK_STEPS = [
11
  "ingestion",
@@ -28,15 +31,39 @@ const STEP_LABELS = {
28
  lighteval: "LightEval",
29
  };
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  /**
32
  * Component to handle benchmark generation and display logs
33
  *
34
  * @param {Object} props - Component props
35
  * @param {string} props.sessionId - The session ID for the uploaded file
 
36
  * @param {Function} props.onComplete - Function to call when generation is complete
37
  * @returns {JSX.Element} Benchmark generator component
38
  */
39
- const BenchmarkGenerator = ({ sessionId, onComplete }) => {
 
 
 
40
  const [generating, setGenerating] = useState(false);
41
  const [generationComplete, setGenerationComplete] = useState(false);
42
  const [generationLogs, setGenerationLogs] = useState([]);
@@ -55,6 +82,9 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
55
  // Reference for starting time
56
  const startTimeRef = useRef(null);
57
 
 
 
 
58
  // Start generation on component mount
59
  useEffect(() => {
60
  // Set start time
@@ -68,7 +98,11 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
68
  setElapsedTime(timeElapsed);
69
  }, 1000);
70
 
71
- generateBenchmark();
 
 
 
 
72
 
73
  // Clean up the polling interval and timer when the component unmounts
74
  return () => {
@@ -78,8 +112,56 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
78
  if (timerIntervalRef.current) {
79
  clearInterval(timerIntervalRef.current);
80
  }
 
 
 
81
  };
82
- }, []);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  // Determine the current phase and completed steps based on logs
85
  useEffect(() => {
@@ -116,6 +198,9 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
116
  setActiveStep(newActiveStep);
117
  }
118
 
 
 
 
119
  // Check the latest logs to determine the current phase
120
  const recentLogs = generationLogs.slice(-10); // Check more logs
121
 
@@ -157,7 +242,14 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
157
  ) {
158
  setCurrentPhase("configuring");
159
  }
160
- }, [generationLogs, completedSteps, activeStep, sessionId, onComplete]);
 
 
 
 
 
 
 
161
 
162
  const generateBenchmark = async () => {
163
  if (!sessionId) {
 
6
  import { useNavigate, useSearchParams } from "react-router-dom";
7
  import API_CONFIG from "../config/api";
8
 
9
+ // Temps de simulation en millisecondes pour les documents précalculés
10
+ const SIMULATION_DURATION = 20000; // 20 secondes
11
+
12
  // Define all benchmark steps in sequence
13
  const BENCHMARK_STEPS = [
14
  "ingestion",
 
31
  lighteval: "LightEval",
32
  };
33
 
34
+ // Simulated log messages for pre-calculated documents
35
+ const SIMULATED_LOGS = [
36
+ "[INFO] Initializing benchmark generation...",
37
+ "[INFO] Generating base configuration file...",
38
+ "[SUCCESS] Stage completed: ingestion",
39
+ "[INFO] Processing document content for upload...",
40
+ "[SUCCESS] Stage completed: upload_ingest_to_hub",
41
+ "[INFO] Generating document summary...",
42
+ "[SUCCESS] Stage completed: summarization",
43
+ "[INFO] Chunking content for better analysis...",
44
+ "[SUCCESS] Stage completed: chunking",
45
+ "[INFO] Generating single-shot questions...",
46
+ "[SUCCESS] Stage completed: single_shot_question_generation",
47
+ "[INFO] Creating multi-hop questions from content...",
48
+ "[SUCCESS] Stage completed: multi_hop_question_generation",
49
+ "[INFO] Running LightEval for benchmark validation...",
50
+ "[SUCCESS] Stage completed: lighteval",
51
+ "[SUCCESS] Ingestion process completed successfully",
52
+ ];
53
+
54
  /**
55
  * Component to handle benchmark generation and display logs
56
  *
57
  * @param {Object} props - Component props
58
  * @param {string} props.sessionId - The session ID for the uploaded file
59
+ * @param {boolean} props.isDefaultDocument - Whether this is a pre-calculated document
60
  * @param {Function} props.onComplete - Function to call when generation is complete
61
  * @returns {JSX.Element} Benchmark generator component
62
  */
63
+ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
64
+ const [searchParams] = useSearchParams();
65
+ const isDefault =
66
+ searchParams.get("isDefault") === "true" || isDefaultDocument;
67
  const [generating, setGenerating] = useState(false);
68
  const [generationComplete, setGenerationComplete] = useState(false);
69
  const [generationLogs, setGenerationLogs] = useState([]);
 
82
  // Reference for starting time
83
  const startTimeRef = useRef(null);
84
 
85
+ // Simulation interval reference
86
+ const simulationIntervalRef = useRef(null);
87
+
88
  // Start generation on component mount
89
  useEffect(() => {
90
  // Set start time
 
98
  setElapsedTime(timeElapsed);
99
  }, 1000);
100
 
101
+ if (isDefault) {
102
+ simulateGeneration();
103
+ } else {
104
+ generateBenchmark();
105
+ }
106
 
107
  // Clean up the polling interval and timer when the component unmounts
108
  return () => {
 
112
  if (timerIntervalRef.current) {
113
  clearInterval(timerIntervalRef.current);
114
  }
115
+ if (simulationIntervalRef.current) {
116
+ clearInterval(simulationIntervalRef.current);
117
+ }
118
  };
119
+ }, [isDefault]);
120
+
121
+ // Simulate the benchmark generation for pre-calculated documents
122
+ const simulateGeneration = () => {
123
+ setGenerating(true);
124
+ setGenerationLogs([]);
125
+ setError(null);
126
+ setCurrentPhase("initializing");
127
+ setCompletedSteps([]);
128
+ setActiveStep(0);
129
+
130
+ // Timing variables for simulation
131
+ const totalSteps = SIMULATED_LOGS.length;
132
+ const totalDuration = SIMULATION_DURATION; // 20 seconds
133
+ const intervalPerStep = totalDuration / totalSteps;
134
+ let currentStep = 0;
135
+
136
+ // Function to add next log message
137
+ const addNextLog = () => {
138
+ if (currentStep < SIMULATED_LOGS.length) {
139
+ const newLogs = [...generationLogs, SIMULATED_LOGS[currentStep]];
140
+ setGenerationLogs(newLogs);
141
+ currentStep++;
142
+
143
+ // Check if completed
144
+ if (currentStep >= SIMULATED_LOGS.length) {
145
+ // Simulation complete
146
+ setTimeout(() => {
147
+ setCurrentPhase("complete");
148
+ setGenerationComplete(true);
149
+ clearInterval(simulationIntervalRef.current);
150
+ if (onComplete) {
151
+ onComplete({
152
+ success: true,
153
+ sessionId,
154
+ logs: newLogs,
155
+ });
156
+ }
157
+ }, 1000);
158
+ }
159
+ }
160
+ };
161
+
162
+ // Start simulation
163
+ simulationIntervalRef.current = setInterval(addNextLog, intervalPerStep);
164
+ };
165
 
166
  // Determine the current phase and completed steps based on logs
167
  useEffect(() => {
 
198
  setActiveStep(newActiveStep);
199
  }
200
 
201
+ // Skip the rest of the log processing if we're simulating
202
+ if (isDefault) return;
203
+
204
  // Check the latest logs to determine the current phase
205
  const recentLogs = generationLogs.slice(-10); // Check more logs
206
 
 
242
  ) {
243
  setCurrentPhase("configuring");
244
  }
245
+ }, [
246
+ generationLogs,
247
+ completedSteps,
248
+ activeStep,
249
+ sessionId,
250
+ onComplete,
251
+ isDefault,
252
+ ]);
253
 
254
  const generateBenchmark = async () => {
255
  if (!sessionId) {
frontend/src/components/EvaluationDisplay.jsx CHANGED
@@ -14,9 +14,77 @@ import {
14
  Card,
15
  CardContent,
16
  Link,
 
17
  } from "@mui/material";
18
  import OpenInNewIcon from "@mui/icons-material/OpenInNew";
19
  import CheckCircleIcon from "@mui/icons-material/CheckCircle";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  const EvaluationDisplay = ({ sessionId }) => {
21
  const [results, setResults] = useState(null);
22
  const [loading, setLoading] = useState(true);
@@ -60,7 +128,23 @@ const EvaluationDisplay = ({ sessionId }) => {
60
 
61
  // Format accuracy as percentage
62
  const formatAccuracy = (value) => {
63
- return `${(value * 100).toFixed(2)}%`;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  };
65
 
66
  // Format evaluation time
@@ -125,14 +209,35 @@ const EvaluationDisplay = ({ sessionId }) => {
125
  boxShadow: "0 2px 4px rgba(0,0,0,0.05)",
126
  }}
127
  >
128
- <Table sx={{ minWidth: 650 }}>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  <TableHead>
130
- <TableRow>
131
- <TableCell>Rank</TableCell>
 
 
 
 
 
 
 
132
  <TableCell>Model</TableCell>
133
- <TableCell align="center">Accuracy</TableCell>
134
- <TableCell align="center">Eval Time</TableCell>
135
- <TableCell align="center">Status</TableCell>
136
  </TableRow>
137
  </TableHead>
138
  <TableBody>
@@ -142,35 +247,88 @@ const EvaluationDisplay = ({ sessionId }) => {
142
  <TableRow
143
  key={`${model.model_name}-${model.provider}`}
144
  sx={{
145
- "&:last-child td, &:last-child th": { border: 0 },
 
 
146
  }}
147
  >
148
- <TableCell>{index + 1}</TableCell>
 
 
 
 
149
  <TableCell component="th" scope="row">
150
- <Link
151
- href={`https://huggingface.co/${model.model_name}`}
152
- target="_blank"
153
- rel="noopener noreferrer"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  sx={{
155
- textDecoration: "none",
156
- "&:hover": {
157
- textDecoration: "underline",
158
- },
 
159
  display: "flex",
160
  alignItems: "center",
 
 
161
  }}
162
  >
163
- {model.model_name}
164
- <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
165
- </Link>
166
- </TableCell>
167
- <TableCell align="center">
168
- {formatAccuracy(model.accuracy)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  </TableCell>
170
- <TableCell align="center">
171
  {formatTime(model.evaluation_time)}
172
  </TableCell>
173
- <TableCell align="center">
174
  <span style={{ color: "green" }}>✓ Success</span>
175
  </TableCell>
176
  </TableRow>
 
14
  Card,
15
  CardContent,
16
  Link,
17
+ Tooltip,
18
  } from "@mui/material";
19
  import OpenInNewIcon from "@mui/icons-material/OpenInNew";
20
  import CheckCircleIcon from "@mui/icons-material/CheckCircle";
21
+
22
+ // Styles pour les médailles
23
+ const MEDAL_STYLES = {
24
+ 1: {
25
+ color: "#B58A1B",
26
+ background: "linear-gradient(135deg, #FFF7E0 0%, #FFD700 100%)",
27
+ borderColor: "rgba(212, 160, 23, 0.35)",
28
+ shadowColor: "rgba(212, 160, 23, 0.8)",
29
+ },
30
+ 2: {
31
+ color: "#667380",
32
+ background: "linear-gradient(135deg, #FFFFFF 0%, #D8E3ED 100%)",
33
+ borderColor: "rgba(124, 139, 153, 0.35)",
34
+ shadowColor: "rgba(124, 139, 153, 0.8)",
35
+ },
36
+ 3: {
37
+ color: "#B85C2F",
38
+ background: "linear-gradient(135deg, #FDF0E9 0%, #FFBC8C 100%)",
39
+ borderColor: "rgba(204, 108, 61, 0.35)",
40
+ shadowColor: "rgba(204, 108, 61, 0.8)",
41
+ },
42
+ default: {
43
+ color: "text.primary",
44
+ background: "transparent",
45
+ borderColor: "transparent",
46
+ shadowColor: "transparent",
47
+ },
48
+ };
49
+
50
+ // Fonction pour obtenir le style de médaille en fonction du rang
51
+ const getMedalStyle = (rank) => {
52
+ if (rank <= 3) {
53
+ const medalStyle = MEDAL_STYLES[rank];
54
+ return {
55
+ color: medalStyle.color,
56
+ fontWeight: 900,
57
+ fontFamily: '"Inter", -apple-system, sans-serif',
58
+ width: "24px",
59
+ height: "24px",
60
+ background: medalStyle.background,
61
+ border: "1px solid",
62
+ borderColor: medalStyle.borderColor,
63
+ borderRadius: "50%",
64
+ display: "flex",
65
+ alignItems: "center",
66
+ justifyContent: "center",
67
+ fontSize: "0.95rem",
68
+ lineHeight: 1,
69
+ padding: 0,
70
+ boxShadow: `1px 1px 0 ${medalStyle.shadowColor}`,
71
+ marginRight: "8px",
72
+ };
73
+ }
74
+ // Pour les rangs > 3, même dimensions mais transparent
75
+ return {
76
+ color: "text.primary",
77
+ fontWeight: rank <= 10 ? 600 : 400,
78
+ width: "24px",
79
+ height: "24px",
80
+ display: "flex",
81
+ alignItems: "center",
82
+ justifyContent: "center",
83
+ fontSize: "0.95rem",
84
+ marginRight: "8px",
85
+ };
86
+ };
87
+
88
  const EvaluationDisplay = ({ sessionId }) => {
89
  const [results, setResults] = useState(null);
90
  const [loading, setLoading] = useState(true);
 
128
 
129
  // Format accuracy as percentage
130
  const formatAccuracy = (value) => {
131
+ return `${(value * 100).toFixed(2)}\u2009%`;
132
+ };
133
+
134
+ // Fonction pour obtenir une couleur en fonction du score (rouge au vert)
135
+ const getColorForScore = (score) => {
136
+ // Convertir en pourcentage (0-100)
137
+ const percent = score * 100;
138
+
139
+ // Calcul de la couleur: rouge (0%) à vert (100%)
140
+ // Rouge diminue, vert augmente
141
+ const red = Math.max(
142
+ 0,
143
+ Math.min(255, Math.round(255 * (1 - percent / 100)))
144
+ );
145
+ const green = Math.max(0, Math.min(255, Math.round(255 * (percent / 100))));
146
+
147
+ return `rgb(${red}, ${green}, 0)`;
148
  };
149
 
150
  // Format evaluation time
 
209
  boxShadow: "0 2px 4px rgba(0,0,0,0.05)",
210
  }}
211
  >
212
+ <Table
213
+ sx={{
214
+ minWidth: 650,
215
+ "& .MuiTableCell-root": {
216
+ borderRight: "1px solid rgba(224, 224, 224, 1)",
217
+ borderBottom: "1px solid rgba(224, 224, 224, 1)",
218
+ "&:last-child": {
219
+ borderRight: "none",
220
+ },
221
+ },
222
+ "& .MuiTableRow-root:last-child .MuiTableCell-root": {
223
+ borderBottom: "1px solid rgba(224, 224, 224, 1)",
224
+ },
225
+ }}
226
+ >
227
  <TableHead>
228
+ <TableRow
229
+ sx={{
230
+ "& .MuiTableCell-root": {
231
+ fontWeight: "bold",
232
+ backgroundColor: "rgba(0, 0, 0, 0.02)",
233
+ },
234
+ }}
235
+ >
236
+ <TableCell width="80px">Rank</TableCell>
237
  <TableCell>Model</TableCell>
238
+ <TableCell align="left">Accuracy</TableCell>
239
+ <TableCell align="left">Eval Time</TableCell>
240
+ <TableCell align="right">Status</TableCell>
241
  </TableRow>
242
  </TableHead>
243
  <TableBody>
 
247
  <TableRow
248
  key={`${model.model_name}-${model.provider}`}
249
  sx={{
250
+ "&:nth-of-type(even)": {
251
+ backgroundColor: "rgba(0, 0, 0, 0.02)",
252
+ },
253
  }}
254
  >
255
+ <TableCell>
256
+ <Box sx={{ display: "flex", alignItems: "center" }}>
257
+ <Box sx={getMedalStyle(index + 1)}>{index + 1}</Box>
258
+ </Box>
259
+ </TableCell>
260
  <TableCell component="th" scope="row">
261
+ <Tooltip title={model.model_name} placement="top">
262
+ <Link
263
+ href={`https://huggingface.co/${model.model_name}`}
264
+ target="_blank"
265
+ rel="noopener noreferrer"
266
+ sx={{
267
+ textDecoration: "none",
268
+ "&:hover": {
269
+ textDecoration: "underline",
270
+ },
271
+ display: "flex",
272
+ alignItems: "center",
273
+ }}
274
+ >
275
+ {model.model_name.length > 20
276
+ ? `${model.model_name.substring(0, 20)}...`
277
+ : model.model_name}
278
+ <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
279
+ </Link>
280
+ </Tooltip>
281
+ </TableCell>
282
+ <TableCell
283
+ align="left"
284
+ sx={{
285
+ padding: 0,
286
+ position: "relative",
287
+ overflow: "hidden",
288
+ }}
289
+ >
290
+ <Box
291
  sx={{
292
+ position: "absolute",
293
+ width: "100%",
294
+ height: "100%",
295
+ left: 0,
296
+ top: 0,
297
  display: "flex",
298
  alignItems: "center",
299
+ justifyContent: "flex-start",
300
+ pl: 2,
301
  }}
302
  >
303
+ <Box
304
+ sx={{
305
+ position: "absolute",
306
+ left: 0,
307
+ top: 0,
308
+ height: "100%",
309
+ width: `${model.accuracy * 100}%`,
310
+ backgroundColor: getColorForScore(model.accuracy),
311
+ opacity: 0.2,
312
+ zIndex: 0,
313
+ }}
314
+ />
315
+ <Typography
316
+ sx={{
317
+ position: "relative",
318
+ zIndex: 1,
319
+ fontWeight: model.accuracy > 0.7 ? "bold" : "normal",
320
+ py: 1.5,
321
+ textAlign: "left",
322
+ }}
323
+ >
324
+ {formatAccuracy(model.accuracy)}
325
+ </Typography>
326
+ </Box>
327
  </TableCell>
328
+ <TableCell align="left">
329
  {formatTime(model.evaluation_time)}
330
  </TableCell>
331
+ <TableCell align="right">
332
  <span style={{ color: "green" }}>✓ Success</span>
333
  </TableCell>
334
  </TableRow>
frontend/src/components/Intro.jsx CHANGED
@@ -1,21 +1,28 @@
1
  import React from "react";
2
- import { Box } from "@mui/material";
3
  import HFLogo from "./Logo/HFLogo";
4
 
5
  const Intro = () => (
6
- <Box sx={{ textAlign: "center", mb: 8 }}>
7
  <Box
8
  sx={{ height: "60px", mb: 4, display: "flex", justifyContent: "center" }}
9
  >
10
  <HFLogo />
11
  </Box>
12
- <h1>Yourbench Demo</h1>
13
- <p>
 
 
 
 
 
 
 
14
  YourBench is an <b>open-source framework</b> for generating{" "}
15
  <b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims to
16
  keep your large language models on their toes—even as new data sources,
17
  domains, and knowledge demands evolve.
18
- </p>
19
  </Box>
20
  );
21
 
 
1
  import React from "react";
2
+ import { Box, Typography } from "@mui/material";
3
  import HFLogo from "./Logo/HFLogo";
4
 
5
  const Intro = () => (
6
+ <Box sx={{ textAlign: "center", mb: 4 }}>
7
  <Box
8
  sx={{ height: "60px", mb: 4, display: "flex", justifyContent: "center" }}
9
  >
10
  <HFLogo />
11
  </Box>
12
+ <Typography
13
+ variant="h4"
14
+ component="h1"
15
+ gutterBottom
16
+ sx={{ fontWeight: 800 }}
17
+ >
18
+ Yourbench Demo
19
+ </Typography>
20
+ <Typography variant="body1" sx={{ maxWidth: "800px", mx: "auto" }}>
21
  YourBench is an <b>open-source framework</b> for generating{" "}
22
  <b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims to
23
  keep your large language models on their toes—even as new data sources,
24
  domains, and knowledge demands evolve.
25
+ </Typography>
26
  </Box>
27
  );
28
 
frontend/src/pages/BenchmarkDisplayPage.jsx CHANGED
@@ -81,7 +81,16 @@ function BenchmarkDisplayPage() {
81
 
82
  const handleStartEvaluation = () => {
83
  console.log("Starting evaluation with session ID:", sessionId);
84
- navigate(`/benchmark-evaluation?session=${sessionId}`);
 
 
 
 
 
 
 
 
 
85
  };
86
 
87
  const defaultSampleQuestions = [
 
81
 
82
  const handleStartEvaluation = () => {
83
  console.log("Starting evaluation with session ID:", sessionId);
84
+ const isDefault = [
85
+ "the-bitter-lesson",
86
+ "hurricane-faq",
87
+ "pokemon-guide",
88
+ ].includes(sessionId);
89
+ navigate(
90
+ `/benchmark-evaluation?session=${sessionId}&isDefault=${
91
+ isDefault ? "true" : "false"
92
+ }`
93
+ );
94
  };
95
 
96
  const defaultSampleQuestions = [
frontend/src/pages/BenchmarkEvaluationPage.jsx CHANGED
@@ -8,6 +8,10 @@ function BenchmarkEvaluationPage() {
8
  const navigate = useNavigate();
9
  const [searchParams] = useSearchParams();
10
  const sessionId = searchParams.get("session");
 
 
 
 
11
  const [isValidSession, setIsValidSession] = useState(true);
12
  const [isLoading, setIsLoading] = useState(true);
13
 
@@ -20,6 +24,12 @@ function BenchmarkEvaluationPage() {
20
  return;
21
  }
22
 
 
 
 
 
 
 
23
  const checkSession = async () => {
24
  try {
25
  const response = await fetch(
@@ -41,10 +51,11 @@ function BenchmarkEvaluationPage() {
41
  };
42
 
43
  checkSession();
44
- }, [sessionId]);
45
 
46
  const handleEvaluationComplete = (result) => {
47
  console.log("Évaluation terminée:", result);
 
48
  };
49
 
50
  if (!isValidSession) {
@@ -69,6 +80,7 @@ function BenchmarkEvaluationPage() {
69
  ) : (
70
  <BenchmarkEvaluation
71
  sessionId={sessionId}
 
72
  onComplete={handleEvaluationComplete}
73
  />
74
  )}
 
8
  const navigate = useNavigate();
9
  const [searchParams] = useSearchParams();
10
  const sessionId = searchParams.get("session");
11
+ const isDefaultFromUrl = searchParams.get("isDefault") === "true";
12
+ const isDefault =
13
+ isDefaultFromUrl ||
14
+ ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
15
  const [isValidSession, setIsValidSession] = useState(true);
16
  const [isLoading, setIsLoading] = useState(true);
17
 
 
24
  return;
25
  }
26
 
27
+ // Si c'est un document précalculé, on le considère comme valide directement
28
+ if (isDefault) {
29
+ setIsLoading(false);
30
+ return;
31
+ }
32
+
33
  const checkSession = async () => {
34
  try {
35
  const response = await fetch(
 
51
  };
52
 
53
  checkSession();
54
+ }, [sessionId, isDefault]);
55
 
56
  const handleEvaluationComplete = (result) => {
57
  console.log("Évaluation terminée:", result);
58
+ // La redirection est gérée par le composant BenchmarkEvaluation
59
  };
60
 
61
  if (!isValidSession) {
 
80
  ) : (
81
  <BenchmarkEvaluation
82
  sessionId={sessionId}
83
+ isDefaultDocument={isDefault}
84
  onComplete={handleEvaluationComplete}
85
  />
86
  )}
frontend/src/pages/BenchmarkGenerationPage.jsx CHANGED
@@ -8,6 +8,7 @@ function BenchmarkGenerationPage() {
8
  const navigate = useNavigate();
9
  const [searchParams] = useSearchParams();
10
  const sessionId = searchParams.get("session");
 
11
  const [isValidSession, setIsValidSession] = useState(true);
12
 
13
  useEffect(() => {
@@ -32,6 +33,7 @@ function BenchmarkGenerationPage() {
32
  <Intro />
33
  <BenchmarkGenerator
34
  sessionId={sessionId}
 
35
  onComplete={handleGenerationComplete}
36
  />
37
  </>
 
8
  const navigate = useNavigate();
9
  const [searchParams] = useSearchParams();
10
  const sessionId = searchParams.get("session");
11
+ const isDefault = searchParams.get("isDefault") === "true";
12
  const [isValidSession, setIsValidSession] = useState(true);
13
 
14
  useEffect(() => {
 
33
  <Intro />
34
  <BenchmarkGenerator
35
  sessionId={sessionId}
36
+ isDefaultDocument={isDefault}
37
  onComplete={handleGenerationComplete}
38
  />
39
  </>
frontend/src/pages/HomePage.jsx CHANGED
@@ -7,8 +7,12 @@ import BenchmarkCreateForm from "../components/BenchmarkCreateForm";
7
  function HomePage() {
8
  const navigate = useNavigate();
9
 
10
- const handleStartGeneration = (sid) => {
11
- navigate(`/benchmark-generation?session=${sid}`);
 
 
 
 
12
  };
13
 
14
  return (
 
7
  function HomePage() {
8
  const navigate = useNavigate();
9
 
10
+ const handleStartGeneration = (sid, isDefaultDocument) => {
11
+ navigate(
12
+ `/benchmark-generation?session=${sid}&isDefault=${
13
+ isDefaultDocument ? "true" : "false"
14
+ }`
15
+ );
16
  };
17
 
18
  return (