Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

tfrere commited on Mar 31

Commit

ffa4ae8

1 Parent(s): 83d60af

add prerendered documents | update filename | refactor

Browse files

Files changed (31) hide show

.gitignore +2 -0
backend/main.py +2 -2
backend/old-pyproject.toml +0 -26
backend/poetry.lock +0 -0
backend/requirements.txt +0 -1
backend/routes/benchmark.py +2 -2
backend/routes/evaluation.py +1 -1
backend/routes/questions.py +0 -4
backend/routes/upload.py +23 -0
backend/tasks/{createBench.py → create_bench.py} +0 -0
backend/tasks/{createBenchConfigFile.py → create_bench_config_file.py} +1 -1
backend/tasks/{evaluationTask.py → evaluation_task.py} +0 -0
backend/test_import.py +0 -5
backend/tests/test_evaluation.py +0 -165
backend/tests/test_hf_upload.py +0 -78
backend/tests/test_inference.py +0 -84
backend/tests/test_lighteval.py +0 -151
backend/tests/test_openai.py +0 -31
backend/tests/test_parallel_lighteval.py +0 -278
backend/tests/test_provider_parallel_support.py +0 -227
backend/tests/test_yourbench_results.py +0 -394
backend/yourbench_simple_demo.egg-info/SOURCES.txt +0 -8
frontend/src/components/BenchmarkCreateForm.jsx +199 -151
frontend/src/components/BenchmarkEvaluation.jsx +56 -16
frontend/src/components/BenchmarkGenerator.jsx +96 -4
frontend/src/components/EvaluationDisplay.jsx +183 -25
frontend/src/components/Intro.jsx +12 -5
frontend/src/pages/BenchmarkDisplayPage.jsx +10 -1
frontend/src/pages/BenchmarkEvaluationPage.jsx +13 -1
frontend/src/pages/BenchmarkGenerationPage.jsx +2 -0
frontend/src/pages/HomePage.jsx +6 -2

.gitignore CHANGED Viewed

@@ -3,6 +3,8 @@
 __pycache__
 .cache/
 # dependencies
 frontend/node_modules

 __pycache__
 .cache/
+*.egg-info
 # dependencies
 frontend/node_modules

backend/main.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 import os
 from dotenv import load_dotenv
-from routes import routers, session_files, active_bench_tasks
 # Load environment variables from .env file
 load_dotenv()

+from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 import os
 from dotenv import load_dotenv
+from routes import routers, session_files, active_bench_tasks, benchmark
 # Load environment variables from .env file
 load_dotenv()

backend/old-pyproject.toml DELETED Viewed

@@ -1,26 +0,0 @@
-[tool.poetry]
-name = "llm-leaderboard-backend"
-version = "0.1.0"
-description = "Backend for the Open LLM Leaderboard"
-authors = ["Your Name <[email protected]>"]
-[tool.poetry.dependencies]
-python = ">=3.12,<3.13"
-fastapi = "^0.115.6"
-huggingface-hub = "0.29.3"
-python-dotenv = "^1.0.1"
-python-multipart = "^0.0.9"
-uvicorn = {extras = ["standard"], version = "^0.27.0"}
-loguru = "^0.7.3"
-lighteval = {version = ">=0.8.0", extras = ["math"]}
-tqdm = "^4.67.1"
-asyncio = "^3.4.3"
-datasets = "^3.3.0"
-yourbench = {git = "https://github.com/huggingface/yourbench.git"}
-tiktoken = "^0.9.0"
-requests = {extras = ["socks"], version = "^2.32.3"}
-httpx-socks = "^0.10.0"
-[build-system]
-requires = ["poetry-core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"

backend/poetry.lock DELETED Viewed

The diff for this file is too large to render. See raw diff

backend/requirements.txt DELETED Viewed

	@@ -1 +0,0 @@
1	-

backend/routes/benchmark.py CHANGED Viewed

@@ -2,8 +2,8 @@ from fastapi import APIRouter, HTTPException
 from typing import Dict, Any
 import os
 import time
-from tasks.createBenchConfigFile import CreateBenchConfigTask
-from tasks.createBench import CreateBenchTask
 router = APIRouter(tags=["benchmark"])

 from typing import Dict, Any
 import os
 import time
+from tasks.create_bench_config_file import CreateBenchConfigTask
+from tasks.create_bench import CreateBenchTask
 router = APIRouter(tags=["benchmark"])

backend/routes/evaluation.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import APIRouter, HTTPException
 from typing import Dict, Any
 import os
-from tasks.evaluationTask import EvaluationTask
 from huggingface_hub import hf_hub_download
 import json
 from datetime import datetime

 from fastapi import APIRouter, HTTPException
 from typing import Dict, Any
 import os
+from tasks.evaluation_task import EvaluationTask
 from huggingface_hub import hf_hub_download
 import json
 from datetime import datetime

backend/routes/questions.py CHANGED Viewed

@@ -37,9 +37,7 @@ async def get_benchmark_questions(session_id: str):
             if single_dataset and len(single_dataset['train']) > 0:
                 # Get a random sample (up to 2) from single-shot questions
                 sample_indices = random.sample(range(len(single_dataset['train'])), min(2, len(single_dataset['train'])))
-                print(f"Dataset structure: {single_dataset['train'][0].keys()}")
                 for idx in sample_indices:
-                    print(f"Question {idx} data: {single_dataset['train'][idx]}")
                     questions.append({
                         "id": str(idx),
                         "question": single_dataset['train'][idx].get("question", ""),
@@ -58,9 +56,7 @@ async def get_benchmark_questions(session_id: str):
                     # Get remaining questions from multi-hop questions
                     remaining = 2 - len(questions)
                     sample_indices = random.sample(range(len(multi_dataset['train'])), min(remaining, len(multi_dataset['train'])))
-                    print(f"Multi-hop dataset structure: {multi_dataset['train'][0].keys()}")
                     for idx in sample_indices:
-                        print(f"Multi-hop question {idx} data: {multi_dataset['train'][idx]}")
                         questions.append({
                             "id": str(idx),
                             "question": multi_dataset['train'][idx].get("question", ""),

             if single_dataset and len(single_dataset['train']) > 0:
                 # Get a random sample (up to 2) from single-shot questions
                 sample_indices = random.sample(range(len(single_dataset['train'])), min(2, len(single_dataset['train'])))
                 for idx in sample_indices:
                     questions.append({
                         "id": str(idx),
                         "question": single_dataset['train'][idx].get("question", ""),
                     # Get remaining questions from multi-hop questions
                     remaining = 2 - len(questions)
                     sample_indices = random.sample(range(len(multi_dataset['train'])), min(remaining, len(multi_dataset['train'])))
                     for idx in sample_indices:
                         questions.append({
                             "id": str(idx),
                             "question": multi_dataset['train'][idx].get("question", ""),

backend/routes/upload.py CHANGED Viewed

@@ -12,6 +12,29 @@ session_files = {}
 UPLOAD_ROOT = "uploaded_files"
 os.makedirs(UPLOAD_ROOT, exist_ok=True)
 @router.post("/upload")
 async def upload_file(file: UploadFile = File(...)):
     """

 UPLOAD_ROOT = "uploaded_files"
 os.makedirs(UPLOAD_ROOT, exist_ok=True)
+# Initialize session files dictionary with pre-calculated documents
+precalculated_docs = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]
+for doc_id in precalculated_docs:
+    doc_dir = os.path.join(UPLOAD_ROOT, doc_id)
+    if os.path.exists(doc_dir):
+        doc_files_dir = os.path.join(doc_dir, "uploaded_files")
+        if os.path.exists(doc_files_dir):
+            for filename in os.listdir(doc_files_dir):
+                if filename.endswith((".pdf", ".txt", ".html", ".md")):
+                    file_path = os.path.join(doc_files_dir, filename)
+                    session_files[doc_id] = file_path
+                    print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
+                    break
+        else:
+            # Search directly in the doc_dir
+            for filename in os.listdir(doc_dir):
+                if filename.endswith((".pdf", ".txt", ".html", ".md")):
+                    file_path = os.path.join(doc_dir, filename)
+                    session_files[doc_id] = file_path
+                    print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
+                    break
 @router.post("/upload")
 async def upload_file(file: UploadFile = File(...)):
     """

backend/tasks/{createBench.py → create_bench.py} RENAMED Viewed

File without changes

backend/tasks/{createBenchConfigFile.py → create_bench_config_file.py} RENAMED Viewed

@@ -114,7 +114,7 @@ class CreateBenchConfigTask:
                     "provider": "novita",
                     "api_key": "$HF_TOKEN",
                     "max_concurrent_requests": 32,
-                },
             ],
             "model_roles": {

                     "provider": "novita",
                     "api_key": "$HF_TOKEN",
                     "max_concurrent_requests": 32,
+                }
             ],
             "model_roles": {

backend/tasks/{evaluationTask.py → evaluation_task.py} RENAMED Viewed

File without changes

backend/test_import.py DELETED Viewed

@@ -1,5 +0,0 @@
-try:
-    import lighteval_task
-    print("lighteval_task importé avec succès!")
-except ImportError as e:
-    print(f"Erreur: {e}")

backend/tests/test_evaluation.py DELETED Viewed

@@ -1,165 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to test the evaluation task in standalone mode
-"""
-import os
-import sys
-import uuid
-import json
-import time
-import argparse
-from dotenv import load_dotenv
-from pathlib import Path
-import traceback
-# Ensure the environment is properly configured
-load_dotenv()
-# Add the current directory to the path to import modules
-sys.path.append(os.getcwd())
-from tasks.evaluationTask import EvaluationTask
-def setup_environment():
-    """
-    Configure the environment for testing
-    """
-    # Check if the HF token is defined
-    hf_token = os.getenv("HF_TOKEN")
-    if not hf_token:
-        print("⚠️  The HF_TOKEN is not defined in the environment or .env file")
-        print("    Please define this variable before continuing.")
-        sys.exit(1)
-    # Set the default organization if not defined
-    if not os.getenv("HF_ORGANIZATION"):
-        os.environ["HF_ORGANIZATION"] = "yourbench"
-        print("ℹ️  The HF_ORGANIZATION variable is not defined, using 'yourbench' as default")
-def run_standalone_evaluation(dataset_name, models=None, max_wait_time=3600):
-    """
-    Run the evaluation task in standalone mode
-    Args:
-        dataset_name: Name of the dataset to evaluate
-        models: List of models to evaluate (optional)
-        max_wait_time: Maximum waiting time in seconds
-    """
-    # Generate a unique session ID
-    session_uid = str(uuid.uuid4())
-    print(f"🔧 Session ID: {session_uid}")
-    # Create the evaluation task instance
-    evaluation_task = EvaluationTask(session_uid, dataset_name)
-    # If specific models are provided, use them
-    if models:
-        evaluation_task.models = models
-        print(f"🤖 Using custom models: {models}")
-    # Display dataset information
-    organization = os.getenv("HF_ORGANIZATION", "yourbench")
-    print(f"📊 Evaluating dataset: {organization}/{dataset_name}")
-    print(f"💾 Results saved in: {evaluation_task.output_dir}")
-    # Start the evaluation task
-    print("🚀 Starting evaluation...")
-    evaluation_task.run()
-    # Wait for the task to complete while displaying logs
-    start_time = time.time()
-    last_log_count = 0
-    while not evaluation_task.is_task_completed():
-        current_logs = evaluation_task.get_logs()
-        # Display only new logs
-        if len(current_logs) > last_log_count:
-            for log in current_logs[last_log_count:]:
-                print(f"  {log}")
-            last_log_count = len(current_logs)
-        # Check if the maximum time is reached
-        elapsed_time = time.time() - start_time
-        if elapsed_time > max_wait_time:
-            print("⚠️  Maximum waiting time reached, forced stop")
-            break
-        time.sleep(1)
-    # Check if results are available
-    results_file = Path(f"{evaluation_task.output_dir}/models_comparison.json")
-    if results_file.exists():
-        try:
-            with open(results_file, 'r') as f:
-                results = json.load(f)
-            print("\n📈 Evaluation Results:")
-            print(f"  Dataset: {results['metadata']['dataset']}")
-            print(f"  Models tested: {results['metadata']['total_models_tested']}")
-            print(f"  Successful tests: {results['metadata']['successful_tests']}")
-            print(f"  Timestamp: {results['metadata']['timestamp']}")
-            if results['metadata']['successful_tests'] > 0:
-                print("\n📊 Model ranking by accuracy:")
-                successful_models = [m for m in results['models_comparison'] if m['success']]
-                for i, model in enumerate(successful_models):
-                    print(f"  {i+1}. ✅ {model['model_name']} ({model['provider']})")
-                    print(f"     Accuracy: {model['accuracy']:.4f} ± {model['accuracy_stderr']:.4f}")
-                    print(f"     Evaluation time: {model['evaluation_time']:.2f}s")
-            failed_models = [m for m in results['models_comparison'] if not m['success']]
-            if failed_models:
-                print("\n❌ Unevaluated models:")
-                for i, model in enumerate(failed_models):
-                    print(f"  {i+1}. {model['model_name']} ({model['provider']})")
-                    error_msg = model.get('error', 'Unknown reason')
-                    print(f"     Reason: {error_msg}")
-            # Check detailed results files
-            detailed_file = Path(f"{evaluation_task.output_dir}/detailed_results.json")
-            if detailed_file.exists():
-                print(f"\n📄 Detailed results available in: {detailed_file}")
-            # Check raw files
-            raw_results = list(Path(f"{evaluation_task.output_dir}/results").glob("**/*.json"))
-            if raw_results:
-                print(f"\n📁 {len(raw_results)} raw result files available in: {evaluation_task.output_dir}/results")
-            print(f"\n✅ Evaluation completed!")
-        except Exception as e:
-            print(f"❌ Error reading results: {str(e)}")
-            print(f"   Details: {traceback.format_exc()}")
-    else:
-        print(f"❌ No evaluation results found in {results_file}")
-if __name__ == "__main__":
-    # Configure the argument parser
-    parser = argparse.ArgumentParser(description="Test the evaluation task in standalone mode")
-    parser.add_argument("dataset_name", type=str, help="Name of the dataset to evaluate (without the organization)")
-    parser.add_argument("--model", action="append", dest="models",
-                        help="Model to evaluate in the format 'name/model,provider'. Can be used multiple times.")
-    parser.add_argument("--timeout", type=int, default=3600,
-                        help="Maximum waiting time in seconds (default: 3600)")
-    args = parser.parse_args()
-    # Configure the environment
-    setup_environment()
-    # Transform models into tuples if specified
-    models_to_evaluate = None
-    if args.models:
-        models_to_evaluate = []
-        for model_spec in args.models:
-            try:
-                model_name, provider = model_spec.split(",")
-                models_to_evaluate.append((model_name, provider))
-            except ValueError:
-                print(f"⚠️  Invalid model format: {model_spec}. Use 'name/model,provider'")
-                sys.exit(1)
-    # Run the evaluation
-    run_standalone_evaluation(args.dataset_name, models_to_evaluate, args.timeout)

backend/tests/test_hf_upload.py DELETED Viewed

@@ -1,78 +0,0 @@
-"""
-Script de test pour vérifier l'upload vers Hugging Face Hub
-Ce script crée un dataset simple et tente de l'uploader vers le Hub
-en utilisant le token et l'organisation définis dans les variables d'environnement.
-"""
-import os
-import sys
-import datasets
-from huggingface_hub import HfApi, login
-from datasets import Dataset
-from dotenv import load_dotenv
-from pathlib import Path
-def test_hf_upload():
-    # Charger les variables d'environnement depuis le fichier .env
-    dotenv_path = Path('.env')
-    load_dotenv(dotenv_path=dotenv_path)
-    # Récupérer le token et l'organisation des variables d'environnement
-    hf_token = os.getenv("HF_TOKEN")
-    org_name = os.getenv("HF_ORGANIZATION", "yourbench")
-    if not hf_token:
-        print("Erreur: La variable HF_TOKEN n'est pas définie dans le fichier .env.")
-        sys.exit(1)
-    dataset_name = "test_dataset_upload"
-    repo_id = f"{org_name}/{dataset_name}"
-    print(f"Tentative d'upload vers {repo_id} avec le token {hf_token[:5]}... (token tronqué pour la sécurité)")
-    try:
-        # Se connecter à l'API Hugging Face
-        print("Connexion à l'API Hugging Face...")
-        login(token=hf_token)
-        api = HfApi(token=hf_token)
-        # Créer un dataset simple
-        print("Création d'un dataset de test...")
-        data = {
-            "text": ["Ceci est un test", "Un autre exemple", "Troisième exemple"],
-            "label": [1, 0, 1]
-        }
-        dataset = Dataset.from_dict(data)
-        # Vérifier si le repo existe déjà et le supprimer si nécessaire
-        try:
-            api.delete_repo(repo_id=repo_id, repo_type="dataset")
-            print(f"Repo existant {repo_id} supprimé.")
-        except Exception:
-            print(f"Le repo {repo_id} n'existait pas encore.")
-        # Uploader le dataset
-        print(f"Upload du dataset vers {repo_id}...")
-        dataset.push_to_hub(
-            repo_id=repo_id,
-            token=hf_token,
-            private=True,
-            commit_message="Test d'upload de dataset"
-        )
-        print(f"Succès! Dataset uploadé vers https://huggingface.co/datasets/{repo_id}")
-        return True
-    except Exception as e:
-        print(f"Erreur lors de l'upload: {str(e)}")
-        print("\nTraceback complet:")
-        import traceback
-        traceback.print_exc()
-        return False
-if __name__ == "__main__":
-    print("=== Test d'upload vers Hugging Face Hub ===")
-    success = test_hf_upload()
-    if success:
-        print("\n✅ Le test a réussi! L'upload fonctionne correctement.")
-    else:
-        print("\n❌ Le test a échoué. Vérifiez les erreurs ci-dessus.")

backend/tests/test_inference.py DELETED Viewed

@@ -1,84 +0,0 @@
-import time
-import signal
-from huggingface_hub import InferenceClient
-# Configuration - Modèles et leurs providers
-MODELS = [
-    ("Qwen/Qwen2.5-72B-Instruct", "sambanova"),
-    ("meta-llama/Llama-3.3-70B-Instruct", "sambanova"),
-    ("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "sambanova"),
-    ("Qwen/QwQ-32B", "novita"),
-    # ("mistralai/Mistral-Small-24B-Instruct-2501", "novita")
-]
-QUESTION = "What is the capital of France?"
-TIMEOUT = 10  # secondes
-class TimeoutException(Exception):
-    pass
-def timeout_handler(signum, frame):
-    raise TimeoutException("Timeout")
-def test_model(model, provider):
-    client = InferenceClient(provider=provider)
-    # Configure le timeout
-    signal.signal(signal.SIGALRM, timeout_handler)
-    signal.alarm(TIMEOUT)
-    start_time = time.time()
-    try:
-        response = client.chat_completion(
-            model=model,
-            messages=[{"role": "user", "content": QUESTION}]
-        )
-        result = response.choices[0].message.content
-        success = True
-    except TimeoutException:
-        result = f"TIMEOUT ({TIMEOUT}s)"
-        success = False
-    except Exception as e:
-        result = str(e)
-        success = False
-    finally:
-        # Désactive l'alarme
-        signal.alarm(0)
-    execution_time = time.time() - start_time
-    status = "✅" if success else "❌"
-    print(f"{status} {model} ({provider}) - Temps: {execution_time:.2f}s")
-    if success:
-        print(f"  Réponse: {result[:80]}..." if len(result) > 80 else f"  Réponse: {result}")
-    else:
-        print(f"  Erreur: {result}")
-    return success, execution_time, result
-def main():
-    print(f"\nTest de {len(MODELS)} modèles avec leurs providers spécifiques")
-    print(f"Question: {QUESTION}")
-    print(f"Timeout: {TIMEOUT}s\n")
-    results = []
-    for model, provider in MODELS:
-        success, time_taken, response = test_model(model, provider)
-        results.append({
-            "model": model,
-            "provider": provider,
-            "success": success,
-            "time": time_taken
-        })
-    print("\n=== RÉSUMÉ ===")
-    for result in results:
-        status = "✅" if result["success"] else "❌"
-        print(f"{status} {result['model']} ({result['provider']}): {result['time']:.2f}s")
-if __name__ == "__main__":
-    main()

backend/tests/test_lighteval.py DELETED Viewed

@@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script minimal pour tester directement lighteval avec la tâche yourbench
-"""
-import os
-import sys
-import subprocess
-import json
-import time
-from pathlib import Path
-import logging
-# Assurez-vous que l'environnement est correctement configuré
-from dotenv import load_dotenv
-load_dotenv()
-# Importer le module de définition de tâche yourbench
-sys.path.append(os.getcwd())
-from tasks.yourbench_lighteval_task import create_yourbench_task
-def run_lighteval_test():
-    """
-    Exécuter un test minimal avec lighteval
-    """
-    # Parameters
-    dataset_name = "yourbench_a"
-    organization = "yourbench"
-    model_name = "Qwen/Qwen2.5-72B-Instruct"
-    provider = "novita"
-    output_dir = f"uploaded_files/test_{provider}/lighteval_results"
-    # Créer le répertoire de sortie
-    os.makedirs(output_dir, exist_ok=True)
-    # Définir le chemin d'accès complet au dataset
-    dataset_path = f"{organization}/{dataset_name}"
-    print(f"Dataset à évaluer: {dataset_path}")
-    # Créer un fichier temporaire
-    import tempfile
-    temp_file_path = tempfile.mktemp(suffix=".py")
-    print(f"Création du fichier temporaire: {temp_file_path}")
-    with open(temp_file_path, 'w') as temp_file:
-        # Écrire le contenu du fichier
-        temp_file.write(f"""
-import os
-import sys
-import logging
-sys.path.append("{os.getcwd()}")
-from tasks.yourbench_lighteval_task import create_yourbench_task
-# Configurer le logging
-logging.basicConfig(level=logging.INFO)
-# Créer la tâche yourbench
-yourbench = create_yourbench_task("{dataset_path}", "lighteval")
-# Définir la variable TASKS_TABLE dont lighteval a besoin
-TASKS_TABLE = [yourbench]
-""")
-    # Construire la commande lighteval
-    cmd = [
-        "lighteval",
-        "endpoint",
-        "inference-providers",
-        f"model={model_name},provider={provider}",
-        "custom|yourbench|0|0",
-        "--custom-tasks",
-        temp_file_path,
-        "--max-samples", "5",  # Seulement 1 échantillon
-        "--output-dir", output_dir,
-        "--save-details",
-        "--no-push-to-hub"  # Pas de push pour gagner du temps
-    ]
-    # Afficher la commande
-    print(f"Exécution de la commande: {' '.join(cmd)}")
-    print(f"Heure de début: {time.strftime('%H:%M:%S')}")
-    # Exécuter la commande
-    try:
-        # Exécuter avec capture des sorties
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        # Afficher les résultats
-        print(f"Code de retour: {result.returncode}")
-        print("--- SORTIE STANDARD ---")
-        print(result.stdout)
-        print("--- ERREUR STANDARD ---")
-        print(result.stderr)
-        # Vérifier si des résultats ont été générés
-        results_dir = Path(output_dir) / "results"
-        if results_dir.exists():
-            print(f"Dossier de résultats créé: {results_dir}")
-            # Lister les fichiers de résultats
-            result_files = list(results_dir.glob("**/*.json"))
-            if result_files:
-                print(f"Fichiers de résultats trouvés: {result_files}")
-                # Trier les fichiers par date de modification pour prendre le plus récent
-                result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
-                latest_result = result_files[0]
-                print(f"Fichier de résultats le plus récent: {latest_result}")
-                # Lire le fichier de résultats
-                with open(latest_result, 'r') as f:
-                    results = json.load(f)
-                    print("Contenu du fichier de résultats:")
-                    print(json.dumps(results, indent=2))
-                    # Analyse des résultats
-                    print("\n==== ANALYSE DES RÉSULTATS ====")
-                    if "results" in results:
-                        for task_name, task_results in results["results"].items():
-                            print(f"Tâche: {task_name}")
-                            for metric_name, metric_value in task_results.items():
-                                print(f"  {metric_name}: {metric_value}")
-                    else:
-                        print("Aucun résultat trouvé dans le fichier JSON")
-                # Vérifier les détails
-                details_dir = Path(output_dir) / "details"
-                if details_dir.exists():
-                    print(f"\nDossier de détails trouvé: {details_dir}")
-                    model_details_dirs = list(details_dir.glob("**/*"))
-                    if model_details_dirs:
-                        print(f"Dossiers de détails par modèle: {model_details_dirs}")
-            else:
-                print("Aucun fichier de résultats trouvé.")
-        else:
-            print(f"Aucun dossier de résultats créé.")
-    except subprocess.CalledProcessError as e:
-        print(f"Erreur lors de l'exécution de la commande: {e}")
-    except Exception as e:
-        print(f"Exception: {e}")
-    finally:
-        # Supprimer le fichier temporaire
-        try:
-            os.unlink(temp_file_path)
-            print(f"Fichier temporaire supprimé: {temp_file_path}")
-        except:
-            pass
-    print(f"Heure de fin: {time.strftime('%H:%M:%S')}")
-if __name__ == "__main__":
-    run_lighteval_test()

backend/tests/test_openai.py DELETED Viewed

@@ -1,31 +0,0 @@
-import os
-from openai import OpenAI
-from dotenv import load_dotenv
-# Load environment variables
-load_dotenv()
-def test_openai_connection():
-    try:
-        # Initialize OpenAI client
-        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
-        # Make a simple request
-        response = client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "user", "content": "Say 'Hello World'"}
-            ]
-        )
-        print("✅ OpenAI API connection successful!")
-        print(f"Response: {response.choices[0].message.content}")
-        return True
-    except Exception as e:
-        print("❌ OpenAI API connection failed!")
-        print(f"Error: {str(e)}")
-        return False
-if __name__ == "__main__":
-    test_openai_connection()

backend/tests/test_parallel_lighteval.py DELETED Viewed

@@ -1,278 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to run lighteval tests in parallel for multiple models
-"""
-import os
-import sys
-import json
-import time
-import tempfile
-import asyncio
-from pathlib import Path
-from typing import Tuple, List, Dict, Any
-# Ensure environment is properly configured
-from dotenv import load_dotenv
-load_dotenv()
-# Import yourbench task module
-sys.path.append(os.getcwd())
-from tasks.yourbench_lighteval_task import create_yourbench_task
-# Define models to test
-INIT_MODELS = [
-    # 70B
-    ("Qwen/Qwen2.5-72B-Instruct", "novita"),
-    ("meta-llama/Llama-3.3-70B-Instruct", "novita"),
-    ("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "novita"),
-    # 20 to 30B
-    ("Qwen/QwQ-32B", "novita"),
-    # ("mistralai/Mistral-Small-24B-Instruct-2501", "sambanova"),
-]
-async def run_lighteval_test_for_model(model_info: Tuple[str, str]) -> Dict[str, Any]:
-    """
-    Run lighteval test for a specific model
-    """
-    model_name, provider = model_info
-    # Parameters
-    dataset_name = "yourbench_a"
-    organization = "yourbench"
-    output_dir = f"uploaded_files/test_parallel_{provider}/lighteval_results"
-    # Create output directory
-    os.makedirs(output_dir, exist_ok=True)
-    # Define full dataset path
-    dataset_path = f"{organization}/{dataset_name}"
-    print(f"Dataset to evaluate for {model_name}: {dataset_path}")
-    # Create temporary file
-    temp_file_path = tempfile.mktemp(suffix=".py")
-    print(f"Creating temporary file for {model_name}: {temp_file_path}")
-    with open(temp_file_path, 'w') as temp_file:
-        temp_file.write(f"""
-import os
-import sys
-sys.path.append("{os.getcwd()}")
-from tasks.yourbench_lighteval_task import create_yourbench_task
-# Create yourbench task
-yourbench = create_yourbench_task("{dataset_path}", "lighteval")
-# Define TASKS_TABLE needed by lighteval
-TASKS_TABLE = [yourbench]
-""")
-    # Build lighteval command args
-    cmd_args = [
-        "lighteval",
-        "endpoint",
-        "inference-providers",
-        f"model={model_name},provider={provider}",
-        "custom|yourbench|0|0",
-        "--custom-tasks",
-        temp_file_path,
-        "--max-samples", "5",
-        "--output-dir", output_dir,
-        "--save-details",
-        "--no-push-to-hub"
-    ]
-    print(f"Running command for {model_name}: {' '.join(cmd_args)}")
-    print(f"Start time for {model_name}: {time.strftime('%H:%M:%S')}")
-    results = {
-        "model_name": model_name,
-        "provider": provider,
-        "success": False,
-        "error": None,
-        "results": None,
-        "return_code": None
-    }
-    try:
-        # Prepare environment with needed tokens
-        env = os.environ.copy()
-        hf_token = os.getenv("HF_TOKEN")
-        if hf_token:
-            env["HF_TOKEN"] = hf_token
-            env["HUGGING_FACE_HUB_TOKEN"] = hf_token
-            env["HF_ORGANIZATION"] = organization
-        # Run the process asynchronously
-        process = await asyncio.create_subprocess_exec(
-            *cmd_args,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-            env=env
-        )
-        # Wait for the process to complete
-        stdout, stderr = await process.communicate()
-        # Store return code
-        exit_code = process.returncode
-        results["return_code"] = exit_code
-        # Log some output for debugging
-        if stdout:
-            stdout_lines = stdout.decode().strip().split('\n')
-            if stdout_lines and len(stdout_lines) > 0:
-                print(f"Output from {model_name}: {stdout_lines[0]}")
-        # Check if results were generated
-        results_dir = Path(output_dir) / "results"
-        if results_dir.exists():
-            result_files = list(results_dir.glob("**/*.json"))
-            if result_files:
-                # Read the first results file
-                with open(result_files[0], 'r') as f:
-                    test_results = json.load(f)
-                    results["results"] = test_results
-                    results["success"] = True
-    except asyncio.CancelledError:
-        results["error"] = "Task cancelled"
-        print(f"Task cancelled for {model_name}")
-    except Exception as e:
-        results["error"] = f"Exception: {str(e)}"
-        print(f"Error running test for {model_name}: {str(e)}")
-    finally:
-        # Delete temporary file
-        try:
-            os.unlink(temp_file_path)
-        except:
-            pass
-    print(f"End time for {model_name}: {time.strftime('%H:%M:%S')}")
-    return results
-async def run_parallel_tests(models: List[Tuple[str, str]]) -> List[Dict[str, Any]]:
-    """
-    Run tests in parallel for multiple models using asyncio
-    """
-    print(f"Starting parallel tests for {len(models)} models")
-    # Create tasks for each model
-    tasks = [run_lighteval_test_for_model(model) for model in models]
-    # Run all tasks concurrently and gather results
-    model_results = await asyncio.gather(*tasks, return_exceptions=True)
-    # Process results
-    results = []
-    for i, result in enumerate(model_results):
-        if isinstance(result, Exception):
-            # Handle exception
-            model_name, provider = models[i]
-            print(f"Test failed for {model_name}: {str(result)}")
-            results.append({
-                "model_name": model_name,
-                "provider": provider,
-                "success": False,
-                "error": str(result),
-                "results": None,
-                "return_code": None
-            })
-        else:
-            # Valid result
-            results.append(result)
-            print(f"Test completed for {result['model_name']}")
-    return results
-def format_comparison_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """
-    Format results for easy comparison between models
-    """
-    comparison = {
-        "metadata": {
-            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-            "total_models_tested": len(results),
-            "successful_tests": len([r for r in results if r["success"]])
-        },
-        "models_comparison": []
-    }
-    # Sort models by accuracy (if available) or name
-    sorted_results = sorted(
-        results,
-        key=lambda x: (
-            x["results"]["results"]["all"]["accuracy"] if x["success"] and x["results"] else -1,
-            x["model_name"]
-        ),
-        reverse=True
-    )
-    for result in sorted_results:
-        model_result = {
-            "model_name": result["model_name"],
-            "provider": result["provider"],
-            "success": result["success"]
-        }
-        if result["success"] and result["results"]:
-            model_result.update({
-                "accuracy": result["results"]["results"]["all"]["accuracy"],
-                "accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
-                "evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
-            })
-        else:
-            model_result["error"] = result["error"]
-        comparison["models_comparison"].append(model_result)
-    return comparison
-async def main_async():
-    """
-    Async main function to run parallel tests
-    """
-    print("Starting parallel lighteval tests")
-    start_time = time.time()
-    # Run tests in parallel
-    results = await run_parallel_tests(INIT_MODELS)
-    # Save detailed results
-    detailed_output_file = "parallel_test_detailed_results.json"
-    with open(detailed_output_file, 'w') as f:
-        json.dump(results, f, indent=2)
-    # Generate and save comparison results
-    comparison = format_comparison_results(results)
-    comparison_file = "models_comparison.json"
-    with open(comparison_file, 'w') as f:
-        json.dump(comparison, f, indent=2)
-    # Print summary
-    print("\nTest Summary:")
-    for model in comparison["models_comparison"]:
-        status = "✅" if model["success"] else "❌"
-        print(f"{status} {model['model_name']} ({model['provider']})")
-        if not model["success"]:
-            print(f"   Error: {model['error']}")
-        else:
-            print(f"   Accuracy: {model['accuracy']:.2%} (±{model['accuracy_stderr']:.2%})")
-            print(f"   Evaluation time: {model['evaluation_time']:.2f}s")
-    duration = time.time() - start_time
-    print(f"\nTotal execution time: {duration:.2f} seconds")
-    print(f"Detailed results saved to: {detailed_output_file}")
-    print(f"Comparison results saved to: {comparison_file}")
-def main():
-    """
-    Main function to run parallel tests
-    """
-    # Create event loop and run the async main
-    loop = asyncio.get_event_loop()
-    loop.run_until_complete(main_async())
-    loop.close()
-if __name__ == "__main__":
-    main()

backend/tests/test_provider_parallel_support.py DELETED Viewed

@@ -1,227 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script pour tester si un fournisseur d'API supporte réellement les requêtes parallèles
-"""
-import os
-import sys
-import time
-import asyncio
-import json
-from pathlib import Path
-from datetime import datetime
-# Ensure environment is properly configured
-from dotenv import load_dotenv
-load_dotenv()
-# Définir le modèle et le fournisseur à tester
-MODEL_NAME = "Qwen/QwQ-32B"
-PROVIDER = "novita"
-REQUEST_COUNT = 5  # Nombre de requêtes
-# Liste de questions
-PROMPTS = [
-    "Explain in detail how parallel computing has transformed modern data processing.",
-    "Describe the fundamental differences between CPU and GPU architectures.",
-    "Analyze the key challenges in distributed systems design.",
-    "Discuss the evolution of natural language processing from rule-based systems to modern transformer architectures.",
-    "Explain the concept of quantum computing and how it differs from classical computing paradigms."
-]
-async def send_request(prompt, request_id=None, show_logs=True):
-    """Envoie une requête au modèle et mesure le temps d'exécution"""
-    if show_logs and request_id is not None:
-        print(f"Démarrage requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
-    start_time = time.time()
-    cmd_args = [
-        "curl", "-s",
-        "-X", "POST",
-        f"https://api-inference.huggingface.co/models/{MODEL_NAME}",
-        "-H", f"Authorization: Bearer {os.environ.get('HF_TOKEN')}",
-        "-H", "Content-Type: application/json",
-        "-d", json.dumps({
-            "inputs": prompt,
-            "parameters": {
-                "provider": PROVIDER,
-                "max_new_tokens": 20
-            }
-        })
-    ]
-    process = await asyncio.create_subprocess_exec(
-        *cmd_args,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE
-    )
-    stdout, stderr = await process.communicate()
-    end_time = time.time()
-    duration = end_time - start_time
-    response = stdout.decode("utf-8")
-    stderr_output = stderr.decode("utf-8")
-    # Déterminer le succès
-    is_success = False
-    try:
-        response_json = json.loads(response)
-        is_success = process.returncode == 0 and isinstance(response_json, list) and "generated_text" in response_json[0]
-    except json.JSONDecodeError:
-        is_success = process.returncode == 0 and not ("error" in response.lower())
-    except Exception:
-        is_success = process.returncode == 0
-    # Extraire message d'erreur si échec
-    error_message = None
-    if not is_success:
-        try:
-            if "error" in response.lower():
-                try:
-                    response_json = json.loads(response)
-                    if "error" in response_json:
-                        error_message = response_json["error"]
-                except:
-                    error_message = f"Erreur non-JSON: {response}"
-            elif stderr_output:
-                error_message = stderr_output
-            else:
-                error_message = f"Réponse: {response}"
-        except:
-            error_message = f"Erreur inconnue. Code: {process.returncode}"
-    if show_logs and request_id is not None:
-        print(f"Fin requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]} (durée: {duration:.2f}s)")
-        if not is_success:
-            print(f"ERREUR requête {request_id}: {error_message[:100]}..." if error_message and len(error_message) > 100 else error_message)
-    return {
-        "request_id": request_id,
-        "prompt": prompt,
-        "start_time": start_time,
-        "end_time": end_time,
-        "duration": duration,
-        "success": is_success,
-        "response": response,
-        "error_message": error_message
-    }
-async def run_parallel_requests(prompts):
-    """Exécute les requêtes en parallèle"""
-    print(f"\n=== Test parallèle: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
-    print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
-    # Synchroniser le démarrage des requêtes
-    start_event = asyncio.Event()
-    async def synchronized_request(prompt, req_id):
-        await start_event.wait()
-        return await send_request(prompt, req_id)
-    # Créer toutes les tâches
-    tasks = [asyncio.create_task(synchronized_request(prompts[i], i)) for i in range(len(prompts))]
-    # Attendre que toutes les tâches soient prêtes
-    await asyncio.sleep(1)
-    # Lancer toutes les requêtes en même temps
-    parallel_start_time = time.time()
-    print(f"Démarrage synchronisé à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
-    start_event.set()
-    # Attendre que toutes les tâches se terminent
-    results = await asyncio.gather(*tasks)
-    parallel_end_time = time.time()
-    parallel_duration = parallel_end_time - parallel_start_time
-    print(f"Test parallèle terminé en {parallel_duration:.2f}s\n")
-    return results, parallel_duration
-async def run_sequential_requests(prompts):
-    """Exécute les mêmes requêtes séquentiellement"""
-    print(f"\n=== Test séquentiel: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
-    print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
-    sequential_start_time = time.time()
-    results = []
-    for i, prompt in enumerate(prompts):
-        print(f"Requête séquentielle {i}...")
-        result = await send_request(prompt, i)
-        results.append(result)
-    sequential_end_time = time.time()
-    sequential_duration = sequential_end_time - sequential_start_time
-    print(f"Test séquentiel terminé en {sequential_duration:.2f}s\n")
-    return results, sequential_duration
-async def run_tests():
-    """Exécute les tests parallèles puis séquentiels et compare les résultats"""
-    global_start = time.time()
-    prompts = PROMPTS[:REQUEST_COUNT]  # Utiliser le nombre de prompts spécifié
-    # 1. Test parallèle
-    parallel_results, parallel_duration = await run_parallel_requests(prompts)
-    # 2. Test séquentiel
-    sequential_results, sequential_duration = await run_sequential_requests(prompts)
-    # 3. Analyser les résultats
-    global_end = time.time()
-    total_duration = global_end - global_start
-    # Calculer les métriques
-    parallel_success = sum(1 for r in parallel_results if r["success"])
-    sequential_success = sum(1 for r in sequential_results if r["success"])
-    # Calculer le facteur de parallélisme réel (temps séquentiel / temps parallèle)
-    if parallel_duration > 0:
-        parallelism_factor = sequential_duration / parallel_duration
-    else:
-        parallelism_factor = 0
-    # Pourcentage d'amélioration
-    improvement_percent = (1 - (parallel_duration / sequential_duration)) * 100 if sequential_duration > 0 else 0
-    # Afficher le résumé
-    print("\n====== RÉSUMÉ DES TESTS ======")
-    print(f"Modèle: {MODEL_NAME}, Provider: {PROVIDER}, Requêtes: {len(prompts)}")
-    print(f"\nDurée test parallèle:  {parallel_duration:.2f}s ({parallel_success}/{len(prompts)} réussies)")
-    print(f"Durée test séquentiel: {sequential_duration:.2f}s ({sequential_success}/{len(prompts)} réussies)")
-    print(f"Facteur de parallélisme: {parallelism_factor:.2f}x")
-    print(f"Amélioration: {improvement_percent:.1f}%")
-    if parallelism_factor >= len(prompts) * 0.8:
-        conclusion = "EXCELLENT parallélisme (proche du théorique maximum)"
-    elif parallelism_factor >= 2:
-        conclusion = "BON parallélisme (significativement meilleur que séquentiel)"
-    elif parallelism_factor >= 1.3:
-        conclusion = "MOYEN parallélisme (légèrement meilleur que séquentiel)"
-    else:
-        conclusion = "FAIBLE ou PAS DE parallélisme (pas d'avantage significatif)"
-    print(f"\nConclusion: {conclusion}")
-    # Enregistrer les résultats
-    output_file = f"parallel_test_{PROVIDER}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
-    with open(output_file, 'w') as f:
-        json.dump({
-            "model": MODEL_NAME,
-            "provider": PROVIDER,
-            "request_count": len(prompts),
-            "parallel_duration": parallel_duration,
-            "sequential_duration": sequential_duration,
-            "parallelism_factor": parallelism_factor,
-            "improvement_percent": improvement_percent,
-            "conclusion": conclusion,
-            "parallel_results": parallel_results,
-            "sequential_results": sequential_results
-        }, f, indent=2)
-    print(f"\nRésultats détaillés sauvegardés dans {output_file}")
-if __name__ == "__main__":
-    asyncio.run(run_tests())

backend/tests/test_yourbench_results.py DELETED Viewed

@@ -1,394 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script pour tester les résultats de Yourbench et vérifier les datasets sur le Hub Hugging Face.
-"""
-import os
-import sys
-import json
-import argparse
-import requests
-import tempfile
-from datetime import datetime
-from typing import Dict, List, Any, Optional, Tuple
-# Vérifier si les bibliothèques nécessaires sont installées
-try:
-    from dotenv import load_dotenv
-    from huggingface_hub import HfApi, DatasetInfo, ModelInfo
-    from loguru import logger
-    import pandas as pd
-except ImportError:
-    print("Installation des dépendances...")
-    import subprocess
-    subprocess.run(["pip", "install", "python-dotenv", "huggingface_hub", "loguru", "pandas", "pyarrow"], check=True)
-    from dotenv import load_dotenv
-    from huggingface_hub import HfApi, DatasetInfo, ModelInfo
-    from loguru import logger
-    import pandas as pd
-# Charger les variables d'environnement depuis .env
-load_dotenv()
-# Configuration de la journalisation
-logger.remove()
-logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>")
-logger.add("yourbench_tests.log", rotation="10 MB", retention="1 week")
-def configure_argument_parser() -> argparse.ArgumentParser:
-    """Configure le parser d'arguments."""
-    parser = argparse.ArgumentParser(description="Tester les résultats de Yourbench et vérifier les datasets")
-    parser.add_argument("--dataset", type=str, help="Nom du dataset à vérifier (sans le nom de l'organisation)")
-    parser.add_argument("--org", type=str, default=os.environ.get("HF_ORGANIZATION", "yourbench"),
-                        help="Organisation Hugging Face (défaut: valeur de HF_ORGANIZATION dans .env ou 'yourbench')")
-    parser.add_argument("--verbose", "-v", action="store_true", help="Afficher des informations détaillées")
-    return parser
-class YourbenchTester:
-    """Classe pour tester les résultats et datasets de Yourbench."""
-    def __init__(self, organization: str, verbose: bool = False):
-        """Initialise le testeur Yourbench.
-        Args:
-            organization: Nom de l'organisation sur Hugging Face
-            verbose: Afficher des informations détaillées
-        """
-        self.organization = organization
-        self.verbose = verbose
-        self.hf_token = os.environ.get("HF_TOKEN")
-        if not self.hf_token:
-            logger.error("Variable d'environnement HF_TOKEN non trouvée dans le fichier .env")
-            sys.exit(1)
-        self.api = HfApi(token=self.hf_token)
-        logger.info(f"Initialisation du testeur pour l'organisation: {organization}")
-    def test_dataset_exists(self, dataset_name: str) -> Optional[DatasetInfo]:
-        """Vérifie si un dataset existe sur le Hub.
-        Args:
-            dataset_name: Nom du dataset à vérifier
-        Returns:
-            Informations sur le dataset s'il existe, None sinon
-        """
-        full_dataset_name = f"{self.organization}/{dataset_name}"
-        logger.info(f"Vérification de l'existence du dataset: {full_dataset_name}")
-        try:
-            dataset_info = self.api.dataset_info(full_dataset_name)
-            logger.success(f"Dataset {full_dataset_name} trouvé!")
-            if self.verbose:
-                logger.info(f"ID: {dataset_info.id}")
-                logger.info(f"Dernière modification: {dataset_info.lastModified}")
-                logger.info(f"SHA: {dataset_info.sha}")
-            return dataset_info
-        except Exception as e:
-            logger.error(f"Impossible de trouver le dataset {full_dataset_name}: {str(e)}")
-            return None
-    def analyze_dataset_content(self, dataset_name: str) -> Tuple[bool, Dict[str, Any]]:
-        """Analyse le contenu d'un dataset.
-        Args:
-            dataset_name: Nom du dataset à analyser
-        Returns:
-            Tuple contenant un booléen indiquant si l'analyse a réussi et un dictionnaire de statistiques
-        """
-        full_dataset_name = f"{self.organization}/{dataset_name}"
-        logger.info(f"Analyse du contenu du dataset: {full_dataset_name}")
-        stats = {
-            "fichiers": 0,
-            "taille_totale": 0,
-            "fichiers_json": 0,
-            "fichiers_parquet": 0,
-            "a_questions": False,
-            "nb_questions": 0,
-            "structure_parquet": {},
-            "types_documents": set()
-        }
-        try:
-            # Lister les fichiers dans le dataset
-            files = self.api.list_repo_files(full_dataset_name, repo_type="dataset")
-            stats["fichiers"] = len(files)
-            if self.verbose:
-                logger.info(f"Fichiers trouvés dans le dataset: {len(files)}")
-                for file in files[:10]:  # Limiter à 10 fichiers pour éviter un affichage trop verbeux
-                    logger.info(f"  - {file}")
-                if len(files) > 10:
-                    logger.info(f"  ... et {len(files) - 10} fichiers supplémentaires")
-            # Vérifier la présence de fichiers questions
-            question_files = [f for f in files if "question" in f.lower() and f.endswith(".json")]
-            stats["fichiers_json"] = len([f for f in files if f.endswith(".json")])
-            # Vérifier les fichiers Parquet qui sont utilisés par Yourbench
-            parquet_files = [f for f in files if f.endswith(".parquet")]
-            stats["fichiers_parquet"] = len(parquet_files)
-            if parquet_files:
-                logger.info(f"Fichiers Parquet trouvés: {len(parquet_files)}")
-                # Analyser un échantillon de fichiers Parquet
-                for parquet_file in parquet_files[:3]:  # Limiter à 3 fichiers pour l'analyse
-                    category = parquet_file.split('/')[0] if '/' in parquet_file else "unknown"
-                    logger.info(f"Analyse du fichier Parquet: {parquet_file} (catégorie: {category})")
-                    try:
-                        # Télécharger le fichier Parquet
-                        temp_file = self.api.hf_hub_download(
-                            repo_id=full_dataset_name,
-                            filename=parquet_file,
-                            repo_type="dataset"
-                        )
-                        # Lire le fichier Parquet avec pandas
-                        df = pd.read_parquet(temp_file)
-                        # Ajouter des statistiques
-                        stats["structure_parquet"][category] = {
-                            "colonnes": list(df.columns),
-                            "nb_lignes": len(df),
-                            "exemple": df.iloc[0].to_dict() if len(df) > 0 else {}
-                        }
-                        # Vérifier si ce fichier contient des questions
-                        if any(col for col in df.columns if "question" in col.lower()):
-                            stats["a_questions"] = True
-                            question_col = next(col for col in df.columns if "question" in col.lower())
-                            stats["nb_questions"] = len(df)
-                            # Récupérer un exemple de question
-                            if len(df) > 0 and question_col in df.columns:
-                                logger.info(f"Exemple de question: {df[question_col].iloc[0][:100]}...")
-                        # Identifier les types de documents si disponible
-                        if "doc_type" in df.columns and len(df) > 0:
-                            doc_types = df["doc_type"].unique()
-                            stats["types_documents"].update(doc_types)
-                    except Exception as e:
-                        logger.warning(f"Erreur lors de l'analyse du fichier {parquet_file}: {str(e)}")
-            # Convertir le set en liste pour la sérialisation JSON
-            stats["types_documents"] = list(stats["types_documents"])
-            if question_files:
-                stats["a_questions"] = True
-                # Analyser un fichier de questions pour comprendre sa structure
-                sample_file = question_files[0]
-                content = self.api.hf_hub_download(
-                    repo_id=full_dataset_name,
-                    filename=sample_file,
-                    repo_type="dataset"
-                )
-                with open(content, 'r') as f:
-                    data = json.load(f)
-                if isinstance(data, list):
-                    stats["nb_questions"] = len(data)
-                elif isinstance(data, dict) and "questions" in data:
-                    stats["nb_questions"] = len(data["questions"])
-                logger.success(f"Fichiers de questions trouvés: {len(question_files)}")
-                logger.info(f"Exemple de fichier analysé: {sample_file}")
-                logger.info(f"Nombre de questions trouvées: {stats['nb_questions']}")
-            return True, stats
-        except Exception as e:
-            logger.error(f"Erreur lors de l'analyse du dataset {full_dataset_name}: {str(e)}")
-            return False, stats
-    def check_evaluation_results(self, dataset_name: str) -> bool:
-        """Vérifie s'il existe des résultats d'évaluation pour ce dataset.
-        Args:
-            dataset_name: Nom du dataset à vérifier
-        Returns:
-            True si des résultats d'évaluation existent, False sinon
-        """
-        logger.info(f"Recherche de résultats d'évaluation pour le dataset: {dataset_name}")
-        try:
-            # Lister tous les datasets de l'organisation
-            datasets = self.api.list_datasets(author=self.organization)
-            # Chercher les datasets d'évaluation
-            eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
-            if self.verbose:
-                logger.info(f"Datasets d'évaluation trouvés: {len(eval_datasets)}")
-                for ds in eval_datasets[:5]:
-                    logger.info(f"  - {ds.id}")
-            # Vérifier si le dataset spécifié est mentionné dans les évaluations
-            for eval_ds in eval_datasets:
-                try:
-                    # Télécharger le README pour voir si le dataset est mentionné
-                    readme_path = self.api.hf_hub_download(
-                        repo_id=eval_ds.id,
-                        filename="README.md",
-                        repo_type="dataset"
-                    )
-                    with open(readme_path, 'r') as f:
-                        readme_content = f.read()
-                    if dataset_name in readme_content:
-                        logger.success(f"Résultats d'évaluation trouvés dans: {eval_ds.id}")
-                        return True
-                except:
-                    continue
-            logger.warning(f"Aucun résultat d'évaluation trouvé pour le dataset: {dataset_name}")
-            return False
-        except Exception as e:
-            logger.error(f"Erreur lors de la recherche de résultats d'évaluation: {str(e)}")
-            return False
-    def check_model_performances(self, dataset_name: str) -> Dict[str, float]:
-        """Vérifie les performances des modèles sur le dataset spécifié.
-        Args:
-            dataset_name: Nom du dataset à vérifier
-        Returns:
-            Dictionnaire des performances des modèles (model_name -> score)
-        """
-        logger.info(f"Vérification des performances des modèles sur le dataset: {dataset_name}")
-        performances = {}
-        try:
-            # Cette partie est spéculative car nous ne connaissons pas la structure exacte
-            # des résultats. Une approche possible serait de chercher des fichiers JSON
-            # contenant des métriques dans les datasets d'évaluation.
-            # Chercher les datasets d'évaluation
-            datasets = self.api.list_datasets(author=self.organization)
-            eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
-            for eval_ds in eval_datasets:
-                try:
-                    files = self.api.list_repo_files(eval_ds.id, repo_type="dataset")
-                    result_files = [f for f in files if "result" in f.lower() and f.endswith(".json")]
-                    for result_file in result_files:
-                        file_path = self.api.hf_hub_download(
-                            repo_id=eval_ds.id,
-                            filename=result_file,
-                            repo_type="dataset"
-                        )
-                        with open(file_path, 'r') as f:
-                            results = json.load(f)
-                        # Analyse basique des résultats (à adapter selon la structure réelle)
-                        if "model_name" in results and "metrics" in results:
-                            model_name = results["model_name"]
-                            metrics = results["metrics"]
-                            # Prendre la première métrique trouvée comme score
-                            if metrics and isinstance(metrics, dict):
-                                first_metric = list(metrics.keys())[0]
-                                performances[model_name] = metrics[first_metric]
-                except:
-                    continue
-            if performances:
-                logger.success(f"Performances trouvées pour {len(performances)} modèles")
-                for model, score in performances.items():
-                    logger.info(f"  - {model}: {score}")
-            else:
-                logger.warning("Aucune performance de modèle trouvée")
-            return performances
-        except Exception as e:
-            logger.error(f"Erreur lors de la vérification des performances: {str(e)}")
-            return {}
-def main():
-    """Fonction principale."""
-    parser = configure_argument_parser()
-    args = parser.parse_args()
-    if not args.dataset:
-        logger.error("Veuillez spécifier un dataset avec --dataset")
-        parser.print_help()
-        return
-    # Créer le testeur
-    tester = YourbenchTester(args.org, args.verbose)
-    # 1. Vérifier l'existence du dataset
-    dataset_info = tester.test_dataset_exists(args.dataset)
-    if not dataset_info:
-        logger.error(f"Le dataset {args.org}/{args.dataset} n'existe pas ou n'est pas accessible")
-        return
-    # 2. Analyser le contenu du dataset
-    success, stats = tester.analyze_dataset_content(args.dataset)
-    if success:
-        logger.info("\n=== Statistiques du dataset ===")
-        logger.info(f"Nombre de fichiers: {stats['fichiers']}")
-        logger.info(f"Fichiers JSON: {stats['fichiers_json']}")
-        logger.info(f"Fichiers Parquet: {stats['fichiers_parquet']}")
-        logger.info(f"Contient des questions: {'Oui' if stats['a_questions'] else 'Non'}")
-        if stats['a_questions']:
-            logger.info(f"Nombre de questions: {stats['nb_questions']}")
-        if 'types_documents' in stats and stats['types_documents']:
-            logger.info(f"Types de documents: {', '.join(stats['types_documents'])}")
-        # Afficher la structure des fichiers Parquet
-        if 'structure_parquet' in stats and stats['structure_parquet']:
-            logger.info("\n=== Structure des fichiers Parquet ===")
-            for category, info in stats['structure_parquet'].items():
-                logger.info(f"\nCatégorie: {category}")
-                logger.info(f"Nombre de lignes: {info['nb_lignes']}")
-                logger.info(f"Colonnes: {', '.join(info['colonnes'])}")
-                if args.verbose and 'exemple' in info and info['exemple']:
-                    logger.info("\nExemple de ligne:")
-                    for key, value in info['exemple'].items():
-                        # Tronquer les valeurs trop longues
-                        if isinstance(value, str) and len(value) > 100:
-                            value = value[:100] + "..."
-                        logger.info(f"  {key}: {value}")
-    # 3. Vérifier s'il existe des résultats d'évaluation
-    has_evaluations = tester.check_evaluation_results(args.dataset)
-    if has_evaluations:
-        # 4. Vérifier les performances des modèles
-        performances = tester.check_model_performances(args.dataset)
-        if performances:
-            logger.info("\n=== Classement des modèles ===")
-            # Trier les modèles par score (du plus élevé au plus bas)
-            sorted_models = sorted(performances.items(), key=lambda x: x[1], reverse=True)
-            for i, (model, score) in enumerate(sorted_models, 1):
-                logger.info(f"{i}. {model}: {score:.4f}")
-    logger.success("Test terminé !")
-if __name__ == "__main__":
-    main()

backend/yourbench_simple_demo.egg-info/SOURCES.txt CHANGED Viewed

@@ -2,14 +2,6 @@ README.md
 pyproject.toml
 lighteval_task/__init__.py
 lighteval_task/lighteval_task.py
-tests/test_evaluation.py
-tests/test_hf_upload.py
-tests/test_inference.py
-tests/test_lighteval.py
-tests/test_openai.py
-tests/test_parallel_lighteval.py
-tests/test_provider_parallel_support.py
-tests/test_yourbench_results.py
 yourbench_simple_demo.egg-info/PKG-INFO
 yourbench_simple_demo.egg-info/SOURCES.txt
 yourbench_simple_demo.egg-info/dependency_links.txt

 pyproject.toml
 lighteval_task/__init__.py
 lighteval_task/lighteval_task.py
 yourbench_simple_demo.egg-info/PKG-INFO
 yourbench_simple_demo.egg-info/SOURCES.txt
 yourbench_simple_demo.egg-info/dependency_links.txt

frontend/src/components/BenchmarkCreateForm.jsx CHANGED Viewed

@@ -1,48 +1,26 @@
-import React, { useState, useRef, useEffect } from "react";
 import {
   Box,
   Paper,
   Typography,
   CircularProgress,
-  Alert,
   Button,
-  Stepper,
-  Step,
-  StepLabel,
 } from "@mui/material";
-import { useLocation } from "react-router-dom";
 import CloudUploadIcon from "@mui/icons-material/CloudUpload";
 import AutoFixHighIcon from "@mui/icons-material/AutoFixHigh";
-import AuthContainer from "./shared/AuthContainer";
 import { useThemeMode } from "../hooks/useThemeMode";
 import getTheme from "../config/theme";
 import API_CONFIG from "../config/api";
 /**
- * Component to display a stepper with three steps: Login, Upload File, and Generate
- *
- * @param {Object} props - Component props
- * @param {number} props.activeStep - Current active step (0-based index)
- * @returns {JSX.Element} Stepper component
- */
-const StepsDisplay = ({ activeStep }) => {
-  const steps = ["Login", "Upload File", "Generate"];
-  return (
-    <Box sx={{ width: "100%", mb: 4 }}>
-      <Stepper activeStep={activeStep} alternativeLabel>
-        {steps.map((label) => (
-          <Step key={label}>
-            <StepLabel>{label}</StepLabel>
-          </Step>
-        ))}
-      </Stepper>
-    </Box>
-  );
-};
-/**
- * Component for creating a new benchmark, including authentication, file upload, and generation initiation
  *
  * @param {Object} props - Component props
  * @param {Function} props.onStartGeneration - Callback when generation starts with sessionId
@@ -54,31 +32,36 @@ function BenchmarkCreateForm({ onStartGeneration }) {
   const [isDragging, setIsDragging] = useState(false);
   const [uploadStatus, setUploadStatus] = useState(null);
   const [isLoading, setIsLoading] = useState(false);
-  const [activeStep, setActiveStep] = useState(0);
   const [sessionId, setSessionId] = useState(null);
   const fileInputRef = useRef(null);
-  const location = useLocation();
-  // Check if we're coming back from an OAuth redirect
-  useEffect(() => {
-    // If we have code in URL parameters, it's an OAuth callback
-    const params = new URLSearchParams(window.location.search);
-    if (params.has("code")) {
-      console.log("Detected OAuth callback, cleaning URL");
-      // Remove the query parameters from the URL without reloading
-      window.history.replaceState({}, document.title, window.location.pathname);
-      // Check if we have auth data in localStorage after a brief delay to let OAuth process complete
-      setTimeout(() => {
-        const storedAuth = localStorage.getItem("hf_oauth");
-        if (storedAuth) {
-          console.log("Found auth data after redirect, refreshing UI state");
-          setActiveStep(1); // Move to next step if authenticated
-        }
-      }, 1000);
-    }
-  }, [location]);
   const handleDragOver = (e) => {
     e.preventDefault();
@@ -97,7 +80,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
     const file = e.target.files[0];
     if (!file) return;
-    // Vérifier si c'est un PDF, TXT, HTML ou MD
     if (
       !file.name.endsWith(".pdf") &&
       !file.name.endsWith(".txt") &&
@@ -117,6 +100,8 @@ function BenchmarkCreateForm({ onStartGeneration }) {
   const handleFileUpload = async (file) => {
     setIsLoading(true);
     setUploadStatus(null);
     try {
       const formData = new FormData();
@@ -134,20 +119,22 @@ function BenchmarkCreateForm({ onStartGeneration }) {
           success: true,
           message: `File ${result.filename} uploaded successfully`,
         });
-        // Store the session ID for the benchmark generation
         setSessionId(result.session_id);
-        setActiveStep(2); // Advance to Generate step after successful upload
       } else {
         setUploadStatus({
           success: false,
           message: result.error || "Upload failed",
         });
       }
     } catch (error) {
       setUploadStatus({
         success: false,
         message: "Server connection error",
       });
     } finally {
       setIsLoading(false);
     }
@@ -163,7 +150,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
       return;
     }
-    // Vérifier si c'est un PDF, TXT, HTML ou MD
     if (
       !file.name.endsWith(".pdf") &&
       !file.name.endsWith(".txt") &&
@@ -180,114 +167,175 @@ function BenchmarkCreateForm({ onStartGeneration }) {
     handleFileUpload(file);
   };
   const handleGenerateClick = () => {
     if (onStartGeneration && sessionId) {
-      onStartGeneration(sessionId);
     }
   };
   return (
-    <>
-      <StepsDisplay activeStep={activeStep} />
-      {/* Authentication step */}
-      {activeStep === 0 && (
-        <AuthContainer
-          actionText="use this demo"
-          onSuccess={() => setActiveStep(1)}
-        />
-      )}
-      {/* File upload step */}
-      {activeStep === 1 && (
-        <Paper
-          elevation={3}
-          sx={{
-            p: 4,
-            mt: 3,
-            mb: 3,
-            border: isDragging
               ? `2px dashed ${theme.palette.primary.main}`
-              : "2px dashed #ccc",
-            backgroundColor: isDragging ? "rgba(0, 0, 0, 0.05)" : "transparent",
-            display: "flex",
-            flexDirection: "column",
-            alignItems: "center",
-            justifyContent: "center",
-            minHeight: 200,
-            cursor: "pointer",
-            transition: "all 0.3s ease",
-          }}
-          onDragOver={handleDragOver}
-          onDragLeave={handleDragLeave}
-          onDrop={handleDrop}
-          onClick={handleClick}
-        >
-          <input
-            type="file"
-            ref={fileInputRef}
-            onChange={handleFileChange}
-            accept=".pdf,.txt,.html,.md"
-            style={{ display: "none" }}
-          />
-          <CloudUploadIcon
-            sx={{ fontSize: 60, color: "text.secondary", mb: 1 }}
-          />
-          <Typography variant="h6" component="div" gutterBottom>
-            Drag and drop your file here or click to browse
-          </Typography>
-          <Typography variant="body2" color="text.secondary">
-            Accepted formats: PDF, TXT, HTML, MD
-          </Typography>
-          {isLoading && (
-            <Box sx={{ mt: 2 }}>
-              <CircularProgress size={30} />
-            </Box>
-          )}
-          {uploadStatus && (
-            <Alert
-              severity={uploadStatus.success ? "success" : "error"}
-              sx={{ mt: 2, width: "100%" }}
-            >
-              {uploadStatus.message}
-            </Alert>
-          )}
-        </Paper>
-      )}
-      {/* Generate button step */}
-      {activeStep === 2 && (
-        <Paper
-          elevation={3}
-          sx={{
-            p: 4,
-            mt: 3,
-            display: "flex",
-            flexDirection: "column",
-            alignItems: "center",
-            justifyContent: "center",
-            minHeight: 200,
-          }}
         >
-          <AutoFixHighIcon
-            sx={{ fontSize: 60, color: "text.secondary", mb: 1 }}
-          />
-          <Typography variant="h6" component="div" gutterBottom>
-            Ready to generate your benchmark
-          </Typography>
-          <Button
-            variant="contained"
-            color="primary"
-            onClick={handleGenerateClick}
-            sx={{ mt: 2 }}
-          >
-            Generate Benchmark
-          </Button>
-        </Paper>
-      )}
-    </>
   );
 }

+import React, { useState, useRef } from "react";
 import {
   Box,
   Paper,
   Typography,
   CircularProgress,
   Button,
+  Snackbar,
+  Alert,
+  Grid,
 } from "@mui/material";
 import CloudUploadIcon from "@mui/icons-material/CloudUpload";
 import AutoFixHighIcon from "@mui/icons-material/AutoFixHigh";
+import InsertDriveFileIcon from "@mui/icons-material/InsertDriveFile";
+import DescriptionIcon from "@mui/icons-material/Description";
+import ArticleIcon from "@mui/icons-material/Article";
+import MenuBookIcon from "@mui/icons-material/MenuBook";
 import { useThemeMode } from "../hooks/useThemeMode";
 import getTheme from "../config/theme";
 import API_CONFIG from "../config/api";
 /**
+ * Component for creating a new benchmark, including file upload and generation initiation
  *
  * @param {Object} props - Component props
  * @param {Function} props.onStartGeneration - Callback when generation starts with sessionId
   const [isDragging, setIsDragging] = useState(false);
   const [uploadStatus, setUploadStatus] = useState(null);
   const [isLoading, setIsLoading] = useState(false);
   const [sessionId, setSessionId] = useState(null);
+  const [openSnackbar, setOpenSnackbar] = useState(false);
+  const [selectedDocument, setSelectedDocument] = useState(null);
+  const [isDefaultDocument, setIsDefaultDocument] = useState(false);
   const fileInputRef = useRef(null);
+  const defaultDocuments = [
+    {
+      id: "the-bitter-lesson",
+      name: "The Bitter Lesson",
+      icon: <ArticleIcon sx={{ fontSize: 40 }} />,
+      description: "A seminal paper on AI development by Rich Sutton",
+    },
+    {
+      id: "hurricane-faq",
+      name: "Hurricane FAQ",
+      icon: <DescriptionIcon sx={{ fontSize: 40 }} />,
+      description: "Frequently asked questions about hurricanes",
+    },
+    {
+      id: "pokemon-guide",
+      name: "Pokemon Guide",
+      icon: <MenuBookIcon sx={{ fontSize: 40 }} />,
+      description: "A comprehensive guide to Pokemon",
+    },
+  ];
+  const handleCloseSnackbar = () => {
+    setOpenSnackbar(false);
+  };
   const handleDragOver = (e) => {
     e.preventDefault();
     const file = e.target.files[0];
     if (!file) return;
+    // Check if it's a PDF, TXT, HTML or MD
     if (
       !file.name.endsWith(".pdf") &&
       !file.name.endsWith(".txt") &&
   const handleFileUpload = async (file) => {
     setIsLoading(true);
     setUploadStatus(null);
+    setIsDefaultDocument(false);
+    setSelectedDocument(null);
     try {
       const formData = new FormData();
           success: true,
           message: `File ${result.filename} uploaded successfully`,
         });
+        setOpenSnackbar(true);
         setSessionId(result.session_id);
+        setSelectedDocument({ name: file.name });
       } else {
         setUploadStatus({
           success: false,
           message: result.error || "Upload failed",
         });
+        setOpenSnackbar(true);
       }
     } catch (error) {
       setUploadStatus({
         success: false,
         message: "Server connection error",
       });
+      setOpenSnackbar(true);
     } finally {
       setIsLoading(false);
     }
       return;
     }
+    // Check if it's a PDF, TXT, HTML or MD
     if (
       !file.name.endsWith(".pdf") &&
       !file.name.endsWith(".txt") &&
     handleFileUpload(file);
   };
+  const handleDefaultDocClick = (doc) => {
+    setSelectedDocument(doc);
+    setSessionId(doc.id);
+    setIsDefaultDocument(true);
+  };
   const handleGenerateClick = () => {
     if (onStartGeneration && sessionId) {
+      onStartGeneration(sessionId, isDefaultDocument);
     }
   };
   return (
+    <Box sx={{ mt: -2 }}>
+      <Typography
+        variant="subtitle1"
+        component="div"
+        align="center"
+        sx={{ mb: 2, color: "text.secondary" }}
+      >
+        Choose a sample document
+      </Typography>
+      <Grid container spacing={2} sx={{ mb: 2 }}>
+        {defaultDocuments.map((doc) => (
+          <Grid item xs={12} md={4} key={doc.id}>
+            <Box
+              elevation={2}
+              sx={{
+                p: 2,
+                display: "flex",
+                flexDirection: "column",
+                borderRadius: 1.5,
+                alignItems: "center",
+                cursor: "pointer",
+                transition: "all 0.2s ease",
+                height: "100%",
+                // border: "2px solid rgba(0, 0, 0, 0.1)",
+                border:
+                  selectedDocument?.id === doc.id
+                    ? `2px solid ${theme.palette.primary.main}`
+                    : "2px solid rgba(0, 0, 0, 0.1)",
+                "&:hover": {
+                  transform: "translateY(-2px)",
+                  boxShadow: 3,
+                },
+              }}
+              onClick={() => handleDefaultDocClick(doc)}
+            >
+              <Box sx={{ color: "primary.main", mb: 1 }}>{doc.icon}</Box>
+              <Typography variant="subtitle1" component="div" gutterBottom>
+                {doc.name}
+              </Typography>
+              <Typography
+                variant="body2"
+                color="text.secondary"
+                align="center"
+                sx={{ flexGrow: 1 }}
+              >
+                {doc.description}
+              </Typography>
+            </Box>
+          </Grid>
+        ))}
+      </Grid>
+      <Typography
+        variant="subtitle1"
+        component="div"
+        align="center"
+        sx={{ mb: 2, color: "text.secondary" }}
+      >
+        Or upload your own ...
+      </Typography>
+      <Box
+        sx={{
+          p: 4,
+          mt: 2,
+          mb: 2,
+          borderRadius: 1.5,
+          border:
+            selectedDocument?.name && !isDefaultDocument
+              ? `2px solid ${theme.palette.primary.main}`
+              : isDragging
               ? `2px dashed ${theme.palette.primary.main}`
+              : "2px dashed rgba(0, 0, 0, 0.16)",
+          backgroundColor: isDragging ? "rgba(0, 0, 0, 0.05)" : "transparent",
+          display: "flex",
+          flexDirection: "column",
+          alignItems: "center",
+          justifyContent: "center",
+          minHeight: 180,
+          cursor: "pointer",
+          transition: "all 0.3s ease",
+        }}
+        onDragOver={handleDragOver}
+        onDragLeave={handleDragLeave}
+        onDrop={handleDrop}
+        onClick={handleClick}
+      >
+        <input
+          type="file"
+          ref={fileInputRef}
+          onChange={handleFileChange}
+          accept=".pdf,.txt,.html,.md"
+          style={{ display: "none" }}
+        />
+        {selectedDocument?.name && !isDefaultDocument ? (
+          <>
+            <InsertDriveFileIcon
+              sx={{ fontSize: 50, color: "primary.main", mb: 1 }}
+            />
+            <Typography variant="h6" component="div" gutterBottom>
+              {selectedDocument.name}
+            </Typography>
+            <Typography variant="body2" color="text.secondary">
+              Click to upload a different file
+            </Typography>
+          </>
+        ) : (
+          <>
+            <CloudUploadIcon
+              sx={{ fontSize: 50, color: "primary.main", mb: 1 }}
+            />
+            <Typography variant="h6" component="div" gutterBottom>
+              Drag and drop your file here or click to browse
+            </Typography>
+            <Typography variant="body2" color="text.secondary">
+              Accepted formats: PDF, TXT, HTML, MD
+            </Typography>
+          </>
+        )}
+        {isLoading && (
+          <Box sx={{ mt: 2 }}>
+            <CircularProgress size={30} />
+          </Box>
+        )}
+      </Box>
+      <Box sx={{ display: "flex", justifyContent: "center" }}>
+        <Button
+          variant="contained"
+          color="primary"
+          onClick={handleGenerateClick}
+          startIcon={<AutoFixHighIcon />}
+          disabled={!sessionId}
+          sx={{ mt: 2 }}
+        >
+          Generate Benchmark
+        </Button>
+      </Box>
+      <Snackbar
+        open={openSnackbar}
+        autoHideDuration={6000}
+        onClose={handleCloseSnackbar}
+        anchorOrigin={{ vertical: "bottom", horizontal: "right" }}
+      >
+        <Alert
+          onClose={handleCloseSnackbar}
+          severity={uploadStatus?.success ? "success" : "error"}
+          sx={{ width: "100%" }}
         >
+          {uploadStatus?.message}
+        </Alert>
+      </Snackbar>
+    </Box>
   );
 }

frontend/src/components/BenchmarkEvaluation.jsx CHANGED Viewed

@@ -3,15 +3,28 @@ import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
 import { useNavigate, useSearchParams } from "react-router-dom";
 import API_CONFIG from "../config/api";
 // Starting messages with their timing
 const STARTING_MESSAGES = [
-  { message: "Initializing evaluation environment...", progress: 22 },
-  { message: "Starting evaluation process...", progress: 54 },
-  { message: "Evaluating models...", progress: 71 },
-  { message: "Storing evaluation results...", progress: 100 },
 ];
-const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
   const [evaluationComplete, setEvaluationComplete] = useState(false);
   const [error, setError] = useState(null);
   const [elapsedTime, setElapsedTime] = useState(0);
@@ -21,6 +34,7 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
   const startTimeRef = useRef(null);
   const startingMessageIntervalRef = useRef(null);
   const pollingIntervalRef = useRef(null);
   const navigate = useNavigate();
@@ -33,21 +47,26 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
   // Add effect to handle starting messages
   useEffect(() => {
-    startingMessageIntervalRef.current = setInterval(() => {
-      setStartingMessageIndex((prev) => {
-        if (prev < STARTING_MESSAGES.length - 1) {
-          return prev + 1;
-        }
-        return prev;
-      });
-    }, 20000); // Change message every 20 seconds
     return () => {
       if (startingMessageIntervalRef.current) {
         clearInterval(startingMessageIntervalRef.current);
       }
     };
-  }, []);
   // Start evaluation when component mounts
   useEffect(() => {
@@ -62,7 +81,11 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
       setElapsedTime(timeElapsed);
     }, 1000);
-    startEvaluation();
     // Clean up intervals on unmount
     return () => {
@@ -72,8 +95,25 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
       if (timerIntervalRef.current) {
         clearInterval(timerIntervalRef.current);
       }
     };
-  }, []);
   // Format elapsed time as HH:MM:SS
   const formatElapsedTime = () => {

 import { useNavigate, useSearchParams } from "react-router-dom";
 import API_CONFIG from "../config/api";
+// Temps de simulation en millisecondes pour les documents précalculés
+const SIMULATION_DURATION = 20000; // 20 secondes
+// Intervalle de changement des messages pour les documents standards vs précalculés
+const MESSAGE_CHANGE_INTERVAL = {
+  DEFAULT: 20000, // 20 secondes pour documents standards
+  PRECALCULATED: 5000, // 5 secondes pour documents précalculés
+};
 // Starting messages with their timing
 const STARTING_MESSAGES = [
+  { message: "Initializing evaluation environment...", progress: 0 },
+  { message: "Starting evaluation process...", progress: 27 },
+  { message: "Evaluating models...", progress: 54 },
+  { message: "Storing evaluation results...", progress: 84 },
 ];
+const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
+  const [searchParams] = useSearchParams();
+  const isDefault =
+    isDefaultDocument ||
+    ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
   const [evaluationComplete, setEvaluationComplete] = useState(false);
   const [error, setError] = useState(null);
   const [elapsedTime, setElapsedTime] = useState(0);
   const startTimeRef = useRef(null);
   const startingMessageIntervalRef = useRef(null);
   const pollingIntervalRef = useRef(null);
+  const simulationTimeoutRef = useRef(null);
   const navigate = useNavigate();
   // Add effect to handle starting messages
   useEffect(() => {
+    startingMessageIntervalRef.current = setInterval(
+      () => {
+        setStartingMessageIndex((prev) => {
+          if (prev < STARTING_MESSAGES.length - 1) {
+            return prev + 1;
+          }
+          return prev;
+        });
+      },
+      isDefault
+        ? MESSAGE_CHANGE_INTERVAL.PRECALCULATED
+        : MESSAGE_CHANGE_INTERVAL.DEFAULT
+    );
     return () => {
       if (startingMessageIntervalRef.current) {
         clearInterval(startingMessageIntervalRef.current);
       }
     };
+  }, [isDefault]);
   // Start evaluation when component mounts
   useEffect(() => {
       setElapsedTime(timeElapsed);
     }, 1000);
+    if (isDefault) {
+      simulateEvaluation();
+    } else {
+      startEvaluation();
+    }
     // Clean up intervals on unmount
     return () => {
       if (timerIntervalRef.current) {
         clearInterval(timerIntervalRef.current);
       }
+      if (simulationTimeoutRef.current) {
+        clearTimeout(simulationTimeoutRef.current);
+      }
     };
+  }, [isDefault]);
+  // Simulate the evaluation process for pre-calculated documents
+  const simulateEvaluation = () => {
+    // Complete after 20 seconds
+    simulationTimeoutRef.current = setTimeout(() => {
+      setEvaluationComplete(true);
+      if (startingMessageIntervalRef.current) {
+        clearInterval(startingMessageIntervalRef.current);
+      }
+      setStartingMessageIndex(STARTING_MESSAGES.length - 1); // Set to last message
+    }, SIMULATION_DURATION);
+  };
   // Format elapsed time as HH:MM:SS
   const formatElapsedTime = () => {

frontend/src/components/BenchmarkGenerator.jsx CHANGED Viewed

@@ -6,6 +6,9 @@ import LogDisplay from "./LogDisplay";
 import { useNavigate, useSearchParams } from "react-router-dom";
 import API_CONFIG from "../config/api";
 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
   "ingestion",
@@ -28,15 +31,39 @@ const STEP_LABELS = {
   lighteval: "LightEval",
 };
 /**
  * Component to handle benchmark generation and display logs
  *
  * @param {Object} props - Component props
  * @param {string} props.sessionId - The session ID for the uploaded file
  * @param {Function} props.onComplete - Function to call when generation is complete
  * @returns {JSX.Element} Benchmark generator component
  */
-const BenchmarkGenerator = ({ sessionId, onComplete }) => {
   const [generating, setGenerating] = useState(false);
   const [generationComplete, setGenerationComplete] = useState(false);
   const [generationLogs, setGenerationLogs] = useState([]);
@@ -55,6 +82,9 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
   // Reference for starting time
   const startTimeRef = useRef(null);
   // Start generation on component mount
   useEffect(() => {
     // Set start time
@@ -68,7 +98,11 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
       setElapsedTime(timeElapsed);
     }, 1000);
-    generateBenchmark();
     // Clean up the polling interval and timer when the component unmounts
     return () => {
@@ -78,8 +112,56 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
       if (timerIntervalRef.current) {
         clearInterval(timerIntervalRef.current);
       }
     };
-  }, []);
   // Determine the current phase and completed steps based on logs
   useEffect(() => {
@@ -116,6 +198,9 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
       setActiveStep(newActiveStep);
     }
     // Check the latest logs to determine the current phase
     const recentLogs = generationLogs.slice(-10); // Check more logs
@@ -157,7 +242,14 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
     ) {
       setCurrentPhase("configuring");
     }
-  }, [generationLogs, completedSteps, activeStep, sessionId, onComplete]);
   const generateBenchmark = async () => {
     if (!sessionId) {

 import { useNavigate, useSearchParams } from "react-router-dom";
 import API_CONFIG from "../config/api";
+// Temps de simulation en millisecondes pour les documents précalculés
+const SIMULATION_DURATION = 20000; // 20 secondes
 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
   "ingestion",
   lighteval: "LightEval",
 };
+// Simulated log messages for pre-calculated documents
+const SIMULATED_LOGS = [
+  "[INFO] Initializing benchmark generation...",
+  "[INFO] Generating base configuration file...",
+  "[SUCCESS] Stage completed: ingestion",
+  "[INFO] Processing document content for upload...",
+  "[SUCCESS] Stage completed: upload_ingest_to_hub",
+  "[INFO] Generating document summary...",
+  "[SUCCESS] Stage completed: summarization",
+  "[INFO] Chunking content for better analysis...",
+  "[SUCCESS] Stage completed: chunking",
+  "[INFO] Generating single-shot questions...",
+  "[SUCCESS] Stage completed: single_shot_question_generation",
+  "[INFO] Creating multi-hop questions from content...",
+  "[SUCCESS] Stage completed: multi_hop_question_generation",
+  "[INFO] Running LightEval for benchmark validation...",
+  "[SUCCESS] Stage completed: lighteval",
+  "[SUCCESS] Ingestion process completed successfully",
+];
 /**
  * Component to handle benchmark generation and display logs
  *
  * @param {Object} props - Component props
  * @param {string} props.sessionId - The session ID for the uploaded file
+ * @param {boolean} props.isDefaultDocument - Whether this is a pre-calculated document
  * @param {Function} props.onComplete - Function to call when generation is complete
  * @returns {JSX.Element} Benchmark generator component
  */
+const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
+  const [searchParams] = useSearchParams();
+  const isDefault =
+    searchParams.get("isDefault") === "true" || isDefaultDocument;
   const [generating, setGenerating] = useState(false);
   const [generationComplete, setGenerationComplete] = useState(false);
   const [generationLogs, setGenerationLogs] = useState([]);
   // Reference for starting time
   const startTimeRef = useRef(null);
+  // Simulation interval reference
+  const simulationIntervalRef = useRef(null);
   // Start generation on component mount
   useEffect(() => {
     // Set start time
       setElapsedTime(timeElapsed);
     }, 1000);
+    if (isDefault) {
+      simulateGeneration();
+    } else {
+      generateBenchmark();
+    }
     // Clean up the polling interval and timer when the component unmounts
     return () => {
       if (timerIntervalRef.current) {
         clearInterval(timerIntervalRef.current);
       }
+      if (simulationIntervalRef.current) {
+        clearInterval(simulationIntervalRef.current);
+      }
     };
+  }, [isDefault]);
+  // Simulate the benchmark generation for pre-calculated documents
+  const simulateGeneration = () => {
+    setGenerating(true);
+    setGenerationLogs([]);
+    setError(null);
+    setCurrentPhase("initializing");
+    setCompletedSteps([]);
+    setActiveStep(0);
+    // Timing variables for simulation
+    const totalSteps = SIMULATED_LOGS.length;
+    const totalDuration = SIMULATION_DURATION; // 20 seconds
+    const intervalPerStep = totalDuration / totalSteps;
+    let currentStep = 0;
+    // Function to add next log message
+    const addNextLog = () => {
+      if (currentStep < SIMULATED_LOGS.length) {
+        const newLogs = [...generationLogs, SIMULATED_LOGS[currentStep]];
+        setGenerationLogs(newLogs);
+        currentStep++;
+        // Check if completed
+        if (currentStep >= SIMULATED_LOGS.length) {
+          // Simulation complete
+          setTimeout(() => {
+            setCurrentPhase("complete");
+            setGenerationComplete(true);
+            clearInterval(simulationIntervalRef.current);
+            if (onComplete) {
+              onComplete({
+                success: true,
+                sessionId,
+                logs: newLogs,
+              });
+            }
+          }, 1000);
+        }
+      }
+    };
+    // Start simulation
+    simulationIntervalRef.current = setInterval(addNextLog, intervalPerStep);
+  };
   // Determine the current phase and completed steps based on logs
   useEffect(() => {
       setActiveStep(newActiveStep);
     }
+    // Skip the rest of the log processing if we're simulating
+    if (isDefault) return;
     // Check the latest logs to determine the current phase
     const recentLogs = generationLogs.slice(-10); // Check more logs
     ) {
       setCurrentPhase("configuring");
     }
+  }, [
+    generationLogs,
+    completedSteps,
+    activeStep,
+    sessionId,
+    onComplete,
+    isDefault,
+  ]);
   const generateBenchmark = async () => {
     if (!sessionId) {

frontend/src/components/EvaluationDisplay.jsx CHANGED Viewed

@@ -14,9 +14,77 @@ import {
   Card,
   CardContent,
   Link,
 } from "@mui/material";
 import OpenInNewIcon from "@mui/icons-material/OpenInNew";
 import CheckCircleIcon from "@mui/icons-material/CheckCircle";
 const EvaluationDisplay = ({ sessionId }) => {
   const [results, setResults] = useState(null);
   const [loading, setLoading] = useState(true);
@@ -60,7 +128,23 @@ const EvaluationDisplay = ({ sessionId }) => {
   // Format accuracy as percentage
   const formatAccuracy = (value) => {
-    return `${(value * 100).toFixed(2)}%`;
   };
   // Format evaluation time
@@ -125,14 +209,35 @@ const EvaluationDisplay = ({ sessionId }) => {
           boxShadow: "0 2px 4px rgba(0,0,0,0.05)",
         }}
       >
-        <Table sx={{ minWidth: 650 }}>
           <TableHead>
-            <TableRow>
-              <TableCell>Rank</TableCell>
               <TableCell>Model</TableCell>
-              <TableCell align="center">Accuracy</TableCell>
-              <TableCell align="center">Eval Time</TableCell>
-              <TableCell align="center">Status</TableCell>
             </TableRow>
           </TableHead>
           <TableBody>
@@ -142,35 +247,88 @@ const EvaluationDisplay = ({ sessionId }) => {
                 <TableRow
                   key={`${model.model_name}-${model.provider}`}
                   sx={{
-                    "&:last-child td, &:last-child th": { border: 0 },
                   }}
                 >
-                  <TableCell>{index + 1}</TableCell>
                   <TableCell component="th" scope="row">
-                    <Link
-                      href={`https://huggingface.co/${model.model_name}`}
-                      target="_blank"
-                      rel="noopener noreferrer"
                       sx={{
-                        textDecoration: "none",
-                        "&:hover": {
-                          textDecoration: "underline",
-                        },
                         display: "flex",
                         alignItems: "center",
                       }}
                     >
-                      {model.model_name}
-                      <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
-                    </Link>
-                  </TableCell>
-                  <TableCell align="center">
-                    {formatAccuracy(model.accuracy)}
                   </TableCell>
-                  <TableCell align="center">
                     {formatTime(model.evaluation_time)}
                   </TableCell>
-                  <TableCell align="center">
                     <span style={{ color: "green" }}>✓ Success</span>
                   </TableCell>
                 </TableRow>

   Card,
   CardContent,
   Link,
+  Tooltip,
 } from "@mui/material";
 import OpenInNewIcon from "@mui/icons-material/OpenInNew";
 import CheckCircleIcon from "@mui/icons-material/CheckCircle";
+// Styles pour les médailles
+const MEDAL_STYLES = {
+  1: {
+    color: "#B58A1B",
+    background: "linear-gradient(135deg, #FFF7E0 0%, #FFD700 100%)",
+    borderColor: "rgba(212, 160, 23, 0.35)",
+    shadowColor: "rgba(212, 160, 23, 0.8)",
+  },
+  2: {
+    color: "#667380",
+    background: "linear-gradient(135deg, #FFFFFF 0%, #D8E3ED 100%)",
+    borderColor: "rgba(124, 139, 153, 0.35)",
+    shadowColor: "rgba(124, 139, 153, 0.8)",
+  },
+  3: {
+    color: "#B85C2F",
+    background: "linear-gradient(135deg, #FDF0E9 0%, #FFBC8C 100%)",
+    borderColor: "rgba(204, 108, 61, 0.35)",
+    shadowColor: "rgba(204, 108, 61, 0.8)",
+  },
+  default: {
+    color: "text.primary",
+    background: "transparent",
+    borderColor: "transparent",
+    shadowColor: "transparent",
+  },
+};
+// Fonction pour obtenir le style de médaille en fonction du rang
+const getMedalStyle = (rank) => {
+  if (rank <= 3) {
+    const medalStyle = MEDAL_STYLES[rank];
+    return {
+      color: medalStyle.color,
+      fontWeight: 900,
+      fontFamily: '"Inter", -apple-system, sans-serif',
+      width: "24px",
+      height: "24px",
+      background: medalStyle.background,
+      border: "1px solid",
+      borderColor: medalStyle.borderColor,
+      borderRadius: "50%",
+      display: "flex",
+      alignItems: "center",
+      justifyContent: "center",
+      fontSize: "0.95rem",
+      lineHeight: 1,
+      padding: 0,
+      boxShadow: `1px 1px 0 ${medalStyle.shadowColor}`,
+      marginRight: "8px",
+    };
+  }
+  // Pour les rangs > 3, même dimensions mais transparent
+  return {
+    color: "text.primary",
+    fontWeight: rank <= 10 ? 600 : 400,
+    width: "24px",
+    height: "24px",
+    display: "flex",
+    alignItems: "center",
+    justifyContent: "center",
+    fontSize: "0.95rem",
+    marginRight: "8px",
+  };
+};
 const EvaluationDisplay = ({ sessionId }) => {
   const [results, setResults] = useState(null);
   const [loading, setLoading] = useState(true);
   // Format accuracy as percentage
   const formatAccuracy = (value) => {
+    return `${(value * 100).toFixed(2)}\u2009%`;
+  };
+  // Fonction pour obtenir une couleur en fonction du score (rouge au vert)
+  const getColorForScore = (score) => {
+    // Convertir en pourcentage (0-100)
+    const percent = score * 100;
+    // Calcul de la couleur: rouge (0%) à vert (100%)
+    // Rouge diminue, vert augmente
+    const red = Math.max(
+      0,
+      Math.min(255, Math.round(255 * (1 - percent / 100)))
+    );
+    const green = Math.max(0, Math.min(255, Math.round(255 * (percent / 100))));
+    return `rgb(${red}, ${green}, 0)`;
   };
   // Format evaluation time
           boxShadow: "0 2px 4px rgba(0,0,0,0.05)",
         }}
       >
+        <Table
+          sx={{
+            minWidth: 650,
+            "& .MuiTableCell-root": {
+              borderRight: "1px solid rgba(224, 224, 224, 1)",
+              borderBottom: "1px solid rgba(224, 224, 224, 1)",
+              "&:last-child": {
+                borderRight: "none",
+              },
+            },
+            "& .MuiTableRow-root:last-child .MuiTableCell-root": {
+              borderBottom: "1px solid rgba(224, 224, 224, 1)",
+            },
+          }}
+        >
           <TableHead>
+            <TableRow
+              sx={{
+                "& .MuiTableCell-root": {
+                  fontWeight: "bold",
+                  backgroundColor: "rgba(0, 0, 0, 0.02)",
+                },
+              }}
+            >
+              <TableCell width="80px">Rank</TableCell>
               <TableCell>Model</TableCell>
+              <TableCell align="left">Accuracy</TableCell>
+              <TableCell align="left">Eval Time</TableCell>
+              <TableCell align="right">Status</TableCell>
             </TableRow>
           </TableHead>
           <TableBody>
                 <TableRow
                   key={`${model.model_name}-${model.provider}`}
                   sx={{
+                    "&:nth-of-type(even)": {
+                      backgroundColor: "rgba(0, 0, 0, 0.02)",
+                    },
                   }}
                 >
+                  <TableCell>
+                    <Box sx={{ display: "flex", alignItems: "center" }}>
+                      <Box sx={getMedalStyle(index + 1)}>{index + 1}</Box>
+                    </Box>
+                  </TableCell>
                   <TableCell component="th" scope="row">
+                    <Tooltip title={model.model_name} placement="top">
+                      <Link
+                        href={`https://huggingface.co/${model.model_name}`}
+                        target="_blank"
+                        rel="noopener noreferrer"
+                        sx={{
+                          textDecoration: "none",
+                          "&:hover": {
+                            textDecoration: "underline",
+                          },
+                          display: "flex",
+                          alignItems: "center",
+                        }}
+                      >
+                        {model.model_name.length > 20
+                          ? `${model.model_name.substring(0, 20)}...`
+                          : model.model_name}
+                        <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
+                      </Link>
+                    </Tooltip>
+                  </TableCell>
+                  <TableCell
+                    align="left"
+                    sx={{
+                      padding: 0,
+                      position: "relative",
+                      overflow: "hidden",
+                    }}
+                  >
+                    <Box
                       sx={{
+                        position: "absolute",
+                        width: "100%",
+                        height: "100%",
+                        left: 0,
+                        top: 0,
                         display: "flex",
                         alignItems: "center",
+                        justifyContent: "flex-start",
+                        pl: 2,
                       }}
                     >
+                      <Box
+                        sx={{
+                          position: "absolute",
+                          left: 0,
+                          top: 0,
+                          height: "100%",
+                          width: `${model.accuracy * 100}%`,
+                          backgroundColor: getColorForScore(model.accuracy),
+                          opacity: 0.2,
+                          zIndex: 0,
+                        }}
+                      />
+                      <Typography
+                        sx={{
+                          position: "relative",
+                          zIndex: 1,
+                          fontWeight: model.accuracy > 0.7 ? "bold" : "normal",
+                          py: 1.5,
+                          textAlign: "left",
+                        }}
+                      >
+                        {formatAccuracy(model.accuracy)}
+                      </Typography>
+                    </Box>
                   </TableCell>
+                  <TableCell align="left">
                     {formatTime(model.evaluation_time)}
                   </TableCell>
+                  <TableCell align="right">
                     <span style={{ color: "green" }}>✓ Success</span>
                   </TableCell>
                 </TableRow>

frontend/src/components/Intro.jsx CHANGED Viewed

@@ -1,21 +1,28 @@
 import React from "react";
-import { Box } from "@mui/material";
 import HFLogo from "./Logo/HFLogo";
 const Intro = () => (
-  <Box sx={{ textAlign: "center", mb: 8 }}>
     <Box
       sx={{ height: "60px", mb: 4, display: "flex", justifyContent: "center" }}
     >
       <HFLogo />
     </Box>
-    <h1>Yourbench Demo</h1>
-    <p>
       YourBench is an <b>open-source framework</b> for generating{" "}
       <b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims to
       keep your large language models on their toes—even as new data sources,
       domains, and knowledge demands evolve.
-    </p>
   </Box>
 );

 import React from "react";
+import { Box, Typography } from "@mui/material";
 import HFLogo from "./Logo/HFLogo";
 const Intro = () => (
+  <Box sx={{ textAlign: "center", mb: 4 }}>
     <Box
       sx={{ height: "60px", mb: 4, display: "flex", justifyContent: "center" }}
     >
       <HFLogo />
     </Box>
+    <Typography
+      variant="h4"
+      component="h1"
+      gutterBottom
+      sx={{ fontWeight: 800 }}
+    >
+      Yourbench Demo
+    </Typography>
+    <Typography variant="body1" sx={{ maxWidth: "800px", mx: "auto" }}>
       YourBench is an <b>open-source framework</b> for generating{" "}
       <b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims to
       keep your large language models on their toes—even as new data sources,
       domains, and knowledge demands evolve.
+    </Typography>
   </Box>
 );

frontend/src/pages/BenchmarkDisplayPage.jsx CHANGED Viewed

@@ -81,7 +81,16 @@ function BenchmarkDisplayPage() {
   const handleStartEvaluation = () => {
     console.log("Starting evaluation with session ID:", sessionId);
-    navigate(`/benchmark-evaluation?session=${sessionId}`);
   };
   const defaultSampleQuestions = [

   const handleStartEvaluation = () => {
     console.log("Starting evaluation with session ID:", sessionId);
+    const isDefault = [
+      "the-bitter-lesson",
+      "hurricane-faq",
+      "pokemon-guide",
+    ].includes(sessionId);
+    navigate(
+      `/benchmark-evaluation?session=${sessionId}&isDefault=${
+        isDefault ? "true" : "false"
+      }`
+    );
   };
   const defaultSampleQuestions = [

frontend/src/pages/BenchmarkEvaluationPage.jsx CHANGED Viewed

@@ -8,6 +8,10 @@ function BenchmarkEvaluationPage() {
   const navigate = useNavigate();
   const [searchParams] = useSearchParams();
   const sessionId = searchParams.get("session");
   const [isValidSession, setIsValidSession] = useState(true);
   const [isLoading, setIsLoading] = useState(true);
@@ -20,6 +24,12 @@ function BenchmarkEvaluationPage() {
       return;
     }
     const checkSession = async () => {
       try {
         const response = await fetch(
@@ -41,10 +51,11 @@ function BenchmarkEvaluationPage() {
     };
     checkSession();
-  }, [sessionId]);
   const handleEvaluationComplete = (result) => {
     console.log("Évaluation terminée:", result);
   };
   if (!isValidSession) {
@@ -69,6 +80,7 @@ function BenchmarkEvaluationPage() {
       ) : (
         <BenchmarkEvaluation
           sessionId={sessionId}
           onComplete={handleEvaluationComplete}
         />
       )}

   const navigate = useNavigate();
   const [searchParams] = useSearchParams();
   const sessionId = searchParams.get("session");
+  const isDefaultFromUrl = searchParams.get("isDefault") === "true";
+  const isDefault =
+    isDefaultFromUrl ||
+    ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
   const [isValidSession, setIsValidSession] = useState(true);
   const [isLoading, setIsLoading] = useState(true);
       return;
     }
+    // Si c'est un document précalculé, on le considère comme valide directement
+    if (isDefault) {
+      setIsLoading(false);
+      return;
+    }
     const checkSession = async () => {
       try {
         const response = await fetch(
     };
     checkSession();
+  }, [sessionId, isDefault]);
   const handleEvaluationComplete = (result) => {
     console.log("Évaluation terminée:", result);
+    // La redirection est gérée par le composant BenchmarkEvaluation
   };
   if (!isValidSession) {
       ) : (
         <BenchmarkEvaluation
           sessionId={sessionId}
+          isDefaultDocument={isDefault}
           onComplete={handleEvaluationComplete}
         />
       )}

frontend/src/pages/BenchmarkGenerationPage.jsx CHANGED Viewed

@@ -8,6 +8,7 @@ function BenchmarkGenerationPage() {
   const navigate = useNavigate();
   const [searchParams] = useSearchParams();
   const sessionId = searchParams.get("session");
   const [isValidSession, setIsValidSession] = useState(true);
   useEffect(() => {
@@ -32,6 +33,7 @@ function BenchmarkGenerationPage() {
       <Intro />
       <BenchmarkGenerator
         sessionId={sessionId}
         onComplete={handleGenerationComplete}
       />
     </>

   const navigate = useNavigate();
   const [searchParams] = useSearchParams();
   const sessionId = searchParams.get("session");
+  const isDefault = searchParams.get("isDefault") === "true";
   const [isValidSession, setIsValidSession] = useState(true);
   useEffect(() => {
       <Intro />
       <BenchmarkGenerator
         sessionId={sessionId}
+        isDefaultDocument={isDefault}
         onComplete={handleGenerationComplete}
       />
     </>

frontend/src/pages/HomePage.jsx CHANGED Viewed

@@ -7,8 +7,12 @@ import BenchmarkCreateForm from "../components/BenchmarkCreateForm";
 function HomePage() {
   const navigate = useNavigate();
-  const handleStartGeneration = (sid) => {
-    navigate(`/benchmark-generation?session=${sid}`);
   };
   return (

 function HomePage() {
   const navigate = useNavigate();
+  const handleStartGeneration = (sid, isDefaultDocument) => {
+    navigate(
+      `/benchmark-generation?session=${sid}&isDefault=${
+        isDefaultDocument ? "true" : "false"
+      }`
+    );
   };
   return (