Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

tfrere commited on Apr 2

Commit

4fb52f5

1 Parent(s): 0e34dc4

add moder provider switching to eval

Browse files

Files changed (6) hide show

backend/benchmark_results.json +0 -139
backend/results.json +0 -0
backend/routes/cleanup.py +20 -10
backend/tasks/evaluation_task.py +71 -29
frontend/src/components/BenchmarkEvaluation.jsx +20 -11
frontend/src/components/BenchmarkGenerator.jsx +0 -3

backend/benchmark_results.json DELETED Viewed

@@ -1,139 +0,0 @@
-{
-  "timestamp": "2025-04-01T10:30:15.307581",
-  "models": {
-    "Qwen/Qwen2.5-72B-Instruct": [
-      {
-        "provider": "sambanova",
-        "total_time": 21.616381883621216,
-        "success_rate": 1.0,
-        "average_time": 4.323276376724243
-      },
-      {
-        "provider": "together",
-        "total_time": 21.84441828727722,
-        "success_rate": 1.0,
-        "average_time": 4.368883657455444
-      },
-      {
-        "provider": "nebius",
-        "total_time": 22.003292322158813,
-        "success_rate": 1.0,
-        "average_time": 4.400658464431762
-      },
-      {
-        "provider": "fireworks-ai",
-        "total_time": 22.086440563201904,
-        "success_rate": 1.0,
-        "average_time": 4.417288112640381
-      },
-      {
-        "provider": "novita",
-        "total_time": 22.16641402244568,
-        "success_rate": 1.0,
-        "average_time": 4.433282804489136
-      },
-      {
-        "provider": "hf-inference",
-        "total_time": 22.41838788986206,
-        "success_rate": 1.0,
-        "average_time": 4.483677577972412
-      },
-      {
-        "provider": "hyperbolic",
-        "total_time": 23.555410146713257,
-        "success_rate": 1.0,
-        "average_time": 4.711082029342651
-      }
-    ],
-    "meta-llama/Llama-3.3-70B-Instruct": [
-      {
-        "provider": "novita",
-        "total_time": 28.36034393310547,
-        "success_rate": 1.0,
-        "average_time": 5.672068786621094
-      },
-      {
-        "provider": "fireworks-ai",
-        "total_time": 31.595482110977173,
-        "success_rate": 1.0,
-        "average_time": 6.319096422195434
-      },
-      {
-        "provider": "sambanova",
-        "total_time": 31.845455646514893,
-        "success_rate": 1.0,
-        "average_time": 6.369091129302978
-      },
-      {
-        "provider": "nebius",
-        "total_time": 31.963874578475952,
-        "success_rate": 1.0,
-        "average_time": 6.39277491569519
-      },
-      {
-        "provider": "hyperbolic",
-        "total_time": 35.02063775062561,
-        "success_rate": 1.0,
-        "average_time": 7.004127550125122
-      },
-      {
-        "provider": "together",
-        "total_time": 36.88544726371765,
-        "success_rate": 1.0,
-        "average_time": 7.3770894527435305
-      },
-      {
-        "provider": "hf-inference",
-        "total_time": 37.26896572113037,
-        "success_rate": 1.0,
-        "average_time": 7.453793144226074
-      },
-      {
-        "provider": "cerebras",
-        "total_time": 37.70701003074646,
-        "success_rate": 1.0,
-        "average_time": 7.541402006149292
-      }
-    ],
-    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": null,
-    "Qwen/QwQ-32B": [
-      {
-        "provider": "sambanova",
-        "total_time": 25.050092935562134,
-        "success_rate": 1.0,
-        "average_time": 5.010018587112427
-      },
-      {
-        "provider": "novita",
-        "total_time": 25.061633110046387,
-        "success_rate": 1.0,
-        "average_time": 5.012326622009278
-      },
-      {
-        "provider": "hyperbolic",
-        "total_time": 25.363604307174683,
-        "success_rate": 1.0,
-        "average_time": 5.072720861434936
-      },
-      {
-        "provider": "nebius",
-        "total_time": 25.37495517730713,
-        "success_rate": 1.0,
-        "average_time": 5.074991035461426
-      },
-      {
-        "provider": "hf-inference",
-        "total_time": 25.41055965423584,
-        "success_rate": 1.0,
-        "average_time": 5.082111930847168
-      },
-      {
-        "provider": "fireworks-ai",
-        "total_time": 25.595581769943237,
-        "success_rate": 1.0,
-        "average_time": 5.119116353988647
-      }
-    ],
-    "mistralai/Mistral-Small-24B-Instruct-2501": null
-  }
-}

backend/results.json DELETED Viewed

The diff for this file is too large to render. See raw diff

backend/routes/cleanup.py CHANGED Viewed

@@ -2,28 +2,38 @@ from fastapi import APIRouter, HTTPException
 import os
 import shutil
 from .upload import session_files
 router = APIRouter(tags=["cleanup"])
-# Dossier racine pour les uploads
 UPLOAD_ROOT = "uploaded_files"
-# Liste des documents de base qui ne doivent pas être supprimés
 BASE_DOCUMENTS = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]
 @router.delete("/cleanup-session/{session_id}")
 async def cleanup_session(session_id: str):
     """
-    Supprime le dossier de session après que l'utilisateur a visualisé les résultats d'évaluation.
-    Ne supprime pas les documents de base.
     Args:
-        session_id: ID de la session à supprimer
     Returns:
-        Dictionary avec statut et message
     """
-    # Vérifier si le session_id existe et n'est pas un document de base
     if session_id in BASE_DOCUMENTS:
         return {
             "success": False,
@@ -32,7 +42,7 @@ async def cleanup_session(session_id: str):
     session_dir = os.path.join(UPLOAD_ROOT, session_id)
-    # Vérifier si le dossier existe
     if not os.path.exists(session_dir):
         return {
             "success": False,
@@ -40,11 +50,11 @@ async def cleanup_session(session_id: str):
         }
     try:
-        # Supprimer la référence du fichier de session
         if session_id in session_files:
             del session_files[session_id]
-        # Supprimer le dossier de session
         shutil.rmtree(session_dir)
         return {

 import os
 import shutil
 from .upload import session_files
+import logging
 router = APIRouter(tags=["cleanup"])
+# Root directory for uploads
 UPLOAD_ROOT = "uploaded_files"
+# List of base documents that should not be deleted
 BASE_DOCUMENTS = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]
 @router.delete("/cleanup-session/{session_id}")
 async def cleanup_session(session_id: str):
     """
+    Removes the session directory after the user has viewed the evaluation results.
+    Does not remove base documents.
+    In development mode, does nothing and returns a log message.
     Args:
+        session_id: ID of the session to delete
     Returns:
+        Dictionary with status and message
     """
+    # Check if we are in development mode
+    if os.environ.get("ENVIRONEMENT", "").lower() == "development":
+        logging.info(f"[DEV MODE] Cleanup called for session: {session_id} - No action taken in development mode")
+        return {
+            "success": True,
+            "message": f"Development mode - cleanup skipped for session: {session_id}"
+        }
+    # Check if the session_id exists and is not a base document
     if session_id in BASE_DOCUMENTS:
         return {
             "success": False,
     session_dir = os.path.join(UPLOAD_ROOT, session_id)
+    # Check if the directory exists
     if not os.path.exists(session_dir):
         return {
             "success": False,
         }
     try:
+        # Remove the session file reference
         if session_id in session_files:
             del session_files[session_id]
+        # Remove the session directory
         shutil.rmtree(session_dir)
         return {

backend/tasks/evaluation_task.py CHANGED Viewed

@@ -13,11 +13,12 @@ import json
 import shutil
 from typing import List, Dict
 from tasks.get_model_providers import get_model_providers
 from huggingface_hub import HfApi
 import asyncio
 # Valeur par défaut du timeout
-DEFAULT_EVALUATION_TIMEOUT = 60.0  # 1 minute par défaut
 class EvaluationTask:
     """
@@ -40,11 +41,36 @@ class EvaluationTask:
         self.results = []
         self.hf_api = HfApi()
         self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
         # Nettoyer les anciens résultats si demandé
         if clean_old_results:
             self.clean_old_results()
     def clean_old_results(self) -> None:
         """
         Clean old evaluation results to avoid confusion
@@ -239,57 +265,73 @@ TASKS_TABLE = [yourbench]
         models = [
             "Qwen/QwQ-32B",
             "Qwen/Qwen2.5-72B-Instruct",
-            "deepseek-ai/DeepSeek-V3-0324",
             "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
         ]
-        # Get providers for each model
-        model_providers = get_model_providers(models)
-        print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
-        # Run evaluations in parallel using asyncio
         tasks = []
-        for model_name, providers in model_providers:
-            if providers:  # Only run if providers are available
-                tasks.append(self._run_lighteval(model_name, providers[0]))
-        self.results = await asyncio.gather(*tasks)
-        # Calculate total script execution time
-        total_time = time.time() - script_start_time
-        print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
-        # Cleanup intermediate results if they exist
-        if os.path.exists("data/lighteval_results"):
-            print(f"[{datetime.now().strftime('%H:%M:%S')}] Cleaning up intermediate results")
-            try:
-                # Recursively delete intermediate results
-                import shutil
-                shutil.rmtree("data/lighteval_results", ignore_errors=True)
-            except Exception as e:
-                print(f"[{datetime.now().strftime('%H:%M:%S')}] Warning: Failed to clean up intermediate results: {str(e)}")
-        # Save final results to Hub (only once)
         self._save_results_to_hub()
-        # Mark the task as completed
         self.is_completed = True
     def get_logs(self) -> List[str]:
         """
-        Get logs for this task (empty list since we don't track logs anymore)
         Returns:
-            Empty list of logs
         """
-        return []
     def is_task_completed(self) -> bool:
         """
         Check if the task is completed
         Returns:
-            True if completed, False otherwise
         """
         return self.is_completed

 import shutil
 from typing import List, Dict
 from tasks.get_model_providers import get_model_providers
+from tasks.get_available_model_provider import get_available_model_provider
 from huggingface_hub import HfApi
 import asyncio
 # Valeur par défaut du timeout
+DEFAULT_EVALUATION_TIMEOUT = 70.0  # 1 minute par défaut
 class EvaluationTask:
     """
         self.results = []
         self.hf_api = HfApi()
         self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
+        self.current_step = "initializing"
+        self.completed_steps = []
         # Nettoyer les anciens résultats si demandé
         if clean_old_results:
             self.clean_old_results()
+    def update_step(self, step: str) -> None:
+        """
+        Update the current step and completed steps
+        Args:
+            step: Name of the step to update
+        """
+        self.current_step = step
+        if step not in self.completed_steps:
+            self.completed_steps.append(step)
+    def get_progress(self) -> Dict:
+        """
+        Get the current progress of the task
+        Returns:
+            Dictionary containing current step and completed steps
+        """
+        return {
+            "current_step": self.current_step,
+            "completed_steps": self.completed_steps
+        }
     def clean_old_results(self) -> None:
         """
         Clean old evaluation results to avoid confusion
         models = [
             "Qwen/QwQ-32B",
             "Qwen/Qwen2.5-72B-Instruct",
+            "meta-llama/Llama-3.3-70B-Instruct",
             "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+            "mistralai/Mistral-Small-24B-Instruct-2501",
         ]
+        # Step 1: Check available providers for each model
+        self.update_step("finding_available_model_providers")
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
+        model_providers = {}
+        for model in models:
+            provider = get_available_model_provider(model, verbose=True)
+            if provider:
+                model_providers[model] = provider
+            else:
+                print(f"[{datetime.now().strftime('%H:%M:%S')}] No available provider found for {model}")
+        if not model_providers:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] No models with available providers found")
+            return
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Found providers for {len(model_providers)} models")
+        # Step 2: Run evaluations in parallel
+        self.update_step("starting_evaluation_process")
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation process...")
+        # Step 3: Evaluate models
+        self.update_step("evaluating_models")
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluating models...")
         tasks = []
+        for model, provider in model_providers.items():
+            tasks.append(self._run_lighteval(model, provider))
+        # Run all evaluations in parallel
+        results = await asyncio.gather(*tasks)
+        # Filter out failed evaluations
+        self.results = [r for r in results if r["status"] == "success"]
+        # Step 4: Save results
+        self.update_step("storing_evaluation_results")
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Storing evaluation results...")
         self._save_results_to_hub()
+        # Mark task as completed
         self.is_completed = True
+        self.update_step("completed")
+        total_time = time.time() - script_start_time
+        print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation completed in {total_time:.2f}s")
     def get_logs(self) -> List[str]:
         """
+        Get the logs of the task
         Returns:
+            List of log messages
         """
+        return self.logs if hasattr(self, "logs") else []
     def is_task_completed(self) -> bool:
         """
         Check if the task is completed
         Returns:
+            True if the task is completed, False otherwise
         """
         return self.is_completed

frontend/src/components/BenchmarkEvaluation.jsx CHANGED Viewed

@@ -4,20 +4,21 @@ import { useNavigate, useSearchParams } from "react-router-dom";
 import API_CONFIG from "../config/api";
 // Temps de simulation en millisecondes pour les documents précalculés
-const SIMULATION_DURATION = 60000; // 20 secondes
 // Intervalle de changement des messages pour les documents standards vs précalculés
 const MESSAGE_CHANGE_INTERVAL = {
-  DEFAULT: 20000, // 20 secondes pour documents standards
-  PRECALCULATED: 5000, // 5 secondes pour documents précalculés
 };
 // Starting messages with their timing
 const STARTING_MESSAGES = [
-  { message: "Initializing evaluation environment...", step: 1, totalSteps: 4 },
-  { message: "Starting evaluation process...", step: 2, totalSteps: 4 },
-  { message: "Evaluating models...", step: 3, totalSteps: 4 },
-  { message: "Storing evaluation results...", step: 4, totalSteps: 4 },
 ];
 const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
@@ -29,6 +30,7 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
   const [error, setError] = useState(null);
   const [elapsedTime, setElapsedTime] = useState(0);
   const [startingMessageIndex, setStartingMessageIndex] = useState(0);
   const timerIntervalRef = useRef(null);
   const startTimeRef = useRef(null);
@@ -86,7 +88,8 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
       if (
         document.visibilityState === "visible" &&
         !isDefault &&
-        !evaluationComplete
       ) {
         console.log("Page became visible, checking evaluation status...");
         // Force une nouvelle requête pour récupérer l'état d'évaluation
@@ -140,7 +143,10 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
     if (isDefault) {
       simulateEvaluation();
     } else {
-      startEvaluation();
     }
     // Clean up intervals on unmount
@@ -156,7 +162,7 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
       }
       document.removeEventListener("visibilitychange", handleVisibilityChange);
     };
-  }, [isDefault, sessionId, evaluationComplete]);
   // Simulate the evaluation process for pre-calculated documents
   const simulateEvaluation = () => {
@@ -192,6 +198,9 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
       return;
     }
     try {
       // Call API to start evaluation
       const response = await fetch(
@@ -307,7 +316,7 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
             fontWeight: 500,
           }}
         >
-          Estimated time: ~1 min
         </Typography>
       </Box>

 import API_CONFIG from "../config/api";
 // Temps de simulation en millisecondes pour les documents précalculés
+const SIMULATION_DURATION = 70000; // 20 secondes
 // Intervalle de changement des messages pour les documents standards vs précalculés
 const MESSAGE_CHANGE_INTERVAL = {
+  DEFAULT: 25000, // 20 secondes pour documents standards
+  PRECALCULATED: 25000, // 5 secondes pour documents précalculés
 };
 // Starting messages with their timing
 const STARTING_MESSAGES = [
+  { message: "Initializing evaluation environment...", step: 1, totalSteps: 5 },
+  { message: "Finding available model providers...", step: 2, totalSteps: 5 },
+  { message: "Starting evaluation process...", step: 3, totalSteps: 5 },
+  { message: "Evaluating models...", step: 4, totalSteps: 5 },
+  { message: "Storing evaluation results...", step: 5, totalSteps: 5 },
 ];
 const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
   const [error, setError] = useState(null);
   const [elapsedTime, setElapsedTime] = useState(0);
   const [startingMessageIndex, setStartingMessageIndex] = useState(0);
+  const [evaluationStarted, setEvaluationStarted] = useState(false);
   const timerIntervalRef = useRef(null);
   const startTimeRef = useRef(null);
       if (
         document.visibilityState === "visible" &&
         !isDefault &&
+        !evaluationComplete &&
+        evaluationStarted // Vérifier si l'évaluation a déjà commencé
       ) {
         console.log("Page became visible, checking evaluation status...");
         // Force une nouvelle requête pour récupérer l'état d'évaluation
     if (isDefault) {
       simulateEvaluation();
     } else {
+      // Démarrer l'évaluation seulement si elle n'a pas déjà été lancée
+      if (!evaluationStarted) {
+        startEvaluation();
+      }
     }
     // Clean up intervals on unmount
       }
       document.removeEventListener("visibilitychange", handleVisibilityChange);
     };
+  }, [isDefault, sessionId, evaluationComplete, evaluationStarted]);
   // Simulate the evaluation process for pre-calculated documents
   const simulateEvaluation = () => {
       return;
     }
+    // Marquer que l'évaluation a commencé
+    setEvaluationStarted(true);
     try {
       // Call API to start evaluation
       const response = await fetch(
             fontWeight: 500,
           }}
         >
+          Estimated time ~ 1min 30s
         </Typography>
       </Box>

frontend/src/components/BenchmarkGenerator.jsx CHANGED Viewed

@@ -18,9 +18,6 @@ const BENCHMARK_STEPS = [
   "summarization",
   "chunking",
   "single_shot_question_generation",
-  "evaluation_provider_check",
-  "evaluation",
-  "evaluation_saving_results",
 ];
 // Step labels for display (more user-friendly names)

   "summarization",
   "chunking",
   "single_shot_question_generation",
 ];
 // Step labels for display (more user-friendly names)