Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

tfrere commited on Apr 2

Commit

0e34dc4

1 Parent(s): 2484201

add get available model provider to benchmark generation

Browse files

Files changed (6) hide show

backend/results.json +0 -0
backend/routes/benchmark.py +46 -33
backend/tasks/create_bench_config_file.py +60 -15
backend/tasks/get_available_model_provider.py +208 -0
backend/tests/test_provider_rate_limits.py +272 -0
frontend/src/components/BenchmarkGenerator.jsx +10 -0

backend/results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

backend/routes/benchmark.py CHANGED Viewed

@@ -162,40 +162,53 @@ class UnifiedBenchmarkTask:
             self.config_task = CreateBenchConfigTask(session_uid=self.session_uid)
             # Execute the configuration task
-            config_path = self.config_task.run(file_path=file_path)
-            # Get configuration logs
-            config_logs = self.config_task.get_logs()
-            for log in config_logs:
-                self._add_log(log)
-            # Mark configuration step as completed
-            if "[SUCCESS] Stage completed: config_generation" not in self.logs:
-                self._add_log("[SUCCESS] Stage completed: configuration")
-            # Step 2: Benchmark
-            self._add_log("[INFO] Starting benchmark process")
-            self.bench_task = CreateBenchTask(session_uid=self.session_uid, config_path=config_path)
-            # Run the benchmark task
-            self.bench_task.run()
-            # Wait for the benchmark task to complete
-            while not self.bench_task.is_task_completed():
-                # Get new logs and add them
-                bench_logs = self.bench_task.get_logs()
-                for log in bench_logs:
                     self._add_log(log)
-                time.sleep(1)
-            # Get final logs
-            final_logs = self.bench_task.get_logs()
-            for log in final_logs:
-                self._add_log(log)
-            # Mark as completed
-            self.is_completed = True
-            self._add_log("[SUCCESS] Benchmark process completed successfully")
         except Exception as e:
             self._add_log(f"[ERROR] Benchmark process failed: {str(e)}")

             self.config_task = CreateBenchConfigTask(session_uid=self.session_uid)
             # Execute the configuration task
+            try:
+                config_path = self.config_task.run(file_path=file_path)
+                # Get configuration logs
+                config_logs = self.config_task.get_logs()
+                for log in config_logs:
                     self._add_log(log)
+                # Mark configuration step as completed
+                if "[SUCCESS] Stage completed: config_generation" not in self.logs:
+                    self._add_log("[SUCCESS] Stage completed: configuration")
+                # Step 2: Benchmark
+                self._add_log("[INFO] Starting benchmark process")
+                self.bench_task = CreateBenchTask(session_uid=self.session_uid, config_path=config_path)
+                # Run the benchmark task
+                self.bench_task.run()
+                # Wait for the benchmark task to complete
+                while not self.bench_task.is_task_completed():
+                    # Get new logs and add them
+                    bench_logs = self.bench_task.get_logs()
+                    for log in bench_logs:
+                        self._add_log(log)
+                    time.sleep(1)
+                # Get final logs
+                final_logs = self.bench_task.get_logs()
+                for log in final_logs:
+                    self._add_log(log)
+                # Mark as completed
+                self.is_completed = True
+                self._add_log("[SUCCESS] Benchmark process completed successfully")
+            except Exception as config_error:
+                error_msg = str(config_error)
+                # Log detailed error
+                self._add_log(f"[ERROR] Configuration failed: {error_msg}")
+                # Check if it's a provider error and provide a more user-friendly message
+                if "Required models not available" in error_msg:
+                    self._add_log("[ERROR] Some required models are not available at the moment. Please try again later.")
+                # Mark as completed with error
+                self.is_completed = True
         except Exception as e:
             self._add_log(f"[ERROR] Benchmark process failed: {str(e)}")

backend/tasks/create_bench_config_file.py CHANGED Viewed

@@ -13,6 +13,8 @@ from typing import Optional, Dict, Any, List, Tuple
 from loguru import logger
 from huggingface_hub import HfApi
 class CreateBenchConfigTask:
     """
@@ -76,6 +78,26 @@ class CreateBenchConfigTask:
             self._add_log(f"[ERROR] {error_msg}")
             raise RuntimeError(error_msg)
     def generate_base_config(self, hf_org: str, hf_dataset_name: str) -> Dict[str, Any]:
         """
         Create the base configuration dictionary
@@ -94,6 +116,39 @@ class CreateBenchConfigTask:
         if not hf_token:
             raise RuntimeError("HF_TOKEN environment variable is not defined")
         return {
             "hf_configuration": {
                 "token": "$HF_TOKEN",  # Utiliser directement le token de l'environnement
@@ -102,23 +157,10 @@ class CreateBenchConfigTask:
                 "hf_dataset_name": hf_dataset_name,
                 "concat_if_exist": False,
             },
-            "model_list": [
-                {
-                    "model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
-                    "provider": "novita",
-                    "api_key": "$HF_TOKEN",
-                    "max_concurrent_requests": 32,
-                },
-                {
-                    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-                    "provider": "novita",
-                    "api_key": "$HF_TOKEN",
-                    "max_concurrent_requests": 32,
-                }
-            ],
             "model_roles": {
-                "ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
                 "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
                 "chunking": ["intfloat/multilingual-e5-large-instruct"],
                 "single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
@@ -229,6 +271,9 @@ class CreateBenchConfigTask:
             time.sleep(0.8)  # Simulate delay
             # Generate and save the configuration
             config = self.generate_base_config(org_name, dataset_name)

 from loguru import logger
 from huggingface_hub import HfApi
+from tasks.get_available_model_provider import get_available_model_provider
 class CreateBenchConfigTask:
     """
             self._add_log(f"[ERROR] {error_msg}")
             raise RuntimeError(error_msg)
+    def get_model_provider(self, model_name: str) -> Optional[str]:
+        """
+        Get the available provider for a model
+        Args:
+            model_name: Name of the model to check
+        Returns:
+            Available provider or None if none found
+        """
+        self._add_log(f"[INFO] Finding available provider for {model_name}")
+        provider = get_available_model_provider(model_name, verbose=True)
+        if provider:
+            self._add_log(f"[INFO] Found provider for {model_name}: {provider}")
+            return provider
+        else:
+            self._add_log(f"[WARNING] No available provider found for {model_name}")
+            return None
     def generate_base_config(self, hf_org: str, hf_dataset_name: str) -> Dict[str, Any]:
         """
         Create the base configuration dictionary
         if not hf_token:
             raise RuntimeError("HF_TOKEN environment variable is not defined")
+        # Get providers for models
+        model_list = []
+        # Define required models
+        required_models = [
+            "Qwen/Qwen2.5-72B-Instruct"
+        ]
+        # Track found models
+        found_models = set()
+        for model_name in required_models:
+            provider = self.get_model_provider(model_name)
+            if provider:
+                model_list.append({
+                    "model_name": model_name,
+                    "provider": provider,
+                    "api_key": "$HF_TOKEN",
+                    "max_concurrent_requests": 32,
+                })
+                found_models.add(model_name)
+        # # Check if both required models are available
+        if len(found_models) < len(required_models):
+            missing_models = set(required_models) - found_models
+            missing_models_str = ", ".join(missing_models)
+            error_msg = f"Required models not available: {missing_models_str}. Cannot proceed with benchmark."
+            self._add_log(f"[ERROR] {error_msg}")
+            raise RuntimeError(error_msg)
+        # Mark provider check stage as completed
+        self._add_log("[SUCCESS] Stage completed: provider_check")
         return {
             "hf_configuration": {
                 "token": "$HF_TOKEN",  # Utiliser directement le token de l'environnement
                 "hf_dataset_name": hf_dataset_name,
                 "concat_if_exist": False,
             },
+            "model_list": model_list,
             "model_roles": {
+                "ingestion": ["Qwen/Qwen2.5-72B-Instruct"],
                 "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
                 "chunking": ["intfloat/multilingual-e5-large-instruct"],
                 "single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
             time.sleep(0.8)  # Simulate delay
+            # Log the start of finding providers
+            self._add_log("[INFO] Finding available providers for models...")
             # Generate and save the configuration
             config = self.generate_base_config(org_name, dataset_name)

backend/tasks/get_available_model_provider.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import os
+import logging
+import json
+from huggingface_hub import model_info, InferenceClient
+from dotenv import load_dotenv
+# Define preferred providers
+PREFERRED_PROVIDERS = ["sambanova", "novita"]
+def filter_providers(providers):
+    """Filter providers to only include preferred ones."""
+    return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
+def prioritize_providers(providers):
+    """Prioritize preferred providers, keeping all others."""
+    preferred = [provider for provider in providers if provider in PREFERRED_PROVIDERS]
+    non_preferred = [provider for provider in providers if provider not in PREFERRED_PROVIDERS]
+    return preferred + non_preferred
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def is_vision_model(model_name: str) -> bool:
+    """
+    Check if the model is a vision model based on its name
+    Args:
+        model_name: Name of the model
+    Returns:
+        True if it's a vision model, False otherwise
+    """
+    vision_indicators = ["-VL-", "vision", "clip", "image"]
+    return any(indicator in model_name.lower() for indicator in vision_indicators)
+def get_test_payload(model_name: str) -> dict:
+    """
+    Get the appropriate test payload based on model type
+    Args:
+        model_name: Name of the model
+    Returns:
+        Dictionary containing the test payload
+    """
+    # We're only testing text models now
+    return {
+        "inputs": "Hello",
+        "parameters": {
+            "max_new_tokens": 5
+        }
+    }
+def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool:
+    """
+    Test if a specific provider is available for a model using InferenceClient
+    Args:
+        model_name: Name of the model
+        provider: Provider to test
+        verbose: Whether to log detailed information
+    Returns:
+        True if the provider is available, False otherwise
+    """
+    try:
+        # Load environment variables
+        load_dotenv()
+        # Get HF token from environment
+        hf_token = os.environ.get("HF_TOKEN")
+        if not hf_token:
+            raise ValueError("HF_TOKEN not defined in environment")
+        if verbose:
+            logger.info(f"Testing provider {provider} for model {model_name}")
+        # Initialize the InferenceClient with the specific provider
+        client = InferenceClient(
+            model=model_name,
+            token=hf_token,
+            provider=provider,
+            timeout=10  # Increased timeout to allow model loading
+        )
+        try:
+            # Use the chat completions method for testing
+            response = client.chat_completion(
+                messages=[{"role": "user", "content": "Hello"}],
+                max_tokens=5
+            )
+            if verbose:
+                logger.info(f"Provider {provider} is available for {model_name}")
+            return True
+        except Exception as e:
+            if verbose:
+                error_message = str(e)
+                logger.error(f"Error with provider {provider}: {error_message}")
+                # Log specific error types if we can identify them
+                if "status_code=429" in error_message:
+                    logger.warning(f"Provider {provider} rate limited. You may need to wait or upgrade your plan.")
+                elif "status_code=401" in error_message:
+                    logger.warning(f"Authentication failed for provider {provider}. Check your token.")
+                elif "status_code=503" in error_message:
+                    logger.warning(f"Provider {provider} service unavailable. Model may be loading or provider is down.")
+                elif "timed out" in error_message.lower():
+                    logger.error(f"Timeout error with provider {provider} - request timed out after 10 seconds")
+            return False
+    except Exception as e:
+        if verbose:
+            logger.error(f"Error in test_provider: {str(e)}")
+        return False
+def get_available_model_provider(model_name, verbose=False):
+    """
+    Get the first available provider for a given model.
+    Args:
+        model_name: Name of the model on the Hub
+        verbose: Whether to log detailed information
+    Returns:
+        First available provider or None if none are available
+    """
+    try:
+        # Load environment variables
+        load_dotenv()
+        # Get HF token from environment
+        hf_token = os.environ.get("HF_TOKEN")
+        if not hf_token:
+            raise ValueError("HF_TOKEN not defined in environment")
+        # Get providers for the model and prioritize them
+        info = model_info(model_name, expand="inferenceProviderMapping")
+        if not hasattr(info, "inference_provider_mapping"):
+            if verbose:
+                logger.info(f"No inference providers found for {model_name}")
+            return None
+        providers = list(info.inference_provider_mapping.keys())
+        if not providers:
+            if verbose:
+                logger.info(f"Empty list of providers for {model_name}")
+            return None
+        # Prioritize providers
+        providers = prioritize_providers(providers)
+        if verbose:
+            logger.info(f"Available providers for {model_name}: {', '.join(providers)}")
+        # Test each provider
+        for provider in providers:
+            if test_provider(model_name, provider, verbose):
+                return provider
+        return None
+    except Exception as e:
+        if verbose:
+            logger.error(f"Error in get_available_model_provider: {str(e)}")
+        return None
+if __name__ == "__main__":
+    # # Example usage with verbose mode enabled
+    # model = "Qwen/Qwen2.5-72B-Instruct"
+    # # Test sambanova provider
+    # print("\nTesting sambanova provider:")
+    # sambanova_available = test_provider(model, "sambanova", verbose=True)
+    # print(f"sambanova available: {sambanova_available}")
+    # # Test novita provider
+    # print("\nTesting novita provider:")
+    # novita_available = test_provider(model, "novita", verbose=True)
+    # print(f"novita available: {novita_available}")
+    # # Test automatic provider selection
+    # print("\nTesting automatic provider selection:")
+    # provider = get_available_model_provider(model, verbose=True)
+    # print(f"Selected provider: {provider}")
+    models = [
+        "Qwen/QwQ-32B",
+        "Qwen/Qwen2.5-72B-Instruct",
+        "meta-llama/Llama-3.3-70B-Instruct",
+        "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        "mistralai/Mistral-Small-24B-Instruct-2501",
+    ]
+    providers = []
+    for model in models:
+        provider = get_available_model_provider(model, verbose=True)
+        providers.append(provider)
+    print(f"Providers {len(providers)}: {providers}")
+    # print("\nTesting novita provider:")
+    # novita_available = test_provider("deepseek-ai/DeepSeek-V3-0324", "novita", verbose=True)
+    # print(f"novita available: {novita_available}")

backend/tests/test_provider_rate_limits.py ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/usr/bin/env python
+"""
+Script to test rate limits of Hugging Face Inference API providers.
+Spams requests to a model/provider and collects error messages.
+Usage: python test_provider_rate_limits.py --model "model_name" --provider "provider_name" --requests 50
+"""
+import argparse
+import json
+import time
+import os
+import requests
+import sys
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from collections import Counter
+from typing import Dict, List, Tuple
+from dotenv import load_dotenv
+# Add parent directory to path to import from tasks
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from tasks.get_available_model_provider import prioritize_providers
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger("rate_limit_test")
+# Default model to test
+DEFAULT_MODEL = "meta-llama/Llama-3.3-70B-Instruct"
+def send_request(model: str, provider: str, token: str, request_id: int) -> Dict:
+    """
+    Send a single request to the model with the given provider.
+    Args:
+        model: Model name
+        provider: Provider name
+        token: HF token
+        request_id: ID for this request
+    Returns:
+        Dictionary with request info and result
+    """
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "inputs": f"Request {request_id}: Hello, what do you thing about the future of AI? And divide me 10 by {request_id}",
+        "parameters": {
+            "max_new_tokens": 10000,
+            "provider": provider
+        }
+    }
+    api_url = f"https://api-inference.huggingface.co/models/{model}"
+    start_time = time.time()
+    try:
+        response = requests.post(api_url, headers=headers, json=payload, timeout=15)
+        end_time = time.time()
+        result = {
+            "request_id": request_id,
+            "status_code": response.status_code,
+            "time_taken": end_time - start_time,
+            "headers": dict(response.headers),
+            "success": response.status_code == 200,
+        }
+        if response.status_code != 200:
+            try:
+                error_data = response.json()
+                if isinstance(error_data, dict) and "error" in error_data:
+                    result["error_message"] = error_data["error"]
+                else:
+                    result["error_message"] = str(error_data)
+            except:
+                result["error_message"] = response.text
+        return result
+    except Exception as e:
+        end_time = time.time()
+        return {
+            "request_id": request_id,
+            "status_code": 0,
+            "time_taken": end_time - start_time,
+            "success": False,
+            "error_message": str(e)
+        }
+def run_rate_limit_test(model: str, provider: str = None, num_requests: int = 50,
+                       max_workers: int = 10, delay: float = 0.1) -> List[Dict]:
+    """
+    Run a rate limit test by sending multiple requests to the specified model/provider.
+    Args:
+        model: Model to test
+        provider: Provider to test (if None, will use first available)
+        num_requests: Number of requests to send
+        max_workers: Maximum number of concurrent workers
+        delay: Delay between batches of requests
+    Returns:
+        List of results for each request
+    """
+    # Load environment variables
+    load_dotenv()
+    # Get HF token
+    hf_token = os.environ.get("HF_TOKEN")
+    if not hf_token:
+        logger.error("HF_TOKEN not defined in environment")
+        return []
+    # If provider not specified, get first available
+    if not provider:
+        from tasks.get_available_model_provider import get_available_model_provider
+        provider = get_available_model_provider(model)
+        if not provider:
+            logger.error(f"No available provider found for {model}")
+            return []
+    logger.info(f"Testing rate limits for {model} with provider: {provider}")
+    logger.info(f"Sending {num_requests} requests with {max_workers} concurrent workers")
+    # Send requests in parallel
+    results = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_id = {
+            executor.submit(send_request, model, provider, hf_token, i): i
+            for i in range(num_requests)
+        }
+        completed = 0
+        for future in future_to_id:
+            result = future.result()
+            results.append(result)
+            completed += 1
+            if completed % 10 == 0:
+                logger.info(f"Completed {completed}/{num_requests} requests")
+            # Add a small delay periodically to avoid overwhelming the API
+            if completed % max_workers == 0:
+                time.sleep(delay)
+    return results
+def analyze_results(results: List[Dict]) -> Dict:
+    """
+    Analyze the results of the rate limit test.
+    Args:
+        results: List of request results
+    Returns:
+        Dictionary with analysis
+    """
+    total_requests = len(results)
+    successful = sum(1 for r in results if r["success"])
+    failed = total_requests - successful
+    # Count different error messages
+    error_messages = Counter(r.get("error_message") for r in results if not r["success"])
+    # Calculate timing statistics
+    times = [r["time_taken"] for r in results]
+    avg_time = sum(times) / len(times) if times else 0
+    # Check for rate limiting headers
+    rate_limit_headers = set()
+    for r in results:
+        if "headers" in r:
+            for header in r["headers"]:
+                if "rate" in header.lower() or "limit" in header.lower():
+                    rate_limit_headers.add(header)
+    return {
+        "total_requests": total_requests,
+        "successful_requests": successful,
+        "failed_requests": failed,
+        "success_rate": successful / total_requests if total_requests > 0 else 0,
+        "average_time": avg_time,
+        "error_messages": dict(error_messages),
+        "rate_limit_headers": list(rate_limit_headers)
+    }
+def display_results(results: List[Dict], analysis: Dict) -> None:
+    """
+    Display the results of the rate limit test.
+    Args:
+        results: List of request results
+        analysis: Analysis of results
+    """
+    print("\n" + "="*80)
+    print(f"RATE LIMIT TEST RESULTS")
+    print("="*80)
+    print(f"\nTotal Requests: {analysis['total_requests']}")
+    print(f"Successful: {analysis['successful_requests']} ({analysis['success_rate']*100:.1f}%)")
+    print(f"Failed: {analysis['failed_requests']}")
+    print(f"Average Time: {analysis['average_time']:.3f} seconds")
+    if analysis["rate_limit_headers"]:
+        print("\nRate Limit Headers Found:")
+        for header in analysis["rate_limit_headers"]:
+            print(f"  - {header}")
+    if analysis["error_messages"]:
+        print("\nError Messages:")
+        for msg, count in analysis["error_messages"].items():
+            print(f"  - [{count} occurrences] {msg}")
+    # Print sample of headers from a failed request
+    failed_requests = [r for r in results if not r["success"]]
+    if failed_requests:
+        print("\nSample Headers from a Failed Request:")
+        for header, value in failed_requests[0].get("headers", {}).items():
+            print(f"  {header}: {value}")
+def main():
+    """
+    Main entry point for the script.
+    """
+    parser = argparse.ArgumentParser(description="Test rate limits of Hugging Face Inference API providers.")
+    parser.add_argument("--model", type=str, default=DEFAULT_MODEL, help="Name of the model to test")
+    parser.add_argument("--provider", type=str, help="Name of the provider to test (if not specified, will use first available)")
+    parser.add_argument("--requests", type=int, default=50, help="Number of requests to send")
+    parser.add_argument("--workers", type=int, default=10, help="Maximum number of concurrent workers")
+    parser.add_argument("--delay", type=float, default=0.1, help="Delay between batches of requests")
+    parser.add_argument("--output", type=str, help="Path to save results as JSON (optional)")
+    args = parser.parse_args()
+    # Run the test
+    results = run_rate_limit_test(
+        model=args.model,
+        provider=args.provider,
+        num_requests=args.requests,
+        max_workers=args.workers,
+        delay=args.delay
+    )
+    if not results:
+        logger.error("Test failed to run properly")
+        return
+    # Analyze the results
+    analysis = analyze_results(results)
+    # Display the results
+    display_results(results, analysis)
+    # Save results if requested
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump({
+                "results": results,
+                "analysis": analysis
+            }, f, indent=2)
+        logger.info(f"Results saved to {args.output}")
+if __name__ == "__main__":
+    main()

frontend/src/components/BenchmarkGenerator.jsx CHANGED Viewed

@@ -12,21 +12,29 @@ const SIMULATION_DURATION = 120000; // 20 secondes
 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
   "configuration",
   "ingestion",
   "upload_ingest_to_hub",
   "summarization",
   "chunking",
   "single_shot_question_generation",
 ];
 // Step labels for display (more user-friendly names)
 const STEP_LABELS = {
   configuration: "Configuration",
   ingestion: "Ingestion",
   upload_ingest_to_hub: "Upload to Hub",
   summarization: "Summarization",
   chunking: "Chunking",
   single_shot_question_generation: "Question generation",
 };
 // Simulated log messages for pre-calculated documents
@@ -34,6 +42,8 @@ const SIMULATED_LOGS = [
   "[INFO] Initializing benchmark generation...",
   "[INFO] Generating base configuration file...",
   "[SUCCESS] Stage completed: configuration",
   "[INFO] Starting ingestion process...",
   "[SUCCESS] Stage completed: ingestion",
   "[INFO] Processing document content for upload...",

 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
   "configuration",
+  "provider_check",
   "ingestion",
   "upload_ingest_to_hub",
   "summarization",
   "chunking",
   "single_shot_question_generation",
+  "evaluation_provider_check",
+  "evaluation",
+  "evaluation_saving_results",
 ];
 // Step labels for display (more user-friendly names)
 const STEP_LABELS = {
   configuration: "Configuration",
+  provider_check: "Finding providers",
   ingestion: "Ingestion",
   upload_ingest_to_hub: "Upload to Hub",
   summarization: "Summarization",
   chunking: "Chunking",
   single_shot_question_generation: "Question generation",
+  evaluation_provider_check: "Checking evaluation providers",
+  evaluation: "Running evaluations",
+  evaluation_saving_results: "Saving evaluation results",
 };
 // Simulated log messages for pre-calculated documents
   "[INFO] Initializing benchmark generation...",
   "[INFO] Generating base configuration file...",
   "[SUCCESS] Stage completed: configuration",
+  "[INFO] Finding available providers for models...",
+  "[SUCCESS] Stage completed: provider_check",
   "[INFO] Starting ingestion process...",
   "[SUCCESS] Stage completed: ingestion",
   "[INFO] Processing document content for upload...",