Spaces:

yourbench
/

demo

Running on CPU Upgrade

File size: 6,541 Bytes

970eef1

#!/usr/bin/env python3
"""
Script to test the evaluation task in standalone mode
"""
import os
import sys
import uuid
import json
import time
import argparse
from dotenv import load_dotenv
from pathlib import Path
import traceback

# Ensure the environment is properly configured
load_dotenv()

# Add the current directory to the path to import modules
sys.path.append(os.getcwd())
from tasks.evaluationTask import EvaluationTask


def setup_environment():
    """
    Configure the environment for testing
    """
    # Check if the HF token is defined
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        print("⚠️  The HF_TOKEN is not defined in the environment or .env file")
        print("    Please define this variable before continuing.")
        sys.exit(1)
    
    # Set the default organization if not defined
    if not os.getenv("HF_ORGANIZATION"):
        os.environ["HF_ORGANIZATION"] = "yourbench"
        print("ℹ️  The HF_ORGANIZATION variable is not defined, using 'yourbench' as default")


def run_standalone_evaluation(dataset_name, models=None, max_wait_time=3600):
    """
    Run the evaluation task in standalone mode
    
    Args:
        dataset_name: Name of the dataset to evaluate
        models: List of models to evaluate (optional)
        max_wait_time: Maximum waiting time in seconds
    """
    # Generate a unique session ID
    session_uid = str(uuid.uuid4())
    print(f"🔧 Session ID: {session_uid}")
    
    # Create the evaluation task instance
    evaluation_task = EvaluationTask(session_uid, dataset_name)
    
    # If specific models are provided, use them
    if models:
        evaluation_task.models = models
        print(f"🤖 Using custom models: {models}")
    
    # Display dataset information
    organization = os.getenv("HF_ORGANIZATION", "yourbench")
    print(f"📊 Evaluating dataset: {organization}/{dataset_name}")
    print(f"💾 Results saved in: {evaluation_task.output_dir}")
    
    # Start the evaluation task
    print("🚀 Starting evaluation...")
    evaluation_task.run()
    
    # Wait for the task to complete while displaying logs
    start_time = time.time()
    last_log_count = 0
    
    while not evaluation_task.is_task_completed():
        current_logs = evaluation_task.get_logs()
        
        # Display only new logs
        if len(current_logs) > last_log_count:
            for log in current_logs[last_log_count:]:
                print(f"  {log}")
            last_log_count = len(current_logs)
        
        # Check if the maximum time is reached
        elapsed_time = time.time() - start_time
        if elapsed_time > max_wait_time:
            print("⚠️  Maximum waiting time reached, forced stop")
            break
        
        time.sleep(1)
    
    # Check if results are available
    results_file = Path(f"{evaluation_task.output_dir}/models_comparison.json")
    if results_file.exists():
        try:
            with open(results_file, 'r') as f:
                results = json.load(f)
            
            print("\n📈 Evaluation Results:")
            print(f"  Dataset: {results['metadata']['dataset']}")
            print(f"  Models tested: {results['metadata']['total_models_tested']}")
            print(f"  Successful tests: {results['metadata']['successful_tests']}")
            print(f"  Timestamp: {results['metadata']['timestamp']}")
            
            if results['metadata']['successful_tests'] > 0:
                print("\n📊 Model ranking by accuracy:")
                successful_models = [m for m in results['models_comparison'] if m['success']]
                for i, model in enumerate(successful_models):
                    print(f"  {i+1}. ✅ {model['model_name']} ({model['provider']})")
                    print(f"     Accuracy: {model['accuracy']:.4f} ± {model['accuracy_stderr']:.4f}")
                    print(f"     Evaluation time: {model['evaluation_time']:.2f}s")
            
            failed_models = [m for m in results['models_comparison'] if not m['success']]
            if failed_models:
                print("\n❌ Unevaluated models:")
                for i, model in enumerate(failed_models):
                    print(f"  {i+1}. {model['model_name']} ({model['provider']})")
                    error_msg = model.get('error', 'Unknown reason')
                    print(f"     Reason: {error_msg}")
            
            # Check detailed results files
            detailed_file = Path(f"{evaluation_task.output_dir}/detailed_results.json")
            if detailed_file.exists():
                print(f"\n📄 Detailed results available in: {detailed_file}")
            
            # Check raw files
            raw_results = list(Path(f"{evaluation_task.output_dir}/results").glob("**/*.json"))
            if raw_results:
                print(f"\n📁 {len(raw_results)} raw result files available in: {evaluation_task.output_dir}/results")
            
            print(f"\n✅ Evaluation completed!")
        except Exception as e:
            print(f"❌ Error reading results: {str(e)}")
            print(f"   Details: {traceback.format_exc()}")
    else:
        print(f"❌ No evaluation results found in {results_file}")


if __name__ == "__main__":
    # Configure the argument parser
    parser = argparse.ArgumentParser(description="Test the evaluation task in standalone mode")
    parser.add_argument("dataset_name", type=str, help="Name of the dataset to evaluate (without the organization)")
    parser.add_argument("--model", action="append", dest="models", 
                        help="Model to evaluate in the format 'name/model,provider'. Can be used multiple times.")
    parser.add_argument("--timeout", type=int, default=3600, 
                        help="Maximum waiting time in seconds (default: 3600)")
    
    args = parser.parse_args()
    
    # Configure the environment
    setup_environment()
    
    # Transform models into tuples if specified
    models_to_evaluate = None
    if args.models:
        models_to_evaluate = []
        for model_spec in args.models:
            try:
                model_name, provider = model_spec.split(",")
                models_to_evaluate.append((model_name, provider))
            except ValueError:
                print(f"⚠️  Invalid model format: {model_spec}. Use 'name/model,provider'")
                sys.exit(1)
    
    # Run the evaluation
    run_standalone_evaluation(args.dataset_name, models_to_evaluate, args.timeout)