File size: 6,541 Bytes
970eef1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
"""
Script to test the evaluation task in standalone mode
"""
import os
import sys
import uuid
import json
import time
import argparse
from dotenv import load_dotenv
from pathlib import Path
import traceback

# Ensure the environment is properly configured
load_dotenv()

# Add the current directory to the path to import modules
sys.path.append(os.getcwd())
from tasks.evaluationTask import EvaluationTask


def setup_environment():
    """
    Configure the environment for testing
    """
    # Check if the HF token is defined
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        print("⚠️  The HF_TOKEN is not defined in the environment or .env file")
        print("    Please define this variable before continuing.")
        sys.exit(1)
    
    # Set the default organization if not defined
    if not os.getenv("HF_ORGANIZATION"):
        os.environ["HF_ORGANIZATION"] = "yourbench"
        print("ℹ️  The HF_ORGANIZATION variable is not defined, using 'yourbench' as default")


def run_standalone_evaluation(dataset_name, models=None, max_wait_time=3600):
    """
    Run the evaluation task in standalone mode
    
    Args:
        dataset_name: Name of the dataset to evaluate
        models: List of models to evaluate (optional)
        max_wait_time: Maximum waiting time in seconds
    """
    # Generate a unique session ID
    session_uid = str(uuid.uuid4())
    print(f"πŸ”§ Session ID: {session_uid}")
    
    # Create the evaluation task instance
    evaluation_task = EvaluationTask(session_uid, dataset_name)
    
    # If specific models are provided, use them
    if models:
        evaluation_task.models = models
        print(f"πŸ€– Using custom models: {models}")
    
    # Display dataset information
    organization = os.getenv("HF_ORGANIZATION", "yourbench")
    print(f"πŸ“Š Evaluating dataset: {organization}/{dataset_name}")
    print(f"πŸ’Ύ Results saved in: {evaluation_task.output_dir}")
    
    # Start the evaluation task
    print("πŸš€ Starting evaluation...")
    evaluation_task.run()
    
    # Wait for the task to complete while displaying logs
    start_time = time.time()
    last_log_count = 0
    
    while not evaluation_task.is_task_completed():
        current_logs = evaluation_task.get_logs()
        
        # Display only new logs
        if len(current_logs) > last_log_count:
            for log in current_logs[last_log_count:]:
                print(f"  {log}")
            last_log_count = len(current_logs)
        
        # Check if the maximum time is reached
        elapsed_time = time.time() - start_time
        if elapsed_time > max_wait_time:
            print("⚠️  Maximum waiting time reached, forced stop")
            break
        
        time.sleep(1)
    
    # Check if results are available
    results_file = Path(f"{evaluation_task.output_dir}/models_comparison.json")
    if results_file.exists():
        try:
            with open(results_file, 'r') as f:
                results = json.load(f)
            
            print("\nπŸ“ˆ Evaluation Results:")
            print(f"  Dataset: {results['metadata']['dataset']}")
            print(f"  Models tested: {results['metadata']['total_models_tested']}")
            print(f"  Successful tests: {results['metadata']['successful_tests']}")
            print(f"  Timestamp: {results['metadata']['timestamp']}")
            
            if results['metadata']['successful_tests'] > 0:
                print("\nπŸ“Š Model ranking by accuracy:")
                successful_models = [m for m in results['models_comparison'] if m['success']]
                for i, model in enumerate(successful_models):
                    print(f"  {i+1}. βœ… {model['model_name']} ({model['provider']})")
                    print(f"     Accuracy: {model['accuracy']:.4f} Β± {model['accuracy_stderr']:.4f}")
                    print(f"     Evaluation time: {model['evaluation_time']:.2f}s")
            
            failed_models = [m for m in results['models_comparison'] if not m['success']]
            if failed_models:
                print("\n❌ Unevaluated models:")
                for i, model in enumerate(failed_models):
                    print(f"  {i+1}. {model['model_name']} ({model['provider']})")
                    error_msg = model.get('error', 'Unknown reason')
                    print(f"     Reason: {error_msg}")
            
            # Check detailed results files
            detailed_file = Path(f"{evaluation_task.output_dir}/detailed_results.json")
            if detailed_file.exists():
                print(f"\nπŸ“„ Detailed results available in: {detailed_file}")
            
            # Check raw files
            raw_results = list(Path(f"{evaluation_task.output_dir}/results").glob("**/*.json"))
            if raw_results:
                print(f"\nπŸ“ {len(raw_results)} raw result files available in: {evaluation_task.output_dir}/results")
            
            print(f"\nβœ… Evaluation completed!")
        except Exception as e:
            print(f"❌ Error reading results: {str(e)}")
            print(f"   Details: {traceback.format_exc()}")
    else:
        print(f"❌ No evaluation results found in {results_file}")


if __name__ == "__main__":
    # Configure the argument parser
    parser = argparse.ArgumentParser(description="Test the evaluation task in standalone mode")
    parser.add_argument("dataset_name", type=str, help="Name of the dataset to evaluate (without the organization)")
    parser.add_argument("--model", action="append", dest="models", 
                        help="Model to evaluate in the format 'name/model,provider'. Can be used multiple times.")
    parser.add_argument("--timeout", type=int, default=3600, 
                        help="Maximum waiting time in seconds (default: 3600)")
    
    args = parser.parse_args()
    
    # Configure the environment
    setup_environment()
    
    # Transform models into tuples if specified
    models_to_evaluate = None
    if args.models:
        models_to_evaluate = []
        for model_spec in args.models:
            try:
                model_name, provider = model_spec.split(",")
                models_to_evaluate.append((model_name, provider))
            except ValueError:
                print(f"⚠️  Invalid model format: {model_spec}. Use 'name/model,provider'")
                sys.exit(1)
    
    # Run the evaluation
    run_standalone_evaluation(args.dataset_name, models_to_evaluate, args.timeout)