File size: 4,533 Bytes
39acd70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import tempfile
import time
import subprocess
import os
import json
from pathlib import Path
import concurrent.futures
from dotenv import load_dotenv
from datetime import datetime
import yaml
import argparse
from typing import Dict, Any
from tqdm import tqdm
from tools.lighteval.get_model_providers import get_model_providers

def run_lighteval(model_name: str, provider: str) -> dict:
    start_time = time.time()
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")
    
    # Create temporary task file
    temp_file_path = tempfile.mktemp(suffix=".py")
    with open(temp_file_path, 'w') as temp_file:
        temp_file.write("""
from lighteval_task.lighteval_task import create_yourbench_task

# Create yourbench task
yourbench = create_yourbench_task("yourbench/yourbench_test", "single_shot_questions")

# Define TASKS_TABLE needed by lighteval
TASKS_TABLE = [yourbench]
""")

    # LightEval command
    cmd_args = [
        "lighteval",
        "endpoint",
        "inference-providers",
        f"model={model_name},provider={provider}",
        "custom|yourbench|0|0",
        "--custom-tasks",
        temp_file_path,
        "--max-samples", "3",
        "--output-dir", "data/lighteval_results",
        # "--save-details",
        "--no-push-to-hub"
    ]

    try:
        # Run the command with environment variables and timeout of 60 seconds
        subprocess.run(cmd_args, env=os.environ, timeout=60)
    except subprocess.TimeoutExpired:
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
        return {
            "model": model_name,
            "provider": provider,
            "accuracy": 0.0,
            "execution_time": 60.0,
            "status": "timeout"
        }

    # Calculate execution time
    execution_time = time.time() - start_time
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")

    # Clean up
    os.unlink(temp_file_path)

    try:
        # Get results from the output file
        results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
        results_file = next(results_dir.glob("results_*.json"))
        
        with open(results_file) as f:
            results = json.load(f)
            accuracy = results["results"]["all"]["accuracy"]

        return {
            "model": model_name,
            "provider": provider,
            "accuracy": accuracy,
            "execution_time": execution_time,
            "status": "success"
        }
    except Exception as e:
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
        return {
            "model": model_name,
            "provider": provider,
            "accuracy": 0.0,
            "execution_time": execution_time,
            "status": "parse_error"
        }

def main():
    # Start global timer
    script_start_time = time.time()
    
    # Load environment variables
    load_dotenv()

    # Models to evaluate
    models = [
        "Qwen/QwQ-32B",
        "Qwen/Qwen2.5-72B-Instruct",
        "deepseek-ai/DeepSeek-V3-0324",
        "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    ]

    # Get providers for each model
    model_providers = get_model_providers(models)
    
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
    
    # Run evaluations in parallel using ProcessPoolExecutor
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = [
            executor.submit(run_lighteval, model_name, providers[0]) 
            for model_name, providers in model_providers 
            if providers  # Only run if providers are available
        ]
        results = [future.result() for future in concurrent.futures.as_completed(futures)]

    # Calculate total script execution time
    total_time = time.time() - script_start_time
    print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")

    # Print results in order
    print("\nResults:")
    print("-" * 80)
    for result in results:
        print(f"Model: {result['model']}")
        print(f"Provider: {result['provider']}")
        print(f"Accuracy: {result['accuracy']:.2f}")
        print(f"Execution time: {result['execution_time']:.2f}s")
        print("-" * 80)

if __name__ == "__main__":
    main()