File size: 3,446 Bytes
4864926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Initialize the leaderboard with specific models and compute their p-values.

This module ensures only the specified models are included in the leaderboard
and their model trace p-values are computed.
"""

import os
import json
import sys
from src.evaluation.model_trace_eval import compute_model_trace_p_value
from src.envs import EVAL_RESULTS_PATH

# The specific models we want to include
ALLOWED_MODELS = [
    "lmsys/vicuna-7b-v1.5",
    "ibm-granite/granite-7b-base", 
    "EleutherAI/llemma_7b"
]

def create_model_result_file(model_name, precision="float16"):
    """
    Create a result file for a model with computed p-value.
    
    Args:
        model_name: HuggingFace model identifier
        precision: Model precision
    """
    sys.stderr.write(f"\nπŸ”§ CREATING RESULT FILE FOR: {model_name}\n")
    sys.stderr.flush()
    
    # Create the results directory if it doesn't exist
    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
    
    # Generate a safe filename
    safe_name = model_name.replace("/", "_").replace("-", "_")
    result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
    
    sys.stderr.write(f"πŸ“ Result file path: {result_file}\n")
    sys.stderr.flush()
    
    # Check if file already exists
    if os.path.exists(result_file):
        sys.stderr.write(f"βœ… Result file already exists: {result_file}\n")
        sys.stderr.flush()
        return result_file
    
    # Create basic result structure
    result_data = {
        "config": {
            "model_dtype": f"torch.{precision}",
            "model_name": model_name,
            "model_sha": "main"
        },
        "results": {
            "perplexity": {
                "perplexity": None  # Will be populated when user tests
            }
        }
    }
    
    # Save the result file
    try:
        with open(result_file, 'w') as f:
            json.dump(result_data, f, indent=2)
        
        sys.stderr.write(f"βœ… Created result file: {result_file}\n")
        sys.stderr.flush()
        return result_file
        
    except Exception as e:
        sys.stderr.write(f"❌ Failed to create result file: {e}\n")
        sys.stderr.flush()
        return None

def initialize_allowed_models():
    """
    Initialize result files for all allowed models.
    """
    sys.stderr.write(f"\nπŸš€ INITIALIZING ALLOWED MODELS\n")
    sys.stderr.write(f"πŸ“‹ Models to initialize: {ALLOWED_MODELS}\n")
    sys.stderr.flush()
    
    created_files = []
    
    for model_name in ALLOWED_MODELS:
        try:
            result_file = create_model_result_file(model_name)
            if result_file:
                created_files.append(result_file)
                
        except Exception as e:
            sys.stderr.write(f"❌ Failed to initialize {model_name}: {e}\n")
            sys.stderr.flush()
            continue
    
    sys.stderr.write(f"βœ… Initialized {len(created_files)} model result files\n")
    sys.stderr.flush()
    
    return created_files

def is_model_allowed(model_name):
    """
    Check if a model is in the allowed list.
    
    Args:
        model_name: HuggingFace model identifier
        
    Returns:
        bool: True if model is allowed
    """
    return model_name in ALLOWED_MODELS

def get_allowed_models():
    """
    Get the list of allowed models.
    
    Returns:
        list: List of allowed model names
    """
    return ALLOWED_MODELS.copy()