File size: 6,441 Bytes
ed2eb44
 
 
 
 
 
06d4ee9
ed2eb44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
 
 
 
 
ed2eb44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed2eb44
 
06d4ee9
ed2eb44
06d4ee9
ed2eb44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
ed2eb44
 
 
 
06d4ee9
ed2eb44
06d4ee9
ed2eb44
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Data loading and processing utilities for the leaderboard application.
"""
import pandas as pd
import json
from src.utils.config import model_categories
from src.utils.task_mapping import get_display_name

def load_metric_data(file_path):
    """
    Load metric data from a JSON file
    
    Args:
        file_path (str): Path to the JSON file containing metric data
        
    Returns:
        dict: Dictionary containing the loaded metric data
    """
    try:
        with open(file_path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return {}
    except json.JSONDecodeError:
        print(f"Error: File {file_path} is not a valid JSON file.")
        return {}

def process_data(metric_data):
    """
    Process the metric data into a pandas DataFrame
    
    Args:
        metric_data (dict): Dictionary containing the metric data
        
    Returns:
        pandas.DataFrame: DataFrame containing the processed data
    """
    # Create a DataFrame to store the model metric data
    tasks = list(metric_data.keys())
    models = []
    model_data = {}

    # Extract model names and their metric values for each task
    for task in tasks:
        for model in metric_data[task]:
            if model not in models:
                models.append(model)
                model_data[model] = {}
            
            # Store the metric value for this task
            model_data[model][task] = metric_data[task][model]

    # Create DataFrame from the model_data dictionary
    df = pd.DataFrame.from_dict(model_data, orient='index')

    # Replace NaN values with '-'
    df.fillna('-', inplace=True)

    # First convert raw task names to standard format (spaces instead of hyphens/underscores)
    standardized_columns = [task.replace("-", " ").replace("_", " ").title() for task in df.columns]
    df.columns = standardized_columns
    
    # Then apply our display name mapping
    display_name_columns = {col: get_display_name(col) for col in df.columns}
    df = df.rename(columns=display_name_columns)

    # Add a model type column to the dataframe
    df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown"))
    
    return df

def calculate_selected_overall(row, selected_tasks):
    """
    Calculate overall average for selected tasks
    
    Args:
        row (pandas.Series): Row of data
        selected_tasks (list): List of task names to include in the average
        
    Returns:
        float or str: The calculated average or '-' if no numeric values
    """
    numeric_values = []
    
    for task in selected_tasks:
        value = row[task]
        # Check if the value is numeric (could be float or string representing float)
        if isinstance(value, (int, float)) or (isinstance(value, str) and value.replace('.', '', 1).replace('-', '', 1).isdigit()):
            numeric_values.append(float(value))
    
    # Calculate average if there are numeric values
    if numeric_values:
        return sum(numeric_values) / len(numeric_values)
    else:
        return '-'

def filter_and_prepare_data(df, selected_tasks, selected_model_types):
    """
    Filter and prepare data based on selections
    
    Args:
        df (pandas.DataFrame): The original DataFrame
        selected_tasks (list): List of selected task names
        selected_model_types (list): List of selected model types
        
    Returns:
        pandas.DataFrame: Filtered and prepared DataFrame
    """
    # Filter the dataframe based on selected model types
    filtered_df = df[df['Model Type'].isin(selected_model_types)]
    
    # Calculate the average for selected tasks only
    selected_tasks_df = filtered_df[selected_tasks]
    filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1)
    
    # Separate human entries from other models for ranking
    is_human = filtered_df['Model Type'] == 'Human'
    human_df = filtered_df[is_human]
    non_human_df = filtered_df[~is_human]
    
    # Sort non-human models by Selected Overall and add rank
    non_human_df = non_human_df.sort_values('Selected Overall', ascending=False)
    non_human_df.insert(0, 'Rank', range(1, len(non_human_df) + 1))
    
    # Add rank for human (use '-' to indicate not ranked)
    human_df.insert(0, 'Rank', '-')
    
    # Combine dataframes - put humans at appropriate position based on score
    combined_df = pd.concat([non_human_df, human_df])
    combined_df = combined_df.sort_values('Selected Overall', ascending=False)
    
    # Add a Model Name column that shows the index (actual model name)
    combined_df['Model Name'] = combined_df.index
    
    return combined_df

def format_display_dataframe(filtered_df, selected_tasks):
    """
    Create and format the display DataFrame for the leaderboard table
    
    Args:
        filtered_df (pandas.DataFrame): The filtered DataFrame
        selected_tasks (list): List of selected task names
        
    Returns:
        tuple: (pandas.DataFrame, list) - The display DataFrame and the metric columns
    """
    # Create a fixed display DataFrame with only the model info
    display_df = filtered_df[['Rank', 'Model Name', 'Model Type']].copy()
    
    # Format the rank column with medals
    medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"}
    display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x)))
    
    # Rename 'Model Name' to 'Agent'
    display_df = display_df.rename(columns={"Model Name": "Agent"})
    
    # Add metrics columns (Selected Overall and individual tasks)
    metric_columns = ['Selected Overall'] + selected_tasks
    for col in metric_columns:
        if col in filtered_df.columns:
            # Format numeric columns to 1 decimal place
            if filtered_df[col].dtype in ['float64', 'float32']:
                display_df[col] = filtered_df[col].apply(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
            else:
                display_df[col] = filtered_df[col]
    
    # Rename "Selected Overall" to "Metric Average" in display_df
    if "Selected Overall" in display_df.columns:
        display_df = display_df.rename(columns={"Selected Overall": "Metric Average"})
        # Also update the metric_columns list to reflect the rename
        metric_columns = ['Metric Average'] + selected_tasks
    
    return display_df, metric_columns