Spaces:
Running
Running
Commit
·
06d4ee9
1
Parent(s):
697ae1d
Updating table
Browse files- Assests/MLRC_Bench_overview.png +0 -0
- src/app.py +74 -10
- src/components/filters.py +71 -22
- src/components/header.py +2 -8
- src/components/leaderboard.py +154 -37
- src/components/tasks.py +10 -8
- src/data/metrics/absolute_improvement_to_baseline.json +56 -0
- src/data/metrics/{margin_to_human.json → relative_improvement_to_human.json} +6 -0
- src/data/processors.py +4 -0
- src/utils/config.py +28 -10
- src/utils/data_loader.py +30 -9
- src/utils/task_mapping.py +39 -0
Assests/MLRC_Bench_overview.png
CHANGED
![]() |
![]() |
src/app.py
CHANGED
@@ -18,7 +18,7 @@ from src.utils.data_loader import (
|
|
18 |
from src.styles.base import load_all_styles
|
19 |
|
20 |
# Import components
|
21 |
-
from src.components.header import render_page_header
|
22 |
from src.components.filters import (
|
23 |
initialize_session_state,
|
24 |
render_metric_selection,
|
@@ -40,6 +40,37 @@ def setup_page():
|
|
40 |
|
41 |
# Load all styles
|
42 |
load_all_styles()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def main():
|
45 |
"""
|
@@ -51,9 +82,9 @@ def main():
|
|
51 |
# Render header
|
52 |
render_page_header()
|
53 |
|
54 |
-
# Load data
|
55 |
-
|
56 |
-
metric_data = load_metric_data(metrics_config[
|
57 |
df = process_data(metric_data)
|
58 |
|
59 |
# Initialize session state
|
@@ -65,20 +96,54 @@ def main():
|
|
65 |
# Tab 1: Leaderboard
|
66 |
with tabs[0]:
|
67 |
# Render filter components
|
68 |
-
|
|
|
|
|
69 |
selected_tasks = render_task_selection(df)
|
70 |
selected_model_types = render_model_type_selection(df)
|
71 |
|
72 |
# Render leaderboard if selections are valid
|
73 |
if selected_tasks and selected_model_types:
|
74 |
-
#
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# Format data for display
|
78 |
display_df, metric_columns = format_display_dataframe(filtered_df, selected_tasks)
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
# Render the leaderboard table
|
81 |
-
render_leaderboard_table(display_df,
|
82 |
else:
|
83 |
# Show empty state
|
84 |
render_empty_state()
|
@@ -88,8 +153,7 @@ def main():
|
|
88 |
# Render task descriptions
|
89 |
render_task_descriptions()
|
90 |
|
91 |
-
#
|
92 |
-
render_footer()
|
93 |
|
94 |
if __name__ == "__main__":
|
95 |
main()
|
|
|
18 |
from src.styles.base import load_all_styles
|
19 |
|
20 |
# Import components
|
21 |
+
from src.components.header import render_page_header
|
22 |
from src.components.filters import (
|
23 |
initialize_session_state,
|
24 |
render_metric_selection,
|
|
|
40 |
|
41 |
# Load all styles
|
42 |
load_all_styles()
|
43 |
+
|
44 |
+
# Force dark mode using custom CSS
|
45 |
+
st.markdown("""
|
46 |
+
<style>
|
47 |
+
/* Force dark mode regardless of browser settings */
|
48 |
+
.stApp {
|
49 |
+
background-color: #1a202c !important;
|
50 |
+
color: #e2e8f0 !important;
|
51 |
+
}
|
52 |
+
/* Override Streamlit's default styling to ensure dark mode */
|
53 |
+
.stTextInput, .stSelectbox, .stMultiselect {
|
54 |
+
background-color: #2d3748 !important;
|
55 |
+
color: #e2e8f0 !important;
|
56 |
+
}
|
57 |
+
.stButton>button {
|
58 |
+
background-color: #4a5568 !important;
|
59 |
+
color: #e2e8f0 !important;
|
60 |
+
}
|
61 |
+
/* Override header and text colors */
|
62 |
+
h1, h2, h3, h4, h5, h6, p, span, div {
|
63 |
+
color: #e2e8f0 !important;
|
64 |
+
}
|
65 |
+
/* Ensure tab styling is consistent */
|
66 |
+
.stTabs [data-baseweb="tab-list"] {
|
67 |
+
background-color: #1a202c !important;
|
68 |
+
}
|
69 |
+
.stTabs [data-baseweb="tab"] {
|
70 |
+
color: #e2e8f0 !important;
|
71 |
+
}
|
72 |
+
</style>
|
73 |
+
""", unsafe_allow_html=True)
|
74 |
|
75 |
def main():
|
76 |
"""
|
|
|
82 |
# Render header
|
83 |
render_page_header()
|
84 |
|
85 |
+
# Load primary metric data (first metric in config)
|
86 |
+
primary_metric = list(metrics_config.keys())[0]
|
87 |
+
metric_data = load_metric_data(metrics_config[primary_metric]["file"])
|
88 |
df = process_data(metric_data)
|
89 |
|
90 |
# Initialize session state
|
|
|
96 |
# Tab 1: Leaderboard
|
97 |
with tabs[0]:
|
98 |
# Render filter components
|
99 |
+
selected_metrics = render_metric_selection()
|
100 |
+
|
101 |
+
# Continue with other filters
|
102 |
selected_tasks = render_task_selection(df)
|
103 |
selected_model_types = render_model_type_selection(df)
|
104 |
|
105 |
# Render leaderboard if selections are valid
|
106 |
if selected_tasks and selected_model_types:
|
107 |
+
# Load the primary metric data first (always the first in selected_metrics)
|
108 |
+
primary_metric = selected_metrics[0]
|
109 |
+
primary_metric_data = load_metric_data(metrics_config[primary_metric]["file"])
|
110 |
+
primary_df = process_data(primary_metric_data)
|
111 |
+
|
112 |
+
# Filter and prepare data for primary metric
|
113 |
+
filtered_df = filter_and_prepare_data(primary_df, selected_tasks, selected_model_types)
|
114 |
|
115 |
# Format data for display
|
116 |
display_df, metric_columns = format_display_dataframe(filtered_df, selected_tasks)
|
117 |
|
118 |
+
# If additional metrics are selected, add their data too
|
119 |
+
all_metric_columns = metric_columns.copy()
|
120 |
+
|
121 |
+
for metric in selected_metrics[1:]:
|
122 |
+
metric_info = metrics_config[metric]
|
123 |
+
metric_data = load_metric_data(metric_info["file"])
|
124 |
+
metric_df = process_data(metric_data)
|
125 |
+
|
126 |
+
# Process and merge the additional metric data
|
127 |
+
metric_filtered_df = filter_and_prepare_data(metric_df, selected_tasks, selected_model_types)
|
128 |
+
metric_display_df, _ = format_display_dataframe(metric_filtered_df, selected_tasks)
|
129 |
+
|
130 |
+
# Create a meaningful prefix for this metric
|
131 |
+
if metric == "Absolute Improvement to Baseline":
|
132 |
+
prefix = "Abs"
|
133 |
+
else:
|
134 |
+
# Use first word of each part of the metric name
|
135 |
+
prefix = "".join([word[0] for word in metric.split()]).upper()
|
136 |
+
|
137 |
+
# Combine the dataframes - keep only metric columns from metric_display_df
|
138 |
+
for col in metric_columns:
|
139 |
+
if col in metric_display_df.columns:
|
140 |
+
# Add columns with metric prefix
|
141 |
+
display_df[f"{prefix}: {col}"] = metric_display_df[col]
|
142 |
+
# Add to the list of all metric columns
|
143 |
+
all_metric_columns.append(f"{prefix}: {col}")
|
144 |
+
|
145 |
# Render the leaderboard table
|
146 |
+
render_leaderboard_table(display_df, all_metric_columns, primary_metric)
|
147 |
else:
|
148 |
# Show empty state
|
149 |
render_empty_state()
|
|
|
153 |
# Render task descriptions
|
154 |
render_task_descriptions()
|
155 |
|
156 |
+
# Footer removed per user request
|
|
|
157 |
|
158 |
if __name__ == "__main__":
|
159 |
main()
|
src/components/filters.py
CHANGED
@@ -12,12 +12,14 @@ def initialize_session_state(df):
|
|
12 |
df (pandas.DataFrame): The DataFrame with model data
|
13 |
"""
|
14 |
# Initialize session states
|
15 |
-
if '
|
16 |
-
|
|
|
|
|
17 |
|
18 |
if 'selected_tasks' not in st.session_state:
|
19 |
-
#
|
20 |
-
st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
|
21 |
|
22 |
if 'selected_model_types' not in st.session_state:
|
23 |
# Ensure all model types are selected by default
|
@@ -28,23 +30,43 @@ def render_metric_selection():
|
|
28 |
Render the metric selection component
|
29 |
|
30 |
Returns:
|
31 |
-
|
32 |
"""
|
33 |
-
st.markdown("### Select
|
34 |
|
35 |
-
#
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
button_label = f"✓ {metric}" if is_selected else metric
|
41 |
button_type = "primary" if is_selected else "secondary"
|
42 |
|
43 |
if st.button(button_label, key=f"metric_{metric}", type=button_type):
|
44 |
-
|
|
|
|
|
|
|
45 |
st.rerun() # Force UI update
|
46 |
|
47 |
-
return st.session_state.
|
48 |
|
49 |
def render_task_selection(df):
|
50 |
"""
|
@@ -61,14 +83,33 @@ def render_task_selection(df):
|
|
61 |
# Extract task columns (exclude Model Type and Overall)
|
62 |
all_tasks = [col for col in df.columns if col not in ['Model Type']]
|
63 |
|
64 |
-
#
|
65 |
-
num_cols = 3
|
66 |
-
task_rows = [all_tasks[i:i+num_cols] for i in range(0, len(all_tasks), num_cols)]
|
67 |
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
cols = st.columns(num_cols)
|
70 |
-
|
71 |
-
|
|
|
|
|
72 |
with cols[i]:
|
73 |
is_selected = task in st.session_state.selected_tasks
|
74 |
button_label = f"✓ {task}" if is_selected else task
|
@@ -95,12 +136,20 @@ def render_model_type_selection(df):
|
|
95 |
"""
|
96 |
st.markdown("### Select Model Types")
|
97 |
|
98 |
-
#
|
99 |
model_types = df['Model Type'].unique().tolist()
|
100 |
-
model_type_cols = st.columns(len(model_types))
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
for i, model_type in enumerate(model_types):
|
103 |
-
|
|
|
|
|
104 |
is_selected = model_type in st.session_state.selected_model_types
|
105 |
button_label = f"✓ {model_type}" if is_selected else model_type
|
106 |
button_type = "primary" if is_selected else "secondary"
|
|
|
12 |
df (pandas.DataFrame): The DataFrame with model data
|
13 |
"""
|
14 |
# Initialize session states
|
15 |
+
if 'selected_metrics' not in st.session_state:
|
16 |
+
# Start with the first metric always selected
|
17 |
+
primary_metric = list(metrics_config.keys())[0]
|
18 |
+
st.session_state.selected_metrics = [primary_metric]
|
19 |
|
20 |
if 'selected_tasks' not in st.session_state:
|
21 |
+
# Select all tasks by default, excluding Model Type
|
22 |
+
st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
|
23 |
|
24 |
if 'selected_model_types' not in st.session_state:
|
25 |
# Ensure all model types are selected by default
|
|
|
30 |
Render the metric selection component
|
31 |
|
32 |
Returns:
|
33 |
+
list: Selected metrics
|
34 |
"""
|
35 |
+
st.markdown("### Select Metrics")
|
36 |
|
37 |
+
# Get metric names
|
38 |
+
all_metrics = list(metrics_config.keys())
|
39 |
+
primary_metric = all_metrics[0] # First metric is primary
|
40 |
+
secondary_metrics = all_metrics[1:] # Rest are secondary
|
41 |
+
|
42 |
+
# Always select the primary metric
|
43 |
+
if primary_metric not in st.session_state.selected_metrics:
|
44 |
+
st.session_state.selected_metrics.append(primary_metric)
|
45 |
+
|
46 |
+
# Create columns based on number of metrics
|
47 |
+
num_cols = len(all_metrics)
|
48 |
+
cols = st.columns(num_cols)
|
49 |
+
|
50 |
+
# Primary metric first (always selected and can't be deselected)
|
51 |
+
with cols[0]:
|
52 |
+
button_label = f"✓ {primary_metric}"
|
53 |
+
st.button(button_label, key=f"metric_{primary_metric}", type="primary", disabled=True)
|
54 |
+
|
55 |
+
# Secondary metrics that can be toggled
|
56 |
+
for i, metric in enumerate(secondary_metrics):
|
57 |
+
with cols[i+1]:
|
58 |
+
is_selected = metric in st.session_state.selected_metrics
|
59 |
button_label = f"✓ {metric}" if is_selected else metric
|
60 |
button_type = "primary" if is_selected else "secondary"
|
61 |
|
62 |
if st.button(button_label, key=f"metric_{metric}", type=button_type):
|
63 |
+
if is_selected:
|
64 |
+
st.session_state.selected_metrics.remove(metric)
|
65 |
+
else:
|
66 |
+
st.session_state.selected_metrics.append(metric)
|
67 |
st.rerun() # Force UI update
|
68 |
|
69 |
+
return st.session_state.selected_metrics
|
70 |
|
71 |
def render_task_selection(df):
|
72 |
"""
|
|
|
83 |
# Extract task columns (exclude Model Type and Overall)
|
84 |
all_tasks = [col for col in df.columns if col not in ['Model Type']]
|
85 |
|
86 |
+
# Determine number of columns based on screen width
|
87 |
+
num_cols = 3 # Default for medium screens
|
|
|
88 |
|
89 |
+
# Create task buttons in a fixed number of columns with balanced width
|
90 |
+
task_groups = [all_tasks[i:i+num_cols] for i in range(0, len(all_tasks), num_cols)]
|
91 |
+
|
92 |
+
# Custom CSS for button styling
|
93 |
+
st.markdown("""
|
94 |
+
<style>
|
95 |
+
/* Make buttons same width in their columns and centered */
|
96 |
+
.stButton > button {
|
97 |
+
width: 100%;
|
98 |
+
max-width: 300px;
|
99 |
+
margin: 0 auto;
|
100 |
+
display: block;
|
101 |
+
}
|
102 |
+
</style>
|
103 |
+
""", unsafe_allow_html=True)
|
104 |
+
|
105 |
+
# Display buttons in rows
|
106 |
+
for group in task_groups:
|
107 |
+
# Create columns with equal width
|
108 |
cols = st.columns(num_cols)
|
109 |
+
|
110 |
+
# Add buttons to each column
|
111 |
+
for i, task in enumerate(group):
|
112 |
+
if i < len(cols): # Ensure we don't exceed available columns
|
113 |
with cols[i]:
|
114 |
is_selected = task in st.session_state.selected_tasks
|
115 |
button_label = f"✓ {task}" if is_selected else task
|
|
|
136 |
"""
|
137 |
st.markdown("### Select Model Types")
|
138 |
|
139 |
+
# Get unique model types
|
140 |
model_types = df['Model Type'].unique().tolist()
|
|
|
141 |
|
142 |
+
# Determine number of columns - up to 4 columns max depending on number of model types
|
143 |
+
num_cols = min(len(model_types), 4)
|
144 |
+
|
145 |
+
# Create columns
|
146 |
+
cols = st.columns(num_cols)
|
147 |
+
|
148 |
+
# Add a button for each model type
|
149 |
for i, model_type in enumerate(model_types):
|
150 |
+
col_idx = i % num_cols # Determine which column to place the button in
|
151 |
+
|
152 |
+
with cols[col_idx]:
|
153 |
is_selected = model_type in st.session_state.selected_model_types
|
154 |
button_label = f"✓ {model_type}" if is_selected else model_type
|
155 |
button_type = "primary" if is_selected else "secondary"
|
src/components/header.py
CHANGED
@@ -31,11 +31,5 @@ def render_footer():
|
|
31 |
"""
|
32 |
Render the page footer
|
33 |
"""
|
34 |
-
|
35 |
-
|
36 |
-
<div class="footer">
|
37 |
-
<p>© 2023 Model Capability Leaderboard • Made with Streamlit • Contact: [email protected]</p>
|
38 |
-
</div>
|
39 |
-
""",
|
40 |
-
unsafe_allow_html=True
|
41 |
-
)
|
|
|
31 |
"""
|
32 |
Render the page footer
|
33 |
"""
|
34 |
+
# Footer content removed per user request
|
35 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
src/components/leaderboard.py
CHANGED
@@ -4,19 +4,43 @@ Leaderboard table components for the leaderboard application.
|
|
4 |
import streamlit as st
|
5 |
from src.data.processors import get_model_type_style, get_rank_style
|
6 |
|
7 |
-
def render_leaderboard_table(display_df, metric_columns):
|
8 |
"""
|
9 |
Render the custom HTML leaderboard table
|
10 |
|
11 |
Args:
|
12 |
display_df (pandas.DataFrame): The DataFrame with the display data
|
13 |
metric_columns (list): List of metric column names
|
|
|
14 |
"""
|
15 |
from src.components.header import render_section_header
|
|
|
16 |
|
17 |
# Display model ranking header without the box
|
18 |
render_section_header("Model Rankings")
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# Start building the HTML table structure
|
21 |
html_table = """
|
22 |
<div class="fixed-table-container">
|
@@ -25,12 +49,13 @@ def render_leaderboard_table(display_df, metric_columns):
|
|
25 |
<thead>
|
26 |
<tr class="header-row">
|
27 |
<th class="fixed-column first-fixed-column" rowspan="2">Rank</th>
|
28 |
-
<th class="fixed-column second-fixed-column" rowspan="2"
|
29 |
-
<th class="model-type-cell" rowspan="2">Model Type</th>
|
30 |
"""
|
31 |
|
32 |
-
# Add
|
33 |
-
|
|
|
34 |
|
35 |
# Continue the table structure
|
36 |
html_table += """
|
@@ -38,10 +63,13 @@ def render_leaderboard_table(display_df, metric_columns):
|
|
38 |
<tr class="sub-header">
|
39 |
"""
|
40 |
|
41 |
-
# Add individual column headers for metrics
|
42 |
-
for
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
45 |
|
46 |
# Close the header and start the body
|
47 |
html_table += """
|
@@ -53,13 +81,20 @@ def render_leaderboard_table(display_df, metric_columns):
|
|
53 |
# Add the data rows
|
54 |
for i, (idx, row) in enumerate(display_df.iterrows()):
|
55 |
# Define background colors to ensure consistency
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# Start the row
|
59 |
-
html_table += f'<tr class="table-row">'
|
60 |
|
61 |
# Add Rank with medal styling and consistent background
|
62 |
-
rank_style =
|
63 |
rank_styles = get_rank_style(row["Rank"])
|
64 |
for style_key, style_value in rank_styles.items():
|
65 |
rank_style += f"{style_key}: {style_value};"
|
@@ -67,11 +102,11 @@ def render_leaderboard_table(display_df, metric_columns):
|
|
67 |
html_table += f'<td class="fixed-column first-fixed-column" style="{rank_style}">{row["Rank"]}</td>'
|
68 |
|
69 |
# Model name fixed column with consistent background
|
70 |
-
html_table += f'<td class="fixed-column second-fixed-column" title="{row["
|
71 |
|
72 |
# Model type cell
|
73 |
model_type = row["Model Type"]
|
74 |
-
type_style = f"
|
75 |
model_type_styles = get_model_type_style(model_type)
|
76 |
for style_key, style_value in model_type_styles.items():
|
77 |
if style_value:
|
@@ -79,22 +114,30 @@ def render_leaderboard_table(display_df, metric_columns):
|
|
79 |
|
80 |
html_table += f'<td class="table-cell model-type-cell" style="{type_style}">{model_type}</td>'
|
81 |
|
82 |
-
# Add metric values with minimal styling
|
83 |
-
for col in
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
# Simple styling based on positive/negative values
|
88 |
-
try:
|
89 |
-
value = float(str(row[col]).replace(',', ''))
|
90 |
-
if value > 0:
|
91 |
-
cell_class += " positive-value"
|
92 |
-
elif value < 0:
|
93 |
-
cell_class += " negative-value"
|
94 |
-
except:
|
95 |
-
pass
|
96 |
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
html_table += "</tr>"
|
100 |
|
@@ -106,16 +149,90 @@ def render_leaderboard_table(display_df, metric_columns):
|
|
106 |
</div>
|
107 |
"""
|
108 |
|
109 |
-
# Add
|
110 |
-
|
111 |
-
<
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
#
|
118 |
-
st.markdown(
|
119 |
|
120 |
def render_empty_state():
|
121 |
"""
|
|
|
4 |
import streamlit as st
|
5 |
from src.data.processors import get_model_type_style, get_rank_style
|
6 |
|
7 |
+
def render_leaderboard_table(display_df, metric_columns, primary_metric):
|
8 |
"""
|
9 |
Render the custom HTML leaderboard table
|
10 |
|
11 |
Args:
|
12 |
display_df (pandas.DataFrame): The DataFrame with the display data
|
13 |
metric_columns (list): List of metric column names
|
14 |
+
primary_metric (str): The name of the primary metric
|
15 |
"""
|
16 |
from src.components.header import render_section_header
|
17 |
+
from src.utils.config import metrics_config
|
18 |
|
19 |
# Display model ranking header without the box
|
20 |
render_section_header("Model Rankings")
|
21 |
|
22 |
+
# Detect if we have multiple metrics (columns with metric prefixes)
|
23 |
+
has_multiple_metrics = any(":" in col for col in metric_columns)
|
24 |
+
|
25 |
+
# Group columns by metric if multiple metrics are present
|
26 |
+
metric_groups = {}
|
27 |
+
if has_multiple_metrics:
|
28 |
+
# Primary metric columns (no prefix)
|
29 |
+
primary_cols = [col for col in metric_columns if ":" not in col]
|
30 |
+
metric_groups[primary_metric] = primary_cols
|
31 |
+
|
32 |
+
# Other metrics
|
33 |
+
for col in metric_columns:
|
34 |
+
if ":" in col:
|
35 |
+
prefix, metric_name = col.split(": ", 1)
|
36 |
+
full_metric_name = next((m for m in metrics_config if m.startswith(prefix)), prefix)
|
37 |
+
if full_metric_name not in metric_groups:
|
38 |
+
metric_groups[full_metric_name] = []
|
39 |
+
metric_groups[full_metric_name].append(col)
|
40 |
+
else:
|
41 |
+
# Single metric
|
42 |
+
metric_groups[primary_metric] = metric_columns
|
43 |
+
|
44 |
# Start building the HTML table structure
|
45 |
html_table = """
|
46 |
<div class="fixed-table-container">
|
|
|
49 |
<thead>
|
50 |
<tr class="header-row">
|
51 |
<th class="fixed-column first-fixed-column" rowspan="2">Rank</th>
|
52 |
+
<th class="fixed-column second-fixed-column" rowspan="2" style="text-align: center;">Agent</th>
|
53 |
+
<th class="model-type-cell" rowspan="2" style="text-align: center;">Model Type</th>
|
54 |
"""
|
55 |
|
56 |
+
# Add metric headers for each metric group
|
57 |
+
for metric_name, cols in metric_groups.items():
|
58 |
+
html_table += f'<th colspan="{len(cols)}" class="metric-header" style="text-align: center;">{metric_name}</th>'
|
59 |
|
60 |
# Continue the table structure
|
61 |
html_table += """
|
|
|
63 |
<tr class="sub-header">
|
64 |
"""
|
65 |
|
66 |
+
# Add individual column headers for all metrics
|
67 |
+
for metric_name, cols in metric_groups.items():
|
68 |
+
for col in cols:
|
69 |
+
# Extract the actual column name if it has a prefix
|
70 |
+
display_name = col.split(": ", 1)[-1] if ":" in col else col
|
71 |
+
column_class = "overall-cell" if display_name == "Metric Average" else "metric-cell"
|
72 |
+
html_table += f'<th class="{column_class}" style="text-align: center;">{display_name}</th>'
|
73 |
|
74 |
# Close the header and start the body
|
75 |
html_table += """
|
|
|
81 |
# Add the data rows
|
82 |
for i, (idx, row) in enumerate(display_df.iterrows()):
|
83 |
# Define background colors to ensure consistency
|
84 |
+
# Special background for human row
|
85 |
+
is_human_row = row["Agent"] == "Top Human in Competition"
|
86 |
+
if is_human_row:
|
87 |
+
row_bg = "#2a1e37" # Purple-ish dark background for human row
|
88 |
+
row_style = f'style="background-color: {row_bg}; box-shadow: 0 0 5px #f472b6;"'
|
89 |
+
else:
|
90 |
+
row_bg = "#0a0a0a" if i % 2 == 0 else "#111111"
|
91 |
+
row_style = f'style="background-color: {row_bg};"'
|
92 |
|
93 |
# Start the row
|
94 |
+
html_table += f'<tr class="table-row" {row_style}>'
|
95 |
|
96 |
# Add Rank with medal styling and consistent background
|
97 |
+
rank_style = "" # Don't set background at cell level
|
98 |
rank_styles = get_rank_style(row["Rank"])
|
99 |
for style_key, style_value in rank_styles.items():
|
100 |
rank_style += f"{style_key}: {style_value};"
|
|
|
102 |
html_table += f'<td class="fixed-column first-fixed-column" style="{rank_style}">{row["Rank"]}</td>'
|
103 |
|
104 |
# Model name fixed column with consistent background
|
105 |
+
html_table += f'<td class="fixed-column second-fixed-column" title="{row["Agent"]}" style="font-weight: 500; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; text-align: center;">{row["Agent"]}</td>'
|
106 |
|
107 |
# Model type cell
|
108 |
model_type = row["Model Type"]
|
109 |
+
type_style = f"text-align: center;"
|
110 |
model_type_styles = get_model_type_style(model_type)
|
111 |
for style_key, style_value in model_type_styles.items():
|
112 |
if style_value:
|
|
|
114 |
|
115 |
html_table += f'<td class="table-cell model-type-cell" style="{type_style}">{model_type}</td>'
|
116 |
|
117 |
+
# Add metric values with minimal styling for all columns
|
118 |
+
all_metric_columns = [col for group in metric_groups.values() for col in group]
|
119 |
+
for col in all_metric_columns:
|
120 |
+
display_name = col.split(": ", 1)[-1] if ":" in col else col
|
121 |
+
cell_class = "table-cell overall-cell" if display_name == "Metric Average" else "table-cell metric-cell"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
+
# Check if column exists in the row (it should)
|
124 |
+
if col in row:
|
125 |
+
value_text = row[col]
|
126 |
+
|
127 |
+
# Simple styling based on positive/negative values
|
128 |
+
try:
|
129 |
+
value = float(str(row[col]).replace(',', ''))
|
130 |
+
if value > 0:
|
131 |
+
cell_class += " positive-value"
|
132 |
+
elif value < 0:
|
133 |
+
cell_class += " negative-value"
|
134 |
+
except:
|
135 |
+
pass
|
136 |
+
|
137 |
+
html_table += f'<td class="{cell_class}">{value_text}</td>'
|
138 |
+
else:
|
139 |
+
# If column doesn't exist (shouldn't happen), add empty cell
|
140 |
+
html_table += f'<td class="{cell_class}">-</td>'
|
141 |
|
142 |
html_table += "</tr>"
|
143 |
|
|
|
149 |
</div>
|
150 |
"""
|
151 |
|
152 |
+
# Add styling for metrics section
|
153 |
+
metrics_css = """
|
154 |
+
<style>
|
155 |
+
.metric-definitions {
|
156 |
+
margin-top: 30px;
|
157 |
+
padding-top: 20px;
|
158 |
+
border-top: 1px solid #333;
|
159 |
+
}
|
160 |
+
.metric-definition {
|
161 |
+
background-color: #1a1a1a;
|
162 |
+
border-radius: 8px;
|
163 |
+
padding: 12px 16px;
|
164 |
+
margin-bottom: 16px;
|
165 |
+
}
|
166 |
+
.metric-definition h4 {
|
167 |
+
margin-top: 0;
|
168 |
+
color: #a5b4fc;
|
169 |
+
}
|
170 |
+
.metric-definition p {
|
171 |
+
margin-bottom: 0;
|
172 |
+
color: #e2e8f0;
|
173 |
+
}
|
174 |
+
</style>
|
175 |
"""
|
176 |
+
|
177 |
+
# Build a clean HTML string for the metrics section
|
178 |
+
metrics_html = '<div class="metric-definitions">'
|
179 |
+
|
180 |
+
# Add each metric definition
|
181 |
+
for metric_name, metric_info in metrics_config.items():
|
182 |
+
metric_description = metric_info.get('description', '')
|
183 |
+
|
184 |
+
# Special handling for Relative Improvement to Human to show formula
|
185 |
+
if metric_name == "Relative Improvement to Human":
|
186 |
+
formula_html = """
|
187 |
+
<div style="margin: 15px 0;">
|
188 |
+
<p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
|
189 |
+
<div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
|
190 |
+
Relative Improvement to Human = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / (P<sub>human</sub> - P<sub>baseline</sub>)) × 100%
|
191 |
+
</div>
|
192 |
+
<p style="margin-top: 10px; font-weight: 500;">Where:</p>
|
193 |
+
<ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
|
194 |
+
<li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
|
195 |
+
<li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
|
196 |
+
<li style="margin-bottom: 5px;">P<sub>human</sub> is the human performance benchmark</li>
|
197 |
+
<li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
|
198 |
+
</ul>
|
199 |
+
</div>
|
200 |
+
"""
|
201 |
+
|
202 |
+
# Add the metric definition with the formula
|
203 |
+
metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p>{formula_html}</div>'
|
204 |
+
# Special handling for Absolute Improvement to Baseline to show formula
|
205 |
+
elif metric_name == "Absolute Improvement to Baseline":
|
206 |
+
formula_html = """
|
207 |
+
<div style="margin: 15px 0;">
|
208 |
+
<p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
|
209 |
+
<div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
|
210 |
+
Absolute Improvement to Baseline = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / P<sub>baseline</sub>) × 100%
|
211 |
+
</div>
|
212 |
+
<p style="margin-top: 10px; font-weight: 500;">Where:</p>
|
213 |
+
<ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
|
214 |
+
<li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
|
215 |
+
<li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
|
216 |
+
<li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
|
217 |
+
</ul>
|
218 |
+
</div>
|
219 |
+
"""
|
220 |
+
|
221 |
+
# Add the metric definition with the formula
|
222 |
+
metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p>{formula_html}</div>'
|
223 |
+
else:
|
224 |
+
# Regular metric without formula
|
225 |
+
metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p></div>'
|
226 |
+
|
227 |
+
# Close the metric definitions container
|
228 |
+
metrics_html += '</div>'
|
229 |
+
|
230 |
+
# Display the styling and HTML separately for maximum control
|
231 |
+
st.markdown(html_table, unsafe_allow_html=True)
|
232 |
+
st.markdown(metrics_css, unsafe_allow_html=True)
|
233 |
|
234 |
+
# Render the metrics definitions
|
235 |
+
st.markdown(metrics_html, unsafe_allow_html=True)
|
236 |
|
237 |
def render_empty_state():
|
238 |
"""
|
src/components/tasks.py
CHANGED
@@ -3,6 +3,7 @@ Task description components for the leaderboard application.
|
|
3 |
"""
|
4 |
import streamlit as st
|
5 |
from src.utils.config import tasks_info
|
|
|
6 |
|
7 |
def render_task_descriptions():
|
8 |
"""
|
@@ -51,8 +52,8 @@ def render_task_descriptions():
|
|
51 |
</div>
|
52 |
""", unsafe_allow_html=True)
|
53 |
|
54 |
-
# Task links mapping
|
55 |
-
|
56 |
"Backdoor Trigger Recovery": "https://www.llmagentsafetycomp24.com/tracks/#backdoor_model",
|
57 |
"Machine Unlearning": "https://unlearning-challenge.github.io/",
|
58 |
"Perception Temporal Action Loc": "https://ptchallenge-workshop.github.io",
|
@@ -60,6 +61,9 @@ def render_task_descriptions():
|
|
60 |
"Meta Learning": "https://metalearning.chalearn.org/",
|
61 |
"Llm Merging": "https://llm-merging.github.io"
|
62 |
}
|
|
|
|
|
|
|
63 |
|
64 |
# Create two columns
|
65 |
col1, col2 = st.columns(2)
|
@@ -73,9 +77,8 @@ def render_task_descriptions():
|
|
73 |
link = task_links.get(task, "#")
|
74 |
st.markdown(f"""
|
75 |
<a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
|
76 |
-
<div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
|
77 |
-
<div class="task-title">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
|
78 |
-
<div class="task-description">{description}</div>
|
79 |
</div>
|
80 |
</a>
|
81 |
""", unsafe_allow_html=True)
|
@@ -85,9 +88,8 @@ def render_task_descriptions():
|
|
85 |
link = task_links.get(task, "#")
|
86 |
st.markdown(f"""
|
87 |
<a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
|
88 |
-
<div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
|
89 |
-
<div class="task-title">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
|
90 |
-
<div class="task-description">{description}</div>
|
91 |
</div>
|
92 |
</a>
|
93 |
""", unsafe_allow_html=True)
|
|
|
3 |
"""
|
4 |
import streamlit as st
|
5 |
from src.utils.config import tasks_info
|
6 |
+
from src.utils.task_mapping import get_display_name, get_original_name
|
7 |
|
8 |
def render_task_descriptions():
|
9 |
"""
|
|
|
52 |
</div>
|
53 |
""", unsafe_allow_html=True)
|
54 |
|
55 |
+
# Task links mapping - using original task names
|
56 |
+
original_task_links = {
|
57 |
"Backdoor Trigger Recovery": "https://www.llmagentsafetycomp24.com/tracks/#backdoor_model",
|
58 |
"Machine Unlearning": "https://unlearning-challenge.github.io/",
|
59 |
"Perception Temporal Action Loc": "https://ptchallenge-workshop.github.io",
|
|
|
61 |
"Meta Learning": "https://metalearning.chalearn.org/",
|
62 |
"Llm Merging": "https://llm-merging.github.io"
|
63 |
}
|
64 |
+
|
65 |
+
# Update links mapping to use display names as keys
|
66 |
+
task_links = {get_display_name(task): link for task, link in original_task_links.items()}
|
67 |
|
68 |
# Create two columns
|
69 |
col1, col2 = st.columns(2)
|
|
|
77 |
link = task_links.get(task, "#")
|
78 |
st.markdown(f"""
|
79 |
<a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
|
80 |
+
<div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s; padding: 12px; margin-bottom: 15px; height: auto;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
|
81 |
+
<div class="task-title" style="text-align: center;">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
|
|
|
82 |
</div>
|
83 |
</a>
|
84 |
""", unsafe_allow_html=True)
|
|
|
88 |
link = task_links.get(task, "#")
|
89 |
st.markdown(f"""
|
90 |
<a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
|
91 |
+
<div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s; padding: 12px; margin-bottom: 15px; height: auto;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
|
92 |
+
<div class="task-title" style="text-align: center;">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
|
|
|
93 |
</div>
|
94 |
</a>
|
95 |
""", unsafe_allow_html=True)
|
src/data/metrics/absolute_improvement_to_baseline.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"perception_temporal_action_loc": {
|
3 |
+
"MLAB (claude-3-5-sonnet-v2)": 2.222443094482299,
|
4 |
+
"Top Human in Competition": 284.55703321316366,
|
5 |
+
"MLAB (gemini-exp-1206)": -1.34633272895098,
|
6 |
+
"MLAB (o3-mini)": 0.8724822663469414,
|
7 |
+
"MLAB (gpt-4o)": 0.9384906166574135,
|
8 |
+
"MLAB (llama3-1-405b-instruct)": 1.474927454740455,
|
9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.9888962417416385
|
10 |
+
},
|
11 |
+
"llm-merging": {
|
12 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.6756756689645764,
|
13 |
+
"Top Human in Competition": 68.24324325461103,
|
14 |
+
"MLAB (claude-3-5-sonnet-v2)": 3.3783783853634035,
|
15 |
+
"MLAB (gemini-exp-1206)": 3.3783783853634035,
|
16 |
+
"MLAB (o3-mini)": -0.6756756689645764,
|
17 |
+
"MLAB (gpt-4o)": 1.3513513581994137,
|
18 |
+
"MLAB (llama3-1-405b-instruct)": -0.6756756689645764
|
19 |
+
},
|
20 |
+
"meta-learning": {
|
21 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.424978139166417,
|
22 |
+
"Top Human in Competition": 304.53435579895256,
|
23 |
+
"MLAB (claude-3-5-sonnet-v2)": 5.424978139166417,
|
24 |
+
"MLAB (gemini-exp-1206)": 5.424978139166417,
|
25 |
+
"MLAB (o3-mini)": -14.923192223926499,
|
26 |
+
"MLAB (gpt-4o)": 5.424978139166417,
|
27 |
+
"MLAB (llama3-1-405b-instruct)": 5.424978139166417
|
28 |
+
},
|
29 |
+
"product-recommendation": {
|
30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6021227441680528,
|
31 |
+
"Top Human in Competition": 412.59793394031675,
|
32 |
+
"MLAB (claude-3-5-sonnet-v2)": 12.283606772997718,
|
33 |
+
"MLAB (gemini-exp-1206)": 0.6021227441680528,
|
34 |
+
"MLAB (o3-mini)": 0.6035316323448103,
|
35 |
+
"MLAB (gpt-4o)": 2.6400767209619422,
|
36 |
+
"MLAB (llama3-1-405b-instruct)": -2.9066701147102995e-09
|
37 |
+
},
|
38 |
+
"machine_unlearning": {
|
39 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 7.318484292638537,
|
40 |
+
"Top Human in Competition": 61.85258904854873,
|
41 |
+
"MLAB (claude-3-5-sonnet-v2)": -58.58540153334969,
|
42 |
+
"MLAB (gemini-exp-1206)": 3.4837676447981045,
|
43 |
+
"MLAB (o3-mini)": 2.2414490971518704,
|
44 |
+
"MLAB (gpt-4o)": -11.131587250139926,
|
45 |
+
"MLAB (llama3-1-405b-instruct)": 3.8409541040677597
|
46 |
+
},
|
47 |
+
"backdoor-trigger-recovery": {
|
48 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 38.252918051116,
|
49 |
+
"Top Human in Competition": 621.2635313337943,
|
50 |
+
"MLAB (claude-3-5-sonnet-v2)": 247.90785034564928,
|
51 |
+
"MLAB (gemini-exp-1206)": 80.40937239150493,
|
52 |
+
"MLAB (o3-mini)": 38.75953643366491,
|
53 |
+
"MLAB (gpt-4o)": 64.52832837042699,
|
54 |
+
"MLAB (llama3-1-405b-instruct)": 71.70765816958271
|
55 |
+
}
|
56 |
+
}
|
src/data/metrics/{margin_to_human.json → relative_improvement_to_human.json}
RENAMED
@@ -1,6 +1,7 @@
|
|
1 |
{
|
2 |
"perception_temporal_action_loc": {
|
3 |
"MLAB (claude-3-5-sonnet-v2)": 0.7810185077440877,
|
|
|
4 |
"MLAB (gemini-exp-1206)": -0.4731328246392113,
|
5 |
"MLAB (o3-mini)": 0.3066106841553126,
|
6 |
"MLAB (gpt-4o)": 0.3298075630252947,
|
@@ -9,6 +10,7 @@
|
|
9 |
},
|
10 |
"llm-merging": {
|
11 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.9900989999019761,
|
|
|
12 |
"MLAB (claude-3-5-sonnet-v2)": 4.950495058915793,
|
13 |
"MLAB (gemini-exp-1206)": 4.950495058915793,
|
14 |
"MLAB (o3-mini)": -0.9900989999019761,
|
@@ -17,6 +19,7 @@
|
|
17 |
},
|
18 |
"meta-learning": {
|
19 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.781401026144938,
|
|
|
20 |
"MLAB (claude-3-5-sonnet-v2)": 1.781401026144938,
|
21 |
"MLAB (gemini-exp-1206)": 1.781401026144938,
|
22 |
"MLAB (o3-mini)": -4.900331256476853,
|
@@ -25,6 +28,7 @@
|
|
25 |
},
|
26 |
"product-recommendation": {
|
27 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.1459345029718814,
|
|
|
28 |
"MLAB (claude-3-5-sonnet-v2)": 2.9771372473170388,
|
29 |
"MLAB (gemini-exp-1206)": 0.1459345029718814,
|
30 |
"MLAB (o3-mini)": 0.1462759705510577,
|
@@ -33,6 +37,7 @@
|
|
33 |
},
|
34 |
"machine_unlearning": {
|
35 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 11.832138969791846,
|
|
|
36 |
"MLAB (claude-3-5-sonnet-v2)": -94.71778374121965,
|
37 |
"MLAB (gemini-exp-1206)": 5.632371576335568,
|
38 |
"MLAB (o3-mini)": 3.623856546073656,
|
@@ -41,6 +46,7 @@
|
|
41 |
},
|
42 |
"backdoor-trigger-recovery": {
|
43 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 6.1572772457753295,
|
|
|
44 |
"MLAB (claude-3-5-sonnet-v2)": 39.903815022493674,
|
45 |
"MLAB (gemini-exp-1206)": 12.94287662739089,
|
46 |
"MLAB (o3-mini)": 6.238823700218141,
|
|
|
1 |
{
|
2 |
"perception_temporal_action_loc": {
|
3 |
"MLAB (claude-3-5-sonnet-v2)": 0.7810185077440877,
|
4 |
+
"Top Human in Competition": 100.0,
|
5 |
"MLAB (gemini-exp-1206)": -0.4731328246392113,
|
6 |
"MLAB (o3-mini)": 0.3066106841553126,
|
7 |
"MLAB (gpt-4o)": 0.3298075630252947,
|
|
|
10 |
},
|
11 |
"llm-merging": {
|
12 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.9900989999019761,
|
13 |
+
"Top Human in Competition": 100.0,
|
14 |
"MLAB (claude-3-5-sonnet-v2)": 4.950495058915793,
|
15 |
"MLAB (gemini-exp-1206)": 4.950495058915793,
|
16 |
"MLAB (o3-mini)": -0.9900989999019761,
|
|
|
19 |
},
|
20 |
"meta-learning": {
|
21 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.781401026144938,
|
22 |
+
"Top Human in Competition": 100.0,
|
23 |
"MLAB (claude-3-5-sonnet-v2)": 1.781401026144938,
|
24 |
"MLAB (gemini-exp-1206)": 1.781401026144938,
|
25 |
"MLAB (o3-mini)": -4.900331256476853,
|
|
|
28 |
},
|
29 |
"product-recommendation": {
|
30 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.1459345029718814,
|
31 |
+
"Top Human in Competition": 100.0,
|
32 |
"MLAB (claude-3-5-sonnet-v2)": 2.9771372473170388,
|
33 |
"MLAB (gemini-exp-1206)": 0.1459345029718814,
|
34 |
"MLAB (o3-mini)": 0.1462759705510577,
|
|
|
37 |
},
|
38 |
"machine_unlearning": {
|
39 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 11.832138969791846,
|
40 |
+
"Top Human in Competition": 100.0,
|
41 |
"MLAB (claude-3-5-sonnet-v2)": -94.71778374121965,
|
42 |
"MLAB (gemini-exp-1206)": 5.632371576335568,
|
43 |
"MLAB (o3-mini)": 3.623856546073656,
|
|
|
46 |
},
|
47 |
"backdoor-trigger-recovery": {
|
48 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 6.1572772457753295,
|
49 |
+
"Top Human in Competition": 100.0,
|
50 |
"MLAB (claude-3-5-sonnet-v2)": 39.903815022493674,
|
51 |
"MLAB (gemini-exp-1206)": 12.94287662739089,
|
52 |
"MLAB (o3-mini)": 6.238823700218141,
|
src/data/processors.py
CHANGED
@@ -42,6 +42,8 @@ def get_model_type_style(model_type):
|
|
42 |
return {'color': '#93c5fd'} # Brighter blue
|
43 |
elif model_type == "Closed Source":
|
44 |
return {'color': '#cbd5e1'} # Lighter gray
|
|
|
|
|
45 |
else:
|
46 |
return {'color': ''}
|
47 |
|
@@ -61,6 +63,8 @@ def get_rank_style(rank):
|
|
61 |
return {'color': 'silver', 'font-weight': '700', 'font-size': '16px'}
|
62 |
elif "🥉" in str(rank):
|
63 |
return {'color': '#cd7f32', 'font-weight': '700', 'font-size': '16px'}
|
|
|
|
|
64 |
else:
|
65 |
return {}
|
66 |
|
|
|
42 |
return {'color': '#93c5fd'} # Brighter blue
|
43 |
elif model_type == "Closed Source":
|
44 |
return {'color': '#cbd5e1'} # Lighter gray
|
45 |
+
elif model_type == "Human":
|
46 |
+
return {'color': '#f472b6', 'font-weight': '600'} # Pink with emphasis for Human
|
47 |
else:
|
48 |
return {'color': ''}
|
49 |
|
|
|
63 |
return {'color': 'silver', 'font-weight': '700', 'font-size': '16px'}
|
64 |
elif "🥉" in str(rank):
|
65 |
return {'color': '#cd7f32', 'font-weight': '700', 'font-size': '16px'}
|
66 |
+
elif str(rank) == "-":
|
67 |
+
return {'color': '#f472b6', 'font-style': 'italic'} # Style for non-ranked (human)
|
68 |
else:
|
69 |
return {}
|
70 |
|
src/utils/config.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
# Theme and configuration settings for the Model Capability Leaderboard application
|
2 |
|
|
|
|
|
|
|
3 |
# Theme colors - using dark mode by default
|
4 |
dark_theme = {
|
5 |
'bg_color': '#1a202c',
|
@@ -46,12 +49,19 @@ app_config = {
|
|
46 |
|
47 |
# Metrics configuration
|
48 |
metrics_config = {
|
49 |
-
"
|
50 |
-
"file": "src/data/metrics/
|
51 |
-
"description": "
|
52 |
"min_value": -100, # Approximate, adjust as needed
|
53 |
"max_value": 50, # Approximate, adjust as needed
|
54 |
"color_map": "RdYlGn"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
}
|
56 |
# Future metrics can be added here
|
57 |
# "Another Metric": {
|
@@ -70,16 +80,24 @@ model_categories = {
|
|
70 |
"MLAB (o3-mini)": "Closed Source",
|
71 |
"MLAB (gpt-4o)": "Closed Source",
|
72 |
"MLAB (llama3-1-405b-instruct)": "Open Weights",
|
73 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source"
|
|
|
|
|
74 |
# More models would be added here as needed
|
75 |
}
|
76 |
|
77 |
# Task descriptions
|
78 |
tasks_info = {
|
79 |
-
"Perception Temporal Action Loc"
|
80 |
-
|
81 |
-
"
|
82 |
-
|
83 |
-
"
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
}
|
|
|
1 |
# Theme and configuration settings for the Model Capability Leaderboard application
|
2 |
|
3 |
+
# Import task mapping
|
4 |
+
from src.utils.task_mapping import task_display_names
|
5 |
+
|
6 |
# Theme colors - using dark mode by default
|
7 |
dark_theme = {
|
8 |
'bg_color': '#1a202c',
|
|
|
49 |
|
50 |
# Metrics configuration
|
51 |
metrics_config = {
|
52 |
+
"Relative Improvement to Human": {
|
53 |
+
"file": "src/data/metrics/relative_improvement_to_human.json",
|
54 |
+
"description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.",
|
55 |
"min_value": -100, # Approximate, adjust as needed
|
56 |
"max_value": 50, # Approximate, adjust as needed
|
57 |
"color_map": "RdYlGn"
|
58 |
+
},
|
59 |
+
"Absolute Improvement to Baseline": {
|
60 |
+
"file": "src/data/metrics/absolute_improvement_to_baseline.json",
|
61 |
+
"description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.",
|
62 |
+
"min_value": -100, # Approximate, adjust as needed
|
63 |
+
"max_value": 100, # Approximate, adjust as needed
|
64 |
+
"color_map": "RdYlGn"
|
65 |
}
|
66 |
# Future metrics can be added here
|
67 |
# "Another Metric": {
|
|
|
80 |
"MLAB (o3-mini)": "Closed Source",
|
81 |
"MLAB (gpt-4o)": "Closed Source",
|
82 |
"MLAB (llama3-1-405b-instruct)": "Open Weights",
|
83 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
|
84 |
+
"Human": "Human",
|
85 |
+
"Top Human in Competition": "Human"
|
86 |
# More models would be added here as needed
|
87 |
}
|
88 |
|
89 |
# Task descriptions
|
90 |
tasks_info = {
|
91 |
+
task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"):
|
92 |
+
"Testing the model's ability to understand and localize actions within temporal sequences of events.",
|
93 |
+
task_display_names.get("Llm Merging", "LLM Merging"):
|
94 |
+
"Assessing the capability to effectively merge knowledge from multiple language models.",
|
95 |
+
task_display_names.get("Meta Learning", "Meta Learning"):
|
96 |
+
"Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
|
97 |
+
task_display_names.get("Product Recommendation", "Next Product Recommendation"):
|
98 |
+
"Testing the model's ability to recommend relevant products based on user preferences and behavior.",
|
99 |
+
task_display_names.get("Machine Unlearning", "Machine Unlearning"):
|
100 |
+
"Evaluating how well models can 'unlearn' specific information when required.",
|
101 |
+
task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"):
|
102 |
+
"Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
|
103 |
}
|
src/utils/data_loader.py
CHANGED
@@ -4,6 +4,7 @@ Data loading and processing utilities for the leaderboard application.
|
|
4 |
import pandas as pd
|
5 |
import json
|
6 |
from src.utils.config import model_categories
|
|
|
7 |
|
8 |
def load_metric_data(file_path):
|
9 |
"""
|
@@ -56,8 +57,13 @@ def process_data(metric_data):
|
|
56 |
# Replace NaN values with '-'
|
57 |
df.fillna('-', inplace=True)
|
58 |
|
59 |
-
#
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# Add a model type column to the dataframe
|
63 |
df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown"))
|
@@ -108,14 +114,26 @@ def filter_and_prepare_data(df, selected_tasks, selected_model_types):
|
|
108 |
selected_tasks_df = filtered_df[selected_tasks]
|
109 |
filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1)
|
110 |
|
111 |
-
#
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
# Add a Model Name column that shows the index (actual model name)
|
116 |
-
|
117 |
|
118 |
-
return
|
119 |
|
120 |
def format_display_dataframe(filtered_df, selected_tasks):
|
121 |
"""
|
@@ -135,13 +153,16 @@ def format_display_dataframe(filtered_df, selected_tasks):
|
|
135 |
medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"}
|
136 |
display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x)))
|
137 |
|
|
|
|
|
|
|
138 |
# Add metrics columns (Selected Overall and individual tasks)
|
139 |
metric_columns = ['Selected Overall'] + selected_tasks
|
140 |
for col in metric_columns:
|
141 |
if col in filtered_df.columns:
|
142 |
-
# Format numeric columns to
|
143 |
if filtered_df[col].dtype in ['float64', 'float32']:
|
144 |
-
display_df[col] = filtered_df[col].apply(lambda x: f"{x:.
|
145 |
else:
|
146 |
display_df[col] = filtered_df[col]
|
147 |
|
|
|
4 |
import pandas as pd
|
5 |
import json
|
6 |
from src.utils.config import model_categories
|
7 |
+
from src.utils.task_mapping import get_display_name
|
8 |
|
9 |
def load_metric_data(file_path):
|
10 |
"""
|
|
|
57 |
# Replace NaN values with '-'
|
58 |
df.fillna('-', inplace=True)
|
59 |
|
60 |
+
# First convert raw task names to standard format (spaces instead of hyphens/underscores)
|
61 |
+
standardized_columns = [task.replace("-", " ").replace("_", " ").title() for task in df.columns]
|
62 |
+
df.columns = standardized_columns
|
63 |
+
|
64 |
+
# Then apply our display name mapping
|
65 |
+
display_name_columns = {col: get_display_name(col) for col in df.columns}
|
66 |
+
df = df.rename(columns=display_name_columns)
|
67 |
|
68 |
# Add a model type column to the dataframe
|
69 |
df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown"))
|
|
|
114 |
selected_tasks_df = filtered_df[selected_tasks]
|
115 |
filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1)
|
116 |
|
117 |
+
# Separate human entries from other models for ranking
|
118 |
+
is_human = filtered_df['Model Type'] == 'Human'
|
119 |
+
human_df = filtered_df[is_human]
|
120 |
+
non_human_df = filtered_df[~is_human]
|
121 |
+
|
122 |
+
# Sort non-human models by Selected Overall and add rank
|
123 |
+
non_human_df = non_human_df.sort_values('Selected Overall', ascending=False)
|
124 |
+
non_human_df.insert(0, 'Rank', range(1, len(non_human_df) + 1))
|
125 |
+
|
126 |
+
# Add rank for human (use '-' to indicate not ranked)
|
127 |
+
human_df.insert(0, 'Rank', '-')
|
128 |
+
|
129 |
+
# Combine dataframes - put humans at appropriate position based on score
|
130 |
+
combined_df = pd.concat([non_human_df, human_df])
|
131 |
+
combined_df = combined_df.sort_values('Selected Overall', ascending=False)
|
132 |
|
133 |
# Add a Model Name column that shows the index (actual model name)
|
134 |
+
combined_df['Model Name'] = combined_df.index
|
135 |
|
136 |
+
return combined_df
|
137 |
|
138 |
def format_display_dataframe(filtered_df, selected_tasks):
|
139 |
"""
|
|
|
153 |
medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"}
|
154 |
display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x)))
|
155 |
|
156 |
+
# Rename 'Model Name' to 'Agent'
|
157 |
+
display_df = display_df.rename(columns={"Model Name": "Agent"})
|
158 |
+
|
159 |
# Add metrics columns (Selected Overall and individual tasks)
|
160 |
metric_columns = ['Selected Overall'] + selected_tasks
|
161 |
for col in metric_columns:
|
162 |
if col in filtered_df.columns:
|
163 |
+
# Format numeric columns to 1 decimal place
|
164 |
if filtered_df[col].dtype in ['float64', 'float32']:
|
165 |
+
display_df[col] = filtered_df[col].apply(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
166 |
else:
|
167 |
display_df[col] = filtered_df[col]
|
168 |
|
src/utils/task_mapping.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Task name mapping utilities for displaying user-friendly task names.
|
3 |
+
"""
|
4 |
+
|
5 |
+
# Mapping from original task names to display names
|
6 |
+
task_display_names = {
|
7 |
+
"Perception Temporal Action Loc": "Temporal Action Localisation",
|
8 |
+
"Llm Merging": "LLM Merging",
|
9 |
+
"Meta Learning": "Meta Learning",
|
10 |
+
"Product Recommendation": "Next Product Recommendation",
|
11 |
+
"Machine Unlearning": "Machine Unlearning",
|
12 |
+
"Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
|
13 |
+
}
|
14 |
+
|
15 |
+
def get_display_name(task_name):
|
16 |
+
"""
|
17 |
+
Get the display name for a task
|
18 |
+
|
19 |
+
Args:
|
20 |
+
task_name (str): The original task name
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
str: The display name for the task
|
24 |
+
"""
|
25 |
+
return task_display_names.get(task_name, task_name)
|
26 |
+
|
27 |
+
def get_original_name(display_name):
|
28 |
+
"""
|
29 |
+
Get the original task name for a display name
|
30 |
+
|
31 |
+
Args:
|
32 |
+
display_name (str): The display name
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
str: The original task name
|
36 |
+
"""
|
37 |
+
# Create a reverse mapping
|
38 |
+
reverse_mapping = {v: k for k, v in task_display_names.items()}
|
39 |
+
return reverse_mapping.get(display_name, display_name)
|