Armeddinosaur commited on
Commit
06d4ee9
·
1 Parent(s): 697ae1d

Updating table

Browse files
Assests/MLRC_Bench_overview.png CHANGED
src/app.py CHANGED
@@ -18,7 +18,7 @@ from src.utils.data_loader import (
18
  from src.styles.base import load_all_styles
19
 
20
  # Import components
21
- from src.components.header import render_page_header, render_footer
22
  from src.components.filters import (
23
  initialize_session_state,
24
  render_metric_selection,
@@ -40,6 +40,37 @@ def setup_page():
40
 
41
  # Load all styles
42
  load_all_styles()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def main():
45
  """
@@ -51,9 +82,9 @@ def main():
51
  # Render header
52
  render_page_header()
53
 
54
- # Load data
55
- current_metric = list(metrics_config.keys())[0]
56
- metric_data = load_metric_data(metrics_config[current_metric]["file"])
57
  df = process_data(metric_data)
58
 
59
  # Initialize session state
@@ -65,20 +96,54 @@ def main():
65
  # Tab 1: Leaderboard
66
  with tabs[0]:
67
  # Render filter components
68
- selected_metric = render_metric_selection()
 
 
69
  selected_tasks = render_task_selection(df)
70
  selected_model_types = render_model_type_selection(df)
71
 
72
  # Render leaderboard if selections are valid
73
  if selected_tasks and selected_model_types:
74
- # Filter and prepare data
75
- filtered_df = filter_and_prepare_data(df, selected_tasks, selected_model_types)
 
 
 
 
 
76
 
77
  # Format data for display
78
  display_df, metric_columns = format_display_dataframe(filtered_df, selected_tasks)
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  # Render the leaderboard table
81
- render_leaderboard_table(display_df, metric_columns)
82
  else:
83
  # Show empty state
84
  render_empty_state()
@@ -88,8 +153,7 @@ def main():
88
  # Render task descriptions
89
  render_task_descriptions()
90
 
91
- # Render footer
92
- render_footer()
93
 
94
  if __name__ == "__main__":
95
  main()
 
18
  from src.styles.base import load_all_styles
19
 
20
  # Import components
21
+ from src.components.header import render_page_header
22
  from src.components.filters import (
23
  initialize_session_state,
24
  render_metric_selection,
 
40
 
41
  # Load all styles
42
  load_all_styles()
43
+
44
+ # Force dark mode using custom CSS
45
+ st.markdown("""
46
+ <style>
47
+ /* Force dark mode regardless of browser settings */
48
+ .stApp {
49
+ background-color: #1a202c !important;
50
+ color: #e2e8f0 !important;
51
+ }
52
+ /* Override Streamlit's default styling to ensure dark mode */
53
+ .stTextInput, .stSelectbox, .stMultiselect {
54
+ background-color: #2d3748 !important;
55
+ color: #e2e8f0 !important;
56
+ }
57
+ .stButton>button {
58
+ background-color: #4a5568 !important;
59
+ color: #e2e8f0 !important;
60
+ }
61
+ /* Override header and text colors */
62
+ h1, h2, h3, h4, h5, h6, p, span, div {
63
+ color: #e2e8f0 !important;
64
+ }
65
+ /* Ensure tab styling is consistent */
66
+ .stTabs [data-baseweb="tab-list"] {
67
+ background-color: #1a202c !important;
68
+ }
69
+ .stTabs [data-baseweb="tab"] {
70
+ color: #e2e8f0 !important;
71
+ }
72
+ </style>
73
+ """, unsafe_allow_html=True)
74
 
75
  def main():
76
  """
 
82
  # Render header
83
  render_page_header()
84
 
85
+ # Load primary metric data (first metric in config)
86
+ primary_metric = list(metrics_config.keys())[0]
87
+ metric_data = load_metric_data(metrics_config[primary_metric]["file"])
88
  df = process_data(metric_data)
89
 
90
  # Initialize session state
 
96
  # Tab 1: Leaderboard
97
  with tabs[0]:
98
  # Render filter components
99
+ selected_metrics = render_metric_selection()
100
+
101
+ # Continue with other filters
102
  selected_tasks = render_task_selection(df)
103
  selected_model_types = render_model_type_selection(df)
104
 
105
  # Render leaderboard if selections are valid
106
  if selected_tasks and selected_model_types:
107
+ # Load the primary metric data first (always the first in selected_metrics)
108
+ primary_metric = selected_metrics[0]
109
+ primary_metric_data = load_metric_data(metrics_config[primary_metric]["file"])
110
+ primary_df = process_data(primary_metric_data)
111
+
112
+ # Filter and prepare data for primary metric
113
+ filtered_df = filter_and_prepare_data(primary_df, selected_tasks, selected_model_types)
114
 
115
  # Format data for display
116
  display_df, metric_columns = format_display_dataframe(filtered_df, selected_tasks)
117
 
118
+ # If additional metrics are selected, add their data too
119
+ all_metric_columns = metric_columns.copy()
120
+
121
+ for metric in selected_metrics[1:]:
122
+ metric_info = metrics_config[metric]
123
+ metric_data = load_metric_data(metric_info["file"])
124
+ metric_df = process_data(metric_data)
125
+
126
+ # Process and merge the additional metric data
127
+ metric_filtered_df = filter_and_prepare_data(metric_df, selected_tasks, selected_model_types)
128
+ metric_display_df, _ = format_display_dataframe(metric_filtered_df, selected_tasks)
129
+
130
+ # Create a meaningful prefix for this metric
131
+ if metric == "Absolute Improvement to Baseline":
132
+ prefix = "Abs"
133
+ else:
134
+ # Use first word of each part of the metric name
135
+ prefix = "".join([word[0] for word in metric.split()]).upper()
136
+
137
+ # Combine the dataframes - keep only metric columns from metric_display_df
138
+ for col in metric_columns:
139
+ if col in metric_display_df.columns:
140
+ # Add columns with metric prefix
141
+ display_df[f"{prefix}: {col}"] = metric_display_df[col]
142
+ # Add to the list of all metric columns
143
+ all_metric_columns.append(f"{prefix}: {col}")
144
+
145
  # Render the leaderboard table
146
+ render_leaderboard_table(display_df, all_metric_columns, primary_metric)
147
  else:
148
  # Show empty state
149
  render_empty_state()
 
153
  # Render task descriptions
154
  render_task_descriptions()
155
 
156
+ # Footer removed per user request
 
157
 
158
  if __name__ == "__main__":
159
  main()
src/components/filters.py CHANGED
@@ -12,12 +12,14 @@ def initialize_session_state(df):
12
  df (pandas.DataFrame): The DataFrame with model data
13
  """
14
  # Initialize session states
15
- if 'selected_metric' not in st.session_state:
16
- st.session_state.selected_metric = list(metrics_config.keys())[0]
 
 
17
 
18
  if 'selected_tasks' not in st.session_state:
19
- # Default to first 3 tasks, excluding Model Type
20
- st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']][:3]
21
 
22
  if 'selected_model_types' not in st.session_state:
23
  # Ensure all model types are selected by default
@@ -28,23 +30,43 @@ def render_metric_selection():
28
  Render the metric selection component
29
 
30
  Returns:
31
- str: Selected metric
32
  """
33
- st.markdown("### Select Metric")
34
 
35
- # Create more compact metric buttons with clear selection indicators
36
- metric_cols = st.columns(len(metrics_config))
37
- for i, metric in enumerate(metrics_config.keys()):
38
- with metric_cols[i]:
39
- is_selected = st.session_state.selected_metric == metric
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  button_label = f"✓ {metric}" if is_selected else metric
41
  button_type = "primary" if is_selected else "secondary"
42
 
43
  if st.button(button_label, key=f"metric_{metric}", type=button_type):
44
- st.session_state.selected_metric = metric
 
 
 
45
  st.rerun() # Force UI update
46
 
47
- return st.session_state.selected_metric
48
 
49
  def render_task_selection(df):
50
  """
@@ -61,14 +83,33 @@ def render_task_selection(df):
61
  # Extract task columns (exclude Model Type and Overall)
62
  all_tasks = [col for col in df.columns if col not in ['Model Type']]
63
 
64
- # Create task buttons in rows of 3
65
- num_cols = 3
66
- task_rows = [all_tasks[i:i+num_cols] for i in range(0, len(all_tasks), num_cols)]
67
 
68
- for row in task_rows:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  cols = st.columns(num_cols)
70
- for i, task in enumerate(row):
71
- if i < len(row):
 
 
72
  with cols[i]:
73
  is_selected = task in st.session_state.selected_tasks
74
  button_label = f"✓ {task}" if is_selected else task
@@ -95,12 +136,20 @@ def render_model_type_selection(df):
95
  """
96
  st.markdown("### Select Model Types")
97
 
98
- # Create model type buttons
99
  model_types = df['Model Type'].unique().tolist()
100
- model_type_cols = st.columns(len(model_types))
101
 
 
 
 
 
 
 
 
102
  for i, model_type in enumerate(model_types):
103
- with model_type_cols[i]:
 
 
104
  is_selected = model_type in st.session_state.selected_model_types
105
  button_label = f"✓ {model_type}" if is_selected else model_type
106
  button_type = "primary" if is_selected else "secondary"
 
12
  df (pandas.DataFrame): The DataFrame with model data
13
  """
14
  # Initialize session states
15
+ if 'selected_metrics' not in st.session_state:
16
+ # Start with the first metric always selected
17
+ primary_metric = list(metrics_config.keys())[0]
18
+ st.session_state.selected_metrics = [primary_metric]
19
 
20
  if 'selected_tasks' not in st.session_state:
21
+ # Select all tasks by default, excluding Model Type
22
+ st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
23
 
24
  if 'selected_model_types' not in st.session_state:
25
  # Ensure all model types are selected by default
 
30
  Render the metric selection component
31
 
32
  Returns:
33
+ list: Selected metrics
34
  """
35
+ st.markdown("### Select Metrics")
36
 
37
+ # Get metric names
38
+ all_metrics = list(metrics_config.keys())
39
+ primary_metric = all_metrics[0] # First metric is primary
40
+ secondary_metrics = all_metrics[1:] # Rest are secondary
41
+
42
+ # Always select the primary metric
43
+ if primary_metric not in st.session_state.selected_metrics:
44
+ st.session_state.selected_metrics.append(primary_metric)
45
+
46
+ # Create columns based on number of metrics
47
+ num_cols = len(all_metrics)
48
+ cols = st.columns(num_cols)
49
+
50
+ # Primary metric first (always selected and can't be deselected)
51
+ with cols[0]:
52
+ button_label = f"✓ {primary_metric}"
53
+ st.button(button_label, key=f"metric_{primary_metric}", type="primary", disabled=True)
54
+
55
+ # Secondary metrics that can be toggled
56
+ for i, metric in enumerate(secondary_metrics):
57
+ with cols[i+1]:
58
+ is_selected = metric in st.session_state.selected_metrics
59
  button_label = f"✓ {metric}" if is_selected else metric
60
  button_type = "primary" if is_selected else "secondary"
61
 
62
  if st.button(button_label, key=f"metric_{metric}", type=button_type):
63
+ if is_selected:
64
+ st.session_state.selected_metrics.remove(metric)
65
+ else:
66
+ st.session_state.selected_metrics.append(metric)
67
  st.rerun() # Force UI update
68
 
69
+ return st.session_state.selected_metrics
70
 
71
  def render_task_selection(df):
72
  """
 
83
  # Extract task columns (exclude Model Type and Overall)
84
  all_tasks = [col for col in df.columns if col not in ['Model Type']]
85
 
86
+ # Determine number of columns based on screen width
87
+ num_cols = 3 # Default for medium screens
 
88
 
89
+ # Create task buttons in a fixed number of columns with balanced width
90
+ task_groups = [all_tasks[i:i+num_cols] for i in range(0, len(all_tasks), num_cols)]
91
+
92
+ # Custom CSS for button styling
93
+ st.markdown("""
94
+ <style>
95
+ /* Make buttons same width in their columns and centered */
96
+ .stButton > button {
97
+ width: 100%;
98
+ max-width: 300px;
99
+ margin: 0 auto;
100
+ display: block;
101
+ }
102
+ </style>
103
+ """, unsafe_allow_html=True)
104
+
105
+ # Display buttons in rows
106
+ for group in task_groups:
107
+ # Create columns with equal width
108
  cols = st.columns(num_cols)
109
+
110
+ # Add buttons to each column
111
+ for i, task in enumerate(group):
112
+ if i < len(cols): # Ensure we don't exceed available columns
113
  with cols[i]:
114
  is_selected = task in st.session_state.selected_tasks
115
  button_label = f"✓ {task}" if is_selected else task
 
136
  """
137
  st.markdown("### Select Model Types")
138
 
139
+ # Get unique model types
140
  model_types = df['Model Type'].unique().tolist()
 
141
 
142
+ # Determine number of columns - up to 4 columns max depending on number of model types
143
+ num_cols = min(len(model_types), 4)
144
+
145
+ # Create columns
146
+ cols = st.columns(num_cols)
147
+
148
+ # Add a button for each model type
149
  for i, model_type in enumerate(model_types):
150
+ col_idx = i % num_cols # Determine which column to place the button in
151
+
152
+ with cols[col_idx]:
153
  is_selected = model_type in st.session_state.selected_model_types
154
  button_label = f"✓ {model_type}" if is_selected else model_type
155
  button_type = "primary" if is_selected else "secondary"
src/components/header.py CHANGED
@@ -31,11 +31,5 @@ def render_footer():
31
  """
32
  Render the page footer
33
  """
34
- st.markdown(
35
- """
36
- <div class="footer">
37
- <p>© 2023 Model Capability Leaderboard • Made with Streamlit • Contact: [email protected]</p>
38
- </div>
39
- """,
40
- unsafe_allow_html=True
41
- )
 
31
  """
32
  Render the page footer
33
  """
34
+ # Footer content removed per user request
35
+ pass
 
 
 
 
 
 
src/components/leaderboard.py CHANGED
@@ -4,19 +4,43 @@ Leaderboard table components for the leaderboard application.
4
  import streamlit as st
5
  from src.data.processors import get_model_type_style, get_rank_style
6
 
7
- def render_leaderboard_table(display_df, metric_columns):
8
  """
9
  Render the custom HTML leaderboard table
10
 
11
  Args:
12
  display_df (pandas.DataFrame): The DataFrame with the display data
13
  metric_columns (list): List of metric column names
 
14
  """
15
  from src.components.header import render_section_header
 
16
 
17
  # Display model ranking header without the box
18
  render_section_header("Model Rankings")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Start building the HTML table structure
21
  html_table = """
22
  <div class="fixed-table-container">
@@ -25,12 +49,13 @@ def render_leaderboard_table(display_df, metric_columns):
25
  <thead>
26
  <tr class="header-row">
27
  <th class="fixed-column first-fixed-column" rowspan="2">Rank</th>
28
- <th class="fixed-column second-fixed-column" rowspan="2">Model + Scaffolding</th>
29
- <th class="model-type-cell" rowspan="2">Model Type</th>
30
  """
31
 
32
- # Add the metric header
33
- html_table += f'<th colspan="{len(metric_columns)}" class="metric-header">Margin To Human</th>'
 
34
 
35
  # Continue the table structure
36
  html_table += """
@@ -38,10 +63,13 @@ def render_leaderboard_table(display_df, metric_columns):
38
  <tr class="sub-header">
39
  """
40
 
41
- # Add individual column headers for metrics
42
- for col in metric_columns:
43
- column_class = "overall-cell" if col == "Metric Average" else "metric-cell"
44
- html_table += f'<th class="{column_class}">{col}</th>'
 
 
 
45
 
46
  # Close the header and start the body
47
  html_table += """
@@ -53,13 +81,20 @@ def render_leaderboard_table(display_df, metric_columns):
53
  # Add the data rows
54
  for i, (idx, row) in enumerate(display_df.iterrows()):
55
  # Define background colors to ensure consistency
56
- row_bg = "#0a0a0a" if i % 2 == 0 else "#111111"
 
 
 
 
 
 
 
57
 
58
  # Start the row
59
- html_table += f'<tr class="table-row">'
60
 
61
  # Add Rank with medal styling and consistent background
62
- rank_style = f"background-color: {row_bg};" # Add row background to fixed columns
63
  rank_styles = get_rank_style(row["Rank"])
64
  for style_key, style_value in rank_styles.items():
65
  rank_style += f"{style_key}: {style_value};"
@@ -67,11 +102,11 @@ def render_leaderboard_table(display_df, metric_columns):
67
  html_table += f'<td class="fixed-column first-fixed-column" style="{rank_style}">{row["Rank"]}</td>'
68
 
69
  # Model name fixed column with consistent background
70
- html_table += f'<td class="fixed-column second-fixed-column" title="{row["Model Name"]}" style="background-color: {row_bg}; font-weight: 500; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; text-align: center;">{row["Model Name"]}</td>'
71
 
72
  # Model type cell
73
  model_type = row["Model Type"]
74
- type_style = f"background-color: {row_bg};"
75
  model_type_styles = get_model_type_style(model_type)
76
  for style_key, style_value in model_type_styles.items():
77
  if style_value:
@@ -79,22 +114,30 @@ def render_leaderboard_table(display_df, metric_columns):
79
 
80
  html_table += f'<td class="table-cell model-type-cell" style="{type_style}">{model_type}</td>'
81
 
82
- # Add metric values with minimal styling
83
- for col in metric_columns:
84
- cell_class = "table-cell overall-cell" if col == "Metric Average" else "table-cell metric-cell"
85
- value_text = row[col]
86
-
87
- # Simple styling based on positive/negative values
88
- try:
89
- value = float(str(row[col]).replace(',', ''))
90
- if value > 0:
91
- cell_class += " positive-value"
92
- elif value < 0:
93
- cell_class += " negative-value"
94
- except:
95
- pass
96
 
97
- html_table += f'<td class="{cell_class}" style="background-color: {row_bg};">{value_text}</td>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  html_table += "</tr>"
100
 
@@ -106,16 +149,90 @@ def render_leaderboard_table(display_df, metric_columns):
106
  </div>
107
  """
108
 
109
- # Add metric definition below the table
110
- metric_definition = """
111
- <div class="metric-definition">
112
- <h4>Margin to Human</h4>
113
- <p> This metric measures what percentage of the top 1 human-to-baseline performance gap an agent can close on challenging Machine Learning Research Competition problems. For example, if the baseline is 100, top human performance is 200, and the agent scores 110, the agent has closed 10% of the gap between baseline and top human performance. Higher percentages indicate models that more effectively approach top human-level research capabilities.</p>
114
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- # Display the custom HTML table and metric definition
118
- st.markdown(html_table + metric_definition, unsafe_allow_html=True)
119
 
120
  def render_empty_state():
121
  """
 
4
  import streamlit as st
5
  from src.data.processors import get_model_type_style, get_rank_style
6
 
7
+ def render_leaderboard_table(display_df, metric_columns, primary_metric):
8
  """
9
  Render the custom HTML leaderboard table
10
 
11
  Args:
12
  display_df (pandas.DataFrame): The DataFrame with the display data
13
  metric_columns (list): List of metric column names
14
+ primary_metric (str): The name of the primary metric
15
  """
16
  from src.components.header import render_section_header
17
+ from src.utils.config import metrics_config
18
 
19
  # Display model ranking header without the box
20
  render_section_header("Model Rankings")
21
 
22
+ # Detect if we have multiple metrics (columns with metric prefixes)
23
+ has_multiple_metrics = any(":" in col for col in metric_columns)
24
+
25
+ # Group columns by metric if multiple metrics are present
26
+ metric_groups = {}
27
+ if has_multiple_metrics:
28
+ # Primary metric columns (no prefix)
29
+ primary_cols = [col for col in metric_columns if ":" not in col]
30
+ metric_groups[primary_metric] = primary_cols
31
+
32
+ # Other metrics
33
+ for col in metric_columns:
34
+ if ":" in col:
35
+ prefix, metric_name = col.split(": ", 1)
36
+ full_metric_name = next((m for m in metrics_config if m.startswith(prefix)), prefix)
37
+ if full_metric_name not in metric_groups:
38
+ metric_groups[full_metric_name] = []
39
+ metric_groups[full_metric_name].append(col)
40
+ else:
41
+ # Single metric
42
+ metric_groups[primary_metric] = metric_columns
43
+
44
  # Start building the HTML table structure
45
  html_table = """
46
  <div class="fixed-table-container">
 
49
  <thead>
50
  <tr class="header-row">
51
  <th class="fixed-column first-fixed-column" rowspan="2">Rank</th>
52
+ <th class="fixed-column second-fixed-column" rowspan="2" style="text-align: center;">Agent</th>
53
+ <th class="model-type-cell" rowspan="2" style="text-align: center;">Model Type</th>
54
  """
55
 
56
+ # Add metric headers for each metric group
57
+ for metric_name, cols in metric_groups.items():
58
+ html_table += f'<th colspan="{len(cols)}" class="metric-header" style="text-align: center;">{metric_name}</th>'
59
 
60
  # Continue the table structure
61
  html_table += """
 
63
  <tr class="sub-header">
64
  """
65
 
66
+ # Add individual column headers for all metrics
67
+ for metric_name, cols in metric_groups.items():
68
+ for col in cols:
69
+ # Extract the actual column name if it has a prefix
70
+ display_name = col.split(": ", 1)[-1] if ":" in col else col
71
+ column_class = "overall-cell" if display_name == "Metric Average" else "metric-cell"
72
+ html_table += f'<th class="{column_class}" style="text-align: center;">{display_name}</th>'
73
 
74
  # Close the header and start the body
75
  html_table += """
 
81
  # Add the data rows
82
  for i, (idx, row) in enumerate(display_df.iterrows()):
83
  # Define background colors to ensure consistency
84
+ # Special background for human row
85
+ is_human_row = row["Agent"] == "Top Human in Competition"
86
+ if is_human_row:
87
+ row_bg = "#2a1e37" # Purple-ish dark background for human row
88
+ row_style = f'style="background-color: {row_bg}; box-shadow: 0 0 5px #f472b6;"'
89
+ else:
90
+ row_bg = "#0a0a0a" if i % 2 == 0 else "#111111"
91
+ row_style = f'style="background-color: {row_bg};"'
92
 
93
  # Start the row
94
+ html_table += f'<tr class="table-row" {row_style}>'
95
 
96
  # Add Rank with medal styling and consistent background
97
+ rank_style = "" # Don't set background at cell level
98
  rank_styles = get_rank_style(row["Rank"])
99
  for style_key, style_value in rank_styles.items():
100
  rank_style += f"{style_key}: {style_value};"
 
102
  html_table += f'<td class="fixed-column first-fixed-column" style="{rank_style}">{row["Rank"]}</td>'
103
 
104
  # Model name fixed column with consistent background
105
+ html_table += f'<td class="fixed-column second-fixed-column" title="{row["Agent"]}" style="font-weight: 500; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; text-align: center;">{row["Agent"]}</td>'
106
 
107
  # Model type cell
108
  model_type = row["Model Type"]
109
+ type_style = f"text-align: center;"
110
  model_type_styles = get_model_type_style(model_type)
111
  for style_key, style_value in model_type_styles.items():
112
  if style_value:
 
114
 
115
  html_table += f'<td class="table-cell model-type-cell" style="{type_style}">{model_type}</td>'
116
 
117
+ # Add metric values with minimal styling for all columns
118
+ all_metric_columns = [col for group in metric_groups.values() for col in group]
119
+ for col in all_metric_columns:
120
+ display_name = col.split(": ", 1)[-1] if ":" in col else col
121
+ cell_class = "table-cell overall-cell" if display_name == "Metric Average" else "table-cell metric-cell"
 
 
 
 
 
 
 
 
 
122
 
123
+ # Check if column exists in the row (it should)
124
+ if col in row:
125
+ value_text = row[col]
126
+
127
+ # Simple styling based on positive/negative values
128
+ try:
129
+ value = float(str(row[col]).replace(',', ''))
130
+ if value > 0:
131
+ cell_class += " positive-value"
132
+ elif value < 0:
133
+ cell_class += " negative-value"
134
+ except:
135
+ pass
136
+
137
+ html_table += f'<td class="{cell_class}">{value_text}</td>'
138
+ else:
139
+ # If column doesn't exist (shouldn't happen), add empty cell
140
+ html_table += f'<td class="{cell_class}">-</td>'
141
 
142
  html_table += "</tr>"
143
 
 
149
  </div>
150
  """
151
 
152
+ # Add styling for metrics section
153
+ metrics_css = """
154
+ <style>
155
+ .metric-definitions {
156
+ margin-top: 30px;
157
+ padding-top: 20px;
158
+ border-top: 1px solid #333;
159
+ }
160
+ .metric-definition {
161
+ background-color: #1a1a1a;
162
+ border-radius: 8px;
163
+ padding: 12px 16px;
164
+ margin-bottom: 16px;
165
+ }
166
+ .metric-definition h4 {
167
+ margin-top: 0;
168
+ color: #a5b4fc;
169
+ }
170
+ .metric-definition p {
171
+ margin-bottom: 0;
172
+ color: #e2e8f0;
173
+ }
174
+ </style>
175
  """
176
+
177
+ # Build a clean HTML string for the metrics section
178
+ metrics_html = '<div class="metric-definitions">'
179
+
180
+ # Add each metric definition
181
+ for metric_name, metric_info in metrics_config.items():
182
+ metric_description = metric_info.get('description', '')
183
+
184
+ # Special handling for Relative Improvement to Human to show formula
185
+ if metric_name == "Relative Improvement to Human":
186
+ formula_html = """
187
+ <div style="margin: 15px 0;">
188
+ <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
189
+ <div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
190
+ Relative Improvement to Human = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / (P<sub>human</sub> - P<sub>baseline</sub>)) × 100%
191
+ </div>
192
+ <p style="margin-top: 10px; font-weight: 500;">Where:</p>
193
+ <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
194
+ <li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
195
+ <li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
196
+ <li style="margin-bottom: 5px;">P<sub>human</sub> is the human performance benchmark</li>
197
+ <li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
198
+ </ul>
199
+ </div>
200
+ """
201
+
202
+ # Add the metric definition with the formula
203
+ metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p>{formula_html}</div>'
204
+ # Special handling for Absolute Improvement to Baseline to show formula
205
+ elif metric_name == "Absolute Improvement to Baseline":
206
+ formula_html = """
207
+ <div style="margin: 15px 0;">
208
+ <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
209
+ <div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
210
+ Absolute Improvement to Baseline = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / P<sub>baseline</sub>) × 100%
211
+ </div>
212
+ <p style="margin-top: 10px; font-weight: 500;">Where:</p>
213
+ <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
214
+ <li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
215
+ <li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
216
+ <li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
217
+ </ul>
218
+ </div>
219
+ """
220
+
221
+ # Add the metric definition with the formula
222
+ metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p>{formula_html}</div>'
223
+ else:
224
+ # Regular metric without formula
225
+ metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p></div>'
226
+
227
+ # Close the metric definitions container
228
+ metrics_html += '</div>'
229
+
230
+ # Display the styling and HTML separately for maximum control
231
+ st.markdown(html_table, unsafe_allow_html=True)
232
+ st.markdown(metrics_css, unsafe_allow_html=True)
233
 
234
+ # Render the metrics definitions
235
+ st.markdown(metrics_html, unsafe_allow_html=True)
236
 
237
  def render_empty_state():
238
  """
src/components/tasks.py CHANGED
@@ -3,6 +3,7 @@ Task description components for the leaderboard application.
3
  """
4
  import streamlit as st
5
  from src.utils.config import tasks_info
 
6
 
7
  def render_task_descriptions():
8
  """
@@ -51,8 +52,8 @@ def render_task_descriptions():
51
  </div>
52
  """, unsafe_allow_html=True)
53
 
54
- # Task links mapping
55
- task_links = {
56
  "Backdoor Trigger Recovery": "https://www.llmagentsafetycomp24.com/tracks/#backdoor_model",
57
  "Machine Unlearning": "https://unlearning-challenge.github.io/",
58
  "Perception Temporal Action Loc": "https://ptchallenge-workshop.github.io",
@@ -60,6 +61,9 @@ def render_task_descriptions():
60
  "Meta Learning": "https://metalearning.chalearn.org/",
61
  "Llm Merging": "https://llm-merging.github.io"
62
  }
 
 
 
63
 
64
  # Create two columns
65
  col1, col2 = st.columns(2)
@@ -73,9 +77,8 @@ def render_task_descriptions():
73
  link = task_links.get(task, "#")
74
  st.markdown(f"""
75
  <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
76
- <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
77
- <div class="task-title">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
78
- <div class="task-description">{description}</div>
79
  </div>
80
  </a>
81
  """, unsafe_allow_html=True)
@@ -85,9 +88,8 @@ def render_task_descriptions():
85
  link = task_links.get(task, "#")
86
  st.markdown(f"""
87
  <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
88
- <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
89
- <div class="task-title">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
90
- <div class="task-description">{description}</div>
91
  </div>
92
  </a>
93
  """, unsafe_allow_html=True)
 
3
  """
4
  import streamlit as st
5
  from src.utils.config import tasks_info
6
+ from src.utils.task_mapping import get_display_name, get_original_name
7
 
8
  def render_task_descriptions():
9
  """
 
52
  </div>
53
  """, unsafe_allow_html=True)
54
 
55
+ # Task links mapping - using original task names
56
+ original_task_links = {
57
  "Backdoor Trigger Recovery": "https://www.llmagentsafetycomp24.com/tracks/#backdoor_model",
58
  "Machine Unlearning": "https://unlearning-challenge.github.io/",
59
  "Perception Temporal Action Loc": "https://ptchallenge-workshop.github.io",
 
61
  "Meta Learning": "https://metalearning.chalearn.org/",
62
  "Llm Merging": "https://llm-merging.github.io"
63
  }
64
+
65
+ # Update links mapping to use display names as keys
66
+ task_links = {get_display_name(task): link for task, link in original_task_links.items()}
67
 
68
  # Create two columns
69
  col1, col2 = st.columns(2)
 
77
  link = task_links.get(task, "#")
78
  st.markdown(f"""
79
  <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
80
+ <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s; padding: 12px; margin-bottom: 15px; height: auto;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
81
+ <div class="task-title" style="text-align: center;">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
 
82
  </div>
83
  </a>
84
  """, unsafe_allow_html=True)
 
88
  link = task_links.get(task, "#")
89
  st.markdown(f"""
90
  <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
91
+ <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s; padding: 12px; margin-bottom: 15px; height: auto;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
92
+ <div class="task-title" style="text-align: center;">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
 
93
  </div>
94
  </a>
95
  """, unsafe_allow_html=True)
src/data/metrics/absolute_improvement_to_baseline.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "perception_temporal_action_loc": {
3
+ "MLAB (claude-3-5-sonnet-v2)": 2.222443094482299,
4
+ "Top Human in Competition": 284.55703321316366,
5
+ "MLAB (gemini-exp-1206)": -1.34633272895098,
6
+ "MLAB (o3-mini)": 0.8724822663469414,
7
+ "MLAB (gpt-4o)": 0.9384906166574135,
8
+ "MLAB (llama3-1-405b-instruct)": 1.474927454740455,
9
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 0.9888962417416385
10
+ },
11
+ "llm-merging": {
12
+ "CoI-Agent (o1) + MLAB (gpt-4o)": -0.6756756689645764,
13
+ "Top Human in Competition": 68.24324325461103,
14
+ "MLAB (claude-3-5-sonnet-v2)": 3.3783783853634035,
15
+ "MLAB (gemini-exp-1206)": 3.3783783853634035,
16
+ "MLAB (o3-mini)": -0.6756756689645764,
17
+ "MLAB (gpt-4o)": 1.3513513581994137,
18
+ "MLAB (llama3-1-405b-instruct)": -0.6756756689645764
19
+ },
20
+ "meta-learning": {
21
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 5.424978139166417,
22
+ "Top Human in Competition": 304.53435579895256,
23
+ "MLAB (claude-3-5-sonnet-v2)": 5.424978139166417,
24
+ "MLAB (gemini-exp-1206)": 5.424978139166417,
25
+ "MLAB (o3-mini)": -14.923192223926499,
26
+ "MLAB (gpt-4o)": 5.424978139166417,
27
+ "MLAB (llama3-1-405b-instruct)": 5.424978139166417
28
+ },
29
+ "product-recommendation": {
30
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6021227441680528,
31
+ "Top Human in Competition": 412.59793394031675,
32
+ "MLAB (claude-3-5-sonnet-v2)": 12.283606772997718,
33
+ "MLAB (gemini-exp-1206)": 0.6021227441680528,
34
+ "MLAB (o3-mini)": 0.6035316323448103,
35
+ "MLAB (gpt-4o)": 2.6400767209619422,
36
+ "MLAB (llama3-1-405b-instruct)": -2.9066701147102995e-09
37
+ },
38
+ "machine_unlearning": {
39
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 7.318484292638537,
40
+ "Top Human in Competition": 61.85258904854873,
41
+ "MLAB (claude-3-5-sonnet-v2)": -58.58540153334969,
42
+ "MLAB (gemini-exp-1206)": 3.4837676447981045,
43
+ "MLAB (o3-mini)": 2.2414490971518704,
44
+ "MLAB (gpt-4o)": -11.131587250139926,
45
+ "MLAB (llama3-1-405b-instruct)": 3.8409541040677597
46
+ },
47
+ "backdoor-trigger-recovery": {
48
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 38.252918051116,
49
+ "Top Human in Competition": 621.2635313337943,
50
+ "MLAB (claude-3-5-sonnet-v2)": 247.90785034564928,
51
+ "MLAB (gemini-exp-1206)": 80.40937239150493,
52
+ "MLAB (o3-mini)": 38.75953643366491,
53
+ "MLAB (gpt-4o)": 64.52832837042699,
54
+ "MLAB (llama3-1-405b-instruct)": 71.70765816958271
55
+ }
56
+ }
src/data/metrics/{margin_to_human.json → relative_improvement_to_human.json} RENAMED
@@ -1,6 +1,7 @@
1
  {
2
  "perception_temporal_action_loc": {
3
  "MLAB (claude-3-5-sonnet-v2)": 0.7810185077440877,
 
4
  "MLAB (gemini-exp-1206)": -0.4731328246392113,
5
  "MLAB (o3-mini)": 0.3066106841553126,
6
  "MLAB (gpt-4o)": 0.3298075630252947,
@@ -9,6 +10,7 @@
9
  },
10
  "llm-merging": {
11
  "CoI-Agent (o1) + MLAB (gpt-4o)": -0.9900989999019761,
 
12
  "MLAB (claude-3-5-sonnet-v2)": 4.950495058915793,
13
  "MLAB (gemini-exp-1206)": 4.950495058915793,
14
  "MLAB (o3-mini)": -0.9900989999019761,
@@ -17,6 +19,7 @@
17
  },
18
  "meta-learning": {
19
  "CoI-Agent (o1) + MLAB (gpt-4o)": 1.781401026144938,
 
20
  "MLAB (claude-3-5-sonnet-v2)": 1.781401026144938,
21
  "MLAB (gemini-exp-1206)": 1.781401026144938,
22
  "MLAB (o3-mini)": -4.900331256476853,
@@ -25,6 +28,7 @@
25
  },
26
  "product-recommendation": {
27
  "CoI-Agent (o1) + MLAB (gpt-4o)": 0.1459345029718814,
 
28
  "MLAB (claude-3-5-sonnet-v2)": 2.9771372473170388,
29
  "MLAB (gemini-exp-1206)": 0.1459345029718814,
30
  "MLAB (o3-mini)": 0.1462759705510577,
@@ -33,6 +37,7 @@
33
  },
34
  "machine_unlearning": {
35
  "CoI-Agent (o1) + MLAB (gpt-4o)": 11.832138969791846,
 
36
  "MLAB (claude-3-5-sonnet-v2)": -94.71778374121965,
37
  "MLAB (gemini-exp-1206)": 5.632371576335568,
38
  "MLAB (o3-mini)": 3.623856546073656,
@@ -41,6 +46,7 @@
41
  },
42
  "backdoor-trigger-recovery": {
43
  "CoI-Agent (o1) + MLAB (gpt-4o)": 6.1572772457753295,
 
44
  "MLAB (claude-3-5-sonnet-v2)": 39.903815022493674,
45
  "MLAB (gemini-exp-1206)": 12.94287662739089,
46
  "MLAB (o3-mini)": 6.238823700218141,
 
1
  {
2
  "perception_temporal_action_loc": {
3
  "MLAB (claude-3-5-sonnet-v2)": 0.7810185077440877,
4
+ "Top Human in Competition": 100.0,
5
  "MLAB (gemini-exp-1206)": -0.4731328246392113,
6
  "MLAB (o3-mini)": 0.3066106841553126,
7
  "MLAB (gpt-4o)": 0.3298075630252947,
 
10
  },
11
  "llm-merging": {
12
  "CoI-Agent (o1) + MLAB (gpt-4o)": -0.9900989999019761,
13
+ "Top Human in Competition": 100.0,
14
  "MLAB (claude-3-5-sonnet-v2)": 4.950495058915793,
15
  "MLAB (gemini-exp-1206)": 4.950495058915793,
16
  "MLAB (o3-mini)": -0.9900989999019761,
 
19
  },
20
  "meta-learning": {
21
  "CoI-Agent (o1) + MLAB (gpt-4o)": 1.781401026144938,
22
+ "Top Human in Competition": 100.0,
23
  "MLAB (claude-3-5-sonnet-v2)": 1.781401026144938,
24
  "MLAB (gemini-exp-1206)": 1.781401026144938,
25
  "MLAB (o3-mini)": -4.900331256476853,
 
28
  },
29
  "product-recommendation": {
30
  "CoI-Agent (o1) + MLAB (gpt-4o)": 0.1459345029718814,
31
+ "Top Human in Competition": 100.0,
32
  "MLAB (claude-3-5-sonnet-v2)": 2.9771372473170388,
33
  "MLAB (gemini-exp-1206)": 0.1459345029718814,
34
  "MLAB (o3-mini)": 0.1462759705510577,
 
37
  },
38
  "machine_unlearning": {
39
  "CoI-Agent (o1) + MLAB (gpt-4o)": 11.832138969791846,
40
+ "Top Human in Competition": 100.0,
41
  "MLAB (claude-3-5-sonnet-v2)": -94.71778374121965,
42
  "MLAB (gemini-exp-1206)": 5.632371576335568,
43
  "MLAB (o3-mini)": 3.623856546073656,
 
46
  },
47
  "backdoor-trigger-recovery": {
48
  "CoI-Agent (o1) + MLAB (gpt-4o)": 6.1572772457753295,
49
+ "Top Human in Competition": 100.0,
50
  "MLAB (claude-3-5-sonnet-v2)": 39.903815022493674,
51
  "MLAB (gemini-exp-1206)": 12.94287662739089,
52
  "MLAB (o3-mini)": 6.238823700218141,
src/data/processors.py CHANGED
@@ -42,6 +42,8 @@ def get_model_type_style(model_type):
42
  return {'color': '#93c5fd'} # Brighter blue
43
  elif model_type == "Closed Source":
44
  return {'color': '#cbd5e1'} # Lighter gray
 
 
45
  else:
46
  return {'color': ''}
47
 
@@ -61,6 +63,8 @@ def get_rank_style(rank):
61
  return {'color': 'silver', 'font-weight': '700', 'font-size': '16px'}
62
  elif "🥉" in str(rank):
63
  return {'color': '#cd7f32', 'font-weight': '700', 'font-size': '16px'}
 
 
64
  else:
65
  return {}
66
 
 
42
  return {'color': '#93c5fd'} # Brighter blue
43
  elif model_type == "Closed Source":
44
  return {'color': '#cbd5e1'} # Lighter gray
45
+ elif model_type == "Human":
46
+ return {'color': '#f472b6', 'font-weight': '600'} # Pink with emphasis for Human
47
  else:
48
  return {'color': ''}
49
 
 
63
  return {'color': 'silver', 'font-weight': '700', 'font-size': '16px'}
64
  elif "🥉" in str(rank):
65
  return {'color': '#cd7f32', 'font-weight': '700', 'font-size': '16px'}
66
+ elif str(rank) == "-":
67
+ return {'color': '#f472b6', 'font-style': 'italic'} # Style for non-ranked (human)
68
  else:
69
  return {}
70
 
src/utils/config.py CHANGED
@@ -1,5 +1,8 @@
1
  # Theme and configuration settings for the Model Capability Leaderboard application
2
 
 
 
 
3
  # Theme colors - using dark mode by default
4
  dark_theme = {
5
  'bg_color': '#1a202c',
@@ -46,12 +49,19 @@ app_config = {
46
 
47
  # Metrics configuration
48
  metrics_config = {
49
- "Margin to Human": {
50
- "file": "src/data/metrics/margin_to_human.json",
51
- "description": "Performance on Machine Learning Research Challenges. Higher values indicate better research capabilities.",
52
  "min_value": -100, # Approximate, adjust as needed
53
  "max_value": 50, # Approximate, adjust as needed
54
  "color_map": "RdYlGn"
 
 
 
 
 
 
 
55
  }
56
  # Future metrics can be added here
57
  # "Another Metric": {
@@ -70,16 +80,24 @@ model_categories = {
70
  "MLAB (o3-mini)": "Closed Source",
71
  "MLAB (gpt-4o)": "Closed Source",
72
  "MLAB (llama3-1-405b-instruct)": "Open Weights",
73
- "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source"
 
 
74
  # More models would be added here as needed
75
  }
76
 
77
  # Task descriptions
78
  tasks_info = {
79
- "Perception Temporal Action Loc": "Testing the model's ability to understand and localize actions within temporal sequences of events.",
80
- "Llm Merging": "Assessing the capability to effectively merge knowledge from multiple language models.",
81
- "Meta Learning": "Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
82
- "Product Recommendation": "Testing the model's ability to recommend relevant products based on user preferences and behavior.",
83
- "Machine Unlearning": "Evaluating how well models can 'unlearn' specific information when required.",
84
- "Backdoor Trigger Recovery": "Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
 
 
 
 
 
 
85
  }
 
1
  # Theme and configuration settings for the Model Capability Leaderboard application
2
 
3
+ # Import task mapping
4
+ from src.utils.task_mapping import task_display_names
5
+
6
  # Theme colors - using dark mode by default
7
  dark_theme = {
8
  'bg_color': '#1a202c',
 
49
 
50
  # Metrics configuration
51
  metrics_config = {
52
+ "Relative Improvement to Human": {
53
+ "file": "src/data/metrics/relative_improvement_to_human.json",
54
+ "description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.",
55
  "min_value": -100, # Approximate, adjust as needed
56
  "max_value": 50, # Approximate, adjust as needed
57
  "color_map": "RdYlGn"
58
+ },
59
+ "Absolute Improvement to Baseline": {
60
+ "file": "src/data/metrics/absolute_improvement_to_baseline.json",
61
+ "description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.",
62
+ "min_value": -100, # Approximate, adjust as needed
63
+ "max_value": 100, # Approximate, adjust as needed
64
+ "color_map": "RdYlGn"
65
  }
66
  # Future metrics can be added here
67
  # "Another Metric": {
 
80
  "MLAB (o3-mini)": "Closed Source",
81
  "MLAB (gpt-4o)": "Closed Source",
82
  "MLAB (llama3-1-405b-instruct)": "Open Weights",
83
+ "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
84
+ "Human": "Human",
85
+ "Top Human in Competition": "Human"
86
  # More models would be added here as needed
87
  }
88
 
89
  # Task descriptions
90
  tasks_info = {
91
+ task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"):
92
+ "Testing the model's ability to understand and localize actions within temporal sequences of events.",
93
+ task_display_names.get("Llm Merging", "LLM Merging"):
94
+ "Assessing the capability to effectively merge knowledge from multiple language models.",
95
+ task_display_names.get("Meta Learning", "Meta Learning"):
96
+ "Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
97
+ task_display_names.get("Product Recommendation", "Next Product Recommendation"):
98
+ "Testing the model's ability to recommend relevant products based on user preferences and behavior.",
99
+ task_display_names.get("Machine Unlearning", "Machine Unlearning"):
100
+ "Evaluating how well models can 'unlearn' specific information when required.",
101
+ task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"):
102
+ "Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
103
  }
src/utils/data_loader.py CHANGED
@@ -4,6 +4,7 @@ Data loading and processing utilities for the leaderboard application.
4
  import pandas as pd
5
  import json
6
  from src.utils.config import model_categories
 
7
 
8
  def load_metric_data(file_path):
9
  """
@@ -56,8 +57,13 @@ def process_data(metric_data):
56
  # Replace NaN values with '-'
57
  df.fillna('-', inplace=True)
58
 
59
- # Rename the columns to more readable format
60
- df.columns = [task.replace("-", " ").replace("_", " ").title() for task in df.columns]
 
 
 
 
 
61
 
62
  # Add a model type column to the dataframe
63
  df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown"))
@@ -108,14 +114,26 @@ def filter_and_prepare_data(df, selected_tasks, selected_model_types):
108
  selected_tasks_df = filtered_df[selected_tasks]
109
  filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1)
110
 
111
- # Sort by Selected Overall and add rank
112
- filtered_df = filtered_df.sort_values('Selected Overall', ascending=False)
113
- filtered_df.insert(0, 'Rank', range(1, len(filtered_df) + 1))
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  # Add a Model Name column that shows the index (actual model name)
116
- filtered_df['Model Name'] = filtered_df.index
117
 
118
- return filtered_df
119
 
120
  def format_display_dataframe(filtered_df, selected_tasks):
121
  """
@@ -135,13 +153,16 @@ def format_display_dataframe(filtered_df, selected_tasks):
135
  medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"}
136
  display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x)))
137
 
 
 
 
138
  # Add metrics columns (Selected Overall and individual tasks)
139
  metric_columns = ['Selected Overall'] + selected_tasks
140
  for col in metric_columns:
141
  if col in filtered_df.columns:
142
- # Format numeric columns to 3 decimal places
143
  if filtered_df[col].dtype in ['float64', 'float32']:
144
- display_df[col] = filtered_df[col].apply(lambda x: f"{x:.3f}" if isinstance(x, (int, float)) else x)
145
  else:
146
  display_df[col] = filtered_df[col]
147
 
 
4
  import pandas as pd
5
  import json
6
  from src.utils.config import model_categories
7
+ from src.utils.task_mapping import get_display_name
8
 
9
  def load_metric_data(file_path):
10
  """
 
57
  # Replace NaN values with '-'
58
  df.fillna('-', inplace=True)
59
 
60
+ # First convert raw task names to standard format (spaces instead of hyphens/underscores)
61
+ standardized_columns = [task.replace("-", " ").replace("_", " ").title() for task in df.columns]
62
+ df.columns = standardized_columns
63
+
64
+ # Then apply our display name mapping
65
+ display_name_columns = {col: get_display_name(col) for col in df.columns}
66
+ df = df.rename(columns=display_name_columns)
67
 
68
  # Add a model type column to the dataframe
69
  df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown"))
 
114
  selected_tasks_df = filtered_df[selected_tasks]
115
  filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1)
116
 
117
+ # Separate human entries from other models for ranking
118
+ is_human = filtered_df['Model Type'] == 'Human'
119
+ human_df = filtered_df[is_human]
120
+ non_human_df = filtered_df[~is_human]
121
+
122
+ # Sort non-human models by Selected Overall and add rank
123
+ non_human_df = non_human_df.sort_values('Selected Overall', ascending=False)
124
+ non_human_df.insert(0, 'Rank', range(1, len(non_human_df) + 1))
125
+
126
+ # Add rank for human (use '-' to indicate not ranked)
127
+ human_df.insert(0, 'Rank', '-')
128
+
129
+ # Combine dataframes - put humans at appropriate position based on score
130
+ combined_df = pd.concat([non_human_df, human_df])
131
+ combined_df = combined_df.sort_values('Selected Overall', ascending=False)
132
 
133
  # Add a Model Name column that shows the index (actual model name)
134
+ combined_df['Model Name'] = combined_df.index
135
 
136
+ return combined_df
137
 
138
  def format_display_dataframe(filtered_df, selected_tasks):
139
  """
 
153
  medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"}
154
  display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x)))
155
 
156
+ # Rename 'Model Name' to 'Agent'
157
+ display_df = display_df.rename(columns={"Model Name": "Agent"})
158
+
159
  # Add metrics columns (Selected Overall and individual tasks)
160
  metric_columns = ['Selected Overall'] + selected_tasks
161
  for col in metric_columns:
162
  if col in filtered_df.columns:
163
+ # Format numeric columns to 1 decimal place
164
  if filtered_df[col].dtype in ['float64', 'float32']:
165
+ display_df[col] = filtered_df[col].apply(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
166
  else:
167
  display_df[col] = filtered_df[col]
168
 
src/utils/task_mapping.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task name mapping utilities for displaying user-friendly task names.
3
+ """
4
+
5
+ # Mapping from original task names to display names
6
+ task_display_names = {
7
+ "Perception Temporal Action Loc": "Temporal Action Localisation",
8
+ "Llm Merging": "LLM Merging",
9
+ "Meta Learning": "Meta Learning",
10
+ "Product Recommendation": "Next Product Recommendation",
11
+ "Machine Unlearning": "Machine Unlearning",
12
+ "Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
13
+ }
14
+
15
+ def get_display_name(task_name):
16
+ """
17
+ Get the display name for a task
18
+
19
+ Args:
20
+ task_name (str): The original task name
21
+
22
+ Returns:
23
+ str: The display name for the task
24
+ """
25
+ return task_display_names.get(task_name, task_name)
26
+
27
+ def get_original_name(display_name):
28
+ """
29
+ Get the original task name for a display name
30
+
31
+ Args:
32
+ display_name (str): The display name
33
+
34
+ Returns:
35
+ str: The original task name
36
+ """
37
+ # Create a reverse mapping
38
+ reverse_mapping = {v: k for k, v in task_display_names.items()}
39
+ return reverse_mapping.get(display_name, display_name)