MLRC_Bench

Running

App Files Files Community

Armeddinosaur commited on Mar 22

Commit

06d4ee9

1 Parent(s): 697ae1d

Updating table

Browse files

Files changed (12) hide show

Assests/MLRC_Bench_overview.png +0 -0
src/app.py +74 -10
src/components/filters.py +71 -22
src/components/header.py +2 -8
src/components/leaderboard.py +154 -37
src/components/tasks.py +10 -8
src/data/metrics/absolute_improvement_to_baseline.json +56 -0
src/data/metrics/{margin_to_human.json → relative_improvement_to_human.json} +6 -0
src/data/processors.py +4 -0
src/utils/config.py +28 -10
src/utils/data_loader.py +30 -9
src/utils/task_mapping.py +39 -0

Assests/MLRC_Bench_overview.png CHANGED Viewed

src/app.py CHANGED Viewed

@@ -18,7 +18,7 @@ from src.utils.data_loader import (
 from src.styles.base import load_all_styles
 # Import components
-from src.components.header import render_page_header, render_footer
 from src.components.filters import (
     initialize_session_state,
     render_metric_selection,
@@ -40,6 +40,37 @@ def setup_page():
     # Load all styles
     load_all_styles()
 def main():
     """
@@ -51,9 +82,9 @@ def main():
     # Render header
     render_page_header()
-    # Load data
-    current_metric = list(metrics_config.keys())[0]
-    metric_data = load_metric_data(metrics_config[current_metric]["file"])
     df = process_data(metric_data)
     # Initialize session state
@@ -65,20 +96,54 @@ def main():
     # Tab 1: Leaderboard
     with tabs[0]:
         # Render filter components
-        selected_metric = render_metric_selection()
         selected_tasks = render_task_selection(df)
         selected_model_types = render_model_type_selection(df)
         # Render leaderboard if selections are valid
         if selected_tasks and selected_model_types:
-            # Filter and prepare data
-            filtered_df = filter_and_prepare_data(df, selected_tasks, selected_model_types)
             # Format data for display
             display_df, metric_columns = format_display_dataframe(filtered_df, selected_tasks)
             # Render the leaderboard table
-            render_leaderboard_table(display_df, metric_columns)
         else:
             # Show empty state
             render_empty_state()
@@ -88,8 +153,7 @@ def main():
         # Render task descriptions
         render_task_descriptions()
-    # Render footer
-    render_footer()
 if __name__ == "__main__":
     main()

 from src.styles.base import load_all_styles
 # Import components
+from src.components.header import render_page_header
 from src.components.filters import (
     initialize_session_state,
     render_metric_selection,
     # Load all styles
     load_all_styles()
+    # Force dark mode using custom CSS
+    st.markdown("""
+    <style>
+    /* Force dark mode regardless of browser settings */
+    .stApp {
+        background-color: #1a202c !important;
+        color: #e2e8f0 !important;
+    }
+    /* Override Streamlit's default styling to ensure dark mode */
+    .stTextInput, .stSelectbox, .stMultiselect {
+        background-color: #2d3748 !important;
+        color: #e2e8f0 !important;
+    }
+    .stButton>button {
+        background-color: #4a5568 !important;
+        color: #e2e8f0 !important;
+    }
+    /* Override header and text colors */
+    h1, h2, h3, h4, h5, h6, p, span, div {
+        color: #e2e8f0 !important;
+    }
+    /* Ensure tab styling is consistent */
+    .stTabs [data-baseweb="tab-list"] {
+        background-color: #1a202c !important;
+    }
+    .stTabs [data-baseweb="tab"] {
+        color: #e2e8f0 !important;
+    }
+    </style>
+    """, unsafe_allow_html=True)
 def main():
     """
     # Render header
     render_page_header()
+    # Load primary metric data (first metric in config)
+    primary_metric = list(metrics_config.keys())[0]
+    metric_data = load_metric_data(metrics_config[primary_metric]["file"])
     df = process_data(metric_data)
     # Initialize session state
     # Tab 1: Leaderboard
     with tabs[0]:
         # Render filter components
+        selected_metrics = render_metric_selection()
+        # Continue with other filters
         selected_tasks = render_task_selection(df)
         selected_model_types = render_model_type_selection(df)
         # Render leaderboard if selections are valid
         if selected_tasks and selected_model_types:
+            # Load the primary metric data first (always the first in selected_metrics)
+            primary_metric = selected_metrics[0]
+            primary_metric_data = load_metric_data(metrics_config[primary_metric]["file"])
+            primary_df = process_data(primary_metric_data)
+            # Filter and prepare data for primary metric
+            filtered_df = filter_and_prepare_data(primary_df, selected_tasks, selected_model_types)
             # Format data for display
             display_df, metric_columns = format_display_dataframe(filtered_df, selected_tasks)
+            # If additional metrics are selected, add their data too
+            all_metric_columns = metric_columns.copy()
+            for metric in selected_metrics[1:]:
+                metric_info = metrics_config[metric]
+                metric_data = load_metric_data(metric_info["file"])
+                metric_df = process_data(metric_data)
+                # Process and merge the additional metric data
+                metric_filtered_df = filter_and_prepare_data(metric_df, selected_tasks, selected_model_types)
+                metric_display_df, _ = format_display_dataframe(metric_filtered_df, selected_tasks)
+                # Create a meaningful prefix for this metric
+                if metric == "Absolute Improvement to Baseline":
+                    prefix = "Abs"
+                else:
+                    # Use first word of each part of the metric name
+                    prefix = "".join([word[0] for word in metric.split()]).upper()
+                # Combine the dataframes - keep only metric columns from metric_display_df
+                for col in metric_columns:
+                    if col in metric_display_df.columns:
+                        # Add columns with metric prefix
+                        display_df[f"{prefix}: {col}"] = metric_display_df[col]
+                        # Add to the list of all metric columns
+                        all_metric_columns.append(f"{prefix}: {col}")
             # Render the leaderboard table
+            render_leaderboard_table(display_df, all_metric_columns, primary_metric)
         else:
             # Show empty state
             render_empty_state()
         # Render task descriptions
         render_task_descriptions()
+    # Footer removed per user request
 if __name__ == "__main__":
     main()

src/components/filters.py CHANGED Viewed

@@ -12,12 +12,14 @@ def initialize_session_state(df):
         df (pandas.DataFrame): The DataFrame with model data
     """
     # Initialize session states
-    if 'selected_metric' not in st.session_state:
-        st.session_state.selected_metric = list(metrics_config.keys())[0]
     if 'selected_tasks' not in st.session_state:
-        # Default to first 3 tasks, excluding Model Type
-        st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']][:3]
     if 'selected_model_types' not in st.session_state:
         # Ensure all model types are selected by default
@@ -28,23 +30,43 @@ def render_metric_selection():
     Render the metric selection component
     Returns:
-        str: Selected metric
     """
-    st.markdown("### Select Metric")
-    # Create more compact metric buttons with clear selection indicators
-    metric_cols = st.columns(len(metrics_config))
-    for i, metric in enumerate(metrics_config.keys()):
-        with metric_cols[i]:
-            is_selected = st.session_state.selected_metric == metric
             button_label = f"✓ {metric}" if is_selected else metric
             button_type = "primary" if is_selected else "secondary"
             if st.button(button_label, key=f"metric_{metric}", type=button_type):
-                st.session_state.selected_metric = metric
                 st.rerun()  # Force UI update
-    return st.session_state.selected_metric
 def render_task_selection(df):
     """
@@ -61,14 +83,33 @@ def render_task_selection(df):
     # Extract task columns (exclude Model Type and Overall)
     all_tasks = [col for col in df.columns if col not in ['Model Type']]
-    # Create task buttons in rows of 3
-    num_cols = 3
-    task_rows = [all_tasks[i:i+num_cols] for i in range(0, len(all_tasks), num_cols)]
-    for row in task_rows:
         cols = st.columns(num_cols)
-        for i, task in enumerate(row):
-            if i < len(row):
                 with cols[i]:
                     is_selected = task in st.session_state.selected_tasks
                     button_label = f"✓ {task}" if is_selected else task
@@ -95,12 +136,20 @@ def render_model_type_selection(df):
     """
     st.markdown("### Select Model Types")
-    # Create model type buttons
     model_types = df['Model Type'].unique().tolist()
-    model_type_cols = st.columns(len(model_types))
     for i, model_type in enumerate(model_types):
-        with model_type_cols[i]:
             is_selected = model_type in st.session_state.selected_model_types
             button_label = f"✓ {model_type}" if is_selected else model_type
             button_type = "primary" if is_selected else "secondary"

         df (pandas.DataFrame): The DataFrame with model data
     """
     # Initialize session states
+    if 'selected_metrics' not in st.session_state:
+        # Start with the first metric always selected
+        primary_metric = list(metrics_config.keys())[0]
+        st.session_state.selected_metrics = [primary_metric]
     if 'selected_tasks' not in st.session_state:
+        # Select all tasks by default, excluding Model Type
+        st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
     if 'selected_model_types' not in st.session_state:
         # Ensure all model types are selected by default
     Render the metric selection component
     Returns:
+        list: Selected metrics
     """
+    st.markdown("### Select Metrics")
+    # Get metric names
+    all_metrics = list(metrics_config.keys())
+    primary_metric = all_metrics[0]  # First metric is primary
+    secondary_metrics = all_metrics[1:]  # Rest are secondary
+    # Always select the primary metric
+    if primary_metric not in st.session_state.selected_metrics:
+        st.session_state.selected_metrics.append(primary_metric)
+    # Create columns based on number of metrics
+    num_cols = len(all_metrics)
+    cols = st.columns(num_cols)
+    # Primary metric first (always selected and can't be deselected)
+    with cols[0]:
+        button_label = f"✓ {primary_metric}"
+        st.button(button_label, key=f"metric_{primary_metric}", type="primary", disabled=True)
+    # Secondary metrics that can be toggled
+    for i, metric in enumerate(secondary_metrics):
+        with cols[i+1]:
+            is_selected = metric in st.session_state.selected_metrics
             button_label = f"✓ {metric}" if is_selected else metric
             button_type = "primary" if is_selected else "secondary"
             if st.button(button_label, key=f"metric_{metric}", type=button_type):
+                if is_selected:
+                    st.session_state.selected_metrics.remove(metric)
+                else:
+                    st.session_state.selected_metrics.append(metric)
                 st.rerun()  # Force UI update
+    return st.session_state.selected_metrics
 def render_task_selection(df):
     """
     # Extract task columns (exclude Model Type and Overall)
     all_tasks = [col for col in df.columns if col not in ['Model Type']]
+    # Determine number of columns based on screen width
+    num_cols = 3  # Default for medium screens
+    # Create task buttons in a fixed number of columns with balanced width
+    task_groups = [all_tasks[i:i+num_cols] for i in range(0, len(all_tasks), num_cols)]
+    # Custom CSS for button styling
+    st.markdown("""
+    <style>
+    /* Make buttons same width in their columns and centered */
+    .stButton > button {
+        width: 100%;
+        max-width: 300px;
+        margin: 0 auto;
+        display: block;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    # Display buttons in rows
+    for group in task_groups:
+        # Create columns with equal width
         cols = st.columns(num_cols)
+        # Add buttons to each column
+        for i, task in enumerate(group):
+            if i < len(cols):  # Ensure we don't exceed available columns
                 with cols[i]:
                     is_selected = task in st.session_state.selected_tasks
                     button_label = f"✓ {task}" if is_selected else task
     """
     st.markdown("### Select Model Types")
+    # Get unique model types
     model_types = df['Model Type'].unique().tolist()
+    # Determine number of columns - up to 4 columns max depending on number of model types
+    num_cols = min(len(model_types), 4)
+    # Create columns
+    cols = st.columns(num_cols)
+    # Add a button for each model type
     for i, model_type in enumerate(model_types):
+        col_idx = i % num_cols  # Determine which column to place the button in
+        with cols[col_idx]:
             is_selected = model_type in st.session_state.selected_model_types
             button_label = f"✓ {model_type}" if is_selected else model_type
             button_type = "primary" if is_selected else "secondary"

src/components/header.py CHANGED Viewed

@@ -31,11 +31,5 @@ def render_footer():
     """
     Render the page footer
     """
-    st.markdown(
-        """
-        <div class="footer">
-            <p>© 2023 Model Capability Leaderboard • Made with Streamlit • Contact: [email protected]</p>
-        </div>
-        """,
-        unsafe_allow_html=True
-    )

     """
     Render the page footer
     """
+    # Footer content removed per user request
+    pass

src/components/leaderboard.py CHANGED Viewed

@@ -4,19 +4,43 @@ Leaderboard table components for the leaderboard application.
 import streamlit as st
 from src.data.processors import get_model_type_style, get_rank_style
-def render_leaderboard_table(display_df, metric_columns):
     """
     Render the custom HTML leaderboard table
     Args:
         display_df (pandas.DataFrame): The DataFrame with the display data
         metric_columns (list): List of metric column names
     """
     from src.components.header import render_section_header
     # Display model ranking header without the box
     render_section_header("Model Rankings")
     # Start building the HTML table structure
     html_table = """
     <div class="fixed-table-container">
@@ -25,12 +49,13 @@ def render_leaderboard_table(display_df, metric_columns):
           <thead>
             <tr class="header-row">
               <th class="fixed-column first-fixed-column" rowspan="2">Rank</th>
-              <th class="fixed-column second-fixed-column" rowspan="2">Model + Scaffolding</th>
-              <th class="model-type-cell" rowspan="2">Model Type</th>
     """
-    # Add the metric header
-    html_table += f'<th colspan="{len(metric_columns)}" class="metric-header">Margin To Human</th>'
     # Continue the table structure
     html_table += """
@@ -38,10 +63,13 @@ def render_leaderboard_table(display_df, metric_columns):
             <tr class="sub-header">
     """
-    # Add individual column headers for metrics
-    for col in metric_columns:
-        column_class = "overall-cell" if col == "Metric Average" else "metric-cell"
-        html_table += f'<th class="{column_class}">{col}</th>'
     # Close the header and start the body
     html_table += """
@@ -53,13 +81,20 @@ def render_leaderboard_table(display_df, metric_columns):
     # Add the data rows
     for i, (idx, row) in enumerate(display_df.iterrows()):
         # Define background colors to ensure consistency
-        row_bg = "#0a0a0a" if i % 2 == 0 else "#111111"
         # Start the row
-        html_table += f'<tr class="table-row">'
         # Add Rank with medal styling and consistent background
-        rank_style = f"background-color: {row_bg};" # Add row background to fixed columns
         rank_styles = get_rank_style(row["Rank"])
         for style_key, style_value in rank_styles.items():
             rank_style += f"{style_key}: {style_value};"
@@ -67,11 +102,11 @@ def render_leaderboard_table(display_df, metric_columns):
         html_table += f'<td class="fixed-column first-fixed-column" style="{rank_style}">{row["Rank"]}</td>'
         # Model name fixed column with consistent background
-        html_table += f'<td class="fixed-column second-fixed-column" title="{row["Model Name"]}" style="background-color: {row_bg}; font-weight: 500; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; text-align: center;">{row["Model Name"]}</td>'
         # Model type cell
         model_type = row["Model Type"]
-        type_style = f"background-color: {row_bg};"
         model_type_styles = get_model_type_style(model_type)
         for style_key, style_value in model_type_styles.items():
             if style_value:
@@ -79,22 +114,30 @@ def render_leaderboard_table(display_df, metric_columns):
         html_table += f'<td class="table-cell model-type-cell" style="{type_style}">{model_type}</td>'
-        # Add metric values with minimal styling
-        for col in metric_columns:
-            cell_class = "table-cell overall-cell" if col == "Metric Average" else "table-cell metric-cell"
-            value_text = row[col]
-            # Simple styling based on positive/negative values
-            try:
-                value = float(str(row[col]).replace(',', ''))
-                if value > 0:
-                    cell_class += " positive-value"
-                elif value < 0:
-                    cell_class += " negative-value"
-            except:
-                pass
-            html_table += f'<td class="{cell_class}" style="background-color: {row_bg};">{value_text}</td>'
         html_table += "</tr>"
@@ -106,16 +149,90 @@ def render_leaderboard_table(display_df, metric_columns):
     </div>
     """
-    # Add metric definition below the table
-    metric_definition = """
-    <div class="metric-definition">
-        <h4>Margin to Human</h4>
-        <p> This metric measures what percentage of the top 1 human-to-baseline performance gap an agent can close on challenging Machine Learning Research Competition problems. For example, if the baseline is 100, top human performance is 200, and the agent scores 110, the agent has closed 10% of the gap between baseline and top human performance. Higher percentages indicate models that more effectively approach top human-level research capabilities.</p>
-    </div>
     """
-    # Display the custom HTML table and metric definition
-    st.markdown(html_table + metric_definition, unsafe_allow_html=True)
 def render_empty_state():
     """

 import streamlit as st
 from src.data.processors import get_model_type_style, get_rank_style
+def render_leaderboard_table(display_df, metric_columns, primary_metric):
     """
     Render the custom HTML leaderboard table
     Args:
         display_df (pandas.DataFrame): The DataFrame with the display data
         metric_columns (list): List of metric column names
+        primary_metric (str): The name of the primary metric
     """
     from src.components.header import render_section_header
+    from src.utils.config import metrics_config
     # Display model ranking header without the box
     render_section_header("Model Rankings")
+    # Detect if we have multiple metrics (columns with metric prefixes)
+    has_multiple_metrics = any(":" in col for col in metric_columns)
+    # Group columns by metric if multiple metrics are present
+    metric_groups = {}
+    if has_multiple_metrics:
+        # Primary metric columns (no prefix)
+        primary_cols = [col for col in metric_columns if ":" not in col]
+        metric_groups[primary_metric] = primary_cols
+        # Other metrics
+        for col in metric_columns:
+            if ":" in col:
+                prefix, metric_name = col.split(": ", 1)
+                full_metric_name = next((m for m in metrics_config if m.startswith(prefix)), prefix)
+                if full_metric_name not in metric_groups:
+                    metric_groups[full_metric_name] = []
+                metric_groups[full_metric_name].append(col)
+    else:
+        # Single metric
+        metric_groups[primary_metric] = metric_columns
     # Start building the HTML table structure
     html_table = """
     <div class="fixed-table-container">
           <thead>
             <tr class="header-row">
               <th class="fixed-column first-fixed-column" rowspan="2">Rank</th>
+              <th class="fixed-column second-fixed-column" rowspan="2" style="text-align: center;">Agent</th>
+              <th class="model-type-cell" rowspan="2" style="text-align: center;">Model Type</th>
     """
+    # Add metric headers for each metric group
+    for metric_name, cols in metric_groups.items():
+        html_table += f'<th colspan="{len(cols)}" class="metric-header" style="text-align: center;">{metric_name}</th>'
     # Continue the table structure
     html_table += """
             <tr class="sub-header">
     """
+    # Add individual column headers for all metrics
+    for metric_name, cols in metric_groups.items():
+        for col in cols:
+            # Extract the actual column name if it has a prefix
+            display_name = col.split(": ", 1)[-1] if ":" in col else col
+            column_class = "overall-cell" if display_name == "Metric Average" else "metric-cell"
+            html_table += f'<th class="{column_class}" style="text-align: center;">{display_name}</th>'
     # Close the header and start the body
     html_table += """
     # Add the data rows
     for i, (idx, row) in enumerate(display_df.iterrows()):
         # Define background colors to ensure consistency
+        # Special background for human row
+        is_human_row = row["Agent"] == "Top Human in Competition"
+        if is_human_row:
+            row_bg = "#2a1e37"  # Purple-ish dark background for human row
+            row_style = f'style="background-color: {row_bg}; box-shadow: 0 0 5px #f472b6;"'
+        else:
+            row_bg = "#0a0a0a" if i % 2 == 0 else "#111111"
+            row_style = f'style="background-color: {row_bg};"'
         # Start the row
+        html_table += f'<tr class="table-row" {row_style}>'
         # Add Rank with medal styling and consistent background
+        rank_style = "" # Don't set background at cell level
         rank_styles = get_rank_style(row["Rank"])
         for style_key, style_value in rank_styles.items():
             rank_style += f"{style_key}: {style_value};"
         html_table += f'<td class="fixed-column first-fixed-column" style="{rank_style}">{row["Rank"]}</td>'
         # Model name fixed column with consistent background
+        html_table += f'<td class="fixed-column second-fixed-column" title="{row["Agent"]}" style="font-weight: 500; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; text-align: center;">{row["Agent"]}</td>'
         # Model type cell
         model_type = row["Model Type"]
+        type_style = f"text-align: center;"
         model_type_styles = get_model_type_style(model_type)
         for style_key, style_value in model_type_styles.items():
             if style_value:
         html_table += f'<td class="table-cell model-type-cell" style="{type_style}">{model_type}</td>'
+        # Add metric values with minimal styling for all columns
+        all_metric_columns = [col for group in metric_groups.values() for col in group]
+        for col in all_metric_columns:
+            display_name = col.split(": ", 1)[-1] if ":" in col else col
+            cell_class = "table-cell overall-cell" if display_name == "Metric Average" else "table-cell metric-cell"
+            # Check if column exists in the row (it should)
+            if col in row:
+                value_text = row[col]
+                # Simple styling based on positive/negative values
+                try:
+                    value = float(str(row[col]).replace(',', ''))
+                    if value > 0:
+                        cell_class += " positive-value"
+                    elif value < 0:
+                        cell_class += " negative-value"
+                except:
+                    pass
+                html_table += f'<td class="{cell_class}">{value_text}</td>'
+            else:
+                # If column doesn't exist (shouldn't happen), add empty cell
+                html_table += f'<td class="{cell_class}">-</td>'
         html_table += "</tr>"
     </div>
     """
+    # Add styling for metrics section
+    metrics_css = """
+    <style>
+    .metric-definitions {
+        margin-top: 30px;
+        padding-top: 20px;
+        border-top: 1px solid #333;
+    }
+    .metric-definition {
+        background-color: #1a1a1a;
+        border-radius: 8px;
+        padding: 12px 16px;
+        margin-bottom: 16px;
+    }
+    .metric-definition h4 {
+        margin-top: 0;
+        color: #a5b4fc;
+    }
+    .metric-definition p {
+        margin-bottom: 0;
+        color: #e2e8f0;
+    }
+    </style>
     """
+    # Build a clean HTML string for the metrics section
+    metrics_html = '<div class="metric-definitions">'
+    # Add each metric definition
+    for metric_name, metric_info in metrics_config.items():
+        metric_description = metric_info.get('description', '')
+        # Special handling for Relative Improvement to Human to show formula
+        if metric_name == "Relative Improvement to Human":
+            formula_html = """
+            <div style="margin: 15px 0;">
+                <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
+                <div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
+                    Relative Improvement to Human = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / (P<sub>human</sub> - P<sub>baseline</sub>)) × 100%
+                </div>
+                <p style="margin-top: 10px; font-weight: 500;">Where:</p>
+                <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
+                    <li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
+                    <li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
+                    <li style="margin-bottom: 5px;">P<sub>human</sub> is the human performance benchmark</li>
+                    <li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
+                </ul>
+            </div>
+            """
+            # Add the metric definition with the formula
+            metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p>{formula_html}</div>'
+        # Special handling for Absolute Improvement to Baseline to show formula
+        elif metric_name == "Absolute Improvement to Baseline":
+            formula_html = """
+            <div style="margin: 15px 0;">
+                <p style="margin-bottom: 10px; font-weight: 500;">Formula:</p>
+                <div style="background-color: #111; padding: 15px; border-radius: 5px; text-align: center; margin-bottom: 15px;">
+                    Absolute Improvement to Baseline = max<sub>all runs</sub>((P<sub>agent</sub> - P<sub>baseline</sub>) / P<sub>baseline</sub>) × 100%
+                </div>
+                <p style="margin-top: 10px; font-weight: 500;">Where:</p>
+                <ul style="list-style-type: disc; padding-left: 25px; margin-top: 8px;">
+                    <li style="margin-bottom: 5px;">P<sub>agent</sub> is the agent's test performance</li>
+                    <li style="margin-bottom: 5px;">P<sub>baseline</sub> is the baseline test performance</li>
+                    <li style="margin-bottom: 5px;">The maximum is taken across all experimental runs for a given task-model pair</li>
+                </ul>
+            </div>
+            """
+            # Add the metric definition with the formula
+            metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p>{formula_html}</div>'
+        else:
+            # Regular metric without formula
+            metrics_html += f'<div class="metric-definition"><h4>{metric_name}</h4><p>{metric_description}</p></div>'
+    # Close the metric definitions container
+    metrics_html += '</div>'
+    # Display the styling and HTML separately for maximum control
+    st.markdown(html_table, unsafe_allow_html=True)
+    st.markdown(metrics_css, unsafe_allow_html=True)
+    # Render the metrics definitions
+    st.markdown(metrics_html, unsafe_allow_html=True)
 def render_empty_state():
     """

src/components/tasks.py CHANGED Viewed

@@ -3,6 +3,7 @@ Task description components for the leaderboard application.
 """
 import streamlit as st
 from src.utils.config import tasks_info
 def render_task_descriptions():
     """
@@ -51,8 +52,8 @@ def render_task_descriptions():
     </div>
     """, unsafe_allow_html=True)
-    # Task links mapping
-    task_links = {
         "Backdoor Trigger Recovery": "https://www.llmagentsafetycomp24.com/tracks/#backdoor_model",
         "Machine Unlearning": "https://unlearning-challenge.github.io/",
         "Perception Temporal Action Loc": "https://ptchallenge-workshop.github.io",
@@ -60,6 +61,9 @@ def render_task_descriptions():
         "Meta Learning": "https://metalearning.chalearn.org/",
         "Llm Merging": "https://llm-merging.github.io"
     }
     # Create two columns
     col1, col2 = st.columns(2)
@@ -73,9 +77,8 @@ def render_task_descriptions():
             link = task_links.get(task, "#")
             st.markdown(f"""
             <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
-                <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
-                    <div class="task-title">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
-                    <div class="task-description">{description}</div>
                 </div>
             </a>
             """, unsafe_allow_html=True)
@@ -85,9 +88,8 @@ def render_task_descriptions():
             link = task_links.get(task, "#")
             st.markdown(f"""
             <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
-                <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
-                    <div class="task-title">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
-                    <div class="task-description">{description}</div>
                 </div>
             </a>
             """, unsafe_allow_html=True)

 """
 import streamlit as st
 from src.utils.config import tasks_info
+from src.utils.task_mapping import get_display_name, get_original_name
 def render_task_descriptions():
     """
     </div>
     """, unsafe_allow_html=True)
+    # Task links mapping - using original task names
+    original_task_links = {
         "Backdoor Trigger Recovery": "https://www.llmagentsafetycomp24.com/tracks/#backdoor_model",
         "Machine Unlearning": "https://unlearning-challenge.github.io/",
         "Perception Temporal Action Loc": "https://ptchallenge-workshop.github.io",
         "Meta Learning": "https://metalearning.chalearn.org/",
         "Llm Merging": "https://llm-merging.github.io"
     }
+    # Update links mapping to use display names as keys
+    task_links = {get_display_name(task): link for task, link in original_task_links.items()}
     # Create two columns
     col1, col2 = st.columns(2)
             link = task_links.get(task, "#")
             st.markdown(f"""
             <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
+                <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s; padding: 12px; margin-bottom: 15px; height: auto;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
+                    <div class="task-title" style="text-align: center;">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
                 </div>
             </a>
             """, unsafe_allow_html=True)
             link = task_links.get(task, "#")
             st.markdown(f"""
             <a href="{link}" target="_blank" style="text-decoration: none; color: inherit;">
+                <div class="task-card" style="cursor: pointer; transition: transform 0.2s, box-shadow 0.2s; padding: 12px; margin-bottom: 15px; height: auto;" onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 8px 15px rgba(0, 0, 0, 0.2)';" onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 6px rgba(0, 0, 0, 0.15)';">
+                    <div class="task-title" style="text-align: center;">{task} <span style="font-size: 14px; opacity: 0.7;">🔗</span></div>
                 </div>
             </a>
             """, unsafe_allow_html=True)

src/data/metrics/absolute_improvement_to_baseline.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "perception_temporal_action_loc": {
+    "MLAB (claude-3-5-sonnet-v2)": 2.222443094482299,
+    "Top Human in Competition": 284.55703321316366,
+    "MLAB (gemini-exp-1206)": -1.34633272895098,
+    "MLAB (o3-mini)": 0.8724822663469414,
+    "MLAB (gpt-4o)": 0.9384906166574135,
+    "MLAB (llama3-1-405b-instruct)": 1.474927454740455,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 0.9888962417416385
+  },
+  "llm-merging": {
+    "CoI-Agent (o1) + MLAB (gpt-4o)": -0.6756756689645764,
+    "Top Human in Competition": 68.24324325461103,
+    "MLAB (claude-3-5-sonnet-v2)": 3.3783783853634035,
+    "MLAB (gemini-exp-1206)": 3.3783783853634035,
+    "MLAB (o3-mini)": -0.6756756689645764,
+    "MLAB (gpt-4o)": 1.3513513581994137,
+    "MLAB (llama3-1-405b-instruct)": -0.6756756689645764
+  },
+  "meta-learning": {
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 5.424978139166417,
+    "Top Human in Competition": 304.53435579895256,
+    "MLAB (claude-3-5-sonnet-v2)": 5.424978139166417,
+    "MLAB (gemini-exp-1206)": 5.424978139166417,
+    "MLAB (o3-mini)": -14.923192223926499,
+    "MLAB (gpt-4o)": 5.424978139166417,
+    "MLAB (llama3-1-405b-instruct)": 5.424978139166417
+  },
+  "product-recommendation": {
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6021227441680528,
+    "Top Human in Competition": 412.59793394031675,
+    "MLAB (claude-3-5-sonnet-v2)": 12.283606772997718,
+    "MLAB (gemini-exp-1206)": 0.6021227441680528,
+    "MLAB (o3-mini)": 0.6035316323448103,
+    "MLAB (gpt-4o)": 2.6400767209619422,
+    "MLAB (llama3-1-405b-instruct)": -2.9066701147102995e-09
+  },
+  "machine_unlearning": {
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 7.318484292638537,
+    "Top Human in Competition": 61.85258904854873,
+    "MLAB (claude-3-5-sonnet-v2)": -58.58540153334969,
+    "MLAB (gemini-exp-1206)": 3.4837676447981045,
+    "MLAB (o3-mini)": 2.2414490971518704,
+    "MLAB (gpt-4o)": -11.131587250139926,
+    "MLAB (llama3-1-405b-instruct)": 3.8409541040677597
+  },
+  "backdoor-trigger-recovery": {
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 38.252918051116,
+    "Top Human in Competition": 621.2635313337943,
+    "MLAB (claude-3-5-sonnet-v2)": 247.90785034564928,
+    "MLAB (gemini-exp-1206)": 80.40937239150493,
+    "MLAB (o3-mini)": 38.75953643366491,
+    "MLAB (gpt-4o)": 64.52832837042699,
+    "MLAB (llama3-1-405b-instruct)": 71.70765816958271
+  }
+}

src/data/metrics/{margin_to_human.json → relative_improvement_to_human.json} RENAMED Viewed

@@ -1,6 +1,7 @@
 {
   "perception_temporal_action_loc": {
     "MLAB (claude-3-5-sonnet-v2)": 0.7810185077440877,
     "MLAB (gemini-exp-1206)": -0.4731328246392113,
     "MLAB (o3-mini)": 0.3066106841553126,
     "MLAB (gpt-4o)": 0.3298075630252947,
@@ -9,6 +10,7 @@
   },
   "llm-merging": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": -0.9900989999019761,
     "MLAB (claude-3-5-sonnet-v2)": 4.950495058915793,
     "MLAB (gemini-exp-1206)": 4.950495058915793,
     "MLAB (o3-mini)": -0.9900989999019761,
@@ -17,6 +19,7 @@
   },
   "meta-learning": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 1.781401026144938,
     "MLAB (claude-3-5-sonnet-v2)": 1.781401026144938,
     "MLAB (gemini-exp-1206)": 1.781401026144938,
     "MLAB (o3-mini)": -4.900331256476853,
@@ -25,6 +28,7 @@
   },
   "product-recommendation": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 0.1459345029718814,
     "MLAB (claude-3-5-sonnet-v2)": 2.9771372473170388,
     "MLAB (gemini-exp-1206)": 0.1459345029718814,
     "MLAB (o3-mini)": 0.1462759705510577,
@@ -33,6 +37,7 @@
   },
   "machine_unlearning": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 11.832138969791846,
     "MLAB (claude-3-5-sonnet-v2)": -94.71778374121965,
     "MLAB (gemini-exp-1206)": 5.632371576335568,
     "MLAB (o3-mini)": 3.623856546073656,
@@ -41,6 +46,7 @@
   },
   "backdoor-trigger-recovery": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 6.1572772457753295,
     "MLAB (claude-3-5-sonnet-v2)": 39.903815022493674,
     "MLAB (gemini-exp-1206)": 12.94287662739089,
     "MLAB (o3-mini)": 6.238823700218141,

 {
   "perception_temporal_action_loc": {
     "MLAB (claude-3-5-sonnet-v2)": 0.7810185077440877,
+    "Top Human in Competition": 100.0,
     "MLAB (gemini-exp-1206)": -0.4731328246392113,
     "MLAB (o3-mini)": 0.3066106841553126,
     "MLAB (gpt-4o)": 0.3298075630252947,
   },
   "llm-merging": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": -0.9900989999019761,
+    "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": 4.950495058915793,
     "MLAB (gemini-exp-1206)": 4.950495058915793,
     "MLAB (o3-mini)": -0.9900989999019761,
   },
   "meta-learning": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 1.781401026144938,
+    "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": 1.781401026144938,
     "MLAB (gemini-exp-1206)": 1.781401026144938,
     "MLAB (o3-mini)": -4.900331256476853,
   },
   "product-recommendation": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 0.1459345029718814,
+    "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": 2.9771372473170388,
     "MLAB (gemini-exp-1206)": 0.1459345029718814,
     "MLAB (o3-mini)": 0.1462759705510577,
   },
   "machine_unlearning": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 11.832138969791846,
+    "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": -94.71778374121965,
     "MLAB (gemini-exp-1206)": 5.632371576335568,
     "MLAB (o3-mini)": 3.623856546073656,
   },
   "backdoor-trigger-recovery": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 6.1572772457753295,
+    "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": 39.903815022493674,
     "MLAB (gemini-exp-1206)": 12.94287662739089,
     "MLAB (o3-mini)": 6.238823700218141,

src/data/processors.py CHANGED Viewed

@@ -42,6 +42,8 @@ def get_model_type_style(model_type):
         return {'color': '#93c5fd'}  # Brighter blue
     elif model_type == "Closed Source":
         return {'color': '#cbd5e1'}  # Lighter gray
     else:
         return {'color': ''}
@@ -61,6 +63,8 @@ def get_rank_style(rank):
         return {'color': 'silver', 'font-weight': '700', 'font-size': '16px'}
     elif "🥉" in str(rank):
         return {'color': '#cd7f32', 'font-weight': '700', 'font-size': '16px'}
     else:
         return {}

         return {'color': '#93c5fd'}  # Brighter blue
     elif model_type == "Closed Source":
         return {'color': '#cbd5e1'}  # Lighter gray
+    elif model_type == "Human":
+        return {'color': '#f472b6', 'font-weight': '600'}  # Pink with emphasis for Human
     else:
         return {'color': ''}
         return {'color': 'silver', 'font-weight': '700', 'font-size': '16px'}
     elif "🥉" in str(rank):
         return {'color': '#cd7f32', 'font-weight': '700', 'font-size': '16px'}
+    elif str(rank) == "-":
+        return {'color': '#f472b6', 'font-style': 'italic'}  # Style for non-ranked (human)
     else:
         return {}

src/utils/config.py CHANGED Viewed

@@ -1,5 +1,8 @@
 # Theme and configuration settings for the Model Capability Leaderboard application
 # Theme colors - using dark mode by default
 dark_theme = {
     'bg_color': '#1a202c',
@@ -46,12 +49,19 @@ app_config = {
 # Metrics configuration
 metrics_config = {
-    "Margin to Human": {
-        "file": "src/data/metrics/margin_to_human.json",
-        "description": "Performance on Machine Learning Research Challenges. Higher values indicate better research capabilities.",
         "min_value": -100,  # Approximate, adjust as needed
         "max_value": 50,    # Approximate, adjust as needed
         "color_map": "RdYlGn"
     }
     # Future metrics can be added here
     # "Another Metric": {
@@ -70,16 +80,24 @@ model_categories = {
     "MLAB (o3-mini)": "Closed Source",
     "MLAB (gpt-4o)": "Closed Source",
     "MLAB (llama3-1-405b-instruct)": "Open Weights",
-    "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source"
     # More models would be added here as needed
 }
 # Task descriptions
 tasks_info = {
-    "Perception Temporal Action Loc": "Testing the model's ability to understand and localize actions within temporal sequences of events.",
-    "Llm Merging": "Assessing the capability to effectively merge knowledge from multiple language models.",
-    "Meta Learning": "Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
-    "Product Recommendation": "Testing the model's ability to recommend relevant products based on user preferences and behavior.",
-    "Machine Unlearning": "Evaluating how well models can 'unlearn' specific information when required.",
-    "Backdoor Trigger Recovery": "Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
 }

 # Theme and configuration settings for the Model Capability Leaderboard application
+# Import task mapping
+from src.utils.task_mapping import task_display_names
 # Theme colors - using dark mode by default
 dark_theme = {
     'bg_color': '#1a202c',
 # Metrics configuration
 metrics_config = {
+    "Relative Improvement to Human": {
+        "file": "src/data/metrics/relative_improvement_to_human.json",
+        "description": "Measures how much of the performance gap between baseline and human the agent has closed. Calculated as: (Agent performance - Baseline) / (Human - Baseline) × 100%.",
         "min_value": -100,  # Approximate, adjust as needed
         "max_value": 50,    # Approximate, adjust as needed
         "color_map": "RdYlGn"
+    },
+    "Absolute Improvement to Baseline": {
+        "file": "src/data/metrics/absolute_improvement_to_baseline.json",
+        "description": "Measures the percentage improvement over the baseline performance. Calculated as: (Agent performance - Baseline) / Baseline × 100%.",
+        "min_value": -100,  # Approximate, adjust as needed
+        "max_value": 100,   # Approximate, adjust as needed
+        "color_map": "RdYlGn"
     }
     # Future metrics can be added here
     # "Another Metric": {
     "MLAB (o3-mini)": "Closed Source",
     "MLAB (gpt-4o)": "Closed Source",
     "MLAB (llama3-1-405b-instruct)": "Open Weights",
+    "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
+    "Human": "Human",
+    "Top Human in Competition": "Human"
     # More models would be added here as needed
 }
 # Task descriptions
 tasks_info = {
+    task_display_names.get("Perception Temporal Action Loc", "Temporal Action Localisation"):
+        "Testing the model's ability to understand and localize actions within temporal sequences of events.",
+    task_display_names.get("Llm Merging", "LLM Merging"):
+        "Assessing the capability to effectively merge knowledge from multiple language models.",
+    task_display_names.get("Meta Learning", "Meta Learning"):
+        "Evaluating the model's ability to learn how to learn - adapting quickly to new tasks.",
+    task_display_names.get("Product Recommendation", "Next Product Recommendation"):
+        "Testing the model's ability to recommend relevant products based on user preferences and behavior.",
+    task_display_names.get("Machine Unlearning", "Machine Unlearning"):
+        "Evaluating how well models can 'unlearn' specific information when required.",
+    task_display_names.get("Backdoor Trigger Recovery", "Backdoor Trigger Recovery"):
+        "Testing resilience against backdoor attacks and ability to recover from triggered behaviors."
 }

src/utils/data_loader.py CHANGED Viewed

@@ -4,6 +4,7 @@ Data loading and processing utilities for the leaderboard application.
 import pandas as pd
 import json
 from src.utils.config import model_categories
 def load_metric_data(file_path):
     """
@@ -56,8 +57,13 @@ def process_data(metric_data):
     # Replace NaN values with '-'
     df.fillna('-', inplace=True)
-    # Rename the columns to more readable format
-    df.columns = [task.replace("-", " ").replace("_", " ").title() for task in df.columns]
     # Add a model type column to the dataframe
     df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown"))
@@ -108,14 +114,26 @@ def filter_and_prepare_data(df, selected_tasks, selected_model_types):
     selected_tasks_df = filtered_df[selected_tasks]
     filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1)
-    # Sort by Selected Overall and add rank
-    filtered_df = filtered_df.sort_values('Selected Overall', ascending=False)
-    filtered_df.insert(0, 'Rank', range(1, len(filtered_df) + 1))
     # Add a Model Name column that shows the index (actual model name)
-    filtered_df['Model Name'] = filtered_df.index
-    return filtered_df
 def format_display_dataframe(filtered_df, selected_tasks):
     """
@@ -135,13 +153,16 @@ def format_display_dataframe(filtered_df, selected_tasks):
     medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"}
     display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x)))
     # Add metrics columns (Selected Overall and individual tasks)
     metric_columns = ['Selected Overall'] + selected_tasks
     for col in metric_columns:
         if col in filtered_df.columns:
-            # Format numeric columns to 3 decimal places
             if filtered_df[col].dtype in ['float64', 'float32']:
-                display_df[col] = filtered_df[col].apply(lambda x: f"{x:.3f}" if isinstance(x, (int, float)) else x)
             else:
                 display_df[col] = filtered_df[col]

 import pandas as pd
 import json
 from src.utils.config import model_categories
+from src.utils.task_mapping import get_display_name
 def load_metric_data(file_path):
     """
     # Replace NaN values with '-'
     df.fillna('-', inplace=True)
+    # First convert raw task names to standard format (spaces instead of hyphens/underscores)
+    standardized_columns = [task.replace("-", " ").replace("_", " ").title() for task in df.columns]
+    df.columns = standardized_columns
+    # Then apply our display name mapping
+    display_name_columns = {col: get_display_name(col) for col in df.columns}
+    df = df.rename(columns=display_name_columns)
     # Add a model type column to the dataframe
     df['Model Type'] = df.index.map(lambda x: model_categories.get(x, "Unknown"))
     selected_tasks_df = filtered_df[selected_tasks]
     filtered_df['Selected Overall'] = selected_tasks_df.mean(axis=1)
+    # Separate human entries from other models for ranking
+    is_human = filtered_df['Model Type'] == 'Human'
+    human_df = filtered_df[is_human]
+    non_human_df = filtered_df[~is_human]
+    # Sort non-human models by Selected Overall and add rank
+    non_human_df = non_human_df.sort_values('Selected Overall', ascending=False)
+    non_human_df.insert(0, 'Rank', range(1, len(non_human_df) + 1))
+    # Add rank for human (use '-' to indicate not ranked)
+    human_df.insert(0, 'Rank', '-')
+    # Combine dataframes - put humans at appropriate position based on score
+    combined_df = pd.concat([non_human_df, human_df])
+    combined_df = combined_df.sort_values('Selected Overall', ascending=False)
     # Add a Model Name column that shows the index (actual model name)
+    combined_df['Model Name'] = combined_df.index
+    return combined_df
 def format_display_dataframe(filtered_df, selected_tasks):
     """
     medal_ranks = {1: "🥇 1", 2: "🥈 2", 3: "🥉 3"}
     display_df['Rank'] = display_df['Rank'].apply(lambda x: medal_ranks.get(x, str(x)))
+    # Rename 'Model Name' to 'Agent'
+    display_df = display_df.rename(columns={"Model Name": "Agent"})
     # Add metrics columns (Selected Overall and individual tasks)
     metric_columns = ['Selected Overall'] + selected_tasks
     for col in metric_columns:
         if col in filtered_df.columns:
+            # Format numeric columns to 1 decimal place
             if filtered_df[col].dtype in ['float64', 'float32']:
+                display_df[col] = filtered_df[col].apply(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
             else:
                 display_df[col] = filtered_df[col]

src/utils/task_mapping.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+Task name mapping utilities for displaying user-friendly task names.
+"""
+# Mapping from original task names to display names
+task_display_names = {
+    "Perception Temporal Action Loc": "Temporal Action Localisation",
+    "Llm Merging": "LLM Merging",
+    "Meta Learning": "Meta Learning",
+    "Product Recommendation": "Next Product Recommendation",
+    "Machine Unlearning": "Machine Unlearning",
+    "Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
+}
+def get_display_name(task_name):
+    """
+    Get the display name for a task
+    Args:
+        task_name (str): The original task name
+    Returns:
+        str: The display name for the task
+    """
+    return task_display_names.get(task_name, task_name)
+def get_original_name(display_name):
+    """
+    Get the original task name for a display name
+    Args:
+        display_name (str): The display name
+    Returns:
+        str: The original task name
+    """
+    # Create a reverse mapping
+    reverse_mapping = {v: k for k, v in task_display_names.items()}
+    return reverse_mapping.get(display_name, display_name)