File size: 5,379 Bytes
ed2eb44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06d4ee9
ed2eb44
 
 
c08520d
ed2eb44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed2eb44
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
ed2eb44
 
 
 
 
 
 
 
 
 
 
06d4ee9
 
 
ed2eb44
 
 
c08520d
06d4ee9
 
 
 
 
 
c08520d
ed2eb44
 
 
 
06d4ee9
 
 
 
 
 
 
 
 
c08520d
06d4ee9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed2eb44
06d4ee9
ed2eb44
 
 
 
 
 
 
 
 
06d4ee9
ed2eb44
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
Main entry point for the Model Capability Leaderboard application.
"""
import streamlit as st

# Import configuration
from src.utils.config import app_config, metrics_config

# Import data functions
from src.utils.data_loader import (
    load_metric_data, 
    process_data, 
    filter_and_prepare_data,
    format_display_dataframe
)

# Import styles
from src.styles.base import load_all_styles

# Import components
from src.components.header import render_page_header
from src.components.filters import (
    initialize_session_state, 
    render_metric_selection, 
    render_task_selection
)
from src.components.leaderboard import render_leaderboard_table, render_empty_state
from src.components.tasks import render_task_descriptions

def setup_page():
    """
    Set up the Streamlit page configuration
    """
    st.set_page_config(
        page_title=app_config['title'],
        layout=app_config['layout'],
        initial_sidebar_state=app_config['initial_sidebar_state']
    )
    
    # Load all styles
    load_all_styles()
    
    # Force dark mode using custom CSS
    st.markdown("""
    <style>
    /* Force dark mode regardless of browser settings */
    .stApp {
        background-color: #1a202c !important;
        color: #e2e8f0 !important;
    }
    /* Override Streamlit's default styling to ensure dark mode */
    .stTextInput, .stSelectbox, .stMultiselect {
        background-color: #2d3748 !important;
        color: #e2e8f0 !important;
    }
    .stButton>button {
        background-color: #4a5568 !important;
        color: #e2e8f0 !important;
    }
    /* Override header and text colors */
    h1, h2, h3, h4, h5, h6, p, span, div {
        color: #e2e8f0 !important;
    }
    /* Ensure tab styling is consistent */
    .stTabs [data-baseweb="tab-list"] {
        background-color: #1a202c !important;
    }
    .stTabs [data-baseweb="tab"] {
        color: #e2e8f0 !important;
    }
    </style>
    """, unsafe_allow_html=True)

def main():
    """
    Main application function
    """
    # Set up page
    setup_page()
    
    # Render header
    render_page_header()
    
    # Load primary metric data (first metric in config)
    primary_metric = list(metrics_config.keys())[0]
    metric_data = load_metric_data(metrics_config[primary_metric]["file"])
    df = process_data(metric_data)
    
    # Initialize session state
    initialize_session_state(df)
    
    # Create tabs
    tabs = st.tabs(["📊 Leaderboard", "📑 Benchmark Details"])
    
    # Tab 1: Leaderboard
    with tabs[0]:
        # Render filter components
        selected_metrics = render_metric_selection()
        
        # Continue with other filters
        selected_tasks = render_task_selection(df)
        
        # Render leaderboard if selections are valid
        if selected_tasks:
            # Load the primary metric data first (always the first in selected_metrics)
            primary_metric = selected_metrics[0]
            primary_metric_data = load_metric_data(metrics_config[primary_metric]["file"])
            primary_df = process_data(primary_metric_data)
            
            # Filter and prepare data for primary metric
            filtered_df = filter_and_prepare_data(primary_df, selected_tasks, st.session_state.selected_model_types)
            
            # Format data for display
            display_df, metric_columns = format_display_dataframe(filtered_df, selected_tasks)
            
            # If additional metrics are selected, add their data too
            all_metric_columns = metric_columns.copy()
            
            for metric in selected_metrics[1:]:
                metric_info = metrics_config[metric]
                metric_data = load_metric_data(metric_info["file"])
                metric_df = process_data(metric_data)
                
                # Process and merge the additional metric data
                metric_filtered_df = filter_and_prepare_data(metric_df, selected_tasks, st.session_state.selected_model_types)
                metric_display_df, _ = format_display_dataframe(metric_filtered_df, selected_tasks)
                
                # Create a meaningful prefix for this metric
                if metric == "Absolute Improvement to Baseline":
                    prefix = "Abs"
                else:
                    # Use first word of each part of the metric name
                    prefix = "".join([word[0] for word in metric.split()]).upper()
                
                # Combine the dataframes - keep only metric columns from metric_display_df
                for col in metric_columns:
                    if col in metric_display_df.columns:
                        # Add columns with metric prefix
                        display_df[f"{prefix}: {col}"] = metric_display_df[col]
                        # Add to the list of all metric columns
                        all_metric_columns.append(f"{prefix}: {col}")
            
            # Render the leaderboard table
            render_leaderboard_table(display_df, all_metric_columns, primary_metric)
        else:
            # Show empty state
            render_empty_state()
    
    # Tab 2: Benchmark Details
    with tabs[1]:
        # Render task descriptions
        render_task_descriptions()
    
    # Footer removed per user request

if __name__ == "__main__":
    main()