File size: 1,626 Bytes
87248a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1ac14e
 
 
 
87248a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import streamlit as st
import pandas as pd
from utils.style import style_zero_context

@st.cache_data
def load_data():
    df = pd.read_csv("data/zero_context.csv")
    if "Row Color" in df.columns:
        df.drop(columns=["Row Color"], inplace=True)
    return df

def show():
    st.title("Zero Context Leaderboard")
    # Load data
    raw_df = load_data()

    # Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
    styled_df = style_zero_context(raw_df)

    # Directly show the dataframe
    st.dataframe(
        styled_df,
        use_container_width=True,
        hide_index=True,
        height=800,
        column_config={
            "Model": st.column_config.TextColumn(width="large"),
            "Symbolic": st.column_config.NumberColumn(format="%.2f"),
            "Medium": st.column_config.NumberColumn(format="%.2f"),
            "Hard": st.column_config.NumberColumn(format="%.2f"),
            "1st<50% op": st.column_config.NumberColumn(format="%.0f"),
            "1st<10% op": st.column_config.NumberColumn(format="%.0f"),
            "Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
            "Average↑": st.column_config.NumberColumn(
                format="%.2f",
                help="Average across all subsets"
            )
        }
    )

    # You can leave your explanation/description below
    st.markdown("""
    **Benchmark Details**:
    - Evaluated on Symbolic, Medium, and Hard subtasks.
    - Area Under Curve Metrics is Used to Compare between LLM Performance.
    - AUC is calculated using np.trapz function.
    """)