File size: 1,626 Bytes
87248a0 a1ac14e 87248a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import streamlit as st
import pandas as pd
from utils.style import style_zero_context
@st.cache_data
def load_data():
df = pd.read_csv("data/zero_context.csv")
if "Row Color" in df.columns:
df.drop(columns=["Row Color"], inplace=True)
return df
def show():
st.title("Zero Context Leaderboard")
# Load data
raw_df = load_data()
# Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
styled_df = style_zero_context(raw_df)
# Directly show the dataframe
st.dataframe(
styled_df,
use_container_width=True,
hide_index=True,
height=800,
column_config={
"Model": st.column_config.TextColumn(width="large"),
"Symbolic": st.column_config.NumberColumn(format="%.2f"),
"Medium": st.column_config.NumberColumn(format="%.2f"),
"Hard": st.column_config.NumberColumn(format="%.2f"),
"1st<50% op": st.column_config.NumberColumn(format="%.0f"),
"1st<10% op": st.column_config.NumberColumn(format="%.0f"),
"Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
"Average↑": st.column_config.NumberColumn(
format="%.2f",
help="Average across all subsets"
)
}
)
# You can leave your explanation/description below
st.markdown("""
**Benchmark Details**:
- Evaluated on Symbolic, Medium, and Hard subtasks.
- Area Under Curve Metrics is Used to Compare between LLM Performance.
- AUC is calculated using np.trapz function.
""") |