import streamlit as st import pandas as pd from utils.style import style_zero_context @st.cache_data def load_data(): df = pd.read_csv("data/zero_context.csv") if "Row Color" in df.columns: df.drop(columns=["Row Color"], inplace=True) return df def show(): st.title("Zero Context Leaderboard") # Load data raw_df = load_data() # Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting. styled_df = style_zero_context(raw_df) # Directly show the dataframe st.dataframe( styled_df, use_container_width=True, hide_index=True, height=800, column_config={ "Model": st.column_config.TextColumn(width="large"), "Symbolic": st.column_config.NumberColumn(format="%.2f"), "Medium": st.column_config.NumberColumn(format="%.2f"), "Hard": st.column_config.NumberColumn(format="%.2f"), "1st<50% op": st.column_config.NumberColumn(format="%.0f"), "1st<10% op": st.column_config.NumberColumn(format="%.0f"), "Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"), "Average↑": st.column_config.NumberColumn( format="%.2f", help="Average across all subsets" ) } ) # You can leave your explanation/description below st.markdown(""" **Benchmark Details**: - Evaluated on Symbolic, Medium, and Hard subtasks. - Area Under Curve Metrics is Used to Compare between LLM Performance. - AUC is calculated using np.trapz function. """)