atlas5301
update format and adjust name
ccfe614
raw
history blame
1.78 kB
import streamlit as st
import pandas as pd
from utils.style import style_zero_context
@st.cache_data
def load_data():
df = pd.read_csv("data/zero_context.csv")
if "Row Color" in df.columns:
df.drop(columns=["Row Color"], inplace=True)
return df
def show():
st.title("Zero Noise Leaderboard")
# Load data
raw_df = load_data()
# Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
styled_df = style_zero_context(raw_df)
# Directly show the dataframe
st.dataframe(
styled_df,
use_container_width=True,
hide_index=True,
height=35*(1+len(raw_df)),
column_config={
"Model": st.column_config.TextColumn(width="large"),
"Symbolic": st.column_config.NumberColumn(format="%.2f"),
"Medium": st.column_config.NumberColumn(format="%.2f"),
"Hard": st.column_config.NumberColumn(format="%.2f"),
"1st<50% op": st.column_config.NumberColumn(format="%.0f"),
"1st<10% op": st.column_config.NumberColumn(format="%.0f"),
"Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
"Average↑": st.column_config.NumberColumn(
format="%.2f",
help="Average across all subsets"
)
}
)
# You can leave your explanation/description below
st.markdown("""
**Colors**:
- Yellow: reasoning model
- Green: linear attention hybrid model
- Blue: SSM-hybrid model
**Benchmark Details**:
- Evaluated on Symbolic, Medium, and Hard subtasks.
- Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
- AUC is calculated using np.trapz function.
""")