import streamlit as st | |
import pandas as pd | |
from utils.style import style_long_context | |
def load_data(): | |
return pd.read_csv("data/long_context.csv") | |
def show(): | |
st.title("Long Context Leaderboard") | |
# Load and style data | |
df = load_data() | |
styled_df = style_long_context(df) | |
# Display the dataframe with built-in sort on column click | |
st.dataframe( | |
styled_df, | |
use_container_width=True, | |
height=600, | |
hide_index=True, | |
column_config={ | |
"Model": st.column_config.TextColumn(width="large"), | |
"8K": st.column_config.NumberColumn(format="%.2f"), | |
"16K": st.column_config.NumberColumn(format="%.2f"), | |
"32K": st.column_config.NumberColumn(format="%.2f"), | |
"Average↑": st.column_config.NumberColumn( | |
format="%.2f", | |
help="Average across all context lengths" | |
) | |
} | |
) | |
# Optionally, keep some explanatory text | |
st.markdown(""" | |
**Context Lengths**: | |
- 8K: 8,000 tokens | |
- 16K: 16,000 tokens | |
- 32K: 32,000 tokens | |
**Benchmark Details**: | |
- Evaluated on Symbolic, Medium, and Hard subtasks | |
- AUC scores aggregated across context lengths | |
- Larger context evaluations limited by compute constraints and model performance | |
""") |