|
import streamlit as st |
|
import pandas as pd |
|
from utils.style import style_long_context |
|
|
|
@st.cache_data |
|
def load_data(): |
|
return pd.read_csv("data/long_context.csv") |
|
|
|
def show(): |
|
st.title("Long Context Leaderboard") |
|
|
|
|
|
df = load_data() |
|
styled_df = style_long_context(df) |
|
|
|
|
|
st.dataframe( |
|
styled_df, |
|
use_container_width=True, |
|
height=600, |
|
hide_index=True, |
|
column_config={ |
|
"Model": st.column_config.TextColumn(width="large"), |
|
"8K": st.column_config.NumberColumn(format="%.2f"), |
|
"16K": st.column_config.NumberColumn(format="%.2f"), |
|
"32K": st.column_config.NumberColumn(format="%.2f"), |
|
"Average↑": st.column_config.NumberColumn( |
|
format="%.2f", |
|
help="Average across all context lengths" |
|
) |
|
} |
|
) |
|
|
|
|
|
st.markdown(""" |
|
**Context Lengths**: |
|
- 8K: 8,000 tokens |
|
- 16K: 16,000 tokens |
|
- 32K: 32,000 tokens |
|
|
|
**Benchmark Details**: |
|
- Evaluated on Symbolic, Medium, and Hard subtasks |
|
- AUC scores aggregated across context lengths |
|
- Larger context evaluations limited by compute constraints |
|
- Scores normalized across task complexities |
|
""") |