File size: 1,693 Bytes
87248a0 7243e58 87248a0 7243e58 87248a0 ccfe614 87248a0 ccfe614 87248a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import streamlit as st
import pandas as pd
from utils.style import style_long_context
@st.cache_data
def load_data():
df = pd.read_csv("data/long_context.csv")
df.dropna(inplace=True) # Drop rows with any missing values
return df
def show():
st.title("Long Context Leaderboard")
# Load and style data
df = load_data()
styled_df = style_long_context(df)
# Display the dataframe with built-in sort on column click
st.dataframe(
styled_df,
use_container_width=True,
height=35*(len(df)+1),
hide_index=True,
column_config={
"Model": st.column_config.TextColumn(width="large"),
"8K": st.column_config.NumberColumn(format="%.2f"),
"16K": st.column_config.NumberColumn(format="%.2f"),
"32K": st.column_config.NumberColumn(format="%.2f"),
"Average↑": st.column_config.NumberColumn(
format="%.2f",
help="Average across all context lengths"
)
}
)
# Optionally, keep some explanatory text
st.markdown("""
**Context Lengths**:
- 8K: 8,000 tokens
- 16K: 16,000 tokens
- 32K: 32,000 tokens
**Colors**:
- Yellow: reasoning model
- Green: linear attention hybrid model
- Blue: SSM hybrid model
**Benchmark Details**:
- Evaluated on Symbolic, Medium, and Hard subtasks.
- Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
- AUC is calculated using np.trapz function.
- AUC scores aggregated across context lengths.
- Larger context evaluations limited by compute constraints and model performance.
""") |