| import streamlit as st | |
| import pandas as pd | |
| from utils.style import style_long_context | |
| def load_data(): | |
| df = pd.read_csv("data/long_context.csv") | |
| df.dropna(inplace=True) # Drop rows with any missing values | |
| return df | |
| def show(): | |
| st.title("Long Context Leaderboard") | |
| # Load and style data | |
| df = load_data() | |
| styled_df = style_long_context(df) | |
| # Display the dataframe with built-in sort on column click | |
| st.dataframe( | |
| styled_df, | |
| use_container_width=True, | |
| height=35*(len(df)+1), | |
| hide_index=True, | |
| column_config={ | |
| "Model": st.column_config.TextColumn(width="large"), | |
| "8K": st.column_config.NumberColumn(format="%.2f"), | |
| "16K": st.column_config.NumberColumn(format="%.2f"), | |
| "32K": st.column_config.NumberColumn(format="%.2f"), | |
| "Average↑": st.column_config.NumberColumn( | |
| format="%.2f", | |
| help="Average across all context lengths" | |
| ) | |
| } | |
| ) | |
| # Optionally, keep some explanatory text | |
| st.markdown(""" | |
| **Context Lengths**: | |
| - 8K: 8,000 tokens | |
| - 16K: 16,000 tokens | |
| - 32K: 32,000 tokens | |
| **Colors**: | |
| - Yellow: reasoning model | |
| - Green: linear attention hybrid model | |
| - Blue: SSM hybrid model | |
| **Benchmark Details**: | |
| - Evaluated on Symbolic, Medium, and Hard subtasks. | |
| - Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance. | |
| - AUC is calculated using np.trapz function. | |
| - AUC scores aggregated across context lengths. | |
| - Larger context evaluations limited by compute constraints and model performance. | |
| """) |