import streamlit as st import pandas as pd from utils.style import style_long_context @st.cache_data def load_data(): df = pd.read_csv("data/long_context.csv") df.dropna(inplace=True) # Drop rows with any missing values return df def show(): st.title("Long Context Leaderboard") # Load and style data df = load_data() styled_df = style_long_context(df) st.markdown(styled_df, unsafe_allow_html=True) # No need to call to_html() again # st.dataframe(styled_df, use_container_width=True) # st.html(styled_df) # Optionally, keep some explanatory text st.markdown(""" **Context Lengths**: - 8K: 8,000 tokens - 16K: 16,000 tokens - 32K: 32,000 tokens **Colors**: - Yellow: reasoning model - Green: linear attention hybrid model - Blue: SSM hybrid model **Benchmark Details**: - Evaluated on Symbolic, Medium, and Hard subtasks. - Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance. - AUC is calculated using np.trapz function. - AUC scores aggregated across context lengths. - Larger context evaluations limited by compute constraints and model performance. """)