Spaces:

InfiniAILab
/

GSM-Infinite-Leaderboard

Running

GSM-Infinite-Leaderboard / pages /long_context.py

atlas5301

update format and adjust name

ccfe614 5 months ago

1.69 kB

	import streamlit as st
	import pandas as pd
	from utils.style import style_long_context

	@st.cache_data
	def load_data():
	df = pd.read_csv("data/long_context.csv")
	df.dropna(inplace=True) # Drop rows with any missing values
	return df

	def show():
	st.title("Long Context Leaderboard")

	# Load and style data
	df = load_data()
	styled_df = style_long_context(df)

	# Display the dataframe with built-in sort on column click
	st.dataframe(
	styled_df,
	use_container_width=True,
	height=35*(len(df)+1),
	hide_index=True,
	column_config={
	"Model": st.column_config.TextColumn(width="large"),
	"8K": st.column_config.NumberColumn(format="%.2f"),
	"16K": st.column_config.NumberColumn(format="%.2f"),
	"32K": st.column_config.NumberColumn(format="%.2f"),
	"Average↑": st.column_config.NumberColumn(
	format="%.2f",
	help="Average across all context lengths"
	)
	}
	)

	# Optionally, keep some explanatory text
	st.markdown("""
	Context Lengths:
	- 8K: 8,000 tokens
	- 16K: 16,000 tokens
	- 32K: 32,000 tokens

	Colors:
	- Yellow: reasoning model
	- Green: linear attention hybrid model
	- Blue: SSM hybrid model

	Benchmark Details:
	- Evaluated on Symbolic, Medium, and Hard subtasks.
	- Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
	- AUC is calculated using np.trapz function.
	- AUC scores aggregated across context lengths.
	- Larger context evaluations limited by compute constraints and model performance.
	""")