Spaces:

launch
/

ExpertLongBench

Running

App Files Files Community

ExpertLongBench / src /streamlit_app.py

shezamunir

Update src/streamlit_app.py

4cfc3d6 verified 29 days ago

raw

history blame

2.88 kB

	import streamlit as st
	import pandas as pd

	# ─── Page config ──────────────────────────────────────────────────────────────
	st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")

	# ─── Load data ────────────────────────────────────────────────────────────────
	@st.cache_data
	def load_data(path="src/models.json"):
	df = pd.read_json(path, lines=True)
	score_cols = [f"T{i}" for i in range(1, 12)]
	df["Avg"] = df[score_cols].mean(axis=1).round(1)
	# Compute rank per column (1 = best)
	for col in score_cols + ["Avg"]:
	df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
	return df

	df = load_data()

	# Precompute max ranks for color scaling
	score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
	max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}

	# ─── Tabs ──────────────────────────────────────────────────────────────────────
	tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

	with tab1:
	st.markdown("Leaderboard: higher scores shaded green; best models bolded.")
	# Build raw HTML table
	cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
	html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
	# header
	html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
	# rows
	for _, row in df.iterrows():
	html += "<tr>"
	for col in cols:
	val = row[col]
	if col == "Model":
	html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
	else:
	rank = int(row[f"{col}_rank"])
	norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
	# interpolate green (182,243,182) → white (255,255,255)
	r = int(255 - norm*(255-182))
	g = int(255 - norm*(255-243))
	b = 255
	bold = "font-weight:bold;" if rank == 1 else ""
	style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
	html += f"<td style='{style}'>{val}</td>"
	html += "</tr>"
	html += "</table>"
	st.markdown(html, unsafe_allow_html=True)

	with tab2:
	st.markdown("### Benchmark Details")
	st.write(
	"VERIFACT is a factuality evaluation framework for long‑form LLM outputs. "
	"FACTRBENCH provides reference fact sets and external evidence across real‑world prompts."
	)