Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
# βββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide") | |
# βββ Load data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def load_data(path="src/models.json"): | |
df = pd.read_json(path, lines=True) | |
score_cols = [f"T{i}" for i in range(1, 12)] | |
df["Avg"] = df[score_cols].mean(axis=1).round(1) | |
# Compute rank per column (1 = best) | |
for col in score_cols + ["Avg"]: | |
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int) | |
return df | |
df = load_data() | |
# Precompute max ranks for color scaling | |
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"] | |
max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols} | |
# βββ Tabs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"]) | |
with tab1: | |
st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.") | |
# Build raw HTML table | |
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"] | |
html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>" | |
# header | |
html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>" | |
# rows | |
for _, row in df.iterrows(): | |
html += "<tr>" | |
for col in cols: | |
val = row[col] | |
if col == "Model": | |
html += f"<td style='padding:6px; text-align:left;'>{val}</td>" | |
else: | |
rank = int(row[f"{col}_rank"]) | |
norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1) | |
# interpolate green (182,243,182) β white (255,255,255) | |
r = int(255 - norm*(255-182)) | |
g = int(255 - norm*(255-243)) | |
b = 255 | |
bold = "font-weight:bold;" if rank == 1 else "" | |
style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}" | |
html += f"<td style='{style}'>{val}</td>" | |
html += "</tr>" | |
html += "</table>" | |
st.markdown(html, unsafe_allow_html=True) | |
with tab2: | |
st.markdown("### Benchmark Details") | |
st.write( | |
"VERIFACT is a factuality evaluation framework for longβform LLM outputs. " | |
"FACTRBENCH provides reference fact sets and external evidence across realβworld prompts." | |
) | |