File size: 2,875 Bytes
8e68ad1
35c36b4
 
578adcb
901e92c
35c36b4
578adcb
35c36b4
4cfc3d6
35c36b4
578adcb
 
 
 
35c36b4
 
 
 
 
578adcb
 
 
 
 
35c36b4
 
 
578adcb
 
35c36b4
578adcb
35c36b4
578adcb
35c36b4
 
 
578adcb
 
 
 
35c36b4
578adcb
 
 
35c36b4
 
 
578adcb
 
 
35c36b4
 
 
 
 
578adcb
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import streamlit as st
import pandas as pd

# ─── Page config ──────────────────────────────────────────────────────────────
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")

# ─── Load data ────────────────────────────────────────────────────────────────
@st.cache_data
def load_data(path="src/models.json"):
    df = pd.read_json(path, lines=True)
    score_cols = [f"T{i}" for i in range(1, 12)]
    df["Avg"] = df[score_cols].mean(axis=1).round(1)
    # Compute rank per column (1 = best)
    for col in score_cols + ["Avg"]:
        df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
    return df

df = load_data()

# Precompute max ranks for color scaling
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}

# ─── Tabs ──────────────────────────────────────────────────────────────────────
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

with tab1:
    st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
    # Build raw HTML table
    cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
    html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
    # header
    html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
    # rows
    for _, row in df.iterrows():
        html += "<tr>"
        for col in cols:
            val = row[col]
            if col == "Model":
                html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
            else:
                rank = int(row[f"{col}_rank"])
                norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
                # interpolate green (182,243,182) β†’ white (255,255,255)
                r = int(255 - norm*(255-182))
                g = int(255 - norm*(255-243))
                b = 255
                bold = "font-weight:bold;" if rank == 1 else ""
                style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
                html += f"<td style='{style}'>{val}</td>"
        html += "</tr>"
    html += "</table>"
    st.markdown(html, unsafe_allow_html=True)

with tab2:
    st.markdown("### Benchmark Details")
    st.write(
        "VERIFACT is a factuality evaluation framework for long‑form LLM outputs. "
        "FACTRBENCH provides reference fact sets and external evidence across real‑world prompts."
    )