Spaces:

launch
/

ExpertLongBench

Running

File size: 3,868 Bytes

8e68ad1
35c36b4
 
578adcb
901e92c
35c36b4
6594157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578adcb
35c36b4
4cfc3d6
35c36b4
578adcb
 
 
 
35c36b4
 
 
 
 
578adcb
 
 
 
 
35c36b4
 
 
6594157
578adcb
35c36b4
578adcb
35c36b4
578adcb
35c36b4
 
 
578adcb
 
 
 
35c36b4
578adcb
 
 
35c36b4
 
 
578adcb
 
 
35c36b4
 
 
 
 
578adcb

import streamlit as st
import pandas as pd

# ─── Page config ──────────────────────────────────────────────────────────────
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")


logo_image = Image.open("./expertlongbench.png")

# Display logo
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

st.markdown(
    f"""
    <div class="logo-container" style="display:flex; justify-content: center;">
        <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
    </div>
    """,
    unsafe_allow_html=True
)

st.markdown(
    '''
    <div class="header">
        <br/>
        <p style="font-size:22px;">
        VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
        </p>
        <p style="font-size:20px;">
            # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> 
            ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>April 2025</strong>
        </p>
    </div>
    ''',
    unsafe_allow_html=True
)
# ─── Load data ────────────────────────────────────────────────────────────────
@st.cache_data
def load_data(path="src/models.json"):
    df = pd.read_json(path, lines=True)
    score_cols = [f"T{i}" for i in range(1, 12)]
    df["Avg"] = df[score_cols].mean(axis=1).round(1)
    # Compute rank per column (1 = best)
    for col in score_cols + ["Avg"]:
        df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
    return df

df = load_data()

# Precompute max ranks for color scaling
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}

# ─── Tabs ──────────────────────────────────────────────────────────────────────
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

with tab1:
    # st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
    # Build raw HTML table
    cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
    html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
    # header
    html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
    # rows
    for _, row in df.iterrows():
        html += "<tr>"
        for col in cols:
            val = row[col]
            if col == "Model":
                html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
            else:
                rank = int(row[f"{col}_rank"])
                norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
                # interpolate green (182,243,182) → white (255,255,255)
                r = int(255 - norm*(255-182))
                g = int(255 - norm*(255-243))
                b = 255
                bold = "font-weight:bold;" if rank == 1 else ""
                style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
                html += f"<td style='{style}'>{val}</td>"
        html += "</tr>"
    html += "</table>"
    st.markdown(html, unsafe_allow_html=True)

with tab2:
    st.markdown("### Benchmark Details")
    st.write(
        "VERIFACT is a factuality evaluation framework for long‑form LLM outputs. "
        "FACTRBENCH provides reference fact sets and external evidence across real‑world prompts."
    )