Spaces:
Running
Running
File size: 3,868 Bytes
8e68ad1 35c36b4 578adcb 901e92c 35c36b4 6594157 578adcb 35c36b4 4cfc3d6 35c36b4 578adcb 35c36b4 578adcb 35c36b4 6594157 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 578adcb 35c36b4 578adcb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import streamlit as st
import pandas as pd
# βββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.set_page_config(page_title="ExpertLongBench Leaderboard", layout="wide")
logo_image = Image.open("./expertlongbench.png")
# Display logo
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
st.markdown(
f"""
<div class="logo-container" style="display:flex; justify-content: center;">
<img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
</div>
""",
unsafe_allow_html=True
)
st.markdown(
'''
<div class="header">
<br/>
<p style="font-size:22px;">
VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
</p>
<p style="font-size:20px;">
# π <a href="">Paper</a> | π» <a href="">GitHub</a> | π€ <a href="">HuggingFace</a>
βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>April 2025</strong>
</p>
</div>
''',
unsafe_allow_html=True
)
# βββ Load data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@st.cache_data
def load_data(path="src/models.json"):
df = pd.read_json(path, lines=True)
score_cols = [f"T{i}" for i in range(1, 12)]
df["Avg"] = df[score_cols].mean(axis=1).round(1)
# Compute rank per column (1 = best)
for col in score_cols + ["Avg"]:
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
return df
df = load_data()
# Precompute max ranks for color scaling
score_cols = [f"T{i}" for i in range(1, 12)] + ["Avg"]
max_ranks = {col: df[f"{col}_rank"].max() for col in score_cols}
# βββ Tabs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
with tab1:
# st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
# Build raw HTML table
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"
# header
html += "<tr>" + "".join(f"<th style='padding:6px;'>{col}</th>" for col in cols) + "</tr>"
# rows
for _, row in df.iterrows():
html += "<tr>"
for col in cols:
val = row[col]
if col == "Model":
html += f"<td style='padding:6px; text-align:left;'>{val}</td>"
else:
rank = int(row[f"{col}_rank"])
norm = 1 - (rank - 1) / ((max_ranks[col] - 1) or 1)
# interpolate green (182,243,182) β white (255,255,255)
r = int(255 - norm*(255-182))
g = int(255 - norm*(255-243))
b = 255
bold = "font-weight:bold;" if rank == 1 else ""
style = f"background-color:rgb({r},{g},{b}); padding:6px; {bold}"
html += f"<td style='{style}'>{val}</td>"
html += "</tr>"
html += "</table>"
st.markdown(html, unsafe_allow_html=True)
with tab2:
st.markdown("### Benchmark Details")
st.write(
"VERIFACT is a factuality evaluation framework for longβform LLM outputs. "
"FACTRBENCH provides reference fact sets and external evidence across realβworld prompts."
)
|