import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
# Set up page config
st.set_page_config(
page_title="VeriFact Leaderboard",
layout="wide"
)
# Load the image
image = Image.open("test.png")
logo_image = Image.open("./factrbench.png")
# Display logo
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
st.markdown(
f"""
""",
unsafe_allow_html=True
)
st.markdown(
'''
''',
unsafe_allow_html=True
)
# Load the data
data_path = "verifact_data.csv"
df = pd.read_csv(data_path)
# Assign ranks within each tier
df['rank'] = df.groupby('tier')['Overall'].rank(
ascending=False, method='min').astype(int)
df.fillna('-', inplace=True)
df['original_order'] = df.groupby('tier').cumcount()
# Tabs
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
# Tab 1: Leaderboard
with tab1:
st.markdown('Metrics Explanation
', unsafe_allow_html=True)
st.markdown("""
🎯 Precision measures the ratio of correct facts among all extracted facts.
🔎 Recall assesses how many reference facts are covered by model outputs.
⚖️ F1 balances precision and recall for comprehensive factual evaluation.
This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.
""", unsafe_allow_html=True)
tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
selected_tier = st.selectbox('Select metric:', tiers)
if selected_tier != 'All Metrics':
filtered_df = df[df['tier'] == selected_tier]
else:
filtered_df = df
sort_by_factuality = st.checkbox('Sort by overall score', value=True)
if sort_by_factuality:
updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
else:
updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
# html = ''
html = '''
'''
html += """""" + ("Metric | " if selected_tier == 'All Metrics' else "") + "Rank | Model | Factbench | Reddit | Overall |
"
current_tier = None
tier_counts = updated_filtered_df['tier'].value_counts().to_dict()
tier_rowspan_tracker = {}
for _, row in updated_filtered_df.iterrows():
html += ''
if selected_tier == 'All Metrics':
if row['tier'] not in tier_rowspan_tracker:
rowspan = tier_counts[row['tier']]
html += f'{row["tier"]} | '
tier_rowspan_tracker[row['tier']] = True
html += f'{row["rank"]} | {row["model"]} | {row["FactBench"]} | {row["Reddit"]} | {row["Overall"]} |
'
html += '
'
st.markdown(html, unsafe_allow_html=True)
# Tab 2: Benchmark Details
with tab2:
buffered_img = BytesIO()
image.save(buffered_img, format="PNG")
image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
# st.markdown(f'''
#

#
''', unsafe_allow_html=True)
st.markdown(f'''''', unsafe_allow_html=True)
st.markdown('### What is VERIFACT?')
st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
st.markdown('### What is FACTRBENCH?')
st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
st.markdown('### Key Findings')
st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")