Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
from PIL import Image | |
import base64 | |
from io import BytesIO | |
# Set up page config | |
st.set_page_config( | |
page_title="VeriFact Leaderboard", | |
layout="wide" | |
) | |
# Load the image | |
image = Image.open("test.png") | |
logo_image = Image.open("./factrbench.png") | |
# Display logo | |
buffered = BytesIO() | |
logo_image.save(buffered, format="PNG") | |
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
st.markdown( | |
f""" | |
<div class="logo-container" style="display:flex; justify-content: center;"> | |
<img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/> | |
</div> | |
""", | |
unsafe_allow_html=True | |
) | |
st.markdown( | |
''' | |
<div class="header"> | |
<br/> | |
<p style="font-size:22px;"> | |
VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts | |
</p> | |
<p style="font-size:20px;"> | |
# π <a href="">Paper</a> | π» <a href="">GitHub</a> | π€ <a href="">HuggingFace</a> | |
βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>April 2025</strong> | |
</p> | |
</div> | |
''', | |
unsafe_allow_html=True | |
) | |
# Load the data | |
data_path = "verifact_data.csv" | |
df = pd.read_csv(data_path) | |
# Assign ranks within each tier | |
df['rank'] = df.groupby('tier')['Overall'].rank( | |
ascending=False, method='min').astype(int) | |
df.fillna('-', inplace=True) | |
df['original_order'] = df.groupby('tier').cumcount() | |
# Tabs | |
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"]) | |
# Tab 1: Leaderboard | |
with tab1: | |
st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="metric" style="font-size:16px;"> | |
<p> | |
<strong> π― Precision </strong> measures the ratio of correct facts among all extracted facts. <br> | |
<strong> π Recall </strong> assesses how many reference facts are covered by model outputs. <br> | |
<strong> βοΈ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br> | |
This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br> | |
</p> | |
</div> | |
""", unsafe_allow_html=True) | |
tiers = ['All Metrics', 'Precision', 'Recall', 'F1'] | |
selected_tier = st.selectbox('Select metric:', tiers) | |
if selected_tier != 'All Metrics': | |
filtered_df = df[df['tier'] == selected_tier] | |
else: | |
filtered_df = df | |
sort_by_factuality = st.checkbox('Sort by overall score', value=True) | |
if sort_by_factuality: | |
updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False]) | |
else: | |
updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order']) | |
# html = '<div style="max-width: 2000px; margin: 0 auto;"><table>' | |
html = ''' | |
<div style="width: 60%; margin: 0 auto;"> | |
<table style="width: 100%;"> | |
''' | |
html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>" | |
current_tier = None | |
tier_counts = updated_filtered_df['tier'].value_counts().to_dict() | |
tier_rowspan_tracker = {} | |
for _, row in updated_filtered_df.iterrows(): | |
html += '<tr>' | |
if selected_tier == 'All Metrics': | |
if row['tier'] not in tier_rowspan_tracker: | |
rowspan = tier_counts[row['tier']] | |
html += f'<td rowspan="{rowspan}" style="vertical-align: middle;">{row["tier"]}</td>' | |
tier_rowspan_tracker[row['tier']] = True | |
html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>' | |
html += '</tbody></table></div>' | |
st.markdown(html, unsafe_allow_html=True) | |
# Tab 2: Benchmark Details | |
with tab2: | |
buffered_img = BytesIO() | |
image.save(buffered_img, format="PNG") | |
image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8") | |
# st.markdown(f'''<div style="text-align:center;"> | |
# <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" /> | |
# </div>''', unsafe_allow_html=True) | |
st.markdown(f'''<div style="text-align:center; width:65%; margin:0 auto;"> | |
<img src="data:image/png;base64,{image_data}" style="width:100%; height:auto;" /> | |
</div>''', unsafe_allow_html=True) | |
st.markdown('### What is VERIFACT?') | |
st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.") | |
st.markdown('### What is FACTRBENCH?') | |
st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.") | |
st.markdown('### Key Findings') | |
st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall β highlighting the need to consider both.") | |