factrbench

Running

File size: 5,551 Bytes

296ef9e
92ebc19
 
 
f0e92c3
 
92ebc19
 
 
c56ea0e
6676297
92ebc19
 
 
296ef9e
 
92ebc19
296ef9e
f0e92c3
 
 
296ef9e
f0e92c3
 
296ef9e
 
f0e92c3
 
 
 
76923d0
4932adb
 
 
 
3edeea6
439a676
a04fe8d
3edeea6
439a676
783b66b
4932adb
 
 
 
 
76923d0
92ebc19
bf350d1
92ebc19
 
296ef9e
bf350d1
0aa9325
 
 
 
 
296ef9e
c66176f
92ebc19
 
 
296ef9e
92ebc19
98dc2ee
296ef9e
 
 
 
 
 
 
 
98dc2ee
 
033550b
 
92ebc19
033550b
05b28bf
92ebc19
 
05b28bf
783b66b
05b28bf
296ef9e
05b28bf
296ef9e
 
 
 
 
 
0aa9325
296ef9e
92ebc19
 
783b66b
 
 
296ef9e
0aa9325
783b66b
 
 
 
 
296ef9e
0aa9325
296ef9e
92ebc19
 
296ef9e
92ebc19
296ef9e
 
 
92ebc19
296ef9e
 
 
 
 
 
92ebc19
296ef9e
 
92ebc19
296ef9e


import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO

# Set up page config
st.set_page_config(
    page_title="VeriFact Leaderboard",
    layout="wide"
)

# Load the image
image = Image.open("test.png")
logo_image = Image.open("./factrbench.png")

# Display logo
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

st.markdown(
    f"""
    <div class="logo-container" style="display:flex; justify-content: center;">
        <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
    </div>
    """,
    unsafe_allow_html=True
)

st.markdown(
    '''
    <div class="header">
        <br/>
        <p style="font-size:22px;">
        VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
        </p>
        <p style="font-size:20px;">
            # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> 
            ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>April 2025</strong>
        </p>
    </div>
    ''',
    unsafe_allow_html=True
)

# Load the data
data_path = "verifact_data.csv"
df = pd.read_csv(data_path)

# Assign ranks within each tier
df['rank'] = df.groupby('tier')['Overall'].rank(
    ascending=False, method='min').astype(int)

df.fillna('-', inplace=True)
df['original_order'] = df.groupby('tier').cumcount()

# Tabs
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

# Tab 1: Leaderboard
with tab1:
    st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)

    st.markdown("""
        <div class="metric" style="font-size:16px;">
            <p>
            <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
            <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
            <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
            This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
            </p>
        </div>
    """, unsafe_allow_html=True)

    tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
    selected_tier = st.selectbox('Select metric:', tiers)

    if selected_tier != 'All Metrics':
        filtered_df = df[df['tier'] == selected_tier]
    else:
        filtered_df = df

    sort_by_factuality = st.checkbox('Sort by overall score', value=True)
    if sort_by_factuality:
        updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
    else:
        updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])

    # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
    html = '''
        <div style="width: 60%; margin: 0 auto;">
        <table style="width: 100%;">
        '''
    html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"

    current_tier = None
    tier_counts = updated_filtered_df['tier'].value_counts().to_dict()
    tier_rowspan_tracker = {}

    for _, row in updated_filtered_df.iterrows():
        html += '<tr>'
        if selected_tier == 'All Metrics':
            if row['tier'] not in tier_rowspan_tracker:
                rowspan = tier_counts[row['tier']]
                html += f'<td rowspan="{rowspan}" style="vertical-align: middle;">{row["tier"]}</td>'
                tier_rowspan_tracker[row['tier']] = True
        html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'

    html += '</tbody></table></div>'
    st.markdown(html, unsafe_allow_html=True)

# Tab 2: Benchmark Details
with tab2:
    buffered_img = BytesIO()
    image.save(buffered_img, format="PNG")
    image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")

    # st.markdown(f'''<div style="text-align:center;">
    #     <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
    # </div>''', unsafe_allow_html=True)
    st.markdown(f'''<div style="text-align:center; width:65%; margin:0 auto;">
        <img src="data:image/png;base64,{image_data}" style="width:100%; height:auto;" />
    </div>''', unsafe_allow_html=True)

    st.markdown('### What is VERIFACT?')
    st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")

    st.markdown('### What is FACTRBENCH?')
    st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")

    st.markdown('### Key Findings')
    st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")