factrbench

Running

File size: 23,424 Bytes

# import streamlit as st
# import pandas as pd
# from PIL import Image
# import base64
# from io import BytesIO

# # Set up page config
# st.set_page_config(
#     page_title="VeriFact Leaderboard",
#     layout="wide"
# )

# # load header
# with open("_header.md", "r") as f:
#     HEADER_MD = f.read()

# # Load the image
# image = Image.open("test.png")
# logo_image = Image.open("./factrbench.png")

# # Custom CSS for the page
# st.markdown(
#     """
#     <style>
#     @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');

#     html, body, [class*="css"] {
#         font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
#         background-color: #f9f9f9;  /* Light grey background */
#     }

#     .title {
#         font-size: 42px;
#         font-weight: bold;
#         text-align: center;
#         color: #333;
#         margin-bottom: 5px;
#     }

#     .description {
#         font-size: 22px;
#         text-align: center;
#         margin-bottom: 30px;
#         color: #555;
#     }

#     .header, .metric {
#         align-items: left;
#         font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
#         margin-bottom: 20px;
#     }

#     .container {
#         max-width: 1000px;  
#         margin: 0 auto;  
#         padding: 5px;
#     }

#     table {
#         width: 100%;
#         border-collapse: collapse;
#         border-radius: 10px;
#         overflow: hidden;
#     }

#     th, td {
#         padding: 8px;
#         text-align: center;
#         border: 1px solid #ddd;
#         font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
#         font-size: 16px;
#         transition: background-color 0.3s;
#     }

#     th {
#         background-color: #f2f2f2;
#         font-weight: bold;
#     }

#     td:hover {
#         background-color: #eaeaea;
#     }
#     </style>
#     """,
#     unsafe_allow_html=True
# )

# # Display title and description
# st.markdown('<div class="container">', unsafe_allow_html=True)
# # st.image(logo_image, output_format="PNG", width=200)

# # Convert the image to base64
# buffered = BytesIO()
# logo_image.save(buffered, format="PNG")
# img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
# st.markdown(
#     f"""
#     <style>
#     .logo-container {{
#         display: flex;
#         justify-content: flex-start;  /* Aligns to the left */
#     }}
#     .logo-container img {{
#         width: 50%;  /* Adjust this to control the width, e.g., 50% of container width */
#         margin: 0 auto; 
#         max-width: 700px;  /* Set a maximum width */
#         background-color: transparent;
#     }}
#     </style>
#     <div class="logo-container">
#         <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
#     </div>
#     """,
#     unsafe_allow_html=True
# )

# # header_md_text = HEADER_MD # make some parameters later
# # gr.Markdown(header_md_text, elem_classes="markdown-text") 

# st.markdown(
#     '''
#     <div class="header">
#         <br/>
#         <p style="font-size:22px;">
#         VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
#         </p>
#         <p style="font-size:20px;">
#             # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> 
#             ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
#         </p>
#     </div>
#     ''',
#     unsafe_allow_html=True
# )


# # st.markdown('<div class="title">VeriFact Leaderboard</div>',
# #             unsafe_allow_html=True)
# # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
# st.markdown('</div>', unsafe_allow_html=True)

# # Load the data
# data_path = "verifact_data.csv"
# df = pd.read_csv(data_path)

# # Assign ranks within each tier based on factuality_score
# df['rank'] = df.groupby('tier')['Overall'].rank(
#     ascending=False, method='min').astype(int)

# # Replace NaN values with '-'
# df.fillna('-', inplace=True)

# df['original_order'] = df.groupby('tier').cumcount()

# # Create tabs
# st.markdown("""
#     <style>
#         .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
#             font-size: 20px;
#         }
#     </style>
# """, unsafe_allow_html=True)

# tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

# # Tab 1: Leaderboard
# with tab1:
#     # df['original_order'] = df.groupby('tier').cumcount()
#     # print(df['original_order'])
    
#     # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
#     st.markdown('<div class="tab-content">', unsafe_allow_html=True)

#     st.markdown("""
#     <div class="metric" style="font-size:20px; font-weight: bold;">
#     Metrics Explanation
#     </div>
#     """, unsafe_allow_html=True)

#     st.markdown("""
#     <div class="metric" style="font-size:16px;">
#         <br/>
#         <p>
#         <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).  
#         </p>
#         <p>
#         🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
#         </p>
#     </div>
#     """,
#     unsafe_allow_html=True
#     )
    
#     st.markdown("""
#     <style>
#         /* Selectbox text */
#         div[data-baseweb="select"] > div {
#             font-size: 20px;
#         }
        
#         /* Dropdown options */
#         div[role="listbox"] ul li {
#             font-size: 20px !important;
#         }
        
#         /* Checkbox label */
#         .stCheckbox label p {
#             font-size: 20px !important;
#         }
        
#         /* Selectbox label */
#         .stSelectbox label p {
#             font-size: 20px !important;
#         }
#     </style>
# """, unsafe_allow_html=True)
    
#     # Dropdown menu to filter tiers
#     tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
#     selected_tier = st.selectbox('Select metric:', tiers)

#     # Filter the data based on the selected tier
#     if selected_tier != 'All Metrics':
#         filtered_df = df[df['tier'] == selected_tier]
#     else:
#         filtered_df = df

#     sort_by_factuality = st.checkbox('Sort by overall score')

#     # Sort the dataframe based on Factuality Score if the checkbox is selected
#     if sort_by_factuality:
#         updated_filtered_df = filtered_df.sort_values(
#             by=['tier', 'Overall'], ascending=[True, False]
#         )
#     else:
#         updated_filtered_df = filtered_df.sort_values(
#             by=['tier', 'original_order']
#         )

#     # Create HTML for the table
#     if selected_tier == 'All Metrics':
#         html = '''
#         <table>
#             <thead>
#                 <tr>
#                     <th>Metric</th>
#                     <th>Rank</th>
#                     <th>Model</th>
#                     <th>Factbench</th>
#                     <th>Reddit</th>
#                     <th>Overall</th>
#                 </tr>
#             </thead>
#             <tbody>
#         '''
#     else:
#         html = '''
#         <table>
#             <thead>
#                 <tr>
#                     <th>Rank</th>
#                     <th>Model</th>
#                     <th>Factbench</th>
#                     <th>Reddit</th>
#                     <th>Overall</th>
#                 </tr>
#             </thead>
#             <tbody>
#         '''

#     # Generate the rows of the table
#     current_tier = None
#     for i, row in updated_filtered_df.iterrows():
#         html += '<tr>'

#         # Only display the 'Tier' column if 'All Tiers' is selected
#         if selected_tier == 'All Metrics':
#             if row['tier'] != current_tier:
#                 current_tier = row['tier']
#                 html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'

#         # Fill in model and scores
#         html += f'''
#             <td>{row['rank']}</td>
#             <td>{row['model']}</td>
#             <td>{row['FactBench']}</td>
#             <td>{row['Reddit']}</td>
#             <td>{row['Overall']}</td>
#         </tr>
#     '''

#     # Close the table
#     html += '''
#     </table>
#     '''

#     # Display the table
#     st.markdown(html, unsafe_allow_html=True)

#     st.markdown('</div>', unsafe_allow_html=True)

# # Tab 2: Details
# with tab2:
#     st.markdown('<div class="tab-content">', unsafe_allow_html=True)

#     # st.markdown('<div class="title"></div>',
#     #             unsafe_allow_html=True)
#     st.image(image, use_column_width=True)

#     st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
#     st.write(
#         "Language models (LMs) are widely used by an increasing number of users, "
#         "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
#         "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
#         "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
#     )

#     st.markdown('### Content Categorization')
#     st.write(
#         "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
#         "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
#         "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
#     )

#     st.markdown('### Hallucination Prompts & FactBench Dataset')
#     st.write(
#         "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
#         "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
#         "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
#         "regularly updated with new prompts."
#     )

#     st.markdown('</div>', unsafe_allow_html=True)

# # # Tab 3: Links
# # with tab3:
# #     st.markdown('<div class="tab-content">', unsafe_allow_html=True)

# #     st.markdown('<div class="title">Submit your model information on our Github</div>',
# #                 unsafe_allow_html=True)

# #     st.markdown(
# #         '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
# #     st.markdown(
# #         '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')

# #     st.markdown('</div>', unsafe_allow_html=True)


# import streamlit as st
# import pandas as pd
# from PIL import Image
# import base64
# from io import BytesIO

# # Set up page config
# st.set_page_config(
#     page_title="VeriFact Leaderboard",
#     layout="wide"
# )

# # load header
# with open("_header.md", "r") as f:
#     HEADER_MD = f.read()

# # Load the image
# image = Image.open("test.png")
# logo_image = Image.open("./factrbench.png")

# # Custom CSS for the page
# st.markdown(
#     """
#     <style>
#     @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');

#     html, body, [class*="css"] {
#         font-family: 'Arial', sans-serif;
#         background-color: #f9f9f9;
#     }

#     .title {
#         font-size: 42px;
#         font-weight: bold;
#         text-align: center;
#         color: #333;
#         margin-bottom: 5px;
#     }

#     .description {
#         font-size: 22px;
#         text-align: center;
#         margin-bottom: 30px;
#         color: #555;
#     }

#     .header, .metric {
#         align-items: left;
#         margin-bottom: 20px;
#     }

#     .container {
#         max-width: 1000px;  
#         margin: 0 auto;  
#         padding: 5px;
#     }

#     table {
#         width: 100%;
#         border-collapse: collapse;
#         border-radius: 10px;
#         overflow: hidden;
#     }

#     th, td {
#         padding: 8px;
#         text-align: center;
#         border: 1px solid #ddd;
#         font-size: 16px;
#         transition: background-color 0.3s;
#     }

#     th {
#         background-color: #f2f2f2;
#         font-weight: bold;
#     }

#     td:hover {
#         background-color: #eaeaea;
#     }
#     </style>
#     """,
#     unsafe_allow_html=True
# )

# # Display logo
# buffered = BytesIO()
# logo_image.save(buffered, format="PNG")
# img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

# st.markdown(
#     f"""
#     <div class="logo-container" style="display:flex; justify-content: flex-start;">
#         <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
#     </div>
#     """,
#     unsafe_allow_html=True
# )

# st.markdown(
#     '''
#     <div class="header">
#         <br/>
#         <p style="font-size:22px;">
#         VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
#         </p>
#         <p style="font-size:20px;">
#             # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> 
#             ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
#         </p>
#     </div>
#     ''',
#     unsafe_allow_html=True
# )

# # Load the data
# data_path = "verifact_data.csv"
# df = pd.read_csv(data_path)

# # Assign ranks within each tier
# df['rank'] = df.groupby('tier')['Overall'].rank(
#     ascending=False, method='min').astype(int)

# df.fillna('-', inplace=True)
# df['original_order'] = df.groupby('tier').cumcount()

# # Tabs
# tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

# # Tab 1: Leaderboard
# with tab1:
#     st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)

#     st.markdown("""
#         <div class="metric" style="font-size:16px;">
#             <p>
#             <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.  
#             🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
#             </p>
#         </div>
#     """, unsafe_allow_html=True)

#     tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
#     selected_tier = st.selectbox('Select metric:', tiers)

#     if selected_tier != 'All Metrics':
#         filtered_df = df[df['tier'] == selected_tier]
#     else:
#         filtered_df = df

#     sort_by_factuality = st.checkbox('Sort by overall score')
#     if sort_by_factuality:
#         updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
#     else:
#         updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])

#     # 缩小表格：用容器包裹并限制最大宽度
#     html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
#     html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"

#     current_tier = None
#     for _, row in updated_filtered_df.iterrows():
#         html += '<tr>'
#         if selected_tier == 'All Metrics' and row['tier'] != current_tier:
#             current_tier = row['tier']
#             html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
#         html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'

#     html += '</tbody></table></div>'
#     st.markdown(html, unsafe_allow_html=True)

# # Tab 2: Benchmark Details
# with tab2:
#     # 图片剧中显示
#     buffered_img = BytesIO()
#     image.save(buffered_img, format="PNG")
#     image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")

#     st.markdown(f'''<div style="text-align:center;">
#         <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
#     </div>''', unsafe_allow_html=True)

#     st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
#     st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")

#     st.markdown('### Content Categorization')
#     st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")

#     st.markdown('### Hallucination Prompts & FactBench Dataset')
#     st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")


import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO

# Set up page config
st.set_page_config(
    page_title="VeriFact Leaderboard",
    layout="wide"
)

# Load the image
image = Image.open("test.png")
logo_image = Image.open("./factrbench.png")

# Display logo
buffered = BytesIO()
logo_image.save(buffered, format="PNG")
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

st.markdown(
    f"""
    <div class="logo-container" style="display:flex; justify-content: center;">
        <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
    </div>
    """,
    unsafe_allow_html=True
)

st.markdown(
    '''
    <div class="header">
        <br/>
        <p style="font-size:22px;">
        VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
        </p>
        <p style="font-size:20px;">
            # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> 
            ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
        </p>
    </div>
    ''',
    unsafe_allow_html=True
)

# Load the data
data_path = "verifact_data.csv"
df = pd.read_csv(data_path)

# Assign ranks within each tier
df['rank'] = df.groupby('tier')['Overall'].rank(
    ascending=False, method='min').astype(int)

df.fillna('-', inplace=True)
df['original_order'] = df.groupby('tier').cumcount()

# Tabs
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

# Tab 1: Leaderboard
with tab1:
    st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)

    st.markdown("""
        <div class="metric" style="font-size:16px;">
            <p>
            <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
            <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
            <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
            This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
            </p>
        </div>
    """, unsafe_allow_html=True)

    tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
    selected_tier = st.selectbox('Select metric:', tiers)

    if selected_tier != 'All Metrics':
        filtered_df = df[df['tier'] == selected_tier]
    else:
        filtered_df = df

    sort_by_factuality = st.checkbox('Sort by overall score')
    if sort_by_factuality:
        updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
    else:
        updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])

    # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
    html = '''
        <div style="width: 60%; margin: 0 auto;">
        <table style="width: 100%;">
        '''
    html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"

    current_tier = None
    for _, row in updated_filtered_df.iterrows():
        html += '<tr>'
        if selected_tier == 'All Metrics' and row['tier'] != current_tier:
            current_tier = row['tier']
            html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
        html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'

    html += '</tbody></table></div>'
    st.markdown(html, unsafe_allow_html=True)

# Tab 2: Benchmark Details
with tab2:
    buffered_img = BytesIO()
    image.save(buffered_img, format="PNG")
    image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")

    # st.markdown(f'''<div style="text-align:center;">
    #     <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
    # </div>''', unsafe_allow_html=True)
    st.markdown(f'''<div style="text-align:center; width:65%; margin:0 auto;">
        <img src="data:image/png;base64,{image_data}" style="width:100%; height:auto;" />
    </div>''', unsafe_allow_html=True)

    st.markdown('### What is VERIFACT?')
    st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")

    st.markdown('### What is FACTRBENCH?')
    st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")

    st.markdown('### Key Findings')
    st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")