import streamlit as st import pandas as pd from PIL import Image import base64 from io import BytesIO # Set up page config st.set_page_config( page_title="VeriFact Leaderboard", layout="wide" ) # Load the image image = Image.open("test.png") logo_image = Image.open("./factrbench.png") # Display logo buffered = BytesIO() logo_image.save(buffered, format="PNG") img_data = base64.b64encode(buffered.getvalue()).decode("utf-8") st.markdown( f"""

""", unsafe_allow_html=True ) st.markdown( ''' ''', unsafe_allow_html=True ) # Load the data data_path = "verifact_data.csv" df = pd.read_csv(data_path) # Assign ranks within each tier df['rank'] = df.groupby('tier')['Overall'].rank( ascending=False, method='min').astype(int) df.fillna('-', inplace=True) df['original_order'] = df.groupby('tier').cumcount() # Tabs tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"]) # Tab 1: Leaderboard with tab1: st.markdown('

Metrics Explanation

', unsafe_allow_html=True) st.markdown("""

🎯 Precision measures the ratio of correct facts among all extracted facts.
🔎 Recall assesses how many reference facts are covered by model outputs.
⚖️ F1 balances precision and recall for comprehensive factual evaluation.
This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.

""", unsafe_allow_html=True) tiers = ['All Metrics', 'Precision', 'Recall', 'F1'] selected_tier = st.selectbox('Select metric:', tiers) if selected_tier != 'All Metrics': filtered_df = df[df['tier'] == selected_tier] else: filtered_df = df sort_by_factuality = st.checkbox('Sort by overall score', value=True) if sort_by_factuality: updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False]) else: updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order']) # html = '

' html = '''

''' html += """""" + ("" if selected_tier == 'All Metrics' else "") + "" current_tier = None tier_counts = updated_filtered_df['tier'].value_counts().to_dict() tier_rowspan_tracker = {} for _, row in updated_filtered_df.iterrows(): html += '' if selected_tier == 'All Metrics': if row['tier'] not in tier_rowspan_tracker: rowspan = tier_counts[row['tier']] html += f'' tier_rowspan_tracker[row['tier']] = True html += f'' html += '

Metric	Rank	Model	Factbench	Reddit	Overall
{row["tier"]}	{row["rank"]}	{row["model"]}	{row["FactBench"]}	{row["Reddit"]}	{row["Overall"]}

' st.markdown(html, unsafe_allow_html=True) # Tab 2: Benchmark Details with tab2: buffered_img = BytesIO() image.save(buffered_img, format="PNG") image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8") # st.markdown(f'''

''', unsafe_allow_html=True) st.markdown(f'''

''', unsafe_allow_html=True) st.markdown('### What is VERIFACT?') st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.") st.markdown('### What is FACTRBENCH?') st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.") st.markdown('### Key Findings') st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")