factrbench

Running

App Files Files Community

factrbench / app.py

xinliucs

Update app.py

1cfe5be verified about 2 months ago

raw

history blame contribute delete

5.63 kB


	import streamlit as st
	import pandas as pd
	from PIL import Image
	import base64
	from io import BytesIO

	# Set up page config
	st.set_page_config(
	page_title="VeriFact Leaderboard",
	layout="wide"
	)

	# Load the image
	image = Image.open("test.png")
	logo_image = Image.open("./factrbench.png")

	# Display logo
	buffered = BytesIO()
	logo_image.save(buffered, format="PNG")
	img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

	st.markdown(
	f"""
	<div class="logo-container" style="display:flex; justify-content: center;">
	<img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
	</div>
	""",
	unsafe_allow_html=True
	)

	st.markdown(
	'''
	<div class="header">
	<br/>
	<p style="font-size:22px;">
	VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
	</p>
	<p style="font-size:20px;">
	# 📑 <a href="https://arxiv.org/abs/2505.09701">Paper</a> \| 💻 <a href="">GitHub</a> \| 🤗 <a href="https://huggingface.co/datasets/launch/FactRBench">HuggingFace</a>
	⚙️ <strong>Version</strong>: <strong>V1</strong> \| <strong># Models</strong>: 12 \| Updated: <strong>April 2025</strong>
	</p>
	</div>
	''',
	unsafe_allow_html=True
	)

	# Load the data
	data_path = "verifact_data.csv"
	df = pd.read_csv(data_path)

	# Assign ranks within each tier
	df['rank'] = df.groupby('tier')['Overall'].rank(
	ascending=False, method='min').astype(int)

	df.fillna('-', inplace=True)
	df['original_order'] = df.groupby('tier').cumcount()

	# Tabs
	tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

	# Tab 1: Leaderboard
	with tab1:
	st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)

	st.markdown("""
	<div class="metric" style="font-size:16px;">
	<p>
	<strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
	<strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
	<strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
	This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
	</p>
	</div>
	""", unsafe_allow_html=True)

	tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
	selected_tier = st.selectbox('Select metric:', tiers)

	if selected_tier != 'All Metrics':
	filtered_df = df[df['tier'] == selected_tier]
	else:
	filtered_df = df

	sort_by_factuality = st.checkbox('Sort by overall score', value=True)
	if sort_by_factuality:
	updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
	else:
	updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])

	# html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
	html = '''
	<div style="width: 60%; margin: 0 auto;">
	<table style="width: 100%;">
	'''
	html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"

	current_tier = None
	tier_counts = updated_filtered_df['tier'].value_counts().to_dict()
	tier_rowspan_tracker = {}

	for _, row in updated_filtered_df.iterrows():
	html += '<tr>'
	if selected_tier == 'All Metrics':
	if row['tier'] not in tier_rowspan_tracker:
	rowspan = tier_counts[row['tier']]
	html += f'<td rowspan="{rowspan}" style="vertical-align: middle;">{row["tier"]}</td>'
	tier_rowspan_tracker[row['tier']] = True
	html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'

	html += '</tbody></table></div>'
	st.markdown(html, unsafe_allow_html=True)

	# Tab 2: Benchmark Details
	with tab2:
	buffered_img = BytesIO()
	image.save(buffered_img, format="PNG")
	image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")

	# st.markdown(f'''<div style="text-align:center;">
	# <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
	# </div>''', unsafe_allow_html=True)
	st.markdown(f'''<div style="text-align:center; width:65%; margin:0 auto;">
	<img src="data:image/png;base64,{image_data}" style="width:100%; height:auto;" />
	</div>''', unsafe_allow_html=True)

	st.markdown('### What is VERIFACT?')
	st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")

	st.markdown('### What is FACTRBENCH?')
	st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")

	st.markdown('### Key Findings')
	st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")