MLRC_Bench

Running

App Files Files Community

MLRC_Bench / app.py

farimafatahi

Update app.py

e298385 verified 9 months ago

raw

history blame

8.38 kB

	import streamlit as st
	import pandas as pd
	from PIL import Image
	import base64
	from io import BytesIO

	# Set up page config
	st.set_page_config(
	page_title="FactBench Leaderboard",
	layout="wide"
	)

	# Load the image
	image = Image.open("factEvalSteps.png")
	logo_image = Image.open("Factbench_logo.png")

	# Custom CSS for the page
	st.markdown(
	"""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');

	html, body, [class*="css"] {
	font-family: 'Courier Prime', monospace;
	background-color: #f9f9f9; /* Light grey background */
	}}
	}

	.title {
	font-size: 42px;
	font-weight: bold;
	text-align: center;
	color: #333;
	margin-bottom: 5px;
	}

	.description {
	font-size: 22px;
	text-align: center;
	margin-bottom: 30px;
	color: #555;
	}

	.container {
	max-width: 1000px;
	margin: 0 auto;
	padding: 20px;
	}

	table {
	width: 100%;
	border-collapse: collapse;
	border-radius: 10px;
	overflow: hidden;
	}

	th, td {
	padding: 8px;
	text-align: center;
	border: 1px solid #ddd;
	font-size: 14px;
	transition: background-color 0.3s;
	}

	th {
	background-color: #f2f2f2;
	font-weight: bold;
	}

	td:hover {
	background-color: #eaeaea;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	# Display title and description
	st.markdown('<div class="container">', unsafe_allow_html=True)

	# Convert the image to base64
	buffered = BytesIO()
	logo_image.save(buffered, format="PNG")
	img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
	st.markdown(
	f"""
	<style>
	.logo-container {{
	display: flex;
	justify-content: flex-start; /* Aligns to the left */
	}}
	.logo-container img {{
	width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
	max-width: 300px; /* Set a maximum width */
	}}
	</style>
	<div class="logo-container">
	<img src="data:image/png;base64,{img_data}" alt="FactBench Leaderboard Logo">
	</div>
	""",
	unsafe_allow_html=True
	)
	# st.image(logo_image, wi)
	# st.markdown('<div class="title">FactBench Leaderboard</div>',
	# unsafe_allow_html=True)
	st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
	unsafe_allow_html=True)
	st.markdown('</div>', unsafe_allow_html=True)

	# Load the data
	data_path = "tiered_models_data.csv"
	df = pd.read_csv(data_path)

	# Assign ranks within each tier based on factuality_score
	df['rank'] = df.groupby('tier')['factuality_score'].rank(
	ascending=False, method='min').astype(int)

	# Replace NaN values with '-'
	df.fillna('-', inplace=True)

	df['original_order'] = df.groupby('tier').cumcount()

	# Create tabs
	tab1, tab2, tab3 = st.tabs(
	["Leaderboard", "Benchmark Details", "Submit your models"])

	# Tab 1: Leaderboard
	with tab1:
	# df['original_order'] = df.groupby('tier').cumcount()
	# print(df['original_order'])

	# st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
	st.markdown('<div class="tab-content">', unsafe_allow_html=True)

	st.markdown('## Metric Explanation')
	st.markdown('@Farima populate here')

	# Dropdown menu to filter tiers
	tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
	selected_tier = st.selectbox('Select Tier:', tiers)

	# Filter the data based on the selected tier
	if selected_tier != 'All Tiers':
	filtered_df = df[df['tier'] == selected_tier]
	else:
	filtered_df = df

	sort_by_factuality = st.checkbox('Sort by Factuality Score')

	# Sort the dataframe based on Factuality Score if the checkbox is selected
	if sort_by_factuality:
	updated_filtered_df = filtered_df.sort_values(
	by=['tier', 'factuality_score'], ascending=[True, False]
	)
	else:
	updated_filtered_df = filtered_df.sort_values(
	by=['tier', 'original_order']
	)

	# Create HTML for the table
	if selected_tier == 'All Tiers':
	html = '''
	<table>
	<thead>
	<tr>
	<th>Tier</th>
	<th>Rank</th>
	<th>Model</th>
	<th>Factuality Score</th>
	<th>Hallucination Score</th>
	<th># Tokens</th>
	<th># Factual</th>
	<th># Undecidable</th>
	<th># Unsupported</th>
	</tr>
	</thead>
	<tbody>
	'''
	else:
	html = '''
	<table>
	<thead>
	<tr>
	<th>Rank</th>
	<th>Model</th>
	<th>Factuality Score</th>
	<th>Hallucination Score</th>
	<th># Tokens</th>
	<th># Factual</th>
	<th># Undecidable</th>
	<th># Unsupported</th>
	</tr>
	</thead>
	<tbody>
	'''

	# Generate the rows of the table
	current_tier = None
	for i, row in updated_filtered_df.iterrows():
	html += '<tr>'

	# Only display the 'Tier' column if 'All Tiers' is selected
	if selected_tier == 'All Tiers':
	if row['tier'] != current_tier:
	current_tier = row['tier']
	html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'

	# Fill in model and scores
	html += f'''
	<td>{row['rank']}</td>
	<td>{row['model']}</td>
	<td>{row['factuality_score']}</td>
	<td>{row['hallucination_score']}</td>
	<td>{row['avg_tokens']}</td>
	<td>{row['avg_factual_units']}</td>
	<td>{row['avg_undecidable_units']:.2f}</td>
	<td>{row['avg_unsupported_units']:.2f}</td>
	</tr>
	'''

	# Close the table
	html += '''
	</table>
	'''

	# Display the table
	st.markdown(html, unsafe_allow_html=True)

	st.markdown('</div>', unsafe_allow_html=True)

	# Tab 2: Details
	with tab2:
	st.markdown('<div class="tab-content">', unsafe_allow_html=True)

	st.markdown('<div class="title">Benchmark Details</div>',
	unsafe_allow_html=True)
	st.image(image, use_column_width=True)

	st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
	st.write(
	"Language models (LMs) are widely used by an increasing number of users, "
	"underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
	"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
	"a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
	)

	st.markdown('### Content Categorization')
	st.write(
	"VERIFY considers the verifiability of LM-generated content and categorizes content units as "
	"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
	"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
	)

	st.markdown('### Hallucination Prompts & FactBench Dataset')
	st.write(
	"Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
	"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
	"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
	"regularly updated with new prompts."
	)

	st.markdown('</div>', unsafe_allow_html=True)

	# Tab 3: Links
	with tab3:
	st.markdown('<div class="tab-content">', unsafe_allow_html=True)

	st.markdown('<div class="title">Submit your model information on our Github</div>',
	unsafe_allow_html=True)

	st.markdown(
	'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
	st.markdown(
	'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')

	st.markdown('</div>', unsafe_allow_html=True)