Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
from PIL import Image | |
import base64 | |
from io import BytesIO | |
# Set up page config | |
st.set_page_config( | |
page_title="FactBench Leaderboard", | |
layout="wide" | |
) | |
# Load the image | |
image = Image.open("factEvalSteps.png") | |
logo_image = Image.open("Factbench_logo.png") | |
# Custom CSS for the page | |
st.markdown( | |
""" | |
<style> | |
@import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap'); | |
html, body, [class*="css"] { | |
font-family: 'Courier Prime', monospace; | |
background-color: #f9f9f9; /* Light grey background */ | |
}} | |
} | |
.title { | |
font-size: 42px; | |
font-weight: bold; | |
text-align: center; | |
color: #333; | |
margin-bottom: 5px; | |
} | |
.description { | |
font-size: 22px; | |
text-align: center; | |
margin-bottom: 30px; | |
color: #555; | |
} | |
.container { | |
max-width: 1000px; | |
margin: 0 auto; | |
padding: 20px; | |
} | |
table { | |
width: 100%; | |
border-collapse: collapse; | |
border-radius: 10px; | |
overflow: hidden; | |
} | |
th, td { | |
padding: 8px; | |
text-align: center; | |
border: 1px solid #ddd; | |
font-size: 14px; | |
transition: background-color 0.3s; | |
} | |
th { | |
background-color: #f2f2f2; | |
font-weight: bold; | |
} | |
td:hover { | |
background-color: #eaeaea; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
# Display title and description | |
st.markdown('<div class="container">', unsafe_allow_html=True) | |
# Convert the image to base64 | |
buffered = BytesIO() | |
logo_image.save(buffered, format="PNG") | |
img_data = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
st.markdown( | |
f""" | |
<style> | |
.logo-container {{ | |
display: flex; | |
justify-content: flex-start; /* Aligns to the left */ | |
}} | |
.logo-container img {{ | |
width: 50%; /* Adjust this to control the width, e.g., 50% of container width */ | |
max-width: 300px; /* Set a maximum width */ | |
}} | |
</style> | |
<div class="logo-container"> | |
<img src="data:image/png;base64,{img_data}" alt="FactBench Leaderboard Logo"> | |
</div> | |
""", | |
unsafe_allow_html=True | |
) | |
# st.image(logo_image, wi) | |
# st.markdown('<div class="title">FactBench Leaderboard</div>', | |
# unsafe_allow_html=True) | |
st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>', | |
unsafe_allow_html=True) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Load the data | |
data_path = "tiered_models_data.csv" | |
df = pd.read_csv(data_path) | |
# Assign ranks within each tier based on factuality_score | |
df['rank'] = df.groupby('tier')['factuality_score'].rank( | |
ascending=False, method='min').astype(int) | |
# Replace NaN values with '-' | |
df.fillna('-', inplace=True) | |
df['original_order'] = df.groupby('tier').cumcount() | |
# Create tabs | |
tab1, tab2, tab3 = st.tabs( | |
["Leaderboard", "Benchmark Details", "Submit your models"]) | |
# Tab 1: Leaderboard | |
with tab1: | |
# df['original_order'] = df.groupby('tier').cumcount() | |
# print(df['original_order']) | |
# st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True) | |
st.markdown('<div class="tab-content">', unsafe_allow_html=True) | |
st.markdown('## Metric Explanation') | |
st.markdown('@Farima populate here') | |
# Dropdown menu to filter tiers | |
tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy'] | |
selected_tier = st.selectbox('Select Tier:', tiers) | |
# Filter the data based on the selected tier | |
if selected_tier != 'All Tiers': | |
filtered_df = df[df['tier'] == selected_tier] | |
else: | |
filtered_df = df | |
sort_by_factuality = st.checkbox('Sort by Factuality Score') | |
# Sort the dataframe based on Factuality Score if the checkbox is selected | |
if sort_by_factuality: | |
updated_filtered_df = filtered_df.sort_values( | |
by=['tier', 'factuality_score'], ascending=[True, False] | |
) | |
else: | |
updated_filtered_df = filtered_df.sort_values( | |
by=['tier', 'original_order'] | |
) | |
# Create HTML for the table | |
if selected_tier == 'All Tiers': | |
html = ''' | |
<table> | |
<thead> | |
<tr> | |
<th>Tier</th> | |
<th>Rank</th> | |
<th>Model</th> | |
<th>Factuality Score</th> | |
<th>Hallucination Score</th> | |
<th># Tokens</th> | |
<th># Factual</th> | |
<th># Undecidable</th> | |
<th># Unsupported</th> | |
</tr> | |
</thead> | |
<tbody> | |
''' | |
else: | |
html = ''' | |
<table> | |
<thead> | |
<tr> | |
<th>Rank</th> | |
<th>Model</th> | |
<th>Factuality Score</th> | |
<th>Hallucination Score</th> | |
<th># Tokens</th> | |
<th># Factual</th> | |
<th># Undecidable</th> | |
<th># Unsupported</th> | |
</tr> | |
</thead> | |
<tbody> | |
''' | |
# Generate the rows of the table | |
current_tier = None | |
for i, row in updated_filtered_df.iterrows(): | |
html += '<tr>' | |
# Only display the 'Tier' column if 'All Tiers' is selected | |
if selected_tier == 'All Tiers': | |
if row['tier'] != current_tier: | |
current_tier = row['tier'] | |
html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>' | |
# Fill in model and scores | |
html += f''' | |
<td>{row['rank']}</td> | |
<td>{row['model']}</td> | |
<td>{row['factuality_score']}</td> | |
<td>{row['hallucination_score']}</td> | |
<td>{row['avg_tokens']}</td> | |
<td>{row['avg_factual_units']}</td> | |
<td>{row['avg_undecidable_units']:.2f}</td> | |
<td>{row['avg_unsupported_units']:.2f}</td> | |
</tr> | |
''' | |
# Close the table | |
html += ''' | |
</table> | |
''' | |
# Display the table | |
st.markdown(html, unsafe_allow_html=True) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Tab 2: Details | |
with tab2: | |
st.markdown('<div class="tab-content">', unsafe_allow_html=True) | |
st.markdown('<div class="title">Benchmark Details</div>', | |
unsafe_allow_html=True) | |
st.image(image, use_column_width=True) | |
st.markdown('### VERIFY: A Pipeline for Factuality Evaluation') | |
st.write( | |
"Language models (LMs) are widely used by an increasing number of users, " | |
"underscoring the challenge of maintaining factual accuracy across a broad range of topics. " | |
"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), " | |
"a pipeline to evaluate LMs' factual accuracy in real-world user interactions." | |
) | |
st.markdown('### Content Categorization') | |
st.write( | |
"VERIFY considers the verifiability of LM-generated content and categorizes content units as " | |
"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. " | |
"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods." | |
) | |
st.markdown('### Hallucination Prompts & FactBench Dataset') | |
st.write( | |
"Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of " | |
"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 " | |
"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is " | |
"regularly updated with new prompts." | |
) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Tab 3: Links | |
with tab3: | |
st.markdown('<div class="tab-content">', unsafe_allow_html=True) | |
st.markdown('<div class="title">Submit your model information on our Github</div>', | |
unsafe_allow_html=True) | |
st.markdown( | |
'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)') | |
st.markdown( | |
'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)') | |
st.markdown('</div>', unsafe_allow_html=True) | |