Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| from PIL import Image | |
| # Set up page config | |
| st.set_page_config( | |
| page_title="FactBench Leaderboard", | |
| layout="wide", # Layout remains wide, but content will be centered | |
| ) | |
| # Load the image | |
| image = Image.open("factEvalSteps.png") | |
| # Custom CSS for the page | |
| st.markdown( | |
| """ | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap'); | |
| html, body, [class*="css"] { | |
| font-family: 'Courier Prime', monospace; | |
| } | |
| .title { | |
| font-size: 42px; | |
| font-weight: bold; | |
| text-align: center; | |
| color: #333; | |
| margin-bottom: 5px; | |
| } | |
| .description { | |
| font-size: 22px; | |
| text-align: center; | |
| margin-bottom: 30px; | |
| color: #555; | |
| } | |
| .container { | |
| max-width: 1000px; /* Set a max-width for the container */ | |
| margin: 0 auto; /* Center the container */ | |
| padding: 20px; | |
| } | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| border-radius: 10px; | |
| overflow: hidden; | |
| } | |
| th, td { | |
| padding: 8px; | |
| text-align: center; | |
| border: 1px solid #ddd; | |
| font-size: 14px; | |
| transition: background-color 0.3s; | |
| } | |
| th { | |
| background-color: #f2f2f2; | |
| font-weight: bold; | |
| } | |
| td:hover { | |
| background-color: #eaeaea; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # Display title and description | |
| st.markdown('<div class="container">', unsafe_allow_html=True) | |
| st.markdown('<div class="title">FactBench</div>', | |
| unsafe_allow_html=True) | |
| st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>', | |
| unsafe_allow_html=True) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Load the data | |
| # data_path = "factbench_data.csv" | |
| data_path = "tiered_models_data.csv" | |
| df = pd.read_csv(data_path) | |
| # Create tabs | |
| tab1, tab2, tab3 = st.tabs( | |
| ["Leaderboard", "Benchmark Details", "Submit your models"]) | |
| # Tab 1: Leaderboard | |
| # with tab1: | |
| # st.markdown('<div class="title">Leaderboard</div>', | |
| # unsafe_allow_html=True) | |
| # st.markdown('<div class="tab-content">', unsafe_allow_html=True) | |
| # # Dropdown menu to filter tiers | |
| # tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard'] | |
| # selected_tier = st.selectbox('Select Tier:', tiers) | |
| # # Filter the data based on the selected tier | |
| # if selected_tier != 'All Tiers': | |
| # filtered_df = df[df['Tier'] == selected_tier] | |
| # else: | |
| # filtered_df = df | |
| # # Create HTML for the table | |
| # html = ''' | |
| # <table> | |
| # <thead> | |
| # <tr> | |
| # <th>Tier</th> | |
| # <th>Model</th> | |
| # <th>FactScore</th> | |
| # <th>SAFE</th> | |
| # <th>Factcheck-GPT</th> | |
| # <th>VERIFY</th> | |
| # </tr> | |
| # </thead> | |
| # <tbody> | |
| # ''' | |
| # # Generate the rows of the table | |
| # current_tier = None | |
| # for i, row in filtered_df.iterrows(): | |
| # if row['Tier'] != current_tier: | |
| # if current_tier is not None: | |
| # # Close the previous tier row | |
| # html += ' </tr>' | |
| # current_tier = row['Tier'] | |
| # html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>' | |
| # else: | |
| # html += ' <tr>' | |
| # # Fill in model and scores | |
| # html += f''' | |
| # <td>{row['Model']}</td> | |
| # <td>{row['FactScore']:.2f}</td> | |
| # <td>{row['SAFE']:.2f}</td> | |
| # <td>{row['Factcheck-GPT']:.2f}</td> | |
| # <td>{row['VERIFY']:.2f}</td> | |
| # </tr> | |
| # ''' | |
| # # Close the last row and table tags | |
| # html += ''' | |
| # </table> | |
| # ''' | |
| # # Display the table | |
| # st.markdown(html, unsafe_allow_html=True) | |
| # st.markdown('</div>', unsafe_allow_html=True) | |
| df['rank'] = df['factuality_score'].rank( | |
| ascending=False, method='min').astype(int) | |
| with tab1: | |
| st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True) | |
| st.markdown('<div class="tab-content">', unsafe_allow_html=True) | |
| # Dropdown menu to filter tiers | |
| tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard'] | |
| selected_tier = st.selectbox('Select Tier:', tiers) | |
| # Filter the data based on the selected tier | |
| if selected_tier != 'All Tiers': | |
| filtered_df = df[df['tier'] == selected_tier] | |
| else: | |
| filtered_df = df | |
| # Add sorting functionality for Factuality Score | |
| # sort_order = st.radio('Sort by Factuality Score:', | |
| # ('Ascending', 'Descending')) | |
| # # Sort the dataframe based on Factuality Score | |
| # if sort_order == 'Ascending': | |
| # filtered_df = filtered_df.sort_values( | |
| # by='factuality_score', ascending=True) | |
| # else: | |
| # filtered_df = filtered_df.sort_values( | |
| # by='factuality_score', ascending=False) | |
| # Option to sort by Factuality Score in ascending order | |
| sort_by_factuality = st.checkbox('Sort by Factuality Score') | |
| # Sort the dataframe based on Factuality Score if the checkbox is selected | |
| if sort_by_factuality: | |
| updated_filtered_df = filtered_df.sort_values( | |
| by='factuality_score', ascending=False) | |
| else: | |
| updated_filtered_df = filtered_df | |
| # Create HTML for the table | |
| html = ''' | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Rank</th> | |
| <th>Tier</th> | |
| <th>Model</th> | |
| <th>Factuality Score</th> | |
| <th>Hallucination Score</th> | |
| <th>Avg Tokens</th> | |
| <th>Avg Factual Units</th> | |
| <th>Avg Undecidable Units</th> | |
| <th>Avg Unsupported Units</th> | |
| <th>Factual Recall</th> | |
| <th>Conceptual Understanding</th> | |
| <th>Procedural Execution</th> | |
| <th>Comparative Analysis</th> | |
| <th>Recommendations and Insights</th> | |
| <th>Domain-Specific Knowledge</th> | |
| <th>Temporal Context</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| ''' | |
| # Generate the rows of the table | |
| current_tier = None | |
| for i, row in updated_filtered_df.iterrows(): | |
| # if row['tier'] != current_tier: | |
| # if current_tier is not None: | |
| # html += ' </tr>' | |
| # current_tier = row['tier'] | |
| # # 7 models, change this number when more models | |
| # html += f' <tr><td rowspan="7" style="vertical-align: middle;">{current_tier}</td>' | |
| # else: | |
| # html += ' <tr>' | |
| html += ' <tr>' | |
| # Fill in model and scores | |
| html += f''' | |
| <td>{row['rank']}</td> | |
| <td>{row['tier']}</td> | |
| <td>{row['model']}</td> | |
| <td>{row['factuality_score']:.2f}</td> | |
| <td>{row['hallucination_score']:.2f}</td> | |
| <td>{row['avg_tokens']:.2f}</td> | |
| <td>{row['avg_factual_units']:.2f}</td> | |
| <td>{row['avg_undecidable_units']:.2f}</td> | |
| <td>{row['avg_unsupported_units']:.2f}</td> | |
| <td>{row['prompt_categories.Factual Recall']:.2f}</td> | |
| <td>{row['prompt_categories.Conceptual Understanding']:.2f}</td> | |
| <td>{row['prompt_categories.Procedural Execution']:.2f}</td> | |
| <td>{row['prompt_categories.Comparative Analysis']:.2f}</td> | |
| <td>{row['prompt_categories.Recommendations and Insights']:.2f}</td> | |
| <td>{row['prompt_categories.Domain-Specific Knowledge']:.2f}</td> | |
| <td>{row['prompt_categories.Temporal Context']:.2f}</td> | |
| </tr> | |
| ''' | |
| # Close the table | |
| html += ''' | |
| </table> | |
| ''' | |
| # Display the table | |
| st.markdown(html, unsafe_allow_html=True) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Tab 2: Details | |
| with tab2: | |
| st.markdown('<div class="tab-content">', unsafe_allow_html=True) | |
| st.markdown('<div class="title">Benchmark Details</div>', | |
| unsafe_allow_html=True) | |
| st.image(image, use_column_width=True) | |
| st.markdown('### VERIFY: A Pipeline for Factuality Evaluation') | |
| st.write( | |
| "Language models (LMs) are widely used by an increasing number of users, " | |
| "underscoring the challenge of maintaining factual accuracy across a broad range of topics. " | |
| "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), " | |
| "a pipeline to evaluate LMs' factual accuracy in real-world user interactions." | |
| ) | |
| st.markdown('### Content Categorization') | |
| st.write( | |
| "VERIFY considers the verifiability of LM-generated content and categorizes content units as " | |
| "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. " | |
| "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods." | |
| ) | |
| st.markdown('### Hallucination Prompts & FactBench Dataset') | |
| st.write( | |
| "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of " | |
| "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 " | |
| "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is " | |
| "regularly updated with new prompts." | |
| ) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Tab 3: Links | |
| with tab3: | |
| st.markdown('<div class="tab-content">', unsafe_allow_html=True) | |
| st.markdown('<div class="title">Submit your model information on our Github</div>', | |
| unsafe_allow_html=True) | |
| st.markdown( | |
| '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)') | |
| st.markdown( | |
| '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)') | |
| st.markdown('</div>', unsafe_allow_html=True) | |