factrbench

Running

App Files Files Community

shezamunir commited on Oct 23, 2024

Commit

0aa9325

1 Parent(s): 05b28bf

updated app according to edit suggestions

Browse files

Files changed (2) hide show

app.py +89 -131
tiered_models_data.csv +21 -21

app.py CHANGED Viewed

@@ -37,13 +37,11 @@ st.markdown(
     }
     .container {
-        max-width: 1000px;  /* Set a max-width for the container */
-        margin: 0 auto;  /* Center the container */
         padding: 20px;
     }
     table {
         width: 100%;
         border-collapse: collapse;
@@ -81,86 +79,31 @@ st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</di
 st.markdown('</div>', unsafe_allow_html=True)
 # Load the data
-# data_path = "factbench_data.csv"
 data_path = "tiered_models_data.csv"
 df = pd.read_csv(data_path)
 # Create tabs
 tab1, tab2, tab3 = st.tabs(
     ["Leaderboard", "Benchmark Details", "Submit your models"])
 # Tab 1: Leaderboard
-# with tab1:
-#     st.markdown('<div class="title">Leaderboard</div>',
-#                 unsafe_allow_html=True)
-#     st.markdown('<div class="tab-content">', unsafe_allow_html=True)
-#     # Dropdown menu to filter tiers
-#     tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
-#     selected_tier = st.selectbox('Select Tier:', tiers)
-#     # Filter the data based on the selected tier
-#     if selected_tier != 'All Tiers':
-#         filtered_df = df[df['Tier'] == selected_tier]
-#     else:
-#         filtered_df = df
-#     # Create HTML for the table
-#     html = '''
-#     <table>
-#         <thead>
-#             <tr>
-#                 <th>Tier</th>
-#                 <th>Model</th>
-#                 <th>FactScore</th>
-#                 <th>SAFE</th>
-#                 <th>Factcheck-GPT</th>
-#                 <th>VERIFY</th>
-#             </tr>
-#         </thead>
-#         <tbody>
-#     '''
-#     # Generate the rows of the table
-#     current_tier = None
-#     for i, row in filtered_df.iterrows():
-#         if row['Tier'] != current_tier:
-#             if current_tier is not None:
-#                 # Close the previous tier row
-#                 html += '    </tr>'
-#             current_tier = row['Tier']
-#             html += f'    <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
-#         else:
-#             html += '    <tr>'
-#         # Fill in model and scores
-#         html += f'''
-#             <td>{row['Model']}</td>
-#             <td>{row['FactScore']:.2f}</td>
-#             <td>{row['SAFE']:.2f}</td>
-#             <td>{row['Factcheck-GPT']:.2f}</td>
-#             <td>{row['VERIFY']:.2f}</td>
-#         </tr>
-#     '''
-#     # Close the last row and table tags
-#     html += '''
-#     </table>
-#     '''
-#     # Display the table
-#     st.markdown(html, unsafe_allow_html=True)
-#     st.markdown('</div>', unsafe_allow_html=True)
-df['rank'] = df['factuality_score'].rank(
-    ascending=False, method='min').astype(int)
 with tab1:
     st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
     st.markdown('<div class="tab-content">', unsafe_allow_html=True)
     # Dropdown menu to filter tiers
-    tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
     selected_tier = st.selectbox('Select Tier:', tiers)
     # Filter the data based on the selected tier
@@ -168,84 +111,98 @@ with tab1:
         filtered_df = df[df['tier'] == selected_tier]
     else:
         filtered_df = df
-    # Add sorting functionality for Factuality Score
-    # sort_order = st.radio('Sort by Factuality Score:',
-    #                       ('Ascending', 'Descending'))
-    # # Sort the dataframe based on Factuality Score
-    # if sort_order == 'Ascending':
-    #     filtered_df = filtered_df.sort_values(
-    #         by='factuality_score', ascending=True)
-    # else:
-    #     filtered_df = filtered_df.sort_values(
-    #         by='factuality_score', ascending=False)
-    # Option to sort by Factuality Score in ascending order
     sort_by_factuality = st.checkbox('Sort by Factuality Score')
     # Sort the dataframe based on Factuality Score if the checkbox is selected
     if sort_by_factuality:
         updated_filtered_df = filtered_df.sort_values(
-            by='factuality_score', ascending=False)
     else:
-        updated_filtered_df = filtered_df
     # Create HTML for the table
-    html = '''
-    <table>
-        <thead>
-            <tr>
-                <th>Rank</th>
-                <th>Tier</th>
-                <th>Model</th>
-                <th>Factuality Score</th>
-                <th>Hallucination Score</th>
-                <th>Avg Tokens</th>
-                <th>Avg Factual Units</th>
-                <th>Avg Undecidable Units</th>
-                <th>Avg Unsupported Units</th>
-                <th>Factual Recall</th>
-                <th>Conceptual Understanding</th>
-                <th>Procedural Execution</th>
-                <th>Comparative Analysis</th>
-                <th>Recommendations and Insights</th>
-                <th>Domain-Specific Knowledge</th>
-                <th>Temporal Context</th>
-            </tr>
-        </thead>
-        <tbody>
-    '''
     # Generate the rows of the table
     current_tier = None
     for i, row in updated_filtered_df.iterrows():
-        # if row['tier'] != current_tier:
-        #     if current_tier is not None:
-        #         html += '    </tr>'
-        #     current_tier = row['tier']
-        #     # 7 models, change this number when more models
-        #     html += f'    <tr><td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
-        # else:
-        #     html += '    <tr>'
-        html += '    <tr>'
         # Fill in model and scores
         html += f'''
             <td>{row['rank']}</td>
-            <td>{row['tier']}</td>
             <td>{row['model']}</td>
-            <td>{row['factuality_score']:.2f}</td>
-            <td>{row['hallucination_score']:.2f}</td>
-            <td>{row['avg_tokens']:.2f}</td>
-            <td>{row['avg_factual_units']:.2f}</td>
             <td>{row['avg_undecidable_units']:.2f}</td>
             <td>{row['avg_unsupported_units']:.2f}</td>
-            <td>{row['prompt_categories.Factual Recall']:.2f}</td>
-            <td>{row['prompt_categories.Conceptual Understanding']:.2f}</td>
-            <td>{row['prompt_categories.Procedural Execution']:.2f}</td>
-            <td>{row['prompt_categories.Comparative Analysis']:.2f}</td>
-            <td>{row['prompt_categories.Recommendations and Insights']:.2f}</td>
-            <td>{row['prompt_categories.Domain-Specific Knowledge']:.2f}</td>
-            <td>{row['prompt_categories.Temporal Context']:.2f}</td>
         </tr>
     '''
@@ -258,6 +215,7 @@ with tab1:
     st.markdown(html, unsafe_allow_html=True)
     st.markdown('</div>', unsafe_allow_html=True)
 # Tab 2: Details
 with tab2:
     st.markdown('<div class="tab-content">', unsafe_allow_html=True)

     }
     .container {
+        max-width: 1000px;
+        margin: 0 auto;
         padding: 20px;
     }
     table {
         width: 100%;
         border-collapse: collapse;
 st.markdown('</div>', unsafe_allow_html=True)
 # Load the data
 data_path = "tiered_models_data.csv"
 df = pd.read_csv(data_path)
+# Assign ranks within each tier based on factuality_score
+df['rank'] = df.groupby('tier')['factuality_score'].rank(
+    ascending=False, method='min').astype(int)
+# Replace NaN values with '-'
+df.fillna('-', inplace=True)
+df['original_order'] = df.groupby('tier').cumcount()
 # Create tabs
 tab1, tab2, tab3 = st.tabs(
     ["Leaderboard", "Benchmark Details", "Submit your models"])
 # Tab 1: Leaderboard
 with tab1:
+    # df['original_order'] = df.groupby('tier').cumcount()
+    # print(df['original_order'])
     st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
     st.markdown('<div class="tab-content">', unsafe_allow_html=True)
     # Dropdown menu to filter tiers
+    tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
     selected_tier = st.selectbox('Select Tier:', tiers)
     # Filter the data based on the selected tier
         filtered_df = df[df['tier'] == selected_tier]
     else:
         filtered_df = df
     sort_by_factuality = st.checkbox('Sort by Factuality Score')
     # Sort the dataframe based on Factuality Score if the checkbox is selected
     if sort_by_factuality:
         updated_filtered_df = filtered_df.sort_values(
+            by=['tier', 'factuality_score'], ascending=[True, False]
+        )
     else:
+        updated_filtered_df = filtered_df.sort_values(
+            by=['tier', 'original_order']
+        )
     # Create HTML for the table
+    if selected_tier == 'All Tiers':
+        html = '''
+        <table>
+            <thead>
+                <tr>
+                    <th>Tier</th>
+                    <th>Rank</th>
+                    <th>Model</th>
+                    <th>Factuality Score</th>
+                    <th>Factual Recall</th>
+                    <th>Conceptual Understanding</th>
+                    <th>Procedural Execution</th>
+                    <th>Comparative Analysis</th>
+                    <th>Recommendations and Insights</th>
+                    <th>Domain-Specific Knowledge</th>
+                    <th>Temporal Context</th>
+                    <th>Hallucination Score</th>
+                    <th># Tokens</th>
+                    <th># Factual</th>
+                    <th># Undecidable</th>
+                    <th># Unsupported</th>
+                </tr>
+            </thead>
+            <tbody>
+        '''
+    else:
+        html = '''
+        <table>
+            <thead>
+                <tr>
+                    <th>Rank</th>
+                    <th>Model</th>
+                    <th>Factuality Score</th>
+                    <th>Factual Recall</th>
+                    <th>Conceptual Understanding</th>
+                    <th>Procedural Execution</th>
+                    <th>Comparative Analysis</th>
+                    <th>Recommendations and Insights</th>
+                    <th>Domain-Specific Knowledge</th>
+                    <th>Temporal Context</th>
+                    <th>Hallucination Score</th>
+                    <th># Tokens</th>
+                    <th># Factual</th>
+                    <th># Undecidable</th>
+                    <th># Unsupported</th>
+                </tr>
+            </thead>
+            <tbody>
+        '''
     # Generate the rows of the table
     current_tier = None
     for i, row in updated_filtered_df.iterrows():
+        html += '<tr>'
+        # Only display the 'Tier' column if 'All Tiers' is selected
+        if selected_tier == 'All Tiers':
+            if row['tier'] != current_tier:
+                current_tier = row['tier']
+                html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
         # Fill in model and scores
         html += f'''
             <td>{row['rank']}</td>
             <td>{row['model']}</td>
+            <td>{row['factuality_score']}</td>
+            <td>{row['prompt_categories.Factual Recall']}</td>
+            <td>{row['prompt_categories.Conceptual Understanding']}</td>
+            <td>{row['prompt_categories.Procedural Execution']}</td>
+            <td>{row['prompt_categories.Comparative Analysis']}</td>
+            <td>{row['prompt_categories.Recommendations and Insights']}</td>
+            <td>{row['prompt_categories.Domain-Specific Knowledge']}</td>
+            <td>{row['prompt_categories.Temporal Context']}</td>
+            <td>{row['hallucination_score']}</td>
+            <td>{row['avg_tokens']}</td>
+            <td>{row['avg_factual_units']}</td>
             <td>{row['avg_undecidable_units']:.2f}</td>
             <td>{row['avg_unsupported_units']:.2f}</td>
         </tr>
     '''
     st.markdown(html, unsafe_allow_html=True)
     st.markdown('</div>', unsafe_allow_html=True)
 # Tab 2: Details
 with tab2:
     st.markdown('<div class="tab-content">', unsafe_allow_html=True)

tiered_models_data.csv CHANGED Viewed

@@ -1,23 +1,23 @@
 tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
-Tier 1: Easy,gpt4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
-Tier 1: Easy,gemini,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
-Tier 1: Easy,llama3.1_70B_instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
-Tier 1: Easy,llama3.1_405B_instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
-Tier 1: Easy,claude-3.5-sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
-Tier 1: Easy,commandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
-Tier 1: Easy,mistral-large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
-Tier 2: Moderate,gpt4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
-Tier 2: Moderate,gemini,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
-Tier 2: Moderate,llama3.1_70B_instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
-Tier 2: Moderate,llama3.1_405B_instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
-Tier 2: Moderate,claude-3.5-sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
-Tier 2: Moderate,commandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
-Tier 2: Moderate,mistral-large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
-Tier 3: Hard,gpt4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
-Tier 3: Hard,gemini,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
-Tier 3: Hard,llama3.1_70B_instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
-Tier 3: Hard,llama3.1_405B_instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
-Tier 3: Hard,claude-3.5-sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
-Tier 3: Hard,commandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
-Tier 3: Hard,mistral-large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0

 tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
+Tier 1: Hard,GPT4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
+Tier 1: Hard,Gemini1.5-Pro,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
+Tier 1: Hard,Llama3.1-70B-Instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
+Tier 1: Hard,Llama3.1-405B-Instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
+Tier 1: Hard,Claude-3.5-Sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
+Tier 1: Hard,CommandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
+Tier 1: Hard,Mistral-Large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
+Tier 2: Moderate,GPT4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
+Tier 2: Moderate,Gemini1.5-Pro,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
+Tier 2: Moderate,Llama3.1-70B-Instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
+Tier 2: Moderate,Llama3.1-405B-Instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
+Tier 2: Moderate,Claude-3.5-Sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
+Tier 2: Moderate,CommandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
+Tier 2: Moderate,Mistral-Large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
+Tier 3: Easy,GPT4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
+Tier 3: Easy,Gemini1.5-Pro,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
+Tier 3: Easy,Llama3.1-70B-Instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
+Tier 3: Easy,Llama3.1-405B-Instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
+Tier 3: Easy,Claude-3.5-Sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
+Tier 3: Easy,CommandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
+Tier 3: Easy,Mistral-Large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0