Spaces:
Running
Running
Commit
·
0aa9325
1
Parent(s):
05b28bf
updated app according to edit suggestions
Browse files- app.py +89 -131
- tiered_models_data.csv +21 -21
app.py
CHANGED
|
@@ -37,13 +37,11 @@ st.markdown(
|
|
| 37 |
}
|
| 38 |
|
| 39 |
.container {
|
| 40 |
-
max-width: 1000px;
|
| 41 |
-
margin: 0 auto;
|
| 42 |
padding: 20px;
|
| 43 |
}
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
table {
|
| 48 |
width: 100%;
|
| 49 |
border-collapse: collapse;
|
|
@@ -81,86 +79,31 @@ st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</di
|
|
| 81 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 82 |
|
| 83 |
# Load the data
|
| 84 |
-
# data_path = "factbench_data.csv"
|
| 85 |
data_path = "tiered_models_data.csv"
|
| 86 |
df = pd.read_csv(data_path)
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# Create tabs
|
| 89 |
tab1, tab2, tab3 = st.tabs(
|
| 90 |
["Leaderboard", "Benchmark Details", "Submit your models"])
|
| 91 |
|
| 92 |
# Tab 1: Leaderboard
|
| 93 |
-
# with tab1:
|
| 94 |
-
# st.markdown('<div class="title">Leaderboard</div>',
|
| 95 |
-
# unsafe_allow_html=True)
|
| 96 |
-
# st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
| 97 |
-
|
| 98 |
-
# # Dropdown menu to filter tiers
|
| 99 |
-
# tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
|
| 100 |
-
# selected_tier = st.selectbox('Select Tier:', tiers)
|
| 101 |
-
|
| 102 |
-
# # Filter the data based on the selected tier
|
| 103 |
-
# if selected_tier != 'All Tiers':
|
| 104 |
-
# filtered_df = df[df['Tier'] == selected_tier]
|
| 105 |
-
# else:
|
| 106 |
-
# filtered_df = df
|
| 107 |
-
|
| 108 |
-
# # Create HTML for the table
|
| 109 |
-
# html = '''
|
| 110 |
-
# <table>
|
| 111 |
-
# <thead>
|
| 112 |
-
# <tr>
|
| 113 |
-
# <th>Tier</th>
|
| 114 |
-
# <th>Model</th>
|
| 115 |
-
# <th>FactScore</th>
|
| 116 |
-
# <th>SAFE</th>
|
| 117 |
-
# <th>Factcheck-GPT</th>
|
| 118 |
-
# <th>VERIFY</th>
|
| 119 |
-
# </tr>
|
| 120 |
-
# </thead>
|
| 121 |
-
# <tbody>
|
| 122 |
-
# '''
|
| 123 |
-
|
| 124 |
-
# # Generate the rows of the table
|
| 125 |
-
# current_tier = None
|
| 126 |
-
# for i, row in filtered_df.iterrows():
|
| 127 |
-
# if row['Tier'] != current_tier:
|
| 128 |
-
# if current_tier is not None:
|
| 129 |
-
# # Close the previous tier row
|
| 130 |
-
# html += ' </tr>'
|
| 131 |
-
# current_tier = row['Tier']
|
| 132 |
-
# html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
|
| 133 |
-
# else:
|
| 134 |
-
# html += ' <tr>'
|
| 135 |
-
|
| 136 |
-
# # Fill in model and scores
|
| 137 |
-
# html += f'''
|
| 138 |
-
# <td>{row['Model']}</td>
|
| 139 |
-
# <td>{row['FactScore']:.2f}</td>
|
| 140 |
-
# <td>{row['SAFE']:.2f}</td>
|
| 141 |
-
# <td>{row['Factcheck-GPT']:.2f}</td>
|
| 142 |
-
# <td>{row['VERIFY']:.2f}</td>
|
| 143 |
-
# </tr>
|
| 144 |
-
# '''
|
| 145 |
-
|
| 146 |
-
# # Close the last row and table tags
|
| 147 |
-
# html += '''
|
| 148 |
-
# </table>
|
| 149 |
-
# '''
|
| 150 |
-
|
| 151 |
-
# # Display the table
|
| 152 |
-
# st.markdown(html, unsafe_allow_html=True)
|
| 153 |
-
|
| 154 |
-
# st.markdown('</div>', unsafe_allow_html=True)
|
| 155 |
-
df['rank'] = df['factuality_score'].rank(
|
| 156 |
-
ascending=False, method='min').astype(int)
|
| 157 |
-
|
| 158 |
with tab1:
|
|
|
|
|
|
|
| 159 |
st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
|
| 160 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
| 161 |
|
| 162 |
# Dropdown menu to filter tiers
|
| 163 |
-
tiers = ['All Tiers', 'Tier 1:
|
| 164 |
selected_tier = st.selectbox('Select Tier:', tiers)
|
| 165 |
|
| 166 |
# Filter the data based on the selected tier
|
|
@@ -168,84 +111,98 @@ with tab1:
|
|
| 168 |
filtered_df = df[df['tier'] == selected_tier]
|
| 169 |
else:
|
| 170 |
filtered_df = df
|
| 171 |
-
|
| 172 |
-
# sort_order = st.radio('Sort by Factuality Score:',
|
| 173 |
-
# ('Ascending', 'Descending'))
|
| 174 |
-
|
| 175 |
-
# # Sort the dataframe based on Factuality Score
|
| 176 |
-
# if sort_order == 'Ascending':
|
| 177 |
-
# filtered_df = filtered_df.sort_values(
|
| 178 |
-
# by='factuality_score', ascending=True)
|
| 179 |
-
# else:
|
| 180 |
-
# filtered_df = filtered_df.sort_values(
|
| 181 |
-
# by='factuality_score', ascending=False)
|
| 182 |
-
# Option to sort by Factuality Score in ascending order
|
| 183 |
sort_by_factuality = st.checkbox('Sort by Factuality Score')
|
| 184 |
|
| 185 |
# Sort the dataframe based on Factuality Score if the checkbox is selected
|
| 186 |
if sort_by_factuality:
|
| 187 |
updated_filtered_df = filtered_df.sort_values(
|
| 188 |
-
by='factuality_score', ascending=False
|
|
|
|
| 189 |
else:
|
| 190 |
-
updated_filtered_df = filtered_df
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# Create HTML for the table
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
<
|
| 196 |
-
<
|
| 197 |
-
<
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# Generate the rows of the table
|
| 219 |
current_tier = None
|
| 220 |
for i, row in updated_filtered_df.iterrows():
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
#
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
html += ' <tr>'
|
| 231 |
# Fill in model and scores
|
| 232 |
html += f'''
|
| 233 |
<td>{row['rank']}</td>
|
| 234 |
-
<td>{row['tier']}</td>
|
| 235 |
<td>{row['model']}</td>
|
| 236 |
-
<td>{row['factuality_score']
|
| 237 |
-
<td>{row['
|
| 238 |
-
<td>{row['
|
| 239 |
-
<td>{row['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
<td>{row['avg_undecidable_units']:.2f}</td>
|
| 241 |
<td>{row['avg_unsupported_units']:.2f}</td>
|
| 242 |
-
<td>{row['prompt_categories.Factual Recall']:.2f}</td>
|
| 243 |
-
<td>{row['prompt_categories.Conceptual Understanding']:.2f}</td>
|
| 244 |
-
<td>{row['prompt_categories.Procedural Execution']:.2f}</td>
|
| 245 |
-
<td>{row['prompt_categories.Comparative Analysis']:.2f}</td>
|
| 246 |
-
<td>{row['prompt_categories.Recommendations and Insights']:.2f}</td>
|
| 247 |
-
<td>{row['prompt_categories.Domain-Specific Knowledge']:.2f}</td>
|
| 248 |
-
<td>{row['prompt_categories.Temporal Context']:.2f}</td>
|
| 249 |
</tr>
|
| 250 |
'''
|
| 251 |
|
|
@@ -258,6 +215,7 @@ with tab1:
|
|
| 258 |
st.markdown(html, unsafe_allow_html=True)
|
| 259 |
|
| 260 |
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
|
| 261 |
# Tab 2: Details
|
| 262 |
with tab2:
|
| 263 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
|
|
|
| 37 |
}
|
| 38 |
|
| 39 |
.container {
|
| 40 |
+
max-width: 1000px;
|
| 41 |
+
margin: 0 auto;
|
| 42 |
padding: 20px;
|
| 43 |
}
|
| 44 |
|
|
|
|
|
|
|
| 45 |
table {
|
| 46 |
width: 100%;
|
| 47 |
border-collapse: collapse;
|
|
|
|
| 79 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 80 |
|
| 81 |
# Load the data
|
|
|
|
| 82 |
data_path = "tiered_models_data.csv"
|
| 83 |
df = pd.read_csv(data_path)
|
| 84 |
|
| 85 |
+
# Assign ranks within each tier based on factuality_score
|
| 86 |
+
df['rank'] = df.groupby('tier')['factuality_score'].rank(
|
| 87 |
+
ascending=False, method='min').astype(int)
|
| 88 |
+
|
| 89 |
+
# Replace NaN values with '-'
|
| 90 |
+
df.fillna('-', inplace=True)
|
| 91 |
+
|
| 92 |
+
df['original_order'] = df.groupby('tier').cumcount()
|
| 93 |
+
|
| 94 |
# Create tabs
|
| 95 |
tab1, tab2, tab3 = st.tabs(
|
| 96 |
["Leaderboard", "Benchmark Details", "Submit your models"])
|
| 97 |
|
| 98 |
# Tab 1: Leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
with tab1:
|
| 100 |
+
# df['original_order'] = df.groupby('tier').cumcount()
|
| 101 |
+
# print(df['original_order'])
|
| 102 |
st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
|
| 103 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
| 104 |
|
| 105 |
# Dropdown menu to filter tiers
|
| 106 |
+
tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
|
| 107 |
selected_tier = st.selectbox('Select Tier:', tiers)
|
| 108 |
|
| 109 |
# Filter the data based on the selected tier
|
|
|
|
| 111 |
filtered_df = df[df['tier'] == selected_tier]
|
| 112 |
else:
|
| 113 |
filtered_df = df
|
| 114 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
sort_by_factuality = st.checkbox('Sort by Factuality Score')
|
| 116 |
|
| 117 |
# Sort the dataframe based on Factuality Score if the checkbox is selected
|
| 118 |
if sort_by_factuality:
|
| 119 |
updated_filtered_df = filtered_df.sort_values(
|
| 120 |
+
by=['tier', 'factuality_score'], ascending=[True, False]
|
| 121 |
+
)
|
| 122 |
else:
|
| 123 |
+
updated_filtered_df = filtered_df.sort_values(
|
| 124 |
+
by=['tier', 'original_order']
|
| 125 |
+
)
|
| 126 |
|
| 127 |
# Create HTML for the table
|
| 128 |
+
if selected_tier == 'All Tiers':
|
| 129 |
+
html = '''
|
| 130 |
+
<table>
|
| 131 |
+
<thead>
|
| 132 |
+
<tr>
|
| 133 |
+
<th>Tier</th>
|
| 134 |
+
<th>Rank</th>
|
| 135 |
+
<th>Model</th>
|
| 136 |
+
<th>Factuality Score</th>
|
| 137 |
+
<th>Factual Recall</th>
|
| 138 |
+
<th>Conceptual Understanding</th>
|
| 139 |
+
<th>Procedural Execution</th>
|
| 140 |
+
<th>Comparative Analysis</th>
|
| 141 |
+
<th>Recommendations and Insights</th>
|
| 142 |
+
<th>Domain-Specific Knowledge</th>
|
| 143 |
+
<th>Temporal Context</th>
|
| 144 |
+
<th>Hallucination Score</th>
|
| 145 |
+
<th># Tokens</th>
|
| 146 |
+
<th># Factual</th>
|
| 147 |
+
<th># Undecidable</th>
|
| 148 |
+
<th># Unsupported</th>
|
| 149 |
+
</tr>
|
| 150 |
+
</thead>
|
| 151 |
+
<tbody>
|
| 152 |
+
'''
|
| 153 |
+
else:
|
| 154 |
+
html = '''
|
| 155 |
+
<table>
|
| 156 |
+
<thead>
|
| 157 |
+
<tr>
|
| 158 |
+
<th>Rank</th>
|
| 159 |
+
<th>Model</th>
|
| 160 |
+
<th>Factuality Score</th>
|
| 161 |
+
<th>Factual Recall</th>
|
| 162 |
+
<th>Conceptual Understanding</th>
|
| 163 |
+
<th>Procedural Execution</th>
|
| 164 |
+
<th>Comparative Analysis</th>
|
| 165 |
+
<th>Recommendations and Insights</th>
|
| 166 |
+
<th>Domain-Specific Knowledge</th>
|
| 167 |
+
<th>Temporal Context</th>
|
| 168 |
+
<th>Hallucination Score</th>
|
| 169 |
+
<th># Tokens</th>
|
| 170 |
+
<th># Factual</th>
|
| 171 |
+
<th># Undecidable</th>
|
| 172 |
+
<th># Unsupported</th>
|
| 173 |
+
</tr>
|
| 174 |
+
</thead>
|
| 175 |
+
<tbody>
|
| 176 |
+
'''
|
| 177 |
|
| 178 |
# Generate the rows of the table
|
| 179 |
current_tier = None
|
| 180 |
for i, row in updated_filtered_df.iterrows():
|
| 181 |
+
html += '<tr>'
|
| 182 |
+
|
| 183 |
+
# Only display the 'Tier' column if 'All Tiers' is selected
|
| 184 |
+
if selected_tier == 'All Tiers':
|
| 185 |
+
if row['tier'] != current_tier:
|
| 186 |
+
current_tier = row['tier']
|
| 187 |
+
html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
|
| 188 |
+
|
|
|
|
|
|
|
| 189 |
# Fill in model and scores
|
| 190 |
html += f'''
|
| 191 |
<td>{row['rank']}</td>
|
|
|
|
| 192 |
<td>{row['model']}</td>
|
| 193 |
+
<td>{row['factuality_score']}</td>
|
| 194 |
+
<td>{row['prompt_categories.Factual Recall']}</td>
|
| 195 |
+
<td>{row['prompt_categories.Conceptual Understanding']}</td>
|
| 196 |
+
<td>{row['prompt_categories.Procedural Execution']}</td>
|
| 197 |
+
<td>{row['prompt_categories.Comparative Analysis']}</td>
|
| 198 |
+
<td>{row['prompt_categories.Recommendations and Insights']}</td>
|
| 199 |
+
<td>{row['prompt_categories.Domain-Specific Knowledge']}</td>
|
| 200 |
+
<td>{row['prompt_categories.Temporal Context']}</td>
|
| 201 |
+
<td>{row['hallucination_score']}</td>
|
| 202 |
+
<td>{row['avg_tokens']}</td>
|
| 203 |
+
<td>{row['avg_factual_units']}</td>
|
| 204 |
<td>{row['avg_undecidable_units']:.2f}</td>
|
| 205 |
<td>{row['avg_unsupported_units']:.2f}</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
</tr>
|
| 207 |
'''
|
| 208 |
|
|
|
|
| 215 |
st.markdown(html, unsafe_allow_html=True)
|
| 216 |
|
| 217 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 218 |
+
|
| 219 |
# Tab 2: Details
|
| 220 |
with tab2:
|
| 221 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
tiered_models_data.csv
CHANGED
|
@@ -1,23 +1,23 @@
|
|
| 1 |
tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
|
| 2 |
-
Tier 1:
|
| 3 |
-
Tier 1:
|
| 4 |
-
Tier 1:
|
| 5 |
-
Tier 1:
|
| 6 |
-
Tier 1:
|
| 7 |
-
Tier 1:
|
| 8 |
-
Tier 1:
|
| 9 |
-
Tier 2: Moderate,
|
| 10 |
-
Tier 2: Moderate,
|
| 11 |
-
Tier 2: Moderate,
|
| 12 |
-
Tier 2: Moderate,
|
| 13 |
-
Tier 2: Moderate,
|
| 14 |
-
Tier 2: Moderate,
|
| 15 |
-
Tier 2: Moderate,
|
| 16 |
-
Tier 3:
|
| 17 |
-
Tier 3:
|
| 18 |
-
Tier 3:
|
| 19 |
-
Tier 3:
|
| 20 |
-
Tier 3:
|
| 21 |
-
Tier 3:
|
| 22 |
-
Tier 3:
|
| 23 |
|
|
|
|
| 1 |
tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
|
| 2 |
+
Tier 1: Hard,GPT4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
|
| 3 |
+
Tier 1: Hard,Gemini1.5-Pro,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
|
| 4 |
+
Tier 1: Hard,Llama3.1-70B-Instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
|
| 5 |
+
Tier 1: Hard,Llama3.1-405B-Instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
|
| 6 |
+
Tier 1: Hard,Claude-3.5-Sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
|
| 7 |
+
Tier 1: Hard,CommandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
|
| 8 |
+
Tier 1: Hard,Mistral-Large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
|
| 9 |
+
Tier 2: Moderate,GPT4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
|
| 10 |
+
Tier 2: Moderate,Gemini1.5-Pro,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
|
| 11 |
+
Tier 2: Moderate,Llama3.1-70B-Instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
|
| 12 |
+
Tier 2: Moderate,Llama3.1-405B-Instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
|
| 13 |
+
Tier 2: Moderate,Claude-3.5-Sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
|
| 14 |
+
Tier 2: Moderate,CommandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
|
| 15 |
+
Tier 2: Moderate,Mistral-Large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
|
| 16 |
+
Tier 3: Easy,GPT4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
|
| 17 |
+
Tier 3: Easy,Gemini1.5-Pro,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
|
| 18 |
+
Tier 3: Easy,Llama3.1-70B-Instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
|
| 19 |
+
Tier 3: Easy,Llama3.1-405B-Instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
|
| 20 |
+
Tier 3: Easy,Claude-3.5-Sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
|
| 21 |
+
Tier 3: Easy,CommandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
|
| 22 |
+
Tier 3: Easy,Mistral-Large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0
|
| 23 |
|