Spaces:
Running
Running
Commit
·
e4f522a
1
Parent(s):
61cd432
optinization of the leaderboard
Browse files- app.py +74 -42
- data/summary_data.csv +6 -6
app.py
CHANGED
@@ -42,15 +42,15 @@ df_avg_latency = load_data(AVG_LATENCY_FILE)
|
|
42 |
df_p99_latency = load_data(P99_LATENCY_FILE)
|
43 |
print("Data loading complete.")
|
44 |
|
45 |
-
# --- *** NEW: Convert Costs to Cents *** ---
|
46 |
COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
|
47 |
-
NEW_COST_COLUMN_SUMMARY = 'Avg Cost (Cents)' # This is the new name we'll use
|
48 |
|
49 |
# Convert summary cost
|
50 |
if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
|
51 |
-
df_summary[COST_COLUMN_SUMMARY] = pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100
|
52 |
df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
|
53 |
-
print(f"Converted '{COST_COLUMN_SUMMARY}' to Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
|
54 |
else:
|
55 |
print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
|
56 |
|
@@ -61,8 +61,8 @@ if not df_cost.empty:
|
|
61 |
cost_cols = [col for col in df_cost.columns if col != model_col_name]
|
62 |
for col in cost_cols:
|
63 |
# Handle potential non-numeric data gracefully before multiplying
|
64 |
-
df_cost[col] = pd.to_numeric(df_cost[col], errors='coerce') * 100
|
65 |
-
print("Converted cost breakdown columns to Cents in df_cost.")
|
66 |
# --- *** End of Cost Conversion *** ---
|
67 |
|
68 |
# Rename columns for clarity if needed (example for summary)
|
@@ -75,22 +75,34 @@ try:
|
|
75 |
# 'Avg Answer Duration (sec)': 'Avg Latency (s)',
|
76 |
# 'P99 Answer Duration (sec)': 'P99 Latency (s)'
|
77 |
})
|
78 |
-
# Select and reorder columns for the main table
|
79 |
-
summary_cols_display = ['Model', '
|
80 |
# Filter to only columns that actually exist after loading and renaming
|
81 |
summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
|
82 |
-
df_summary_display = df_summary[summary_cols_display]
|
83 |
|
84 |
-
#
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
else:
|
89 |
-
print("Warning: '
|
90 |
|
91 |
except KeyError as e:
|
92 |
-
print(f"Error preparing
|
93 |
-
df_summary_display = df_summary # Fallback
|
|
|
94 |
|
95 |
|
96 |
# --- Build Gradio App ---
|
@@ -98,35 +110,55 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
98 |
gr.Markdown("# AutoBench LLM Leaderboard")
|
99 |
gr.Markdown(
|
100 |
"Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
|
101 |
-
"Includes performance, cost, and latency metrics
|
102 |
"More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
|
103 |
)
|
104 |
|
105 |
# --- Tab 1: Overall Ranking ---
|
106 |
with gr.Tab("Overall Ranking"):
|
107 |
gr.Markdown("## Overall Model Performance")
|
108 |
-
|
|
|
109 |
# Check if df_summary_display has data before rendering
|
110 |
if not df_summary_display.empty:
|
|
|
|
|
|
|
|
|
|
|
111 |
gr.DataFrame(
|
112 |
-
|
113 |
-
|
|
|
114 |
interactive=True, # Allows sorting
|
115 |
# height=600 # Adjust height as needed
|
116 |
)
|
117 |
else:
|
118 |
gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
|
119 |
|
120 |
-
# --- Tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
with gr.Tab("Performance Plots"):
|
122 |
gr.Markdown("## Performance Visualizations")
|
123 |
gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
|
124 |
|
125 |
# Scatter Plot 1 (using summary data)
|
126 |
gr.Markdown("### Rank vs. Average Cost")
|
127 |
-
if not df_summary.empty and '
|
128 |
# Filter out rows where essential plot data might be missing
|
129 |
-
plot_df = df_summary.dropna(subset=['
|
130 |
plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
|
131 |
plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
|
132 |
|
@@ -134,12 +166,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
134 |
fig_cost = px.scatter(
|
135 |
plot_df,
|
136 |
x=NEW_COST_COLUMN_SUMMARY,
|
137 |
-
y="
|
138 |
text="Model", # Show model name near point
|
139 |
log_x=True, # Use log scale for cost
|
140 |
-
title="AutoBench Rank vs. Average Cost per Response (
|
141 |
-
labels={'
|
142 |
-
hover_data=['Model', '
|
143 |
)
|
144 |
fig_cost.update_traces(textposition='top center')
|
145 |
fig_cost.update_layout(
|
@@ -160,15 +192,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
160 |
)
|
161 |
gr.Plot(fig_cost)
|
162 |
else:
|
163 |
-
gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check '
|
164 |
else:
|
165 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
|
166 |
|
167 |
# Plot 2: Rank vs Average Latency
|
168 |
gr.Markdown("### Rank vs. Average Latency")
|
169 |
-
if not df_summary.empty and '
|
170 |
# Filter out rows where essential plot data might be missing
|
171 |
-
plot_df_avg_latency = df_summary.dropna(subset=['
|
172 |
plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
|
173 |
plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
|
174 |
|
@@ -176,27 +208,27 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
176 |
fig_avg_latency = px.scatter(
|
177 |
plot_df_avg_latency,
|
178 |
x="Avg Answer Duration (sec)",
|
179 |
-
y="
|
180 |
text="Model",
|
181 |
log_x=True, # Use log scale for latency - adjust if not desired
|
182 |
title="AutoBench Rank vs. Average Latency (Log Scale)",
|
183 |
-
labels={'
|
184 |
-
hover_data=['Model', '
|
185 |
)
|
186 |
fig_avg_latency.update_traces(textposition='top center')
|
187 |
fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
188 |
gr.Plot(fig_avg_latency)
|
189 |
else:
|
190 |
-
gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check '
|
191 |
else:
|
192 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
|
193 |
|
194 |
|
195 |
# Plot 3: Rank vs P99 Latency
|
196 |
gr.Markdown("### Rank vs. P99 Latency")
|
197 |
-
if not df_summary.empty and '
|
198 |
# Filter out rows where essential plot data might be missing
|
199 |
-
plot_df_p99_latency = df_summary.dropna(subset=['
|
200 |
plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
|
201 |
plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
|
202 |
|
@@ -204,18 +236,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
204 |
fig_p99_latency = px.scatter(
|
205 |
plot_df_p99_latency,
|
206 |
x="P99 Answer Duration (sec)",
|
207 |
-
y="
|
208 |
text="Model",
|
209 |
log_x=True, # Use log scale for latency - adjust if not desired
|
210 |
title="AutoBench Rank vs. P99 Latency (Log Scale)",
|
211 |
-
labels={'
|
212 |
-
hover_data=['Model', '
|
213 |
)
|
214 |
fig_p99_latency.update_traces(textposition='top center')
|
215 |
fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
216 |
gr.Plot(fig_p99_latency)
|
217 |
else:
|
218 |
-
gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check '
|
219 |
else:
|
220 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
|
221 |
|
@@ -224,7 +256,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
224 |
gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
|
225 |
|
226 |
# Cost Breakdown Table
|
227 |
-
gr.Markdown("### Cost Breakdown per Domain (
|
228 |
if not df_cost.empty:
|
229 |
# Make model name the first column if it exists
|
230 |
if 'model_name' in df_cost.columns:
|
@@ -293,7 +325,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
293 |
* **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
|
294 |
* **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
|
295 |
* **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
|
296 |
-
* **
|
297 |
|
298 |
### Data
|
299 |
This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..
|
|
|
42 |
df_p99_latency = load_data(P99_LATENCY_FILE)
|
43 |
print("Data loading complete.")
|
44 |
|
45 |
+
# --- *** NEW: Convert Costs to USD Cents *** ---
|
46 |
COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
|
47 |
+
NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)' # This is the new name we'll use
|
48 |
|
49 |
# Convert summary cost
|
50 |
if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
|
51 |
+
df_summary[COST_COLUMN_SUMMARY] = (pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
|
52 |
df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
|
53 |
+
print(f"Converted '{COST_COLUMN_SUMMARY}' to $ Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
|
54 |
else:
|
55 |
print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
|
56 |
|
|
|
61 |
cost_cols = [col for col in df_cost.columns if col != model_col_name]
|
62 |
for col in cost_cols:
|
63 |
# Handle potential non-numeric data gracefully before multiplying
|
64 |
+
df_cost[col] = (pd.to_numeric(df_cost[col], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
|
65 |
+
print("Converted cost breakdown columns to $ Cents in df_cost.")
|
66 |
# --- *** End of Cost Conversion *** ---
|
67 |
|
68 |
# Rename columns for clarity if needed (example for summary)
|
|
|
75 |
# 'Avg Answer Duration (sec)': 'Avg Latency (s)',
|
76 |
# 'P99 Answer Duration (sec)': 'P99 Latency (s)'
|
77 |
})
|
78 |
+
# Select and reorder columns for the main table - REMOVED BENCHMARK COLUMNS
|
79 |
+
summary_cols_display = ['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)']
|
80 |
# Filter to only columns that actually exist after loading and renaming
|
81 |
summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
|
82 |
+
df_summary_display = df_summary[summary_cols_display].copy() # Use .copy() to avoid SettingWithCopyWarning
|
83 |
|
84 |
+
# Select columns for the new benchmark comparison table
|
85 |
+
benchmark_cols = ['Model', 'AutoBench', 'Chatbot Ar.', 'AAI Index', 'MMLU Index']
|
86 |
+
benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns] # Filter existing
|
87 |
+
df_benchmark_display = df_summary[benchmark_cols].copy() # Use .copy()
|
88 |
+
|
89 |
+
# Ensure AutoBench score is numeric for sorting BOTH display tables
|
90 |
+
if 'AutoBench' in df_summary_display.columns:
|
91 |
+
df_summary_display['AutoBench'] = pd.to_numeric(df_summary_display['AutoBench'], errors='coerce')
|
92 |
+
df_summary_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
|
93 |
+
else:
|
94 |
+
print("Warning: 'AutoBench' column not found for sorting summary table.")
|
95 |
+
|
96 |
+
if 'AutoBench' in df_benchmark_display.columns:
|
97 |
+
df_benchmark_display['AutoBench'] = pd.to_numeric(df_benchmark_display['AutoBench'], errors='coerce')
|
98 |
+
df_benchmark_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
|
99 |
else:
|
100 |
+
print("Warning: 'AutoBench' column not found for sorting benchmark table.")
|
101 |
|
102 |
except KeyError as e:
|
103 |
+
print(f"Error preparing display columns: Missing key {e}. Check CSV headers and rename mapping.")
|
104 |
+
df_summary_display = df_summary.copy() # Fallback
|
105 |
+
df_benchmark_display = pd.DataFrame() # Fallback to empty for benchmark table
|
106 |
|
107 |
|
108 |
# --- Build Gradio App ---
|
|
|
110 |
gr.Markdown("# AutoBench LLM Leaderboard")
|
111 |
gr.Markdown(
|
112 |
"Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
|
113 |
+
"Includes performance, cost, and latency metrics.\\n"
|
114 |
"More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
|
115 |
)
|
116 |
|
117 |
# --- Tab 1: Overall Ranking ---
|
118 |
with gr.Tab("Overall Ranking"):
|
119 |
gr.Markdown("## Overall Model Performance")
|
120 |
+
# REMOVED benchmark correlations from Markdown
|
121 |
+
gr.Markdown("Models ranked by AutoBench score. Lower cost ($ Cents) and latency (s) are better.")
|
122 |
# Check if df_summary_display has data before rendering
|
123 |
if not df_summary_display.empty:
|
124 |
+
# Create a copy specifically for this tab's display and rename the column
|
125 |
+
df_overall_rank_display = df_summary_display.copy()
|
126 |
+
if 'AutoBench' in df_overall_rank_display.columns:
|
127 |
+
df_overall_rank_display.rename(columns={'AutoBench': 'Rank'}, inplace=True)
|
128 |
+
|
129 |
gr.DataFrame(
|
130 |
+
df_overall_rank_display, # Pass the renamed DF
|
131 |
+
# Adjust datatype length based on potentially fewer columns
|
132 |
+
datatype=['str'] + ['number'] * (len(df_overall_rank_display.columns) - 1),
|
133 |
interactive=True, # Allows sorting
|
134 |
# height=600 # Adjust height as needed
|
135 |
)
|
136 |
else:
|
137 |
gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
|
138 |
|
139 |
+
# --- NEW Tab 1.5: Benchmark Comparison ---
|
140 |
+
with gr.Tab("Benchmark Comparison"):
|
141 |
+
gr.Markdown("## Benchmark Comparison")
|
142 |
+
gr.Markdown("Comparison of AutoBench scores with other popular benchmarks (Chatbot Arena, Artificial Analysis Index, MMLU Index). Models sorted by AutoBench score.")
|
143 |
+
if not df_benchmark_display.empty:
|
144 |
+
gr.DataFrame(
|
145 |
+
df_benchmark_display,
|
146 |
+
datatype=['str'] + ['number'] * (len(df_benchmark_display.columns) - 1),
|
147 |
+
interactive=True # Allow sorting
|
148 |
+
)
|
149 |
+
else:
|
150 |
+
gr.Markdown("_(Benchmark comparison data could not be prepared. Check `summary_data.csv` for 'Chatbot Ar.', 'AAI Index', 'MMLU Index' columns.)_")
|
151 |
+
|
152 |
+
# --- Tab 2: Performance Plots ---
|
153 |
with gr.Tab("Performance Plots"):
|
154 |
gr.Markdown("## Performance Visualizations")
|
155 |
gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
|
156 |
|
157 |
# Scatter Plot 1 (using summary data)
|
158 |
gr.Markdown("### Rank vs. Average Cost")
|
159 |
+
if not df_summary.empty and 'AutoBench' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns:
|
160 |
# Filter out rows where essential plot data might be missing
|
161 |
+
plot_df = df_summary.dropna(subset=['AutoBench', NEW_COST_COLUMN_SUMMARY, 'Model']).copy()
|
162 |
plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
|
163 |
plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
|
164 |
|
|
|
166 |
fig_cost = px.scatter(
|
167 |
plot_df,
|
168 |
x=NEW_COST_COLUMN_SUMMARY,
|
169 |
+
y="AutoBench",
|
170 |
text="Model", # Show model name near point
|
171 |
log_x=True, # Use log scale for cost
|
172 |
+
title="AutoBench Rank vs. Average Cost per Response ($ Cents - Log Scale)",
|
173 |
+
labels={'AutoBench': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost ($ Cents) - Log Scale'},
|
174 |
+
hover_data=['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] # Show details on hover
|
175 |
)
|
176 |
fig_cost.update_traces(textposition='top center')
|
177 |
fig_cost.update_layout(
|
|
|
192 |
)
|
193 |
gr.Plot(fig_cost)
|
194 |
else:
|
195 |
+
gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AutoBench' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_")
|
196 |
else:
|
197 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
|
198 |
|
199 |
# Plot 2: Rank vs Average Latency
|
200 |
gr.Markdown("### Rank vs. Average Latency")
|
201 |
+
if not df_summary.empty and 'AutoBench' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns:
|
202 |
# Filter out rows where essential plot data might be missing
|
203 |
+
plot_df_avg_latency = df_summary.dropna(subset=['AutoBench', 'Avg Answer Duration (sec)', 'Model']).copy()
|
204 |
plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
|
205 |
plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
|
206 |
|
|
|
208 |
fig_avg_latency = px.scatter(
|
209 |
plot_df_avg_latency,
|
210 |
x="Avg Answer Duration (sec)",
|
211 |
+
y="AutoBench",
|
212 |
text="Model",
|
213 |
log_x=True, # Use log scale for latency - adjust if not desired
|
214 |
title="AutoBench Rank vs. Average Latency (Log Scale)",
|
215 |
+
labels={'AutoBench': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'},
|
216 |
+
hover_data=['Model', 'AutoBench', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
|
217 |
)
|
218 |
fig_avg_latency.update_traces(textposition='top center')
|
219 |
fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
220 |
gr.Plot(fig_avg_latency)
|
221 |
else:
|
222 |
+
gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AutoBench' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_")
|
223 |
else:
|
224 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
|
225 |
|
226 |
|
227 |
# Plot 3: Rank vs P99 Latency
|
228 |
gr.Markdown("### Rank vs. P99 Latency")
|
229 |
+
if not df_summary.empty and 'AutoBench' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns:
|
230 |
# Filter out rows where essential plot data might be missing
|
231 |
+
plot_df_p99_latency = df_summary.dropna(subset=['AutoBench', 'P99 Answer Duration (sec)', 'Model']).copy()
|
232 |
plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
|
233 |
plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
|
234 |
|
|
|
236 |
fig_p99_latency = px.scatter(
|
237 |
plot_df_p99_latency,
|
238 |
x="P99 Answer Duration (sec)",
|
239 |
+
y="AutoBench",
|
240 |
text="Model",
|
241 |
log_x=True, # Use log scale for latency - adjust if not desired
|
242 |
title="AutoBench Rank vs. P99 Latency (Log Scale)",
|
243 |
+
labels={'AutoBench': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'},
|
244 |
+
hover_data=['Model', 'AutoBench', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
|
245 |
)
|
246 |
fig_p99_latency.update_traces(textposition='top center')
|
247 |
fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
248 |
gr.Plot(fig_p99_latency)
|
249 |
else:
|
250 |
+
gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AutoBench' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_")
|
251 |
else:
|
252 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
|
253 |
|
|
|
256 |
gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
|
257 |
|
258 |
# Cost Breakdown Table
|
259 |
+
gr.Markdown("### Cost Breakdown per Domain ($ Cents/Response)") # <-- MODIFIED
|
260 |
if not df_cost.empty:
|
261 |
# Make model name the first column if it exists
|
262 |
if 'model_name' in df_cost.columns:
|
|
|
325 |
* **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
|
326 |
* **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
|
327 |
* **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
|
328 |
+
* **Chatbot Arena / Artificial Analysis Intelligence Index / MMLU:** Scores from other well-known benchmarks for comparison (where available).
|
329 |
|
330 |
### Data
|
331 |
This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..
|
data/summary_data.csv
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
Model,
|
2 |
-
claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.
|
3 |
claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
|
4 |
-
claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.
|
5 |
deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
|
6 |
deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
|
7 |
deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
|
@@ -13,7 +13,7 @@ gpt-4o-mini,4,1272,35680,0.648,0.00038653,12.17,21.75
|
|
13 |
grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
|
14 |
grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
|
15 |
llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
|
16 |
-
llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.
|
17 |
llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
|
18 |
llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
|
19 |
llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
|
@@ -22,5 +22,5 @@ mistral-small-24b-instruct-2501,3.88,1217,35280,0.652,0.00012061,13.99,29.62
|
|
22 |
nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
|
23 |
nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
|
24 |
o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
|
25 |
-
o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.
|
26 |
-
qwen-plus,4.17,1310,,,0.00094732,34.73,66.
|
|
|
1 |
+
Model,AutoBench,Chatbot Ar.,AAI Index,MMLU Index,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec)
|
2 |
+
claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.8,17.98
|
3 |
claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
|
4 |
+
claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.8,82.6
|
5 |
deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
|
6 |
deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
|
7 |
deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
|
|
|
13 |
grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
|
14 |
grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
|
15 |
llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
|
16 |
+
llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.7
|
17 |
llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
|
18 |
llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
|
19 |
llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
|
|
|
22 |
nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
|
23 |
nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
|
24 |
o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
|
25 |
+
o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.1,52.3
|
26 |
+
qwen-plus,4.17,1310,,,0.00094732,34.73,66.7
|