PeterKruger commited on
Commit
e4f522a
·
1 Parent(s): 61cd432

optinization of the leaderboard

Browse files
Files changed (2) hide show
  1. app.py +74 -42
  2. data/summary_data.csv +6 -6
app.py CHANGED
@@ -42,15 +42,15 @@ df_avg_latency = load_data(AVG_LATENCY_FILE)
42
  df_p99_latency = load_data(P99_LATENCY_FILE)
43
  print("Data loading complete.")
44
 
45
- # --- *** NEW: Convert Costs to Cents *** ---
46
  COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
47
- NEW_COST_COLUMN_SUMMARY = 'Avg Cost (Cents)' # This is the new name we'll use
48
 
49
  # Convert summary cost
50
  if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
51
- df_summary[COST_COLUMN_SUMMARY] = pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100
52
  df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
53
- print(f"Converted '{COST_COLUMN_SUMMARY}' to Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
54
  else:
55
  print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
56
 
@@ -61,8 +61,8 @@ if not df_cost.empty:
61
  cost_cols = [col for col in df_cost.columns if col != model_col_name]
62
  for col in cost_cols:
63
  # Handle potential non-numeric data gracefully before multiplying
64
- df_cost[col] = pd.to_numeric(df_cost[col], errors='coerce') * 100
65
- print("Converted cost breakdown columns to Cents in df_cost.")
66
  # --- *** End of Cost Conversion *** ---
67
 
68
  # Rename columns for clarity if needed (example for summary)
@@ -75,22 +75,34 @@ try:
75
  # 'Avg Answer Duration (sec)': 'Avg Latency (s)',
76
  # 'P99 Answer Duration (sec)': 'P99 Latency (s)'
77
  })
78
- # Select and reorder columns for the main table
79
- summary_cols_display = ['Model', 'AB', 'CBA', 'AAII', 'MMLU', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)'] # <-- MODIFIED
80
  # Filter to only columns that actually exist after loading and renaming
81
  summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
82
- df_summary_display = df_summary[summary_cols_display]
83
 
84
- # Ensure AB score is numeric for sorting
85
- if 'AB' in df_summary_display.columns:
86
- df_summary_display['AB'] = pd.to_numeric(df_summary_display['AB'], errors='coerce')
87
- df_summary_display = df_summary_display.sort_values(by='AB', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
88
  else:
89
- print("Warning: 'AB' column not found for sorting summary table.")
90
 
91
  except KeyError as e:
92
- print(f"Error preparing summary display columns: Missing key {e}. Check CSV headers and rename mapping.")
93
- df_summary_display = df_summary # Fallback to raw loaded data
 
94
 
95
 
96
  # --- Build Gradio App ---
@@ -98,35 +110,55 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
98
  gr.Markdown("# AutoBench LLM Leaderboard")
99
  gr.Markdown(
100
  "Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
101
- "Includes performance, cost, and latency metrics.\n"
102
  "More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
103
  )
104
 
105
  # --- Tab 1: Overall Ranking ---
106
  with gr.Tab("Overall Ranking"):
107
  gr.Markdown("## Overall Model Performance")
108
- gr.Markdown("Models ranked by AutoBench score. Correlations: AB vs CBA: 83.74%, AB vs AAII: 72.49%. (Lower cost [Cents]/latency is better).") # <-- MODIFIED
 
109
  # Check if df_summary_display has data before rendering
110
  if not df_summary_display.empty:
 
 
 
 
 
111
  gr.DataFrame(
112
- df_summary_display,
113
- datatype=['str'] + ['number'] * (len(df_summary_display.columns) - 1), # Assume first col is text, rest numbers
 
114
  interactive=True, # Allows sorting
115
  # height=600 # Adjust height as needed
116
  )
117
  else:
118
  gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
119
 
120
- # --- Tab 2: Performance Plots ---
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  with gr.Tab("Performance Plots"):
122
  gr.Markdown("## Performance Visualizations")
123
  gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
124
 
125
  # Scatter Plot 1 (using summary data)
126
  gr.Markdown("### Rank vs. Average Cost")
127
- if not df_summary.empty and 'AB' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns:
128
  # Filter out rows where essential plot data might be missing
129
- plot_df = df_summary.dropna(subset=['AB', NEW_COST_COLUMN_SUMMARY, 'Model']).copy()
130
  plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
131
  plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
132
 
@@ -134,12 +166,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
134
  fig_cost = px.scatter(
135
  plot_df,
136
  x=NEW_COST_COLUMN_SUMMARY,
137
- y="AB",
138
  text="Model", # Show model name near point
139
  log_x=True, # Use log scale for cost
140
- title="AutoBench Rank vs. Average Cost per Response (USD Cents - Log Scale)",
141
- labels={'AB': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost (USD Cents) - Log Scale'},
142
- hover_data=['Model', 'AB', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] # Show details on hover
143
  )
144
  fig_cost.update_traces(textposition='top center')
145
  fig_cost.update_layout(
@@ -160,15 +192,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
160
  )
161
  gr.Plot(fig_cost)
162
  else:
163
- gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AB' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_")
164
  else:
165
  gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
166
 
167
  # Plot 2: Rank vs Average Latency
168
  gr.Markdown("### Rank vs. Average Latency")
169
- if not df_summary.empty and 'AB' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns:
170
  # Filter out rows where essential plot data might be missing
171
- plot_df_avg_latency = df_summary.dropna(subset=['AB', 'Avg Answer Duration (sec)', 'Model']).copy()
172
  plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
173
  plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
174
 
@@ -176,27 +208,27 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
176
  fig_avg_latency = px.scatter(
177
  plot_df_avg_latency,
178
  x="Avg Answer Duration (sec)",
179
- y="AB",
180
  text="Model",
181
  log_x=True, # Use log scale for latency - adjust if not desired
182
  title="AutoBench Rank vs. Average Latency (Log Scale)",
183
- labels={'AB': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'},
184
- hover_data=['Model', 'AB', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
185
  )
186
  fig_avg_latency.update_traces(textposition='top center')
187
  fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
188
  gr.Plot(fig_avg_latency)
189
  else:
190
- gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AB' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_")
191
  else:
192
  gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
193
 
194
 
195
  # Plot 3: Rank vs P99 Latency
196
  gr.Markdown("### Rank vs. P99 Latency")
197
- if not df_summary.empty and 'AB' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns:
198
  # Filter out rows where essential plot data might be missing
199
- plot_df_p99_latency = df_summary.dropna(subset=['AB', 'P99 Answer Duration (sec)', 'Model']).copy()
200
  plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
201
  plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
202
 
@@ -204,18 +236,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
204
  fig_p99_latency = px.scatter(
205
  plot_df_p99_latency,
206
  x="P99 Answer Duration (sec)",
207
- y="AB",
208
  text="Model",
209
  log_x=True, # Use log scale for latency - adjust if not desired
210
  title="AutoBench Rank vs. P99 Latency (Log Scale)",
211
- labels={'AB': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'},
212
- hover_data=['Model', 'AB', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
213
  )
214
  fig_p99_latency.update_traces(textposition='top center')
215
  fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
216
  gr.Plot(fig_p99_latency)
217
  else:
218
- gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AB' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_")
219
  else:
220
  gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
221
 
@@ -224,7 +256,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
224
  gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
225
 
226
  # Cost Breakdown Table
227
- gr.Markdown("### Cost Breakdown per Domain (USD Cents/Response)") # <-- MODIFIED
228
  if not df_cost.empty:
229
  # Make model name the first column if it exists
230
  if 'model_name' in df_cost.columns:
@@ -293,7 +325,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
293
  * **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
294
  * **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
295
  * **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
296
- * **CBA / AAII / MMLU:** Scores from other well-known benchmarks for comparison (where available).
297
 
298
  ### Data
299
  This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..
 
42
  df_p99_latency = load_data(P99_LATENCY_FILE)
43
  print("Data loading complete.")
44
 
45
+ # --- *** NEW: Convert Costs to USD Cents *** ---
46
  COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
47
+ NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)' # This is the new name we'll use
48
 
49
  # Convert summary cost
50
  if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
51
+ df_summary[COST_COLUMN_SUMMARY] = (pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
52
  df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
53
+ print(f"Converted '{COST_COLUMN_SUMMARY}' to $ Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
54
  else:
55
  print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
56
 
 
61
  cost_cols = [col for col in df_cost.columns if col != model_col_name]
62
  for col in cost_cols:
63
  # Handle potential non-numeric data gracefully before multiplying
64
+ df_cost[col] = (pd.to_numeric(df_cost[col], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
65
+ print("Converted cost breakdown columns to $ Cents in df_cost.")
66
  # --- *** End of Cost Conversion *** ---
67
 
68
  # Rename columns for clarity if needed (example for summary)
 
75
  # 'Avg Answer Duration (sec)': 'Avg Latency (s)',
76
  # 'P99 Answer Duration (sec)': 'P99 Latency (s)'
77
  })
78
+ # Select and reorder columns for the main table - REMOVED BENCHMARK COLUMNS
79
+ summary_cols_display = ['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)']
80
  # Filter to only columns that actually exist after loading and renaming
81
  summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
82
+ df_summary_display = df_summary[summary_cols_display].copy() # Use .copy() to avoid SettingWithCopyWarning
83
 
84
+ # Select columns for the new benchmark comparison table
85
+ benchmark_cols = ['Model', 'AutoBench', 'Chatbot Ar.', 'AAI Index', 'MMLU Index']
86
+ benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns] # Filter existing
87
+ df_benchmark_display = df_summary[benchmark_cols].copy() # Use .copy()
88
+
89
+ # Ensure AutoBench score is numeric for sorting BOTH display tables
90
+ if 'AutoBench' in df_summary_display.columns:
91
+ df_summary_display['AutoBench'] = pd.to_numeric(df_summary_display['AutoBench'], errors='coerce')
92
+ df_summary_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
93
+ else:
94
+ print("Warning: 'AutoBench' column not found for sorting summary table.")
95
+
96
+ if 'AutoBench' in df_benchmark_display.columns:
97
+ df_benchmark_display['AutoBench'] = pd.to_numeric(df_benchmark_display['AutoBench'], errors='coerce')
98
+ df_benchmark_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
99
  else:
100
+ print("Warning: 'AutoBench' column not found for sorting benchmark table.")
101
 
102
  except KeyError as e:
103
+ print(f"Error preparing display columns: Missing key {e}. Check CSV headers and rename mapping.")
104
+ df_summary_display = df_summary.copy() # Fallback
105
+ df_benchmark_display = pd.DataFrame() # Fallback to empty for benchmark table
106
 
107
 
108
  # --- Build Gradio App ---
 
110
  gr.Markdown("# AutoBench LLM Leaderboard")
111
  gr.Markdown(
112
  "Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
113
+ "Includes performance, cost, and latency metrics.\\n"
114
  "More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
115
  )
116
 
117
  # --- Tab 1: Overall Ranking ---
118
  with gr.Tab("Overall Ranking"):
119
  gr.Markdown("## Overall Model Performance")
120
+ # REMOVED benchmark correlations from Markdown
121
+ gr.Markdown("Models ranked by AutoBench score. Lower cost ($ Cents) and latency (s) are better.")
122
  # Check if df_summary_display has data before rendering
123
  if not df_summary_display.empty:
124
+ # Create a copy specifically for this tab's display and rename the column
125
+ df_overall_rank_display = df_summary_display.copy()
126
+ if 'AutoBench' in df_overall_rank_display.columns:
127
+ df_overall_rank_display.rename(columns={'AutoBench': 'Rank'}, inplace=True)
128
+
129
  gr.DataFrame(
130
+ df_overall_rank_display, # Pass the renamed DF
131
+ # Adjust datatype length based on potentially fewer columns
132
+ datatype=['str'] + ['number'] * (len(df_overall_rank_display.columns) - 1),
133
  interactive=True, # Allows sorting
134
  # height=600 # Adjust height as needed
135
  )
136
  else:
137
  gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
138
 
139
+ # --- NEW Tab 1.5: Benchmark Comparison ---
140
+ with gr.Tab("Benchmark Comparison"):
141
+ gr.Markdown("## Benchmark Comparison")
142
+ gr.Markdown("Comparison of AutoBench scores with other popular benchmarks (Chatbot Arena, Artificial Analysis Index, MMLU Index). Models sorted by AutoBench score.")
143
+ if not df_benchmark_display.empty:
144
+ gr.DataFrame(
145
+ df_benchmark_display,
146
+ datatype=['str'] + ['number'] * (len(df_benchmark_display.columns) - 1),
147
+ interactive=True # Allow sorting
148
+ )
149
+ else:
150
+ gr.Markdown("_(Benchmark comparison data could not be prepared. Check `summary_data.csv` for 'Chatbot Ar.', 'AAI Index', 'MMLU Index' columns.)_")
151
+
152
+ # --- Tab 2: Performance Plots ---
153
  with gr.Tab("Performance Plots"):
154
  gr.Markdown("## Performance Visualizations")
155
  gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
156
 
157
  # Scatter Plot 1 (using summary data)
158
  gr.Markdown("### Rank vs. Average Cost")
159
+ if not df_summary.empty and 'AutoBench' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns:
160
  # Filter out rows where essential plot data might be missing
161
+ plot_df = df_summary.dropna(subset=['AutoBench', NEW_COST_COLUMN_SUMMARY, 'Model']).copy()
162
  plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
163
  plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
164
 
 
166
  fig_cost = px.scatter(
167
  plot_df,
168
  x=NEW_COST_COLUMN_SUMMARY,
169
+ y="AutoBench",
170
  text="Model", # Show model name near point
171
  log_x=True, # Use log scale for cost
172
+ title="AutoBench Rank vs. Average Cost per Response ($ Cents - Log Scale)",
173
+ labels={'AutoBench': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost ($ Cents) - Log Scale'},
174
+ hover_data=['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] # Show details on hover
175
  )
176
  fig_cost.update_traces(textposition='top center')
177
  fig_cost.update_layout(
 
192
  )
193
  gr.Plot(fig_cost)
194
  else:
195
+ gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AutoBench' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_")
196
  else:
197
  gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
198
 
199
  # Plot 2: Rank vs Average Latency
200
  gr.Markdown("### Rank vs. Average Latency")
201
+ if not df_summary.empty and 'AutoBench' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns:
202
  # Filter out rows where essential plot data might be missing
203
+ plot_df_avg_latency = df_summary.dropna(subset=['AutoBench', 'Avg Answer Duration (sec)', 'Model']).copy()
204
  plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
205
  plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
206
 
 
208
  fig_avg_latency = px.scatter(
209
  plot_df_avg_latency,
210
  x="Avg Answer Duration (sec)",
211
+ y="AutoBench",
212
  text="Model",
213
  log_x=True, # Use log scale for latency - adjust if not desired
214
  title="AutoBench Rank vs. Average Latency (Log Scale)",
215
+ labels={'AutoBench': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'},
216
+ hover_data=['Model', 'AutoBench', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
217
  )
218
  fig_avg_latency.update_traces(textposition='top center')
219
  fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
220
  gr.Plot(fig_avg_latency)
221
  else:
222
+ gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AutoBench' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_")
223
  else:
224
  gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
225
 
226
 
227
  # Plot 3: Rank vs P99 Latency
228
  gr.Markdown("### Rank vs. P99 Latency")
229
+ if not df_summary.empty and 'AutoBench' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns:
230
  # Filter out rows where essential plot data might be missing
231
+ plot_df_p99_latency = df_summary.dropna(subset=['AutoBench', 'P99 Answer Duration (sec)', 'Model']).copy()
232
  plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
233
  plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
234
 
 
236
  fig_p99_latency = px.scatter(
237
  plot_df_p99_latency,
238
  x="P99 Answer Duration (sec)",
239
+ y="AutoBench",
240
  text="Model",
241
  log_x=True, # Use log scale for latency - adjust if not desired
242
  title="AutoBench Rank vs. P99 Latency (Log Scale)",
243
+ labels={'AutoBench': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'},
244
+ hover_data=['Model', 'AutoBench', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
245
  )
246
  fig_p99_latency.update_traces(textposition='top center')
247
  fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
248
  gr.Plot(fig_p99_latency)
249
  else:
250
+ gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AutoBench' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_")
251
  else:
252
  gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
253
 
 
256
  gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
257
 
258
  # Cost Breakdown Table
259
+ gr.Markdown("### Cost Breakdown per Domain ($ Cents/Response)") # <-- MODIFIED
260
  if not df_cost.empty:
261
  # Make model name the first column if it exists
262
  if 'model_name' in df_cost.columns:
 
325
  * **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
326
  * **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
327
  * **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
328
+ * **Chatbot Arena / Artificial Analysis Intelligence Index / MMLU:** Scores from other well-known benchmarks for comparison (where available).
329
 
330
  ### Data
331
  This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..
data/summary_data.csv CHANGED
@@ -1,7 +1,7 @@
1
- Model,AB,CBA,AAII,MMLU,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec)
2
- claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.80,17.98
3
  claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
4
- claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.80,82.60
5
  deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
6
  deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
7
  deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
@@ -13,7 +13,7 @@ gpt-4o-mini,4,1272,35680,0.648,0.00038653,12.17,21.75
13
  grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
14
  grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
15
  llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
16
- llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.70
17
  llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
18
  llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
19
  llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
@@ -22,5 +22,5 @@ mistral-small-24b-instruct-2501,3.88,1217,35280,0.652,0.00012061,13.99,29.62
22
  nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
23
  nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
24
  o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
25
- o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.10,52.30
26
- qwen-plus,4.17,1310,,,0.00094732,34.73,66.70
 
1
+ Model,AutoBench,Chatbot Ar.,AAI Index,MMLU Index,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec)
2
+ claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.8,17.98
3
  claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
4
+ claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.8,82.6
5
  deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
6
  deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
7
  deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
 
13
  grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
14
  grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
15
  llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
16
+ llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.7
17
  llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
18
  llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
19
  llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
 
22
  nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
23
  nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
24
  o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
25
+ o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.1,52.3
26
+ qwen-plus,4.17,1310,,,0.00094732,34.73,66.7