Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 17,778 Bytes
f2bc0a5 b1a1395 f2bc0a5 df66f6e 2a5f9fb 405857a b1a1395 405857a f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 b1a1395 ec3a730 405857a ec3a730 b1a1395 405857a b1a1395 f2bc0a5 b1a1395 f2bc0a5 b1a1395 dbb8b5d f2bc0a5 b1a1395 5639a81 b1a1395 f2bc0a5 b1a1395 ec3a730 f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 c0fa950 f2bc0a5 b1a1395 f2bc0a5 b1a1395 f2bc0a5 405857a f2bc0a5 359d8a9 0e84464 359d8a9 f2bc0a5 405857a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 |
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.graph_objs import Figure
from src.leaderboard.filter_models import FLAGGED_MODELS
from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS, external_eval_results, NUMERIC_INTERVALS
from src.leaderboard.read_evals import EvalResult
import copy
def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
"""
Generates a DataFrame containing the maximum scores until each date.
:param results_df: A DataFrame containing result information including metric scores and dates.
:return: A new DataFrame containing the maximum scores until each date for every metric.
"""
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
#create dataframe with EvalResult dataclass columns, even if raw_data is empty
raw_data = copy.deepcopy(raw_data)
for external_row in external_eval_results:
raw_data.append(EvalResult(**external_row))
results_df = pd.DataFrame(raw_data, columns=EvalResult.__dataclass_fields__.keys())
#results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
#convert date to datetime
results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
#convert to simple date string 2025-04-26
results_df["date"] = results_df["date"].dt.strftime("%Y-%m-%d")
results_df.sort_values(by="date", inplace=True)
# Step 2: Initialize the scores dictionary
scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
current_max = 0
last_date = ""
column = task.col_name
for _, row in results_df.iterrows():
current_model = row["full_model"]
# We ignore models that are flagged/no longer on the hub/not finished
to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "FINISHED"
if to_ignore:
continue
current_date = row["date"]
if task.benchmark == "Average":
current_score = np.mean(list(row["results"].values()))
else:
if task.benchmark not in row["results"]:
continue
current_score = row["results"][task.benchmark]
if current_score > current_max:
if current_date == last_date and len(scores[column]) > 0:
scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
else:
scores[column].append({"model": current_model, "date": current_date, "score": current_score})
current_max = current_score
last_date = current_date
# Step 4: Return all dictionaries as DataFrames
return {k: pd.DataFrame(v, columns=["model", "date", "score"]) for k, v in scores.items()}
def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
"""
Transforms the scores DataFrame into a new format suitable for plotting.
:param scores_df: A DataFrame containing metric scores and dates.
:return: A new DataFrame reshaped for plotting purposes.
"""
# Initialize the list to store DataFrames
dfs = []
# Iterate over the cols and create a new DataFrame for each column
for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
d = scores_df[col].reset_index(drop=True)
d["task"] = col
dfs.append(d)
# Concatenate all the created DataFrames
concat_df = pd.concat(dfs, ignore_index=True)
# Sort values by 'date'
concat_df.sort_values(by="date", inplace=True)
concat_df.reset_index(drop=True, inplace=True)
return concat_df
def create_metric_plot_obj(
df: pd.DataFrame, metrics: list[str], title: str
) -> Figure:
"""
Create a Plotly figure object with lines representing different metrics
and horizontal dotted lines representing human baselines.
:param df: The DataFrame containing the metric values, names, and dates.
:param metrics: A list of strings representing the names of the metrics
to be included in the plot.
:param title: A string representing the title of the plot.
:return: A Plotly figure object with lines representing metrics and
horizontal dotted lines representing human baselines.
"""
# Filter the DataFrame based on the specified metrics
df = df[df["task"].isin(metrics)]
# Filter the human baselines based on the specified metrics
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}
# Create a line figure using plotly express with specified markers and custom data
fig = px.line(
df,
x="date",
y="score",
color="task",
markers=True,
custom_data=["task", "score", "model"],
title=title,
)
# Update hovertemplate for better hover interaction experience
fig.update_traces(
hovertemplate="<br>".join(
[
"Model Name: %{customdata[2]}",
"Metric Name: %{customdata[0]}",
"Date: %{x}",
"Metric Value: %{y}",
]
)
)
# Update the range of the y-axis
#fig.update_layout(yaxis_range=[0, 100])
# Create a dictionary to hold the color mapping for each metric
metric_color_mapping = {}
# Map each metric name to its color in the figure
for trace in fig.data:
metric_color_mapping[trace.name] = trace.line.color
# Iterate over filtered human baselines and add horizontal lines to the figure
for metric, value in filtered_human_baselines.items():
color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
# Add horizontal line with matched color and positioned annotation
fig.add_hline(
y=value,
line_dash="dot",
annotation_text=f"{metric} human baseline",
annotation_position=location,
annotation_font_size=10,
annotation_font_color=color,
line_color=color,
)
return fig
def create_lat_score_mem_plot_obj(leaderboard_df):
copy_df = leaderboard_df.copy()
copy_df = copy_df[~(copy_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"]))]
# plot
SCORE_MEMORY_LATENCY_DATA = [
AutoEvalColumn.dummy.name,
AutoEvalColumn.average.name,
AutoEvalColumn.params.name,
AutoEvalColumn.architecture.name,
"Evaluation Time (min)"
]
copy_df["LLM Average Score"] = copy_df[AutoEvalColumn.average.name]
copy_df["Evaluation Time (min)"] = copy_df[AutoEvalColumn.eval_time.name] / 60
#copy_df["size"] = copy_df[AutoEvalColumn.params.name]
copy_df["size"] = copy_df[AutoEvalColumn.params.name].apply(lambda x: 0.5 if 0 <= x < 0.8 else x)
copy_df["size"] = copy_df["size"].apply(lambda x: 0.8 if 0.8 <= x < 2 else x)
copy_df["size"] = copy_df["size"].apply(lambda x: 1.5 if 2 <= x < 5 else x)
copy_df["size"] = copy_df["size"].apply(lambda x: 2.0 if 5 <= x < 10 else x)
copy_df["size"] = copy_df["size"].apply(lambda x: 3.0 if 10 <= x < 35 else x)
copy_df["size"] = copy_df["size"].apply(lambda x: 4.0 if 35 <= x < 60 else x)
copy_df["size"] = copy_df["size"].apply(lambda x: 6.0 if 60 <= x < 90 else x)
copy_df["size"] = copy_df["size"].apply(lambda x: 8.0 if x >= 90 else x)
fig = px.scatter(
copy_df,
x="Evaluation Time (min)",
y="LLM Average Score",
size="size",
color=AutoEvalColumn.architecture.name,
custom_data=SCORE_MEMORY_LATENCY_DATA,
color_discrete_sequence=px.colors.qualitative.Light24,
log_x=True
)
fig.update_traces(
hovertemplate="<br>".join(
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
)
)
fig.update_layout(
title={
"text": "Eval Time vs. Score vs. #Params",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="Time To Evaluate (min)",
yaxis_title="LLM Average Score",
legend_title="LLM Architecture",
width=1200,
height=600,
)
return fig
def create_top_n_models_comparison_plot(leaderboard_df: pd.DataFrame, top_n: int = 5, size_filter: str = None) -> Figure:
"""
Creates a grouped bar chart comparing the performance of the top N models across all metrics.
:param leaderboard_df: DataFrame containing the leaderboard data.
:param top_n: The number of top models to include in the comparison (default is 5).
:param size_filter: If provided, only include models of this specific size category.
:return: A Plotly figure object representing the comparison plot.
"""
# Ensure BENCHMARK_COLS contains the correct metric column names
metric_cols = BENCHMARK_COLS
# Filter out non-model rows (like baseline or human) and select relevant columns
models_df = leaderboard_df[~leaderboard_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"])].copy()
# Add size group information to the DataFrame
models_df['size_group'] = models_df[AutoEvalColumn.params.name].apply(
lambda x: next((k for k, v in NUMERIC_INTERVALS.items() if x in v), '?')
)
# Filter by size category if specified
if size_filter and size_filter != 'All Sizes':
models_df = models_df[models_df['size_group'] == size_filter]
if models_df.empty:
# If no models match the size filter, return an empty figure with a message
fig = px.bar(
x=["No Data"],
y=[0],
title=f"No models found in the {size_filter} size category"
)
fig.update_layout(
xaxis_title="",
yaxis_title="",
showlegend=False
)
return fig
# Sort models by average score and select the top N
top_models_df = models_df.nlargest(top_n, AutoEvalColumn.average.name)
# Select only the necessary columns: model name and metric scores
plot_data = top_models_df[[AutoEvalColumn.dummy.name] + metric_cols]
# Melt the DataFrame to long format suitable for plotting
# 'id_vars' specifies the column(s) to keep as identifiers
# 'value_vars' specifies the columns to unpivot
# 'var_name' is the name for the new column containing the original column names (metrics)
# 'value_name' is the name for the new column containing the values (scores)
melted_df = pd.melt(
plot_data,
id_vars=[AutoEvalColumn.dummy.name],
value_vars=metric_cols,
var_name="Metric",
value_name="Score",
)
# Validate and cap scores to ensure they're within a reasonable range (0-100)
melted_df['Score'] = melted_df['Score'].apply(lambda x: min(max(x, 0), 100))
# Create the grouped bar chart
fig = px.bar(
melted_df,
x="Metric",
y="Score",
color=AutoEvalColumn.dummy.name, # Group bars by model name
barmode="group", # Display bars side-by-side for each metric
title=f"Top {top_n} Models Comparison Across Metrics",
labels={AutoEvalColumn.dummy.name: "Model"}, # Rename legend title
custom_data=[AutoEvalColumn.dummy.name, "Metric", "Score"], # Data for hover
range_y=[0, 100], # Force y-axis range to be 0-100
)
# Update hovertemplate
fig.update_traces(
hovertemplate="<br>".join(
[
"Model: %{customdata[0]}",
"Metric: %{customdata[1]}",
"Score: %{customdata[2]:.2f}", # Format score to 2 decimal places
"<extra></extra>", # Remove the default trace info
]
)
)
# Create title with size filter information if applicable
title_text = f"Top {top_n} Models Comparison Across Metrics"
if size_filter and size_filter != 'All Sizes':
title_text += f" ({size_filter} Models)"
# Calculate appropriate y-axis range based on the data
min_score = melted_df['Score'].min()
max_score = melted_df['Score'].max()
# Set y-axis minimum (start at 0 unless all scores are high)
y_min = 40 if min_score > 50 else 0
# Set y-axis maximum (ensure there's room for annotations)
y_max = 100 if max_score < 95 else 105
# Optional: Adjust layout for better readability
fig.update_layout(
title={
"text": title_text,
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="Metric",
yaxis_title="Score (%)",
legend_title="Model",
yaxis=dict(
range=[y_min, y_max], # Set y-axis range dynamically
constrain="domain", # Constrain the axis to the domain
constraintoward="top" # Constrain toward the top
),
width=1600,
height=450,
)
# Define shape icons for each model
shape_icons = {
0: "triangle-up", # First model gets triangle
1: "square", # Second model gets square
2: "circle", # Third model gets circle
3: "diamond", # Fourth model gets diamond
4: "star", # Fifth model gets star
5: "pentagon", # Sixth model gets pentagon
6: "hexagon", # Seventh model gets hexagon
7: "cross", # Eighth model gets cross
8: "x", # Ninth model gets x
9: "hourglass", # Tenth model gets hourglass
}
# Get the average score for each model
model_averages = {}
for model in top_models_df[AutoEvalColumn.dummy.name].unique():
try:
model_averages[model] = top_models_df.loc[top_models_df[AutoEvalColumn.dummy.name] == model, AutoEvalColumn.average.name].values[0]
except (IndexError, KeyError):
# If average score is not available, use None
model_averages[model] = None
# Add shapes to the legend and annotations with icons for each bar
for i, bar in enumerate(fig.data):
model_name = bar.name
model_index = list(top_models_df[AutoEvalColumn.dummy.name].unique()).index(model_name) % len(shape_icons)
icon_shape = shape_icons[model_index]
# Update the name in the legend to include the shape symbol
shape_symbol = get_symbol_for_shape(icon_shape)
fig.data[i].name = f"{shape_symbol} {model_name}"
# For each bar in this trace
for j, (x, y) in enumerate(zip(bar.x, bar.y)):
# Use the actual bar score instead of the average
score_text = f"<b>{y:.1f}</b>"
# Calculate the exact position for the annotation
# Plotly's grouped bar charts position bars at specific offsets
# We need to match these offsets exactly
num_models = len(top_models_df[AutoEvalColumn.dummy.name].unique())
# The total width allocated for all bars in a group
total_group_width = 0.8
# Width of each individual bar
bar_width = total_group_width / num_models
# Calculate the offset for this specific bar within its group
# i represents which model in the group (0 is the first model, etc.)
# Center of the group is at x, so we need to adjust from there
offset = (i - (num_models-1)/2) * bar_width
# Add score text directly above its bar
fig.add_annotation(
x=x,
y=y + 2, # Position slightly above the bar
text=score_text, # Display the actual bar score
showarrow=False,
font=dict(
size=10,
color=bar.marker.color # Match the bar color
),
opacity=0.9,
xshift=offset * 130 # Adjust the multiplier to better center the annotation
)
# Add the shape icon above the score
fig.add_annotation(
x=x,
y=y - 3, # Position above the score text
text=get_symbol_for_shape(icon_shape), # Convert shape name to symbol
showarrow=False,
font=dict(
size=14,
color="black" # Match the bar color
),
opacity=0.9,
xshift=offset * 130 # Adjust the multiplier to better center the annotation
)
return fig
def get_symbol_for_shape(shape_name):
"""Convert shape name to a symbol character that can be used in annotations."""
symbols = {
"triangle-up": "β²",
"square": "β ",
"circle": "β",
"diamond": "β",
"star": "β
",
"pentagon": "β¬",
"hexagon": "β¬’",
"cross": "β",
"x": "β",
"hourglass": "β§"
}
return symbols.get(shape_name, "β") # Default to circle if shape not found
|