File size: 17,778 Bytes
f2bc0a5
b1a1395
f2bc0a5
 
df66f6e
2a5f9fb
405857a
b1a1395
405857a
f2bc0a5
 
 
b1a1395
f2bc0a5
b1a1395
f2bc0a5
b1a1395
 
f2bc0a5
b1a1395
ec3a730
 
405857a
 
 
ec3a730
 
b1a1395
405857a
 
 
 
b1a1395
f2bc0a5
 
b1a1395
f2bc0a5
 
b1a1395
 
 
 
 
 
dbb8b5d
 
 
f2bc0a5
 
b1a1395
 
 
 
5639a81
 
b1a1395
 
 
 
 
 
 
 
 
f2bc0a5
b1a1395
ec3a730
f2bc0a5
b1a1395
 
f2bc0a5
 
 
b1a1395
f2bc0a5
 
 
 
 
 
b1a1395
 
 
f2bc0a5
 
 
 
 
b1a1395
 
f2bc0a5
 
 
 
 
b1a1395
f2bc0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
b1a1395
f2bc0a5
 
c0fa950
f2bc0a5
 
 
 
b1a1395
 
 
f2bc0a5
b1a1395
f2bc0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405857a
f2bc0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359d8a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e84464
 
 
 
359d8a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2bc0a5
405857a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.graph_objs import Figure

from src.leaderboard.filter_models import FLAGGED_MODELS
from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS, external_eval_results, NUMERIC_INTERVALS
from src.leaderboard.read_evals import EvalResult
import copy



def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
    """
    Generates a DataFrame containing the maximum scores until each date.

    :param results_df: A DataFrame containing result information including metric scores and dates.
    :return: A new DataFrame containing the maximum scores until each date for every metric.
    """
    # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it

    #create dataframe with EvalResult dataclass columns, even if raw_data is empty
    raw_data = copy.deepcopy(raw_data)
    for external_row in external_eval_results:
        raw_data.append(EvalResult(**external_row))
    results_df = pd.DataFrame(raw_data, columns=EvalResult.__dataclass_fields__.keys())

    #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
    #convert date to datetime
    results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
    #convert to simple date string 2025-04-26
    results_df["date"] = results_df["date"].dt.strftime("%Y-%m-%d")
    results_df.sort_values(by="date", inplace=True)

    # Step 2: Initialize the scores dictionary
    scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}

    # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
    for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
        current_max = 0
        last_date = ""
        column = task.col_name
        for _, row in results_df.iterrows():
            current_model = row["full_model"]
            # We ignore models that are flagged/no longer on the hub/not finished 
            to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "FINISHED"
            if to_ignore:
                continue

            current_date = row["date"]
            if task.benchmark == "Average":
                current_score = np.mean(list(row["results"].values()))
            else:
                if task.benchmark not in row["results"]:
                    continue
                current_score = row["results"][task.benchmark]

            if current_score > current_max:
                if current_date == last_date and len(scores[column]) > 0:
                    scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
                else:
                    scores[column].append({"model": current_model, "date": current_date, "score": current_score})
                current_max = current_score
                last_date = current_date

    # Step 4: Return all dictionaries as DataFrames
    return {k: pd.DataFrame(v, columns=["model", "date", "score"]) for k, v in scores.items()}


def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
    """
    Transforms the scores DataFrame into a new format suitable for plotting.

    :param scores_df: A DataFrame containing metric scores and dates.
    :return: A new DataFrame reshaped for plotting purposes.
    """
    # Initialize the list to store DataFrames
    dfs = []

    # Iterate over the cols and create a new DataFrame for each column
    for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
        d = scores_df[col].reset_index(drop=True)
        d["task"] = col
        dfs.append(d)

    # Concatenate all the created DataFrames
    concat_df = pd.concat(dfs, ignore_index=True)

    # Sort values by 'date'
    concat_df.sort_values(by="date", inplace=True)
    concat_df.reset_index(drop=True, inplace=True)
    return concat_df


def create_metric_plot_obj(
    df: pd.DataFrame, metrics: list[str], title: str
) -> Figure:
    """
    Create a Plotly figure object with lines representing different metrics
    and horizontal dotted lines representing human baselines.

    :param df: The DataFrame containing the metric values, names, and dates.
    :param metrics: A list of strings representing the names of the metrics
                    to be included in the plot.
    :param title: A string representing the title of the plot.
    :return: A Plotly figure object with lines representing metrics and
             horizontal dotted lines representing human baselines.
    """

    # Filter the DataFrame based on the specified metrics
    df = df[df["task"].isin(metrics)]

    # Filter the human baselines based on the specified metrics
    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}

    # Create a line figure using plotly express with specified markers and custom data
    fig = px.line(
        df,
        x="date",
        y="score",
        color="task",
        markers=True,
        custom_data=["task", "score", "model"],
        title=title,
    )

    # Update hovertemplate for better hover interaction experience
    fig.update_traces(
        hovertemplate="<br>".join(
            [
                "Model Name: %{customdata[2]}",
                "Metric Name: %{customdata[0]}",
                "Date: %{x}",
                "Metric Value: %{y}",
            ]
        )
    )

    # Update the range of the y-axis
    #fig.update_layout(yaxis_range=[0, 100])

    # Create a dictionary to hold the color mapping for each metric
    metric_color_mapping = {}

    # Map each metric name to its color in the figure
    for trace in fig.data:
        metric_color_mapping[trace.name] = trace.line.color

    # Iterate over filtered human baselines and add horizontal lines to the figure
    for metric, value in filtered_human_baselines.items():
        color = metric_color_mapping.get(metric, "blue")  # Retrieve color from mapping; default to blue if not found
        location = "top left" if metric == "HellaSwag" else "bottom left"  # Set annotation position
        # Add horizontal line with matched color and positioned annotation
        fig.add_hline(
            y=value,
            line_dash="dot",
            annotation_text=f"{metric} human baseline",
            annotation_position=location,
            annotation_font_size=10,
            annotation_font_color=color,
            line_color=color,
        )

    return fig

def create_lat_score_mem_plot_obj(leaderboard_df):
    copy_df = leaderboard_df.copy()
    copy_df = copy_df[~(copy_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"]))]
    # plot
    SCORE_MEMORY_LATENCY_DATA = [
        AutoEvalColumn.dummy.name,
        AutoEvalColumn.average.name,
        AutoEvalColumn.params.name,
        AutoEvalColumn.architecture.name,
        "Evaluation Time (min)"
    ]

    copy_df["LLM Average Score"] = copy_df[AutoEvalColumn.average.name]
    copy_df["Evaluation Time (min)"] = copy_df[AutoEvalColumn.eval_time.name] / 60

    #copy_df["size"] = copy_df[AutoEvalColumn.params.name]
    copy_df["size"] = copy_df[AutoEvalColumn.params.name].apply(lambda x: 0.5 if 0 <= x < 0.8 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 0.8 if 0.8 <= x < 2 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 1.5 if 2 <= x < 5 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 2.0 if 5 <= x < 10 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 3.0 if 10 <= x < 35 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 4.0 if 35 <= x < 60 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 6.0 if 60 <= x < 90 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 8.0 if x >= 90 else x)

    fig = px.scatter(
        copy_df,
        x="Evaluation Time (min)",
        y="LLM Average Score",
        size="size",
        color=AutoEvalColumn.architecture.name,
        custom_data=SCORE_MEMORY_LATENCY_DATA,
        color_discrete_sequence=px.colors.qualitative.Light24,
        log_x=True
    )
    fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
        )
    )
    fig.update_layout(
        title={
            "text": "Eval Time vs. Score vs. #Params",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="Time To Evaluate (min)",
        yaxis_title="LLM Average Score",
        legend_title="LLM Architecture",
        width=1200,
        height=600,
    )

    return fig

def create_top_n_models_comparison_plot(leaderboard_df: pd.DataFrame, top_n: int = 5, size_filter: str = None) -> Figure:
    """
    Creates a grouped bar chart comparing the performance of the top N models across all metrics.

    :param leaderboard_df: DataFrame containing the leaderboard data.
    :param top_n: The number of top models to include in the comparison (default is 5).
    :param size_filter: If provided, only include models of this specific size category.
    :return: A Plotly figure object representing the comparison plot.
    """
    # Ensure BENCHMARK_COLS contains the correct metric column names
    metric_cols = BENCHMARK_COLS

    # Filter out non-model rows (like baseline or human) and select relevant columns
    models_df = leaderboard_df[~leaderboard_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"])].copy()
    
    # Add size group information to the DataFrame
    models_df['size_group'] = models_df[AutoEvalColumn.params.name].apply(
        lambda x: next((k for k, v in NUMERIC_INTERVALS.items() if x in v), '?')
    )
    
    # Filter by size category if specified
    if size_filter and size_filter != 'All Sizes':
        models_df = models_df[models_df['size_group'] == size_filter]
        if models_df.empty:
            # If no models match the size filter, return an empty figure with a message
            fig = px.bar(
                x=["No Data"],
                y=[0],
                title=f"No models found in the {size_filter} size category"
            )
            fig.update_layout(
                xaxis_title="",
                yaxis_title="",
                showlegend=False
            )
            return fig
    
    # Sort models by average score and select the top N
    top_models_df = models_df.nlargest(top_n, AutoEvalColumn.average.name)

    # Select only the necessary columns: model name and metric scores
    plot_data = top_models_df[[AutoEvalColumn.dummy.name] + metric_cols]

    # Melt the DataFrame to long format suitable for plotting
    # 'id_vars' specifies the column(s) to keep as identifiers
    # 'value_vars' specifies the columns to unpivot
    # 'var_name' is the name for the new column containing the original column names (metrics)
    # 'value_name' is the name for the new column containing the values (scores)
    melted_df = pd.melt(
        plot_data,
        id_vars=[AutoEvalColumn.dummy.name],
        value_vars=metric_cols,
        var_name="Metric",
        value_name="Score",
    )
    
    # Validate and cap scores to ensure they're within a reasonable range (0-100)
    melted_df['Score'] = melted_df['Score'].apply(lambda x: min(max(x, 0), 100))
    
    # Create the grouped bar chart
    fig = px.bar(
        melted_df,
        x="Metric",
        y="Score",
        color=AutoEvalColumn.dummy.name,  # Group bars by model name
        barmode="group",  # Display bars side-by-side for each metric
        title=f"Top {top_n} Models Comparison Across Metrics",
        labels={AutoEvalColumn.dummy.name: "Model"}, # Rename legend title
        custom_data=[AutoEvalColumn.dummy.name, "Metric", "Score"], # Data for hover
        range_y=[0, 100],  # Force y-axis range to be 0-100
    )

    # Update hovertemplate
    fig.update_traces(
        hovertemplate="<br>".join(
            [
                "Model: %{customdata[0]}",
                "Metric: %{customdata[1]}",
                "Score: %{customdata[2]:.2f}", # Format score to 2 decimal places
                "<extra></extra>", # Remove the default trace info
            ]
        )
    )

    # Create title with size filter information if applicable
    title_text = f"Top {top_n} Models Comparison Across Metrics"
    if size_filter and size_filter != 'All Sizes':
        title_text += f" ({size_filter} Models)"
    
    # Calculate appropriate y-axis range based on the data
    min_score = melted_df['Score'].min()
    max_score = melted_df['Score'].max()
    
    # Set y-axis minimum (start at 0 unless all scores are high)
    y_min = 40 if min_score > 50 else 0
    
    # Set y-axis maximum (ensure there's room for annotations)
    y_max = 100 if max_score < 95 else 105
    
    # Optional: Adjust layout for better readability
    fig.update_layout(
        title={
            "text": title_text,
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="Metric",
        yaxis_title="Score (%)",
        legend_title="Model",
        yaxis=dict(
            range=[y_min, y_max],  # Set y-axis range dynamically
            constrain="domain",  # Constrain the axis to the domain
            constraintoward="top"  # Constrain toward the top
        ),
        width=1600,
        height=450,
    )
    
    # Define shape icons for each model
    shape_icons = {
        0: "triangle-up",    # First model gets triangle
        1: "square",         # Second model gets square
        2: "circle",         # Third model gets circle
        3: "diamond",        # Fourth model gets diamond
        4: "star",           # Fifth model gets star
        5: "pentagon",       # Sixth model gets pentagon
        6: "hexagon",        # Seventh model gets hexagon
        7: "cross",          # Eighth model gets cross
        8: "x",              # Ninth model gets x
        9: "hourglass",      # Tenth model gets hourglass
    }
    
    # Get the average score for each model
    model_averages = {}
    for model in top_models_df[AutoEvalColumn.dummy.name].unique():
        try:
            model_averages[model] = top_models_df.loc[top_models_df[AutoEvalColumn.dummy.name] == model, AutoEvalColumn.average.name].values[0]
        except (IndexError, KeyError):
            # If average score is not available, use None
            model_averages[model] = None
    
    # Add shapes to the legend and annotations with icons for each bar
    for i, bar in enumerate(fig.data):
        model_name = bar.name
        model_index = list(top_models_df[AutoEvalColumn.dummy.name].unique()).index(model_name) % len(shape_icons)
        icon_shape = shape_icons[model_index]
        
        # Update the name in the legend to include the shape symbol
        shape_symbol = get_symbol_for_shape(icon_shape)
        fig.data[i].name = f"{shape_symbol} {model_name}"
        
        # For each bar in this trace
        for j, (x, y) in enumerate(zip(bar.x, bar.y)):
            # Use the actual bar score instead of the average
            score_text = f"<b>{y:.1f}</b>"
                
            # Calculate the exact position for the annotation
            # Plotly's grouped bar charts position bars at specific offsets
            # We need to match these offsets exactly
            num_models = len(top_models_df[AutoEvalColumn.dummy.name].unique())
            
            # The total width allocated for all bars in a group
            total_group_width = 0.8
            
            # Width of each individual bar
            bar_width = total_group_width / num_models
            
            # Calculate the offset for this specific bar within its group
            # i represents which model in the group (0 is the first model, etc.)
            # Center of the group is at x, so we need to adjust from there
            offset = (i - (num_models-1)/2) * bar_width
            
            # Add score text directly above its bar
            fig.add_annotation(
                x=x,
                y=y + 2,  # Position slightly above the bar
                text=score_text,  # Display the actual bar score
                showarrow=False,
                font=dict(
                    size=10,
                    color=bar.marker.color  # Match the bar color
                ),
                opacity=0.9,
                xshift=offset * 130  # Adjust the multiplier to better center the annotation
            )
            
            # Add the shape icon above the score
            fig.add_annotation(
                x=x,
                y=y - 3,  # Position above the score text
                text=get_symbol_for_shape(icon_shape),  # Convert shape name to symbol
                showarrow=False,
                font=dict(
                    size=14,
                    color="black"  # Match the bar color
                ),
                opacity=0.9,
                xshift=offset * 130  # Adjust the multiplier to better center the annotation
            )

    return fig

def get_symbol_for_shape(shape_name):
    """Convert shape name to a symbol character that can be used in annotations."""
    symbols = {
        "triangle-up": "β–²",
        "square": "β– ",
        "circle": "●",
        "diamond": "β—†",
        "star": "β˜…",
        "pentagon": "⬟",
        "hexagon": "β¬’",
        "cross": "✚",
        "x": "βœ–",
        "hourglass": "β§—"
    }
    return symbols.get(shape_name, "●")  # Default to circle if shape not found