Spaces:

andrewrreed
/

closed-vs-open-arena-elo

Runtime error

File size: 5,830 Bytes

import pickle

import pandas as pd
import gradio as gr
import plotly.express as px

from utils import (
    KEY_TO_CATEGORY_NAME,
    PROPRIETARY_LICENSES,
    download_latest_data_from_space,
)

# with gr.NO_RELOAD:
###################
### Load Data
###################

# gather ELO data
latest_elo_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
)

with open(latest_elo_file_local, "rb") as fin:
    elo_results = pickle.load(fin)

arena_dfs = {}
for k in KEY_TO_CATEGORY_NAME.keys():
    if k not in elo_results:
        continue
    arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]

# gather open llm leaderboard data
latest_leaderboard_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
)
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)

###################
### Prepare Data
###################

# merge leaderboard data with ELO data
merged_dfs = {}
for k, v in arena_dfs.items():
    merged_dfs[k] = (
        pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
        .sort_values("rating", ascending=False)
        .reset_index(drop=True)
    )

# add release dates into the merged data
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
for k, v in merged_dfs.items():
    merged_dfs[k] = pd.merge(
        merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
    )

df = merged_dfs["Overall"]
df["License"] = df["License"].apply(
    lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
)
df["Release Date"] = pd.to_datetime(df["Release Date"])
df["Month-Year"] = df["Release Date"].dt.to_period("M")
df["rating"] = df["rating"].round()


###################
### Plot Data
###################

date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
min_elo_score = df["rating"].min().round()
max_elo_score = df["rating"].max().round()
upper_models_per_month = int(
    df.groupby(["Month-Year", "License"])["rating"].apply(lambda x: x.count()).max()
)


def build_plot(min_score, max_models_per_month, toggle_annotations):

    filtered_df = df[(df["rating"] >= min_score)]
    filtered_df = (
        filtered_df.groupby(["Month-Year", "License"])
        .apply(lambda x: x.nlargest(max_models_per_month, "rating"))
        .reset_index(drop=True)
    )

    fig = px.scatter(
        filtered_df,
        x="Release Date",
        y="rating",
        color="License",
        hover_name="Model",
        hover_data=["Organization", "License"],
        trendline="ols",
        title=f"Proprietary vs Open LLMs (LMSYS Arena ELO as of {date_updated})",
        labels={"rating": "Arena ELO", "Release Date": "Release Date"},
        height=700,
        template="seaborn",
    )

    fig.update_traces(marker=dict(size=10, opacity=0.6))

    if toggle_annotations:
        # get the points to annotate (only the highest rated model per month per license)
        idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
            "rating"
        ].idxmax()
        points_to_annotate_df = filtered_df.loc[idx_to_annotate]

        for i, row in points_to_annotate_df.iterrows():
            fig.add_annotation(
                x=row["Release Date"],
                y=row["rating"],
                text=row["Model"],
                showarrow=True,
                arrowhead=0,
            )

    return fig


with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue=gr.themes.colors.sky,
        secondary_hue=gr.themes.colors.green,
        font=[
            gr.themes.GoogleFont("Open Sans"),
            "ui-sans-serif",
            "system-ui",
            "sans-serif",
        ],
    )
) as demo:
    gr.Markdown(
        """
        <div style="text-align: center; max-width: 650px; margin: auto;">
            <h1 style="font-weight: 900; margin-top: 5px;">🔬 Progress Tracker: Proprietary vs Open LLMs
            </h1>
            <p style="text-align: left; margin-top: 10px; margin-bottom: 10px; line-height: 20px;">
            This app visualizes the progress of proprietary and open-source LLMs in the LMSYS Arena ELO leaderboard. The idea is inspired by <a href="https://www.linkedin.com/posts/maxime-labonne_arena-elo-graph-updated-with-new-models-activity-7187062633735368705-u2jB?utm_source=share&utm_medium=member_desktop">this great work</a> from <a href="https://huggingface.co/mlabonne/">Maxime Labonne</a>.
            </p>
        </div>
        """
    )
    with gr.Row():
        min_score = gr.Slider(
            minimum=min_elo_score,
            maximum=max_elo_score,
            value=800,
            step=50,
            label="Minimum ELO Score",
        )
        max_models_per_month = gr.Slider(
            value=upper_models_per_month,
            minimum=1,
            maximum=upper_models_per_month,
            step=1,
            label="Max Models per Month (per License)",
        )
        toggle_annotations = gr.Radio(
            choices=[True, False], label="Overlay Best Model Name", value=False
        )

    # Show plot
    plot = gr.Plot()
    demo.load(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations],
        outputs=plot,
    )
    min_score.change(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations],
        outputs=plot,
    )
    max_models_per_month.change(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations],
        outputs=plot,
    )
    toggle_annotations.change(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations],
        outputs=plot,
    )

demo.launch()
# if __name__ == "__main__":