In [1]:
import os
import pickle

import pandas as pd
from huggingface_hub import HfFileSystem, hf_hub_download

  from .autonotebook import tqdm as notebook_tqdm


## Prepare data

In [2]:
from typing import Literal


def download_latest_data_from_space(
    repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
    """
    Downloads the latest data file of the specified file type from the given repository space.

    Args:
        repo_id (str): The ID of the repository space.
        file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".

    Returns:
        str: The local file path of the downloaded data file.
    """

    def extract_date(filename):
        return filename.split("/")[-1].split(".")[0].split("_")[-1]

    fs = HfFileSystem()
    data_file_path = f"spaces/{repo_id}/*.{file_type}"
    files = fs.glob(data_file_path)
    latest_file = sorted(files, key=extract_date, reverse=True)[0]

    latest_filepath_local = hf_hub_download(
        repo_id=repo_id,
        filename=latest_file.split("/")[-1],
        repo_type="space",
    )
    return latest_filepath_local

In [3]:
latest_leaderboard_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
)
latest_elo_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
)

In [4]:
# load and prepare ELO data
key_to_category_name = {
    "full": "Overall",
    "coding": "Coding",
    "long_user": "Longer Query",
    "english": "English",
    "chinese": "Chinese",
    "french": "French",
    "no_tie": "Exclude Ties",
    "no_short": "Exclude Short Query (< 5 tokens)",
    "no_refusal": "Exclude Refusal",
}
cat_name_to_explanation = {
    "Overall": "Overall Questions",
    "Coding": "Coding: whether conversation contains code snippets",
    "Longer Query": "Longer Query (>= 500 tokens)",
    "English": "English Prompts",
    "Chinese": "Chinese Prompts",
    "French": "French Prompts",
    "Exclude Ties": "Exclude Ties and Bothbad",
    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
}

with open(latest_elo_file_local, "rb") as fin:
    elo_results = pickle.load(fin)

arena_dfs = {}
for k in key_to_category_name.keys():
    if k not in elo_results:
        continue
    arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]

In [5]:
arena_dfs.keys()

dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])

In [6]:
arena_dfs["Overall"]

Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
RWKV-4-Raven-14B,928.451251,26.146415,937.017097,919.444359,5129,82
alpaca-13b,908.084359,18.598539,915.348707,900.602847,6111,86
bard-jan-24-gemini-pro,1208.712877,7.975296,1213.331583,1203.004139,12387,6
chatglm-6b,886.873429,19.813751,894.785321,878.677878,5195,87
chatglm2-6b,933.337288,33.939472,944.493496,921.470740,2880,82
...,...,...,...,...,...,...
wizardlm-70b,1108.552744,8.988005,1114.390689,1102.745236,8867,29
yi-34b-chat,1111.132640,7.801741,1115.356993,1105.658254,13177,29
zephyr-7b-alpha,1043.084267,45.472021,1054.269954,1027.602171,1901,57
zephyr-7b-beta,1054.416300,11.094606,1060.265072,1047.790509,11924,55


In [7]:
# load and prepare Leaderboard data
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)

In [8]:
leaderboard_df

Unnamed: 0,key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
0,wizardlm-30b,WizardLM-30B,7.01,0.587,2023/6,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-30B-V1.0
1,vicuna-13b-16k,Vicuna-13B-16k,6.92,0.545,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5-16k
2,wizardlm-13b-v1.1,WizardLM-13B-v1.1,6.76,0.500,2023/7,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.1
3,tulu-30b,Tulu-30B,6.43,0.581,2023/6,Non-commercial,AllenAI/UW,https://huggingface.co/allenai/tulu-30b
4,guanaco-65b,Guanaco-65B,6.41,0.621,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-65b...
...,...,...,...,...,...,...,...,...
101,llama-3-70b-instruct,Llama-3-70b-Instruct,-,0.820,2023/12,Llama 3 Community,Meta,https://llama.meta.com/llama3/
102,llama-3-8b-instruct,Llama-3-8b-Instruct,-,0.684,2023/3,Llama 3 Community,Meta,https://llama.meta.com/llama3/
103,gemini-1.5-pro-api-0409-preview,Gemini 1.5 Pro API-0409-Preview,-,0.819,2023/11,Proprietary,Google,https://blog.google/technology/ai/google-gemin...
104,phi-3-mini-128k-instruct,Phi-3-Mini-128k-Instruct,-,0.681,2023/10,MIT,Microsoft,https://azure.microsoft.com/en-us/blog/introdu...


In [9]:
arena_dfs.keys()

dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])

In [10]:
# merge ELO and Leaderboard data
merged_dfs = {}
for k, v in arena_dfs.items():
    merged_dfs[k] = (
        pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
        .sort_values("rating", ascending=False)
        .reset_index(drop=True)
    )

In [11]:
merged_dfs["Overall"]

Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking,key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
0,1258.815279,3.258132,1262.796713,1256.000508,35931,1,gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09,-,-,2023/12,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-...
1,1252.684886,1.799233,1254.748391,1249.873417,73547,2,gpt-4-1106-preview,GPT-4-1106-preview,9.32,-,2023/4,Proprietary,OpenAI,https://openai.com/blog/new-models-and-develop...
2,1250.926206,2.018201,1253.851885,1248.166034,80997,2,claude-3-opus-20240229,Claude 3 Opus,-,0.868,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
3,1249.618395,3.233129,1252.956497,1246.247080,39482,2,gemini-1.5-pro-api-0409-preview,Gemini 1.5 Pro API-0409-Preview,-,0.819,2023/11,Proprietary,Google,https://blog.google/technology/ai/google-gemin...
4,1246.777591,1.942477,1249.979712,1244.305362,67354,2,gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary,OpenAI,https://openai.com/blog/new-models-and-develop...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,886.873429,19.813751,894.785321,878.677878,5195,87,chatglm-6b,ChatGLM-6B,4.50,0.361,2023/3,Non-commercial,Tsinghua,https://huggingface.co/THUDM/chatglm-6b
88,876.929108,27.115855,887.355529,866.860534,4521,88,fastchat-t5-3b,FastChat-T5-3B,3.04,0.477,2023/4,Apache 2.0,LMSYS,https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
89,848.932568,36.961459,859.103936,837.364341,3461,90,stablelm-tuned-alpha-7b,StableLM-Tuned-Alpha-7B,2.75,0.244,2023/4,CC-BY-NC-SA-4.0,Stability AI,https://huggingface.co/stabilityai/stablelm-tu...
90,826.647332,30.156414,837.335988,816.370788,3666,91,dolly-v2-12b,Dolly-V2-12B,3.28,0.257,2023/4,MIT,Databricks,https://huggingface.co/databricks/dolly-v2-12b


### Manually map release dates - MEH.

In [12]:
t = merged_dfs["Overall"].loc[:, ["key", "Model"]]
t["Release Date"] = ""

In [13]:
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")

In [14]:
release_date_mapping

Unnamed: 0,key,Model,Release Date
0,gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09,2024-04-09
1,gpt-4-1106-preview,GPT-4-1106-preview,2023-11-06
2,claude-3-opus-20240229,Claude 3 Opus,2024-02-29
3,gemini-1.5-pro-api-0409-preview,Gemini 1.5 Pro API-0409-Preview,2024-04-09
4,gpt-4-0125-preview,GPT-4-0125-preview,2024-01-25
...,...,...,...
86,chatglm-6b,ChatGLM-6B,2023-03-13
87,fastchat-t5-3b,FastChat-T5-3B,2023-04-27
88,stablelm-tuned-alpha-7b,StableLM-Tuned-Alpha-7B,2023-04-19
89,dolly-v2-12b,Dolly-V2-12B,2023-04-12


In [15]:
release_date_mapping[["key", "Release Date"]]

Unnamed: 0,key,Release Date
0,gpt-4-turbo-2024-04-09,2024-04-09
1,gpt-4-1106-preview,2023-11-06
2,claude-3-opus-20240229,2024-02-29
3,gemini-1.5-pro-api-0409-preview,2024-04-09
4,gpt-4-0125-preview,2024-01-25
...,...,...
86,chatglm-6b,2023-03-13
87,fastchat-t5-3b,2023-04-27
88,stablelm-tuned-alpha-7b,2023-04-19
89,dolly-v2-12b,2023-04-12


In [16]:
# add release dates into the merged data
for k, v in merged_dfs.items():
    merged_dfs[k] = pd.merge(
        merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
    )

In [17]:
merged_dfs["Overall"].columns

Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',
       'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',
       'Knowledge cutoff date', 'License', 'Organization', 'Link',
       'Release Date'],
      dtype='object')

In [77]:
def format_data(df):
    df["License"] = df["License"].apply(
        lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
    )
    df["Release Date"] = pd.to_datetime(df["Release Date"])
    df["Month-Year"] = df["Release Date"].dt.to_period("M")
    df["rating"] = df["rating"].round()
    return df.reset_index(drop=True)


merged_dfs2 = {k: format_data(v) for k, v in merged_dfs.items()}

In [81]:
for k, df in merged_dfs2.items():
    print(
        int(
            df.groupby(["Release Date", "License"])["rating"]
            .apply(lambda x: len(x))
            .max()
        )
    )
    (df["rating"].min().round(),)
    print()

5
5
5
5
5
5
5
5
5


## Build plot

In [76]:
t = {
    "Overall": {
        "min_elo_score": 804.0,
        "max_elo_score": 1259.0,
        "upper_models_per_month": 5,
    },
    "Coding": {
        "min_elo_score": 672.0,
        "max_elo_score": 1270.0,
        "upper_models_per_month": 5,
    },
    "Longer Query": {
        "min_elo_score": 796.0,
        "max_elo_score": 1273.0,
        "upper_models_per_month": 5,
    },
    "English": {
        "min_elo_score": 783.0,
        "max_elo_score": 1246.0,
        "upper_models_per_month": 5,
    },
    "Chinese": {
        "min_elo_score": 753.0,
        "max_elo_score": 1325.0,
        "upper_models_per_month": 5,
    },
    "French": {
        "min_elo_score": 694.0,
        "max_elo_score": 1268.0,
        "upper_models_per_month": 5,
    },
    "Exclude Ties": {
        "min_elo_score": 654.0,
        "max_elo_score": 1334.0,
        "upper_models_per_month": 5,
    },
    "Exclude Short Query (< 5 tokens)": {
        "min_elo_score": 796.0,
        "max_elo_score": 1264.0,
        "upper_models_per_month": 5,
    },
    "Exclude Refusal": {
        "min_elo_score": 795.0,
        "max_elo_score": 1264.0,
        "upper_models_per_month": 5,
    },
}

In [None]:
o = {
    "min_elo_score": <minimum>,
    "max_elo_score": <maximum>,
    "upper_models_per_month": <maximum>,
}

In [49]:
PROPRIETARY_LICENSES = [
    "Proprietary",
    "Non-commercial",
]

df = merged_dfs["Overall"]
df["License"] = df["License"].apply(
    lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
)
df["Release Date"] = pd.to_datetime(df["Release Date"])

In [57]:
df["Month-Year"] = df["Release Date"].dt.to_period("M")

In [66]:
df.groupby(["Month-Year", "License"])["rating"].apply(lambda x: x.count()).max()

8

In [69]:
df[df["Month-Year"] == "2024-01"]

Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking,key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link,Release Date,license_binary,Month-Year
4,1246.777591,1.942477,1249.979712,1244.305362,67354,2,gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary LLM,OpenAI,https://openai.com/blog/new-models-and-develop...,2024-01-25,Proprietary LLM,2024-01
32,1111.13264,7.801741,1115.356993,1105.658254,13177,29,yi-34b-chat,Yi-34B-Chat,-,0.735,2023/6,Open LLM,01 AI,https://huggingface.co/01-ai/Yi-34B-Chat,2024-01-23,Open LLM,2024-01
36,1107.12981,2.419182,1110.056188,1104.002581,47220,32,gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125,-,-,2021/9,Proprietary LLM,OpenAI,https://platform.openai.com/docs/models/gpt-3-...,2024-01-25,Proprietary LLM,2024-01
39,1098.527455,6.400166,1103.343592,1093.903695,14159,36,openchat-3.5-0106,OpenChat-3.5-0106,7.8,0.658,2024/1,Open LLM,OpenChat,https://huggingface.co/openchat/openchat-3.5-0106,2024-01-06,Open LLM,2024-01
43,1087.307758,18.314258,1094.532598,1078.413814,3980,40,nous-hermes-2-mixtral-8x7b-dpo,Nous-Hermes-2-Mixtral-8x7B-DPO,-,-,2024/1,Open LLM,NousResearch,https://huggingface.co/NousResearch/Nous-Herme...,2024-01-13,Open LLM,2024-01
60,1047.927688,60.707225,1061.952116,1034.283514,1321,55,codellama-70b-instruct,CodeLlama-70B-instruct,-,-,2024/1,Open LLM,Meta,https://huggingface.co/codellama/CodeLlama-70b-hf,2024-01-29,Open LLM,2024-01


In [70]:
df[df["Month-Year"] == "2024-01"].groupby(["Month-Year", "License"]).apply(
    lambda x: x.nlargest(3, "rating")
).reset_index(drop=True)





Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking,key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link,Release Date,license_binary,Month-Year
0,1111.13264,7.801741,1115.356993,1105.658254,13177,29,yi-34b-chat,Yi-34B-Chat,-,0.735,2023/6,Open LLM,01 AI,https://huggingface.co/01-ai/Yi-34B-Chat,2024-01-23,Open LLM,2024-01
1,1098.527455,6.400166,1103.343592,1093.903695,14159,36,openchat-3.5-0106,OpenChat-3.5-0106,7.8,0.658,2024/1,Open LLM,OpenChat,https://huggingface.co/openchat/openchat-3.5-0106,2024-01-06,Open LLM,2024-01
2,1087.307758,18.314258,1094.532598,1078.413814,3980,40,nous-hermes-2-mixtral-8x7b-dpo,Nous-Hermes-2-Mixtral-8x7B-DPO,-,-,2024/1,Open LLM,NousResearch,https://huggingface.co/NousResearch/Nous-Herme...,2024-01-13,Open LLM,2024-01
3,1246.777591,1.942477,1249.979712,1244.305362,67354,2,gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary LLM,OpenAI,https://openai.com/blog/new-models-and-develop...,2024-01-25,Proprietary LLM,2024-01
4,1107.12981,2.419182,1110.056188,1104.002581,47220,32,gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125,-,-,2021/9,Proprietary LLM,OpenAI,https://platform.openai.com/docs/models/gpt-3-...,2024-01-25,Proprietary LLM,2024-01


In [51]:
df.keys()

Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',
       'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',
       'Knowledge cutoff date', 'License', 'Organization', 'Link',
       'Release Date', 'license_binary'],
      dtype='object')

In [56]:
import plotly.express as px
import plotly.graph_objects as go

# Plotting
fig = px.scatter(
    df,
    x="Release Date",
    y="rating",
    color="license_binary",
    hover_name="Model",
    hover_data=[
        "Release Date",
        "Organization",
        "License",
        "license_binary",
    ],
    title="Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)",
    labels={"rating": "Arena ELO", "Release Date": "Release Date"},
    template="plotly_white",
)
fig.update_traces(marker=dict(size=8))

# Display the plot
fig.show()

In [37]:
type(fig)

plotly.graph_objs._figure.Figure