diff --git "a/dev.ipynb" "b/dev.ipynb" deleted file mode 100644--- "a/dev.ipynb" +++ /dev/null @@ -1,3587 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/andrewreed/Documents/success_projects/closed-vs-open-arena-elo/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import os\n", - "import pickle\n", - "\n", - "import pandas as pd\n", - "from huggingface_hub import HfFileSystem, hf_hub_download" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Literal\n", - "\n", - "\n", - "def download_latest_data_from_space(\n", - " repo_id: str, file_type: Literal[\"pkl\", \"csv\"]\n", - ") -> str:\n", - " \"\"\"\n", - " Downloads the latest data file of the specified file type from the given repository space.\n", - "\n", - " Args:\n", - " repo_id (str): The ID of the repository space.\n", - " file_type (Literal[\"pkl\", \"csv\"]): The type of the data file to download. Must be either \"pkl\" or \"csv\".\n", - "\n", - " Returns:\n", - " str: The local file path of the downloaded data file.\n", - " \"\"\"\n", - "\n", - " def extract_date(filename):\n", - " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n", - "\n", - " fs = HfFileSystem()\n", - " data_file_path = f\"spaces/{repo_id}/*.{file_type}\"\n", - " files = fs.glob(data_file_path)\n", - " latest_file = sorted(files, key=extract_date, reverse=True)[0]\n", - "\n", - " latest_filepath_local = hf_hub_download(\n", - " repo_id=repo_id,\n", - " filename=latest_file.split(\"/\")[-1],\n", - " repo_type=\"space\",\n", - " )\n", - " return latest_filepath_local" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "latest_leaderboard_file_local = download_latest_data_from_space(\n", - " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"csv\"\n", - ")\n", - "latest_elo_file_local = download_latest_data_from_space(\n", - " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"pkl\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# load and prepare ELO data\n", - "key_to_category_name = {\n", - " \"full\": \"Overall\",\n", - " \"coding\": \"Coding\",\n", - " \"long_user\": \"Longer Query\",\n", - " \"english\": \"English\",\n", - " \"chinese\": \"Chinese\",\n", - " \"french\": \"French\",\n", - " \"no_tie\": \"Exclude Ties\",\n", - " \"no_short\": \"Exclude Short Query (< 5 tokens)\",\n", - " \"no_refusal\": \"Exclude Refusal\",\n", - "}\n", - "cat_name_to_explanation = {\n", - " \"Overall\": \"Overall Questions\",\n", - " \"Coding\": \"Coding: whether conversation contains code snippets\",\n", - " \"Longer Query\": \"Longer Query (>= 500 tokens)\",\n", - " \"English\": \"English Prompts\",\n", - " \"Chinese\": \"Chinese Prompts\",\n", - " \"French\": \"French Prompts\",\n", - " \"Exclude Ties\": \"Exclude Ties and Bothbad\",\n", - " \"Exclude Short Query (< 5 tokens)\": \"Exclude Short User Query (< 5 tokens)\",\n", - " \"Exclude Refusal\": 'Exclude model responses with refusal (e.g., \"I cannot answer\")',\n", - "}\n", - "\n", - "with open(latest_elo_file_local, \"rb\") as fin:\n", - " elo_results = pickle.load(fin)\n", - "\n", - "arena_dfs = {}\n", - "for k in key_to_category_name.keys():\n", - " if k not in elo_results:\n", - " continue\n", - " arena_dfs[key_to_category_name[k]] = elo_results[k][\"leaderboard_table_df\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "arena_dfs.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ratingvariancerating_q975rating_q025num_battlesfinal_ranking
RWKV-4-Raven-14B928.45125126.146415937.017097919.444359512982
alpaca-13b908.08435918.598539915.348707900.602847611186
bard-jan-24-gemini-pro1208.7128777.9752961213.3315831203.004139123876
chatglm-6b886.87342919.813751894.785321878.677878519587
chatglm2-6b933.33728833.939472944.493496921.470740288082
.....................
wizardlm-70b1108.5527448.9880051114.3906891102.745236886729
yi-34b-chat1111.1326407.8017411115.3569931105.6582541317729
zephyr-7b-alpha1043.08426745.4720211054.2699541027.602171190157
zephyr-7b-beta1054.41630011.0946061060.2650721047.7905091192455
zephyr-orpo-141b-A35b-v0.11128.81633716.9643851134.8626801119.183571520722
\n", - "

92 rows × 6 columns

\n", - "
" - ], - "text/plain": [ - " rating variance rating_q975 rating_q025 \\\n", - "RWKV-4-Raven-14B 928.451251 26.146415 937.017097 919.444359 \n", - "alpaca-13b 908.084359 18.598539 915.348707 900.602847 \n", - "bard-jan-24-gemini-pro 1208.712877 7.975296 1213.331583 1203.004139 \n", - "chatglm-6b 886.873429 19.813751 894.785321 878.677878 \n", - "chatglm2-6b 933.337288 33.939472 944.493496 921.470740 \n", - "... ... ... ... ... \n", - "wizardlm-70b 1108.552744 8.988005 1114.390689 1102.745236 \n", - "yi-34b-chat 1111.132640 7.801741 1115.356993 1105.658254 \n", - "zephyr-7b-alpha 1043.084267 45.472021 1054.269954 1027.602171 \n", - "zephyr-7b-beta 1054.416300 11.094606 1060.265072 1047.790509 \n", - "zephyr-orpo-141b-A35b-v0.1 1128.816337 16.964385 1134.862680 1119.183571 \n", - "\n", - " num_battles final_ranking \n", - "RWKV-4-Raven-14B 5129 82 \n", - "alpaca-13b 6111 86 \n", - "bard-jan-24-gemini-pro 12387 6 \n", - "chatglm-6b 5195 87 \n", - "chatglm2-6b 2880 82 \n", - "... ... ... \n", - "wizardlm-70b 8867 29 \n", - "yi-34b-chat 13177 29 \n", - "zephyr-7b-alpha 1901 57 \n", - "zephyr-7b-beta 11924 55 \n", - "zephyr-orpo-141b-A35b-v0.1 5207 22 \n", - "\n", - "[92 rows x 6 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "arena_dfs[\"Overall\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# load and prepare Leaderboard data\n", - "leaderboard_df = pd.read_csv(latest_leaderboard_file_local)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLink
0wizardlm-30bWizardLM-30B7.010.5872023/6Non-commercialMicrosofthttps://huggingface.co/WizardLM/WizardLM-30B-V1.0
1vicuna-13b-16kVicuna-13B-16k6.920.5452023/7Llama 2 CommunityLMSYShttps://huggingface.co/lmsys/vicuna-13b-v1.5-16k
2wizardlm-13b-v1.1WizardLM-13B-v1.16.760.5002023/7Non-commercialMicrosofthttps://huggingface.co/WizardLM/WizardLM-13B-V1.1
3tulu-30bTulu-30B6.430.5812023/6Non-commercialAllenAI/UWhttps://huggingface.co/allenai/tulu-30b
4guanaco-65bGuanaco-65B6.410.6212023/5Non-commercialUWhttps://huggingface.co/timdettmers/guanaco-65b...
...........................
101llama-3-70b-instructLlama-3-70b-Instruct-0.8202023/12Llama 3 CommunityMetahttps://llama.meta.com/llama3/
102llama-3-8b-instructLlama-3-8b-Instruct-0.6842023/3Llama 3 CommunityMetahttps://llama.meta.com/llama3/
103gemini-1.5-pro-api-0409-previewGemini 1.5 Pro API-0409-Preview-0.8192023/11ProprietaryGooglehttps://blog.google/technology/ai/google-gemin...
104phi-3-mini-128k-instructPhi-3-Mini-128k-Instruct-0.6812023/10MITMicrosofthttps://azure.microsoft.com/en-us/blog/introdu...
105snowflake-arctic-instructSnowflake Arctic Instruct-0.6732024/4Apache 2.0Snowflakehttps://www.snowflake.com/blog/arctic-open-eff...
\n", - "

106 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " key Model \\\n", - "0 wizardlm-30b WizardLM-30B \n", - "1 vicuna-13b-16k Vicuna-13B-16k \n", - "2 wizardlm-13b-v1.1 WizardLM-13B-v1.1 \n", - "3 tulu-30b Tulu-30B \n", - "4 guanaco-65b Guanaco-65B \n", - ".. ... ... \n", - "101 llama-3-70b-instruct Llama-3-70b-Instruct \n", - "102 llama-3-8b-instruct Llama-3-8b-Instruct \n", - "103 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n", - "104 phi-3-mini-128k-instruct Phi-3-Mini-128k-Instruct \n", - "105 snowflake-arctic-instruct Snowflake Arctic Instruct \n", - "\n", - " MT-bench (score) MMLU Knowledge cutoff date License \\\n", - "0 7.01 0.587 2023/6 Non-commercial \n", - "1 6.92 0.545 2023/7 Llama 2 Community \n", - "2 6.76 0.500 2023/7 Non-commercial \n", - "3 6.43 0.581 2023/6 Non-commercial \n", - "4 6.41 0.621 2023/5 Non-commercial \n", - ".. ... ... ... ... \n", - "101 - 0.820 2023/12 Llama 3 Community \n", - "102 - 0.684 2023/3 Llama 3 Community \n", - "103 - 0.819 2023/11 Proprietary \n", - "104 - 0.681 2023/10 MIT \n", - "105 - 0.673 2024/4 Apache 2.0 \n", - "\n", - " Organization Link \n", - "0 Microsoft https://huggingface.co/WizardLM/WizardLM-30B-V1.0 \n", - "1 LMSYS https://huggingface.co/lmsys/vicuna-13b-v1.5-16k \n", - "2 Microsoft https://huggingface.co/WizardLM/WizardLM-13B-V1.1 \n", - "3 AllenAI/UW https://huggingface.co/allenai/tulu-30b \n", - "4 UW https://huggingface.co/timdettmers/guanaco-65b... \n", - ".. ... ... \n", - "101 Meta https://llama.meta.com/llama3/ \n", - "102 Meta https://llama.meta.com/llama3/ \n", - "103 Google https://blog.google/technology/ai/google-gemin... \n", - "104 Microsoft https://azure.microsoft.com/en-us/blog/introdu... \n", - "105 Snowflake https://www.snowflake.com/blog/arctic-open-eff... \n", - "\n", - "[106 rows x 8 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "leaderboard_df" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "arena_dfs.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# merge ELO and Leaderboard data\n", - "merged_dfs = {}\n", - "for k, v in arena_dfs.items():\n", - " merged_dfs[k] = (\n", - " pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on=\"key\")\n", - " .sort_values(\"rating\", ascending=False)\n", - " .reset_index(drop=True)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ratingvariancerating_q975rating_q025num_battlesfinal_rankingkeyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLink
01258.8152793.2581321262.7967131256.000508359311gpt-4-turbo-2024-04-09GPT-4-Turbo-2024-04-09--2023/12ProprietaryOpenAIhttps://platform.openai.com/docs/models/gpt-4-...
11252.6848861.7992331254.7483911249.873417735472gpt-4-1106-previewGPT-4-1106-preview9.32-2023/4ProprietaryOpenAIhttps://openai.com/blog/new-models-and-develop...
21250.9262062.0182011253.8518851248.166034809972claude-3-opus-20240229Claude 3 Opus-0.8682023/8ProprietaryAnthropichttps://www.anthropic.com/news/claude-3-family
31249.6183953.2331291252.9564971246.247080394822gemini-1.5-pro-api-0409-previewGemini 1.5 Pro API-0409-Preview-0.8192023/11ProprietaryGooglehttps://blog.google/technology/ai/google-gemin...
41246.7775911.9424771249.9797121244.305362673542gpt-4-0125-previewGPT-4-0125-preview--2023/12ProprietaryOpenAIhttps://openai.com/blog/new-models-and-develop...
.............................................
87886.87342919.813751894.785321878.677878519587chatglm-6bChatGLM-6B4.500.3612023/3Non-commercialTsinghuahttps://huggingface.co/THUDM/chatglm-6b
88876.92910827.115855887.355529866.860534452188fastchat-t5-3bFastChat-T5-3B3.040.4772023/4Apache 2.0LMSYShttps://huggingface.co/lmsys/fastchat-t5-3b-v1.0
89848.93256836.961459859.103936837.364341346190stablelm-tuned-alpha-7bStableLM-Tuned-Alpha-7B2.750.2442023/4CC-BY-NC-SA-4.0Stability AIhttps://huggingface.co/stabilityai/stablelm-tu...
90826.64733230.156414837.335988816.370788366691dolly-v2-12bDolly-V2-12B3.280.2572023/4MITDatabrickshttps://huggingface.co/databricks/dolly-v2-12b
91804.35632944.756983815.161492790.879536253892llama-13bLLaMA-13B2.610.4702023/2Non-commercialMetahttps://arxiv.org/abs/2302.13971
\n", - "

92 rows × 14 columns

\n", - "
" - ], - "text/plain": [ - " rating variance rating_q975 rating_q025 num_battles \\\n", - "0 1258.815279 3.258132 1262.796713 1256.000508 35931 \n", - "1 1252.684886 1.799233 1254.748391 1249.873417 73547 \n", - "2 1250.926206 2.018201 1253.851885 1248.166034 80997 \n", - "3 1249.618395 3.233129 1252.956497 1246.247080 39482 \n", - "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n", - ".. ... ... ... ... ... \n", - "87 886.873429 19.813751 894.785321 878.677878 5195 \n", - "88 876.929108 27.115855 887.355529 866.860534 4521 \n", - "89 848.932568 36.961459 859.103936 837.364341 3461 \n", - "90 826.647332 30.156414 837.335988 816.370788 3666 \n", - "91 804.356329 44.756983 815.161492 790.879536 2538 \n", - "\n", - " final_ranking key \\\n", - "0 1 gpt-4-turbo-2024-04-09 \n", - "1 2 gpt-4-1106-preview \n", - "2 2 claude-3-opus-20240229 \n", - "3 2 gemini-1.5-pro-api-0409-preview \n", - "4 2 gpt-4-0125-preview \n", - ".. ... ... \n", - "87 87 chatglm-6b \n", - "88 88 fastchat-t5-3b \n", - "89 90 stablelm-tuned-alpha-7b \n", - "90 91 dolly-v2-12b \n", - "91 92 llama-13b \n", - "\n", - " Model MT-bench (score) MMLU \\\n", - "0 GPT-4-Turbo-2024-04-09 - - \n", - "1 GPT-4-1106-preview 9.32 - \n", - "2 Claude 3 Opus - 0.868 \n", - "3 Gemini 1.5 Pro API-0409-Preview - 0.819 \n", - "4 GPT-4-0125-preview - - \n", - ".. ... ... ... \n", - "87 ChatGLM-6B 4.50 0.361 \n", - "88 FastChat-T5-3B 3.04 0.477 \n", - "89 StableLM-Tuned-Alpha-7B 2.75 0.244 \n", - "90 Dolly-V2-12B 3.28 0.257 \n", - "91 LLaMA-13B 2.61 0.470 \n", - "\n", - " Knowledge cutoff date License Organization \\\n", - "0 2023/12 Proprietary OpenAI \n", - "1 2023/4 Proprietary OpenAI \n", - "2 2023/8 Proprietary Anthropic \n", - "3 2023/11 Proprietary Google \n", - "4 2023/12 Proprietary OpenAI \n", - ".. ... ... ... \n", - "87 2023/3 Non-commercial Tsinghua \n", - "88 2023/4 Apache 2.0 LMSYS \n", - "89 2023/4 CC-BY-NC-SA-4.0 Stability AI \n", - "90 2023/4 MIT Databricks \n", - "91 2023/2 Non-commercial Meta \n", - "\n", - " Link \n", - "0 https://platform.openai.com/docs/models/gpt-4-... \n", - "1 https://openai.com/blog/new-models-and-develop... \n", - "2 https://www.anthropic.com/news/claude-3-family \n", - "3 https://blog.google/technology/ai/google-gemin... \n", - "4 https://openai.com/blog/new-models-and-develop... \n", - ".. ... \n", - "87 https://huggingface.co/THUDM/chatglm-6b \n", - "88 https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 \n", - "89 https://huggingface.co/stabilityai/stablelm-tu... \n", - "90 https://huggingface.co/databricks/dolly-v2-12b \n", - "91 https://arxiv.org/abs/2302.13971 \n", - "\n", - "[92 rows x 14 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "merged_dfs[\"Overall\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Manually map release dates - MEH." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "t = merged_dfs[\"Overall\"].loc[:, [\"key\", \"Model\"]]\n", - "t[\"Release Date\"] = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "release_date_mapping = pd.read_json(\"release_date_mapping.json\", orient=\"records\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyModelRelease Date
0gpt-4-turbo-2024-04-09GPT-4-Turbo-2024-04-092024-04-09
1gpt-4-1106-previewGPT-4-1106-preview2023-11-06
2claude-3-opus-20240229Claude 3 Opus2024-02-29
3gemini-1.5-pro-api-0409-previewGemini 1.5 Pro API-0409-Preview2024-04-09
4gpt-4-0125-previewGPT-4-0125-preview2024-01-25
............
86chatglm-6bChatGLM-6B2023-03-13
87fastchat-t5-3bFastChat-T5-3B2023-04-27
88stablelm-tuned-alpha-7bStableLM-Tuned-Alpha-7B2023-04-19
89dolly-v2-12bDolly-V2-12B2023-04-12
90llama-13bLLaMA-13B2023-02-27
\n", - "

91 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " key Model \\\n", - "0 gpt-4-turbo-2024-04-09 GPT-4-Turbo-2024-04-09 \n", - "1 gpt-4-1106-preview GPT-4-1106-preview \n", - "2 claude-3-opus-20240229 Claude 3 Opus \n", - "3 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n", - "4 gpt-4-0125-preview GPT-4-0125-preview \n", - ".. ... ... \n", - "86 chatglm-6b ChatGLM-6B \n", - "87 fastchat-t5-3b FastChat-T5-3B \n", - "88 stablelm-tuned-alpha-7b StableLM-Tuned-Alpha-7B \n", - "89 dolly-v2-12b Dolly-V2-12B \n", - "90 llama-13b LLaMA-13B \n", - "\n", - " Release Date \n", - "0 2024-04-09 \n", - "1 2023-11-06 \n", - "2 2024-02-29 \n", - "3 2024-04-09 \n", - "4 2024-01-25 \n", - ".. ... \n", - "86 2023-03-13 \n", - "87 2023-04-27 \n", - "88 2023-04-19 \n", - "89 2023-04-12 \n", - "90 2023-02-27 \n", - "\n", - "[91 rows x 3 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "release_date_mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyRelease Date
0gpt-4-turbo-2024-04-092024-04-09
1gpt-4-1106-preview2023-11-06
2claude-3-opus-202402292024-02-29
3gemini-1.5-pro-api-0409-preview2024-04-09
4gpt-4-0125-preview2024-01-25
.........
86chatglm-6b2023-03-13
87fastchat-t5-3b2023-04-27
88stablelm-tuned-alpha-7b2023-04-19
89dolly-v2-12b2023-04-12
90llama-13b2023-02-27
\n", - "

91 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " key Release Date\n", - "0 gpt-4-turbo-2024-04-09 2024-04-09\n", - "1 gpt-4-1106-preview 2023-11-06\n", - "2 claude-3-opus-20240229 2024-02-29\n", - "3 gemini-1.5-pro-api-0409-preview 2024-04-09\n", - "4 gpt-4-0125-preview 2024-01-25\n", - ".. ... ...\n", - "86 chatglm-6b 2023-03-13\n", - "87 fastchat-t5-3b 2023-04-27\n", - "88 stablelm-tuned-alpha-7b 2023-04-19\n", - "89 dolly-v2-12b 2023-04-12\n", - "90 llama-13b 2023-02-27\n", - "\n", - "[91 rows x 2 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "release_date_mapping[[\"key\", \"Release Date\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# add release dates into the merged data\n", - "for k, v in merged_dfs.items():\n", - " merged_dfs[k] = pd.merge(\n", - " merged_dfs[k], release_date_mapping[[\"key\", \"Release Date\"]], on=\"key\"\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n", - " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n", - " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n", - " 'Release Date'],\n", - " dtype='object')" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "merged_dfs[\"Overall\"].columns" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [], - "source": [ - "def format_data(df):\n", - " df[\"License\"] = df[\"License\"].apply(\n", - " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n", - " )\n", - " df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])\n", - " df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")\n", - " df[\"rating\"] = df[\"rating\"].round()\n", - " return df.reset_index(drop=True)\n", - "\n", - "\n", - "merged_dfs2 = {k: format_data(v) for k, v in merged_dfs.items()}" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5\n", - "5\n", - "5\n", - "5\n", - "5\n", - "5\n", - "5\n", - "5\n", - "5\n" - ] - } - ], - "source": [ - "for k, df in merged_dfs2.items():\n", - " print(\n", - " int(\n", - " df.groupby([\"Release Date\", \"License\"])[\"rating\"]\n", - " .apply(lambda x: len(x))\n", - " .max()\n", - " )\n", - " )\n", - " (df[\"rating\"].min().round(),)\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build plot" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [], - "source": [ - "t = {\n", - " \"Overall\": {\n", - " \"min_elo_score\": 804.0,\n", - " \"max_elo_score\": 1259.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"Coding\": {\n", - " \"min_elo_score\": 672.0,\n", - " \"max_elo_score\": 1270.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"Longer Query\": {\n", - " \"min_elo_score\": 796.0,\n", - " \"max_elo_score\": 1273.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"English\": {\n", - " \"min_elo_score\": 783.0,\n", - " \"max_elo_score\": 1246.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"Chinese\": {\n", - " \"min_elo_score\": 753.0,\n", - " \"max_elo_score\": 1325.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"French\": {\n", - " \"min_elo_score\": 694.0,\n", - " \"max_elo_score\": 1268.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"Exclude Ties\": {\n", - " \"min_elo_score\": 654.0,\n", - " \"max_elo_score\": 1334.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"Exclude Short Query (< 5 tokens)\": {\n", - " \"min_elo_score\": 796.0,\n", - " \"max_elo_score\": 1264.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - " \"Exclude Refusal\": {\n", - " \"min_elo_score\": 795.0,\n", - " \"max_elo_score\": 1264.0,\n", - " \"upper_models_per_month\": 5,\n", - " },\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "o = {\n", - " \"min_elo_score\": ,\n", - " \"max_elo_score\": ,\n", - " \"upper_models_per_month\": ,\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "PROPRIETARY_LICENSES = [\n", - " \"Proprietary\",\n", - " \"Non-commercial\",\n", - "]\n", - "\n", - "df = merged_dfs[\"Overall\"]\n", - "df[\"License\"] = df[\"License\"].apply(\n", - " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n", - ")\n", - "df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby([\"Month-Year\", \"License\"])[\"rating\"].apply(lambda x: x.count()).max()" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ratingvariancerating_q975rating_q025num_battlesfinal_rankingkeyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLinkRelease Datelicense_binaryMonth-Year
41246.7775911.9424771249.9797121244.305362673542gpt-4-0125-previewGPT-4-0125-preview--2023/12Proprietary LLMOpenAIhttps://openai.com/blog/new-models-and-develop...2024-01-25Proprietary LLM2024-01
321111.1326407.8017411115.3569931105.6582541317729yi-34b-chatYi-34B-Chat-0.7352023/6Open LLM01 AIhttps://huggingface.co/01-ai/Yi-34B-Chat2024-01-23Open LLM2024-01
361107.1298102.4191821110.0561881104.0025814722032gpt-3.5-turbo-0125GPT-3.5-Turbo-0125--2021/9Proprietary LLMOpenAIhttps://platform.openai.com/docs/models/gpt-3-...2024-01-25Proprietary LLM2024-01
391098.5274556.4001661103.3435921093.9036951415936openchat-3.5-0106OpenChat-3.5-01067.80.6582024/1Open LLMOpenChathttps://huggingface.co/openchat/openchat-3.5-01062024-01-06Open LLM2024-01
431087.30775818.3142581094.5325981078.413814398040nous-hermes-2-mixtral-8x7b-dpoNous-Hermes-2-Mixtral-8x7B-DPO--2024/1Open LLMNousResearchhttps://huggingface.co/NousResearch/Nous-Herme...2024-01-13Open LLM2024-01
601047.92768860.7072251061.9521161034.283514132155codellama-70b-instructCodeLlama-70B-instruct--2024/1Open LLMMetahttps://huggingface.co/codellama/CodeLlama-70b-hf2024-01-29Open LLM2024-01
\n", - "
" - ], - "text/plain": [ - " rating variance rating_q975 rating_q025 num_battles \\\n", - "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n", - "32 1111.132640 7.801741 1115.356993 1105.658254 13177 \n", - "36 1107.129810 2.419182 1110.056188 1104.002581 47220 \n", - "39 1098.527455 6.400166 1103.343592 1093.903695 14159 \n", - "43 1087.307758 18.314258 1094.532598 1078.413814 3980 \n", - "60 1047.927688 60.707225 1061.952116 1034.283514 1321 \n", - "\n", - " final_ranking key \\\n", - "4 2 gpt-4-0125-preview \n", - "32 29 yi-34b-chat \n", - "36 32 gpt-3.5-turbo-0125 \n", - "39 36 openchat-3.5-0106 \n", - "43 40 nous-hermes-2-mixtral-8x7b-dpo \n", - "60 55 codellama-70b-instruct \n", - "\n", - " Model MT-bench (score) MMLU \\\n", - "4 GPT-4-0125-preview - - \n", - "32 Yi-34B-Chat - 0.735 \n", - "36 GPT-3.5-Turbo-0125 - - \n", - "39 OpenChat-3.5-0106 7.8 0.658 \n", - "43 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n", - "60 CodeLlama-70B-instruct - - \n", - "\n", - " Knowledge cutoff date License Organization \\\n", - "4 2023/12 Proprietary LLM OpenAI \n", - "32 2023/6 Open LLM 01 AI \n", - "36 2021/9 Proprietary LLM OpenAI \n", - "39 2024/1 Open LLM OpenChat \n", - "43 2024/1 Open LLM NousResearch \n", - "60 2024/1 Open LLM Meta \n", - "\n", - " Link Release Date \\\n", - "4 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n", - "32 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n", - "36 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n", - "39 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n", - "43 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n", - "60 https://huggingface.co/codellama/CodeLlama-70b-hf 2024-01-29 \n", - "\n", - " license_binary Month-Year \n", - "4 Proprietary LLM 2024-01 \n", - "32 Open LLM 2024-01 \n", - "36 Proprietary LLM 2024-01 \n", - "39 Open LLM 2024-01 \n", - "43 Open LLM 2024-01 \n", - "60 Open LLM 2024-01 " - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"Month-Year\"] == \"2024-01\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/w0/6t9rxkj97rv47l9sc0q22yth0000gn/T/ipykernel_7726/1725500526.py:1: DeprecationWarning:\n", - "\n", - "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ratingvariancerating_q975rating_q025num_battlesfinal_rankingkeyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLinkRelease Datelicense_binaryMonth-Year
01111.1326407.8017411115.3569931105.6582541317729yi-34b-chatYi-34B-Chat-0.7352023/6Open LLM01 AIhttps://huggingface.co/01-ai/Yi-34B-Chat2024-01-23Open LLM2024-01
11098.5274556.4001661103.3435921093.9036951415936openchat-3.5-0106OpenChat-3.5-01067.80.6582024/1Open LLMOpenChathttps://huggingface.co/openchat/openchat-3.5-01062024-01-06Open LLM2024-01
21087.30775818.3142581094.5325981078.413814398040nous-hermes-2-mixtral-8x7b-dpoNous-Hermes-2-Mixtral-8x7B-DPO--2024/1Open LLMNousResearchhttps://huggingface.co/NousResearch/Nous-Herme...2024-01-13Open LLM2024-01
31246.7775911.9424771249.9797121244.305362673542gpt-4-0125-previewGPT-4-0125-preview--2023/12Proprietary LLMOpenAIhttps://openai.com/blog/new-models-and-develop...2024-01-25Proprietary LLM2024-01
41107.1298102.4191821110.0561881104.0025814722032gpt-3.5-turbo-0125GPT-3.5-Turbo-0125--2021/9Proprietary LLMOpenAIhttps://platform.openai.com/docs/models/gpt-3-...2024-01-25Proprietary LLM2024-01
\n", - "
" - ], - "text/plain": [ - " rating variance rating_q975 rating_q025 num_battles \\\n", - "0 1111.132640 7.801741 1115.356993 1105.658254 13177 \n", - "1 1098.527455 6.400166 1103.343592 1093.903695 14159 \n", - "2 1087.307758 18.314258 1094.532598 1078.413814 3980 \n", - "3 1246.777591 1.942477 1249.979712 1244.305362 67354 \n", - "4 1107.129810 2.419182 1110.056188 1104.002581 47220 \n", - "\n", - " final_ranking key \\\n", - "0 29 yi-34b-chat \n", - "1 36 openchat-3.5-0106 \n", - "2 40 nous-hermes-2-mixtral-8x7b-dpo \n", - "3 2 gpt-4-0125-preview \n", - "4 32 gpt-3.5-turbo-0125 \n", - "\n", - " Model MT-bench (score) MMLU \\\n", - "0 Yi-34B-Chat - 0.735 \n", - "1 OpenChat-3.5-0106 7.8 0.658 \n", - "2 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n", - "3 GPT-4-0125-preview - - \n", - "4 GPT-3.5-Turbo-0125 - - \n", - "\n", - " Knowledge cutoff date License Organization \\\n", - "0 2023/6 Open LLM 01 AI \n", - "1 2024/1 Open LLM OpenChat \n", - "2 2024/1 Open LLM NousResearch \n", - "3 2023/12 Proprietary LLM OpenAI \n", - "4 2021/9 Proprietary LLM OpenAI \n", - "\n", - " Link Release Date \\\n", - "0 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n", - "1 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n", - "2 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n", - "3 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n", - "4 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n", - "\n", - " license_binary Month-Year \n", - "0 Open LLM 2024-01 \n", - "1 Open LLM 2024-01 \n", - "2 Open LLM 2024-01 \n", - "3 Proprietary LLM 2024-01 \n", - "4 Proprietary LLM 2024-01 " - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"Month-Year\"] == \"2024-01\"].groupby([\"Month-Year\", \"License\"]).apply(\n", - " lambda x: x.nlargest(3, \"rating\")\n", - ").reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n", - " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n", - " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n", - " 'Release Date', 'license_binary'],\n", - " dtype='object')" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "customdata": [ - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Anthropic", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Google", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Google", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Anthropic", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Anthropic", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Mistral", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Reka AI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Anthropic", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Mistral", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Reka AI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Google", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Anthropic", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Mistral", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Anthropic", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Google", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Anthropic", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "LMSYS", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Perplexity AI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "OpenAI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Perplexity AI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "UW", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Google", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "UC Berkeley", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Nomic AI", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Stanford", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Tsinghua", - "Proprietary LLM", - "Proprietary LLM" - ], - [ - "Meta", - "Proprietary LLM", - "Proprietary LLM" - ] - ], - "hovertemplate": "%{hovertext}

license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}", - "hovertext": [ - "GPT-4-Turbo-2024-04-09", - "GPT-4-1106-preview", - "Claude 3 Opus", - "Gemini 1.5 Pro API-0409-Preview", - "GPT-4-0125-preview", - "Bard (Gemini Pro)", - "Claude 3 Sonnet", - "GPT-4-0314", - "Claude 3 Haiku", - "GPT-4-0613", - "Mistral-Large-2402", - "Reka-Flash-21B-online", - "Claude-1", - "Mistral Medium", - "Reka-Flash-21B", - "Gemini Pro (Dev API)", - "Claude-2.0", - "Mistral-Next", - "GPT-3.5-Turbo-0613", - "Claude-2.1", - "Gemini Pro", - "Claude-Instant-1", - "GPT-3.5-Turbo-0314", - "GPT-3.5-Turbo-0125", - "Vicuna-33B", - "pplx-70b-online", - "GPT-3.5-Turbo-1106", - "pplx-7b-online", - "Guanaco-33B", - "PaLM-Chat-Bison-001", - "Koala-13B", - "GPT4All-13B-Snoozy", - "Alpaca-13B", - "ChatGLM-6B", - "LLaMA-13B" - ], - "legendgroup": "Proprietary LLM", - "marker": { - "color": "#636efa", - "size": 8, - "symbol": "circle" - }, - "mode": "markers", - "name": "Proprietary LLM", - "orientation": "v", - "showlegend": true, - "type": "scatter", - "x": [ - "2024-04-09T00:00:00", - "2023-11-06T00:00:00", - "2024-02-29T00:00:00", - "2024-04-09T00:00:00", - "2024-01-25T00:00:00", - "2024-02-01T00:00:00", - "2024-02-29T00:00:00", - "2024-03-14T00:00:00", - "2024-03-07T00:00:00", - "2023-06-13T00:00:00", - "2024-02-24T00:00:00", - "2024-02-26T00:00:00", - "2023-03-14T00:00:00", - "2023-12-11T00:00:00", - "2024-02-26T00:00:00", - "2023-12-13T00:00:00", - "2023-07-11T00:00:00", - "2024-02-17T00:00:00", - "2023-06-13T00:00:00", - "2023-11-21T00:00:00", - "2023-12-13T00:00:00", - "2023-03-14T00:00:00", - "2024-03-14T00:00:00", - "2024-01-25T00:00:00", - "2023-06-21T00:00:00", - "2023-11-29T00:00:00", - "2023-11-06T00:00:00", - "2023-11-29T00:00:00", - "2023-05-22T00:00:00", - "2023-07-10T00:00:00", - "2023-04-03T00:00:00", - "2023-04-24T00:00:00", - "2023-03-13T00:00:00", - "2023-03-13T00:00:00", - "2023-02-27T00:00:00" - ], - "xaxis": "x", - "y": [ - 1258.8152791324715, - 1252.6848856241577, - 1250.9262064295565, - 1249.6183945401244, - 1246.7775913509702, - 1208.7128773784577, - 1201.2654981955752, - 1189.557977031121, - 1180.8870022256567, - 1165.279013874706, - 1157.2129636222178, - 1153.368015144387, - 1150.6246111849628, - 1148.003325470259, - 1147.136619289767, - 1135.7254379948201, - 1132.3083987521873, - 1126.6887059695398, - 1119.8996424050451, - 1119.0708879096221, - 1115.3213731540973, - 1110.3806845414053, - 1108.9125926100855, - 1107.1298100300314, - 1093.8870113925889, - 1075.4285458870645, - 1072.711340370162, - 1043.3909111518306, - 1034.3952377983876, - 1009.7116452193085, - 969.48148016344, - 938.8924300511185, - 908.0843590844727, - 886.8734292498528, - 804.3563285706291 - ], - "yaxis": "y" - }, - { - "customdata": [ - [ - "Meta", - "Open LLM", - "Open LLM" - ], - [ - "Cohere", - "Open LLM", - "Open LLM" - ], - [ - "Meta", - "Open LLM", - "Open LLM" - ], - [ - "Alibaba", - "Open LLM", - "Open LLM" - ], - [ - "Cohere", - "Open LLM", - "Open LLM" - ], - [ - "Mistral", - "Open LLM", - "Open LLM" - ], - [ - "Alibaba", - "Open LLM", - "Open LLM" - ], - [ - "HuggingFace", - "Open LLM", - "Open LLM" - ], - [ - "Nexusflow", - "Open LLM", - "Open LLM" - ], - [ - "Alibaba", - "Open LLM", - "Open LLM" - ], - [ - "Mistral", - "Open LLM", - "Open LLM" - ], - [ - "01 AI", - "Open LLM", - "Open LLM" - ], - [ - "Microsoft", - "Open LLM", - "Open LLM" - ], - [ - "Databricks", - "Open LLM", - "Open LLM" - ], - [ - "AllenAI/UW", - "Open LLM", - "Open LLM" - ], - [ - "OpenChat", - "Open LLM", - "Open LLM" - ], - [ - "UC Berkeley", - "Open LLM", - "Open LLM" - ], - [ - "Meta", - "Open LLM", - "Open LLM" - ], - [ - "NousResearch", - "Open LLM", - "Open LLM" - ], - [ - "Google", - "Open LLM", - "Open LLM" - ], - [ - "Nvidia", - "Open LLM", - "Open LLM" - ], - [ - "DeepSeek AI", - "Open LLM", - "Open LLM" - ], - [ - "OpenChat", - "Open LLM", - "Open LLM" - ], - [ - "NousResearch", - "Open LLM", - "Open LLM" - ], - [ - "Alibaba", - "Open LLM", - "Open LLM" - ], - [ - "Mistral", - "Open LLM", - "Open LLM" - ], - [ - "Cognitive Computations", - "Open LLM", - "Open LLM" - ], - [ - "Upstage AI", - "Open LLM", - "Open LLM" - ], - [ - "Microsoft", - "Open LLM", - "Open LLM" - ], - [ - "Meta", - "Open LLM", - "Open LLM" - ], - [ - "HuggingFace", - "Open LLM", - "Open LLM" - ], - [ - "Microsoft", - "Open LLM", - "Open LLM" - ], - [ - "LMSYS", - "Open LLM", - "Open LLM" - ], - [ - "Meta", - "Open LLM", - "Open LLM" - ], - [ - "MosaicML", - "Open LLM", - "Open LLM" - ], - [ - "Meta", - "Open LLM", - "Open LLM" - ], - [ - "Google", - "Open LLM", - "Open LLM" - ], - [ - "HuggingFace", - "Open LLM", - "Open LLM" - ], - [ - "Meta", - "Open LLM", - "Open LLM" - ], - [ - "Alibaba", - "Open LLM", - "Open LLM" - ], - [ - "TII", - "Open LLM", - "Open LLM" - ], - [ - "Together AI", - "Open LLM", - "Open LLM" - ], - [ - "Allen AI", - "Open LLM", - "Open LLM" - ], - [ - "Google", - "Open LLM", - "Open LLM" - ], - [ - "Mistral", - "Open LLM", - "Open LLM" - ], - [ - "LMSYS", - "Open LLM", - "Open LLM" - ], - [ - "Alibaba", - "Open LLM", - "Open LLM" - ], - [ - "Google", - "Open LLM", - "Open LLM" - ], - [ - "Tsinghua", - "Open LLM", - "Open LLM" - ], - [ - "MosaicML", - "Open LLM", - "Open LLM" - ], - [ - "Tsinghua", - "Open LLM", - "Open LLM" - ], - [ - "RWKV", - "Open LLM", - "Open LLM" - ], - [ - "OpenAssistant", - "Open LLM", - "Open LLM" - ], - [ - "LMSYS", - "Open LLM", - "Open LLM" - ], - [ - "Stability AI", - "Open LLM", - "Open LLM" - ], - [ - "Databricks", - "Open LLM", - "Open LLM" - ] - ], - "hovertemplate": "%{hovertext}

license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}", - "hovertext": [ - "Llama-3-70b-Instruct", - "Command R+", - "Llama-3-8b-Instruct", - "Qwen1.5-72B-Chat", - "Command R", - "Mixtral-8x22b-Instruct-v0.1", - "Qwen1.5-32B-Chat", - "Zephyr-ORPO-141b-A35b-v0.1", - "Starling-LM-7B-beta", - "Qwen1.5-14B-Chat", - "Mixtral-8x7b-Instruct-v0.1", - "Yi-34B-Chat", - "WizardLM-70B-v1.0", - "DBRX-Instruct-Preview", - "Tulu-2-DPO-70B", - "OpenChat-3.5-0106", - "Starling-LM-7B-alpha", - "Llama-2-70b-chat", - "Nous-Hermes-2-Mixtral-8x7B-DPO", - "Gemma-1.1-7B-it", - "NV-Llama2-70B-SteerLM-Chat", - "DeepSeek-LLM-67B-Chat", - "OpenChat-3.5", - "OpenHermes-2.5-Mistral-7b", - "Qwen1.5-7B-Chat", - "Mistral-7B-Instruct-v0.2", - "Dolphin-2.2.1-Mistral-7B", - "SOLAR-10.7B-Instruct-v1.0", - "WizardLM-13b-v1.2", - "Llama-2-13b-chat", - "Zephyr-7b-beta", - "Phi-3-Mini-128k-Instruct", - "Vicuna-13B", - "CodeLlama-70B-instruct", - "MPT-30B-chat", - "CodeLlama-34B-instruct", - "Gemma-7B-it", - "Zephyr-7b-alpha", - "Llama-2-7b-chat", - "Qwen-14B-Chat", - "falcon-180b-chat", - "StripedHyena-Nous-7B", - "OLMo-7B-instruct", - "Gemma-1.1-2B-it", - "Mistral-7B-Instruct-v0.1", - "Vicuna-7B", - "Qwen1.5-4B-Chat", - "Gemma-2B-it", - "ChatGLM3-6B", - "MPT-7B-Chat", - "ChatGLM2-6B", - "RWKV-4-Raven-14B", - "OpenAssistant-Pythia-12B", - "FastChat-T5-3B", - "StableLM-Tuned-Alpha-7B", - "Dolly-V2-12B" - ], - "legendgroup": "Open LLM", - "marker": { - "color": "#EF553B", - "size": 8, - "symbol": "circle" - }, - "mode": "markers", - "name": "Open LLM", - "orientation": "v", - "showlegend": true, - "type": "scatter", - "x": [ - "2024-04-18T00:00:00", - "2024-04-04T00:00:00", - "2024-04-18T00:00:00", - "2024-02-04T00:00:00", - "2024-03-11T00:00:00", - "2024-04-17T00:00:00", - "2024-02-04T00:00:00", - "2024-04-12T00:00:00", - "2024-03-20T00:00:00", - "2024-02-04T00:00:00", - "2023-12-11T00:00:00", - "2024-01-23T00:00:00", - "2023-08-09T00:00:00", - "2024-03-27T00:00:00", - "2023-11-12T00:00:00", - "2024-01-06T00:00:00", - "2023-11-25T00:00:00", - "2023-07-18T00:00:00", - "2024-01-13T00:00:00", - "2024-04-09T00:00:00", - "2023-11-24T00:00:00", - "2023-11-29T00:00:00", - "2023-11-16T00:00:00", - "2023-10-29T00:00:00", - "2024-02-04T00:00:00", - "2023-12-11T00:00:00", - "2023-10-30T00:00:00", - "2023-12-13T00:00:00", - "2023-07-25T00:00:00", - "2023-07-18T00:00:00", - "2023-10-26T00:00:00", - "2024-04-23T00:00:00", - "2023-07-23T00:00:00", - "2024-01-29T00:00:00", - "2023-06-09T00:00:00", - "2023-08-24T00:00:00", - "2024-02-21T00:00:00", - "2023-10-09T00:00:00", - "2023-07-18T00:00:00", - "2023-09-24T00:00:00", - "2023-09-05T00:00:00", - "2023-12-07T00:00:00", - "2024-02-23T00:00:00", - "2024-04-09T00:00:00", - "2023-09-27T00:00:00", - "2023-07-29T00:00:00", - "2024-02-04T00:00:00", - "2024-02-21T00:00:00", - "2023-10-25T00:00:00", - "2023-05-04T00:00:00", - "2023-06-25T00:00:00", - "2023-05-22T00:00:00", - "2023-04-03T00:00:00", - "2023-04-27T00:00:00", - "2023-04-19T00:00:00", - "2023-04-12T00:00:00" - ], - "xaxis": "x", - "y": [ - 1209.6462958943152, - 1190.5291640364956, - 1152.500938092916, - 1152.485612667822, - 1147.8966494489798, - 1145.8123271934626, - 1133.8011394014864, - 1128.8163366984966, - 1118.5178781177128, - 1118.475700517794, - 1114, - 1111.1326399460543, - 1108.552744333791, - 1103.2167069462541, - 1102.79428840509, - 1098.527455141752, - 1091.5210240331344, - 1088.7078065720734, - 1087.307757938674, - 1082.9619916739105, - 1082.4713591517852, - 1079.7362777221456, - 1078.6663284631356, - 1078.6429577216027, - 1076.5321247427814, - 1074.0655548845186, - 1065.574858796917, - 1065.0611191304033, - 1061.9003873957429, - 1056.9265912995625, - 1054.4162995844372, - 1050.1481252382014, - 1047.9555279582555, - 1047.927687897156, - 1047.823066613369, - 1047.396876459045, - 1043.5443043467913, - 1043.0842673002462, - 1040.7537596503887, - 1038.586932982431, - 1037.076380506833, - 1023.112092466059, - 1020.7569311460566, - 1014.832737666584, - 1012.1048679697501, - 1009.3834445358582, - 1002.744713564041, - 999.6431193544297, - 960.7895509564338, - 933.340871331175, - 933.3372880828122, - 928.4512512366093, - 900.2948677134343, - 876.9291083582452, - 848.9325675003323, - 826.6473317994165 - ], - "yaxis": "y" - } - ], - "layout": { - "legend": { - "title": { - "text": "license_binary" - }, - "tracegroupgap": 0 - }, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "white", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "white", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "#C8D4E3", - "linecolor": "#C8D4E3", - "minorgridcolor": "#C8D4E3", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "#C8D4E3", - "linecolor": "#C8D4E3", - "minorgridcolor": "#C8D4E3", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "fillpattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "white", - "showlakes": true, - "showland": true, - "subunitcolor": "#C8D4E3" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "white", - "polar": { - "angularaxis": { - "gridcolor": "#EBF0F8", - "linecolor": "#EBF0F8", - "ticks": "" - }, - "bgcolor": "white", - "radialaxis": { - "gridcolor": "#EBF0F8", - "linecolor": "#EBF0F8", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "white", - "gridcolor": "#DFE8F3", - "gridwidth": 2, - "linecolor": "#EBF0F8", - "showbackground": true, - "ticks": "", - "zerolinecolor": "#EBF0F8" - }, - "yaxis": { - "backgroundcolor": "white", - "gridcolor": "#DFE8F3", - "gridwidth": 2, - "linecolor": "#EBF0F8", - "showbackground": true, - "ticks": "", - "zerolinecolor": "#EBF0F8" - }, - "zaxis": { - "backgroundcolor": "white", - "gridcolor": "#DFE8F3", - "gridwidth": 2, - "linecolor": "#EBF0F8", - "showbackground": true, - "ticks": "", - "zerolinecolor": "#EBF0F8" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "#DFE8F3", - "linecolor": "#A2B1C6", - "ticks": "" - }, - "baxis": { - "gridcolor": "#DFE8F3", - "linecolor": "#A2B1C6", - "ticks": "" - }, - "bgcolor": "white", - "caxis": { - "gridcolor": "#DFE8F3", - "linecolor": "#A2B1C6", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "#EBF0F8", - "linecolor": "#EBF0F8", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "#EBF0F8", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "#EBF0F8", - "linecolor": "#EBF0F8", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "#EBF0F8", - "zerolinewidth": 2 - } - } - }, - "title": { - "text": "Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)" - }, - "xaxis": { - "anchor": "y", - "domain": [ - 0, - 1 - ], - "title": { - "text": "Release Date" - } - }, - "yaxis": { - "anchor": "x", - "domain": [ - 0, - 1 - ], - "title": { - "text": "Arena ELO" - } - } - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import plotly.express as px\n", - "import plotly.graph_objects as go\n", - "\n", - "# Plotting\n", - "fig = px.scatter(\n", - " df,\n", - " x=\"Release Date\",\n", - " y=\"rating\",\n", - " color=\"license_binary\",\n", - " hover_name=\"Model\",\n", - " hover_data=[\n", - " \"Release Date\",\n", - " \"Organization\",\n", - " \"License\",\n", - " \"license_binary\",\n", - " ],\n", - " title=\"Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)\",\n", - " labels={\"rating\": \"Arena ELO\", \"Release Date\": \"Release Date\"},\n", - " template=\"plotly_white\",\n", - ")\n", - "fig.update_traces(marker=dict(size=8))\n", - "\n", - "# Display the plot\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "plotly.graph_objs._figure.Figure" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(fig)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}