diff --git "a/dev.ipynb" "b/dev.ipynb"
deleted file mode 100644--- "a/dev.ipynb"
+++ /dev/null
@@ -1,3587 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/andrewreed/Documents/success_projects/closed-vs-open-arena-elo/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "import pickle\n",
- "\n",
- "import pandas as pd\n",
- "from huggingface_hub import HfFileSystem, hf_hub_download"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Prepare data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "from typing import Literal\n",
- "\n",
- "\n",
- "def download_latest_data_from_space(\n",
- " repo_id: str, file_type: Literal[\"pkl\", \"csv\"]\n",
- ") -> str:\n",
- " \"\"\"\n",
- " Downloads the latest data file of the specified file type from the given repository space.\n",
- "\n",
- " Args:\n",
- " repo_id (str): The ID of the repository space.\n",
- " file_type (Literal[\"pkl\", \"csv\"]): The type of the data file to download. Must be either \"pkl\" or \"csv\".\n",
- "\n",
- " Returns:\n",
- " str: The local file path of the downloaded data file.\n",
- " \"\"\"\n",
- "\n",
- " def extract_date(filename):\n",
- " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n",
- "\n",
- " fs = HfFileSystem()\n",
- " data_file_path = f\"spaces/{repo_id}/*.{file_type}\"\n",
- " files = fs.glob(data_file_path)\n",
- " latest_file = sorted(files, key=extract_date, reverse=True)[0]\n",
- "\n",
- " latest_filepath_local = hf_hub_download(\n",
- " repo_id=repo_id,\n",
- " filename=latest_file.split(\"/\")[-1],\n",
- " repo_type=\"space\",\n",
- " )\n",
- " return latest_filepath_local"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "latest_leaderboard_file_local = download_latest_data_from_space(\n",
- " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"csv\"\n",
- ")\n",
- "latest_elo_file_local = download_latest_data_from_space(\n",
- " repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"pkl\"\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "# load and prepare ELO data\n",
- "key_to_category_name = {\n",
- " \"full\": \"Overall\",\n",
- " \"coding\": \"Coding\",\n",
- " \"long_user\": \"Longer Query\",\n",
- " \"english\": \"English\",\n",
- " \"chinese\": \"Chinese\",\n",
- " \"french\": \"French\",\n",
- " \"no_tie\": \"Exclude Ties\",\n",
- " \"no_short\": \"Exclude Short Query (< 5 tokens)\",\n",
- " \"no_refusal\": \"Exclude Refusal\",\n",
- "}\n",
- "cat_name_to_explanation = {\n",
- " \"Overall\": \"Overall Questions\",\n",
- " \"Coding\": \"Coding: whether conversation contains code snippets\",\n",
- " \"Longer Query\": \"Longer Query (>= 500 tokens)\",\n",
- " \"English\": \"English Prompts\",\n",
- " \"Chinese\": \"Chinese Prompts\",\n",
- " \"French\": \"French Prompts\",\n",
- " \"Exclude Ties\": \"Exclude Ties and Bothbad\",\n",
- " \"Exclude Short Query (< 5 tokens)\": \"Exclude Short User Query (< 5 tokens)\",\n",
- " \"Exclude Refusal\": 'Exclude model responses with refusal (e.g., \"I cannot answer\")',\n",
- "}\n",
- "\n",
- "with open(latest_elo_file_local, \"rb\") as fin:\n",
- " elo_results = pickle.load(fin)\n",
- "\n",
- "arena_dfs = {}\n",
- "for k in key_to_category_name.keys():\n",
- " if k not in elo_results:\n",
- " continue\n",
- " arena_dfs[key_to_category_name[k]] = elo_results[k][\"leaderboard_table_df\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "arena_dfs.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rating | \n",
- " variance | \n",
- " rating_q975 | \n",
- " rating_q025 | \n",
- " num_battles | \n",
- " final_ranking | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " RWKV-4-Raven-14B | \n",
- " 928.451251 | \n",
- " 26.146415 | \n",
- " 937.017097 | \n",
- " 919.444359 | \n",
- " 5129 | \n",
- " 82 | \n",
- "
\n",
- " \n",
- " alpaca-13b | \n",
- " 908.084359 | \n",
- " 18.598539 | \n",
- " 915.348707 | \n",
- " 900.602847 | \n",
- " 6111 | \n",
- " 86 | \n",
- "
\n",
- " \n",
- " bard-jan-24-gemini-pro | \n",
- " 1208.712877 | \n",
- " 7.975296 | \n",
- " 1213.331583 | \n",
- " 1203.004139 | \n",
- " 12387 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " chatglm-6b | \n",
- " 886.873429 | \n",
- " 19.813751 | \n",
- " 894.785321 | \n",
- " 878.677878 | \n",
- " 5195 | \n",
- " 87 | \n",
- "
\n",
- " \n",
- " chatglm2-6b | \n",
- " 933.337288 | \n",
- " 33.939472 | \n",
- " 944.493496 | \n",
- " 921.470740 | \n",
- " 2880 | \n",
- " 82 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " wizardlm-70b | \n",
- " 1108.552744 | \n",
- " 8.988005 | \n",
- " 1114.390689 | \n",
- " 1102.745236 | \n",
- " 8867 | \n",
- " 29 | \n",
- "
\n",
- " \n",
- " yi-34b-chat | \n",
- " 1111.132640 | \n",
- " 7.801741 | \n",
- " 1115.356993 | \n",
- " 1105.658254 | \n",
- " 13177 | \n",
- " 29 | \n",
- "
\n",
- " \n",
- " zephyr-7b-alpha | \n",
- " 1043.084267 | \n",
- " 45.472021 | \n",
- " 1054.269954 | \n",
- " 1027.602171 | \n",
- " 1901 | \n",
- " 57 | \n",
- "
\n",
- " \n",
- " zephyr-7b-beta | \n",
- " 1054.416300 | \n",
- " 11.094606 | \n",
- " 1060.265072 | \n",
- " 1047.790509 | \n",
- " 11924 | \n",
- " 55 | \n",
- "
\n",
- " \n",
- " zephyr-orpo-141b-A35b-v0.1 | \n",
- " 1128.816337 | \n",
- " 16.964385 | \n",
- " 1134.862680 | \n",
- " 1119.183571 | \n",
- " 5207 | \n",
- " 22 | \n",
- "
\n",
- " \n",
- "
\n",
- "
92 rows × 6 columns
\n",
- "
"
- ],
- "text/plain": [
- " rating variance rating_q975 rating_q025 \\\n",
- "RWKV-4-Raven-14B 928.451251 26.146415 937.017097 919.444359 \n",
- "alpaca-13b 908.084359 18.598539 915.348707 900.602847 \n",
- "bard-jan-24-gemini-pro 1208.712877 7.975296 1213.331583 1203.004139 \n",
- "chatglm-6b 886.873429 19.813751 894.785321 878.677878 \n",
- "chatglm2-6b 933.337288 33.939472 944.493496 921.470740 \n",
- "... ... ... ... ... \n",
- "wizardlm-70b 1108.552744 8.988005 1114.390689 1102.745236 \n",
- "yi-34b-chat 1111.132640 7.801741 1115.356993 1105.658254 \n",
- "zephyr-7b-alpha 1043.084267 45.472021 1054.269954 1027.602171 \n",
- "zephyr-7b-beta 1054.416300 11.094606 1060.265072 1047.790509 \n",
- "zephyr-orpo-141b-A35b-v0.1 1128.816337 16.964385 1134.862680 1119.183571 \n",
- "\n",
- " num_battles final_ranking \n",
- "RWKV-4-Raven-14B 5129 82 \n",
- "alpaca-13b 6111 86 \n",
- "bard-jan-24-gemini-pro 12387 6 \n",
- "chatglm-6b 5195 87 \n",
- "chatglm2-6b 2880 82 \n",
- "... ... ... \n",
- "wizardlm-70b 8867 29 \n",
- "yi-34b-chat 13177 29 \n",
- "zephyr-7b-alpha 1901 57 \n",
- "zephyr-7b-beta 11924 55 \n",
- "zephyr-orpo-141b-A35b-v0.1 5207 22 \n",
- "\n",
- "[92 rows x 6 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "arena_dfs[\"Overall\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "# load and prepare Leaderboard data\n",
- "leaderboard_df = pd.read_csv(latest_leaderboard_file_local)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " key | \n",
- " Model | \n",
- " MT-bench (score) | \n",
- " MMLU | \n",
- " Knowledge cutoff date | \n",
- " License | \n",
- " Organization | \n",
- " Link | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " wizardlm-30b | \n",
- " WizardLM-30B | \n",
- " 7.01 | \n",
- " 0.587 | \n",
- " 2023/6 | \n",
- " Non-commercial | \n",
- " Microsoft | \n",
- " https://huggingface.co/WizardLM/WizardLM-30B-V1.0 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " vicuna-13b-16k | \n",
- " Vicuna-13B-16k | \n",
- " 6.92 | \n",
- " 0.545 | \n",
- " 2023/7 | \n",
- " Llama 2 Community | \n",
- " LMSYS | \n",
- " https://huggingface.co/lmsys/vicuna-13b-v1.5-16k | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " wizardlm-13b-v1.1 | \n",
- " WizardLM-13B-v1.1 | \n",
- " 6.76 | \n",
- " 0.500 | \n",
- " 2023/7 | \n",
- " Non-commercial | \n",
- " Microsoft | \n",
- " https://huggingface.co/WizardLM/WizardLM-13B-V1.1 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " tulu-30b | \n",
- " Tulu-30B | \n",
- " 6.43 | \n",
- " 0.581 | \n",
- " 2023/6 | \n",
- " Non-commercial | \n",
- " AllenAI/UW | \n",
- " https://huggingface.co/allenai/tulu-30b | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " guanaco-65b | \n",
- " Guanaco-65B | \n",
- " 6.41 | \n",
- " 0.621 | \n",
- " 2023/5 | \n",
- " Non-commercial | \n",
- " UW | \n",
- " https://huggingface.co/timdettmers/guanaco-65b... | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 101 | \n",
- " llama-3-70b-instruct | \n",
- " Llama-3-70b-Instruct | \n",
- " - | \n",
- " 0.820 | \n",
- " 2023/12 | \n",
- " Llama 3 Community | \n",
- " Meta | \n",
- " https://llama.meta.com/llama3/ | \n",
- "
\n",
- " \n",
- " 102 | \n",
- " llama-3-8b-instruct | \n",
- " Llama-3-8b-Instruct | \n",
- " - | \n",
- " 0.684 | \n",
- " 2023/3 | \n",
- " Llama 3 Community | \n",
- " Meta | \n",
- " https://llama.meta.com/llama3/ | \n",
- "
\n",
- " \n",
- " 103 | \n",
- " gemini-1.5-pro-api-0409-preview | \n",
- " Gemini 1.5 Pro API-0409-Preview | \n",
- " - | \n",
- " 0.819 | \n",
- " 2023/11 | \n",
- " Proprietary | \n",
- " Google | \n",
- " https://blog.google/technology/ai/google-gemin... | \n",
- "
\n",
- " \n",
- " 104 | \n",
- " phi-3-mini-128k-instruct | \n",
- " Phi-3-Mini-128k-Instruct | \n",
- " - | \n",
- " 0.681 | \n",
- " 2023/10 | \n",
- " MIT | \n",
- " Microsoft | \n",
- " https://azure.microsoft.com/en-us/blog/introdu... | \n",
- "
\n",
- " \n",
- " 105 | \n",
- " snowflake-arctic-instruct | \n",
- " Snowflake Arctic Instruct | \n",
- " - | \n",
- " 0.673 | \n",
- " 2024/4 | \n",
- " Apache 2.0 | \n",
- " Snowflake | \n",
- " https://www.snowflake.com/blog/arctic-open-eff... | \n",
- "
\n",
- " \n",
- "
\n",
- "
106 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " key Model \\\n",
- "0 wizardlm-30b WizardLM-30B \n",
- "1 vicuna-13b-16k Vicuna-13B-16k \n",
- "2 wizardlm-13b-v1.1 WizardLM-13B-v1.1 \n",
- "3 tulu-30b Tulu-30B \n",
- "4 guanaco-65b Guanaco-65B \n",
- ".. ... ... \n",
- "101 llama-3-70b-instruct Llama-3-70b-Instruct \n",
- "102 llama-3-8b-instruct Llama-3-8b-Instruct \n",
- "103 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n",
- "104 phi-3-mini-128k-instruct Phi-3-Mini-128k-Instruct \n",
- "105 snowflake-arctic-instruct Snowflake Arctic Instruct \n",
- "\n",
- " MT-bench (score) MMLU Knowledge cutoff date License \\\n",
- "0 7.01 0.587 2023/6 Non-commercial \n",
- "1 6.92 0.545 2023/7 Llama 2 Community \n",
- "2 6.76 0.500 2023/7 Non-commercial \n",
- "3 6.43 0.581 2023/6 Non-commercial \n",
- "4 6.41 0.621 2023/5 Non-commercial \n",
- ".. ... ... ... ... \n",
- "101 - 0.820 2023/12 Llama 3 Community \n",
- "102 - 0.684 2023/3 Llama 3 Community \n",
- "103 - 0.819 2023/11 Proprietary \n",
- "104 - 0.681 2023/10 MIT \n",
- "105 - 0.673 2024/4 Apache 2.0 \n",
- "\n",
- " Organization Link \n",
- "0 Microsoft https://huggingface.co/WizardLM/WizardLM-30B-V1.0 \n",
- "1 LMSYS https://huggingface.co/lmsys/vicuna-13b-v1.5-16k \n",
- "2 Microsoft https://huggingface.co/WizardLM/WizardLM-13B-V1.1 \n",
- "3 AllenAI/UW https://huggingface.co/allenai/tulu-30b \n",
- "4 UW https://huggingface.co/timdettmers/guanaco-65b... \n",
- ".. ... ... \n",
- "101 Meta https://llama.meta.com/llama3/ \n",
- "102 Meta https://llama.meta.com/llama3/ \n",
- "103 Google https://blog.google/technology/ai/google-gemin... \n",
- "104 Microsoft https://azure.microsoft.com/en-us/blog/introdu... \n",
- "105 Snowflake https://www.snowflake.com/blog/arctic-open-eff... \n",
- "\n",
- "[106 rows x 8 columns]"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "leaderboard_df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "arena_dfs.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "# merge ELO and Leaderboard data\n",
- "merged_dfs = {}\n",
- "for k, v in arena_dfs.items():\n",
- " merged_dfs[k] = (\n",
- " pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on=\"key\")\n",
- " .sort_values(\"rating\", ascending=False)\n",
- " .reset_index(drop=True)\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rating | \n",
- " variance | \n",
- " rating_q975 | \n",
- " rating_q025 | \n",
- " num_battles | \n",
- " final_ranking | \n",
- " key | \n",
- " Model | \n",
- " MT-bench (score) | \n",
- " MMLU | \n",
- " Knowledge cutoff date | \n",
- " License | \n",
- " Organization | \n",
- " Link | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1258.815279 | \n",
- " 3.258132 | \n",
- " 1262.796713 | \n",
- " 1256.000508 | \n",
- " 35931 | \n",
- " 1 | \n",
- " gpt-4-turbo-2024-04-09 | \n",
- " GPT-4-Turbo-2024-04-09 | \n",
- " - | \n",
- " - | \n",
- " 2023/12 | \n",
- " Proprietary | \n",
- " OpenAI | \n",
- " https://platform.openai.com/docs/models/gpt-4-... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1252.684886 | \n",
- " 1.799233 | \n",
- " 1254.748391 | \n",
- " 1249.873417 | \n",
- " 73547 | \n",
- " 2 | \n",
- " gpt-4-1106-preview | \n",
- " GPT-4-1106-preview | \n",
- " 9.32 | \n",
- " - | \n",
- " 2023/4 | \n",
- " Proprietary | \n",
- " OpenAI | \n",
- " https://openai.com/blog/new-models-and-develop... | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1250.926206 | \n",
- " 2.018201 | \n",
- " 1253.851885 | \n",
- " 1248.166034 | \n",
- " 80997 | \n",
- " 2 | \n",
- " claude-3-opus-20240229 | \n",
- " Claude 3 Opus | \n",
- " - | \n",
- " 0.868 | \n",
- " 2023/8 | \n",
- " Proprietary | \n",
- " Anthropic | \n",
- " https://www.anthropic.com/news/claude-3-family | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 1249.618395 | \n",
- " 3.233129 | \n",
- " 1252.956497 | \n",
- " 1246.247080 | \n",
- " 39482 | \n",
- " 2 | \n",
- " gemini-1.5-pro-api-0409-preview | \n",
- " Gemini 1.5 Pro API-0409-Preview | \n",
- " - | \n",
- " 0.819 | \n",
- " 2023/11 | \n",
- " Proprietary | \n",
- " Google | \n",
- " https://blog.google/technology/ai/google-gemin... | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 1246.777591 | \n",
- " 1.942477 | \n",
- " 1249.979712 | \n",
- " 1244.305362 | \n",
- " 67354 | \n",
- " 2 | \n",
- " gpt-4-0125-preview | \n",
- " GPT-4-0125-preview | \n",
- " - | \n",
- " - | \n",
- " 2023/12 | \n",
- " Proprietary | \n",
- " OpenAI | \n",
- " https://openai.com/blog/new-models-and-develop... | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 87 | \n",
- " 886.873429 | \n",
- " 19.813751 | \n",
- " 894.785321 | \n",
- " 878.677878 | \n",
- " 5195 | \n",
- " 87 | \n",
- " chatglm-6b | \n",
- " ChatGLM-6B | \n",
- " 4.50 | \n",
- " 0.361 | \n",
- " 2023/3 | \n",
- " Non-commercial | \n",
- " Tsinghua | \n",
- " https://huggingface.co/THUDM/chatglm-6b | \n",
- "
\n",
- " \n",
- " 88 | \n",
- " 876.929108 | \n",
- " 27.115855 | \n",
- " 887.355529 | \n",
- " 866.860534 | \n",
- " 4521 | \n",
- " 88 | \n",
- " fastchat-t5-3b | \n",
- " FastChat-T5-3B | \n",
- " 3.04 | \n",
- " 0.477 | \n",
- " 2023/4 | \n",
- " Apache 2.0 | \n",
- " LMSYS | \n",
- " https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 | \n",
- "
\n",
- " \n",
- " 89 | \n",
- " 848.932568 | \n",
- " 36.961459 | \n",
- " 859.103936 | \n",
- " 837.364341 | \n",
- " 3461 | \n",
- " 90 | \n",
- " stablelm-tuned-alpha-7b | \n",
- " StableLM-Tuned-Alpha-7B | \n",
- " 2.75 | \n",
- " 0.244 | \n",
- " 2023/4 | \n",
- " CC-BY-NC-SA-4.0 | \n",
- " Stability AI | \n",
- " https://huggingface.co/stabilityai/stablelm-tu... | \n",
- "
\n",
- " \n",
- " 90 | \n",
- " 826.647332 | \n",
- " 30.156414 | \n",
- " 837.335988 | \n",
- " 816.370788 | \n",
- " 3666 | \n",
- " 91 | \n",
- " dolly-v2-12b | \n",
- " Dolly-V2-12B | \n",
- " 3.28 | \n",
- " 0.257 | \n",
- " 2023/4 | \n",
- " MIT | \n",
- " Databricks | \n",
- " https://huggingface.co/databricks/dolly-v2-12b | \n",
- "
\n",
- " \n",
- " 91 | \n",
- " 804.356329 | \n",
- " 44.756983 | \n",
- " 815.161492 | \n",
- " 790.879536 | \n",
- " 2538 | \n",
- " 92 | \n",
- " llama-13b | \n",
- " LLaMA-13B | \n",
- " 2.61 | \n",
- " 0.470 | \n",
- " 2023/2 | \n",
- " Non-commercial | \n",
- " Meta | \n",
- " https://arxiv.org/abs/2302.13971 | \n",
- "
\n",
- " \n",
- "
\n",
- "
92 rows × 14 columns
\n",
- "
"
- ],
- "text/plain": [
- " rating variance rating_q975 rating_q025 num_battles \\\n",
- "0 1258.815279 3.258132 1262.796713 1256.000508 35931 \n",
- "1 1252.684886 1.799233 1254.748391 1249.873417 73547 \n",
- "2 1250.926206 2.018201 1253.851885 1248.166034 80997 \n",
- "3 1249.618395 3.233129 1252.956497 1246.247080 39482 \n",
- "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
- ".. ... ... ... ... ... \n",
- "87 886.873429 19.813751 894.785321 878.677878 5195 \n",
- "88 876.929108 27.115855 887.355529 866.860534 4521 \n",
- "89 848.932568 36.961459 859.103936 837.364341 3461 \n",
- "90 826.647332 30.156414 837.335988 816.370788 3666 \n",
- "91 804.356329 44.756983 815.161492 790.879536 2538 \n",
- "\n",
- " final_ranking key \\\n",
- "0 1 gpt-4-turbo-2024-04-09 \n",
- "1 2 gpt-4-1106-preview \n",
- "2 2 claude-3-opus-20240229 \n",
- "3 2 gemini-1.5-pro-api-0409-preview \n",
- "4 2 gpt-4-0125-preview \n",
- ".. ... ... \n",
- "87 87 chatglm-6b \n",
- "88 88 fastchat-t5-3b \n",
- "89 90 stablelm-tuned-alpha-7b \n",
- "90 91 dolly-v2-12b \n",
- "91 92 llama-13b \n",
- "\n",
- " Model MT-bench (score) MMLU \\\n",
- "0 GPT-4-Turbo-2024-04-09 - - \n",
- "1 GPT-4-1106-preview 9.32 - \n",
- "2 Claude 3 Opus - 0.868 \n",
- "3 Gemini 1.5 Pro API-0409-Preview - 0.819 \n",
- "4 GPT-4-0125-preview - - \n",
- ".. ... ... ... \n",
- "87 ChatGLM-6B 4.50 0.361 \n",
- "88 FastChat-T5-3B 3.04 0.477 \n",
- "89 StableLM-Tuned-Alpha-7B 2.75 0.244 \n",
- "90 Dolly-V2-12B 3.28 0.257 \n",
- "91 LLaMA-13B 2.61 0.470 \n",
- "\n",
- " Knowledge cutoff date License Organization \\\n",
- "0 2023/12 Proprietary OpenAI \n",
- "1 2023/4 Proprietary OpenAI \n",
- "2 2023/8 Proprietary Anthropic \n",
- "3 2023/11 Proprietary Google \n",
- "4 2023/12 Proprietary OpenAI \n",
- ".. ... ... ... \n",
- "87 2023/3 Non-commercial Tsinghua \n",
- "88 2023/4 Apache 2.0 LMSYS \n",
- "89 2023/4 CC-BY-NC-SA-4.0 Stability AI \n",
- "90 2023/4 MIT Databricks \n",
- "91 2023/2 Non-commercial Meta \n",
- "\n",
- " Link \n",
- "0 https://platform.openai.com/docs/models/gpt-4-... \n",
- "1 https://openai.com/blog/new-models-and-develop... \n",
- "2 https://www.anthropic.com/news/claude-3-family \n",
- "3 https://blog.google/technology/ai/google-gemin... \n",
- "4 https://openai.com/blog/new-models-and-develop... \n",
- ".. ... \n",
- "87 https://huggingface.co/THUDM/chatglm-6b \n",
- "88 https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 \n",
- "89 https://huggingface.co/stabilityai/stablelm-tu... \n",
- "90 https://huggingface.co/databricks/dolly-v2-12b \n",
- "91 https://arxiv.org/abs/2302.13971 \n",
- "\n",
- "[92 rows x 14 columns]"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "merged_dfs[\"Overall\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Manually map release dates - MEH."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "t = merged_dfs[\"Overall\"].loc[:, [\"key\", \"Model\"]]\n",
- "t[\"Release Date\"] = \"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "release_date_mapping = pd.read_json(\"release_date_mapping.json\", orient=\"records\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " key | \n",
- " Model | \n",
- " Release Date | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " gpt-4-turbo-2024-04-09 | \n",
- " GPT-4-Turbo-2024-04-09 | \n",
- " 2024-04-09 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " gpt-4-1106-preview | \n",
- " GPT-4-1106-preview | \n",
- " 2023-11-06 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " claude-3-opus-20240229 | \n",
- " Claude 3 Opus | \n",
- " 2024-02-29 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " gemini-1.5-pro-api-0409-preview | \n",
- " Gemini 1.5 Pro API-0409-Preview | \n",
- " 2024-04-09 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " gpt-4-0125-preview | \n",
- " GPT-4-0125-preview | \n",
- " 2024-01-25 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 86 | \n",
- " chatglm-6b | \n",
- " ChatGLM-6B | \n",
- " 2023-03-13 | \n",
- "
\n",
- " \n",
- " 87 | \n",
- " fastchat-t5-3b | \n",
- " FastChat-T5-3B | \n",
- " 2023-04-27 | \n",
- "
\n",
- " \n",
- " 88 | \n",
- " stablelm-tuned-alpha-7b | \n",
- " StableLM-Tuned-Alpha-7B | \n",
- " 2023-04-19 | \n",
- "
\n",
- " \n",
- " 89 | \n",
- " dolly-v2-12b | \n",
- " Dolly-V2-12B | \n",
- " 2023-04-12 | \n",
- "
\n",
- " \n",
- " 90 | \n",
- " llama-13b | \n",
- " LLaMA-13B | \n",
- " 2023-02-27 | \n",
- "
\n",
- " \n",
- "
\n",
- "
91 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " key Model \\\n",
- "0 gpt-4-turbo-2024-04-09 GPT-4-Turbo-2024-04-09 \n",
- "1 gpt-4-1106-preview GPT-4-1106-preview \n",
- "2 claude-3-opus-20240229 Claude 3 Opus \n",
- "3 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n",
- "4 gpt-4-0125-preview GPT-4-0125-preview \n",
- ".. ... ... \n",
- "86 chatglm-6b ChatGLM-6B \n",
- "87 fastchat-t5-3b FastChat-T5-3B \n",
- "88 stablelm-tuned-alpha-7b StableLM-Tuned-Alpha-7B \n",
- "89 dolly-v2-12b Dolly-V2-12B \n",
- "90 llama-13b LLaMA-13B \n",
- "\n",
- " Release Date \n",
- "0 2024-04-09 \n",
- "1 2023-11-06 \n",
- "2 2024-02-29 \n",
- "3 2024-04-09 \n",
- "4 2024-01-25 \n",
- ".. ... \n",
- "86 2023-03-13 \n",
- "87 2023-04-27 \n",
- "88 2023-04-19 \n",
- "89 2023-04-12 \n",
- "90 2023-02-27 \n",
- "\n",
- "[91 rows x 3 columns]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "release_date_mapping"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " key | \n",
- " Release Date | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " gpt-4-turbo-2024-04-09 | \n",
- " 2024-04-09 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " gpt-4-1106-preview | \n",
- " 2023-11-06 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " claude-3-opus-20240229 | \n",
- " 2024-02-29 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " gemini-1.5-pro-api-0409-preview | \n",
- " 2024-04-09 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " gpt-4-0125-preview | \n",
- " 2024-01-25 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 86 | \n",
- " chatglm-6b | \n",
- " 2023-03-13 | \n",
- "
\n",
- " \n",
- " 87 | \n",
- " fastchat-t5-3b | \n",
- " 2023-04-27 | \n",
- "
\n",
- " \n",
- " 88 | \n",
- " stablelm-tuned-alpha-7b | \n",
- " 2023-04-19 | \n",
- "
\n",
- " \n",
- " 89 | \n",
- " dolly-v2-12b | \n",
- " 2023-04-12 | \n",
- "
\n",
- " \n",
- " 90 | \n",
- " llama-13b | \n",
- " 2023-02-27 | \n",
- "
\n",
- " \n",
- "
\n",
- "
91 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " key Release Date\n",
- "0 gpt-4-turbo-2024-04-09 2024-04-09\n",
- "1 gpt-4-1106-preview 2023-11-06\n",
- "2 claude-3-opus-20240229 2024-02-29\n",
- "3 gemini-1.5-pro-api-0409-preview 2024-04-09\n",
- "4 gpt-4-0125-preview 2024-01-25\n",
- ".. ... ...\n",
- "86 chatglm-6b 2023-03-13\n",
- "87 fastchat-t5-3b 2023-04-27\n",
- "88 stablelm-tuned-alpha-7b 2023-04-19\n",
- "89 dolly-v2-12b 2023-04-12\n",
- "90 llama-13b 2023-02-27\n",
- "\n",
- "[91 rows x 2 columns]"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "release_date_mapping[[\"key\", \"Release Date\"]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "# add release dates into the merged data\n",
- "for k, v in merged_dfs.items():\n",
- " merged_dfs[k] = pd.merge(\n",
- " merged_dfs[k], release_date_mapping[[\"key\", \"Release Date\"]], on=\"key\"\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n",
- " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n",
- " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n",
- " 'Release Date'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "merged_dfs[\"Overall\"].columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 77,
- "metadata": {},
- "outputs": [],
- "source": [
- "def format_data(df):\n",
- " df[\"License\"] = df[\"License\"].apply(\n",
- " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n",
- " )\n",
- " df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])\n",
- " df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")\n",
- " df[\"rating\"] = df[\"rating\"].round()\n",
- " return df.reset_index(drop=True)\n",
- "\n",
- "\n",
- "merged_dfs2 = {k: format_data(v) for k, v in merged_dfs.items()}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 81,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "5\n",
- "5\n",
- "5\n",
- "5\n",
- "5\n",
- "5\n",
- "5\n",
- "5\n",
- "5\n"
- ]
- }
- ],
- "source": [
- "for k, df in merged_dfs2.items():\n",
- " print(\n",
- " int(\n",
- " df.groupby([\"Release Date\", \"License\"])[\"rating\"]\n",
- " .apply(lambda x: len(x))\n",
- " .max()\n",
- " )\n",
- " )\n",
- " (df[\"rating\"].min().round(),)\n",
- " print()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Build plot"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
- "metadata": {},
- "outputs": [],
- "source": [
- "t = {\n",
- " \"Overall\": {\n",
- " \"min_elo_score\": 804.0,\n",
- " \"max_elo_score\": 1259.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"Coding\": {\n",
- " \"min_elo_score\": 672.0,\n",
- " \"max_elo_score\": 1270.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"Longer Query\": {\n",
- " \"min_elo_score\": 796.0,\n",
- " \"max_elo_score\": 1273.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"English\": {\n",
- " \"min_elo_score\": 783.0,\n",
- " \"max_elo_score\": 1246.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"Chinese\": {\n",
- " \"min_elo_score\": 753.0,\n",
- " \"max_elo_score\": 1325.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"French\": {\n",
- " \"min_elo_score\": 694.0,\n",
- " \"max_elo_score\": 1268.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"Exclude Ties\": {\n",
- " \"min_elo_score\": 654.0,\n",
- " \"max_elo_score\": 1334.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"Exclude Short Query (< 5 tokens)\": {\n",
- " \"min_elo_score\": 796.0,\n",
- " \"max_elo_score\": 1264.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- " \"Exclude Refusal\": {\n",
- " \"min_elo_score\": 795.0,\n",
- " \"max_elo_score\": 1264.0,\n",
- " \"upper_models_per_month\": 5,\n",
- " },\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "o = {\n",
- " \"min_elo_score\": ,\n",
- " \"max_elo_score\": ,\n",
- " \"upper_models_per_month\": ,\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [],
- "source": [
- "PROPRIETARY_LICENSES = [\n",
- " \"Proprietary\",\n",
- " \"Non-commercial\",\n",
- "]\n",
- "\n",
- "df = merged_dfs[\"Overall\"]\n",
- "df[\"License\"] = df[\"License\"].apply(\n",
- " lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n",
- ")\n",
- "df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "metadata": {},
- "outputs": [],
- "source": [
- "df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "8"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.groupby([\"Month-Year\", \"License\"])[\"rating\"].apply(lambda x: x.count()).max()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rating | \n",
- " variance | \n",
- " rating_q975 | \n",
- " rating_q025 | \n",
- " num_battles | \n",
- " final_ranking | \n",
- " key | \n",
- " Model | \n",
- " MT-bench (score) | \n",
- " MMLU | \n",
- " Knowledge cutoff date | \n",
- " License | \n",
- " Organization | \n",
- " Link | \n",
- " Release Date | \n",
- " license_binary | \n",
- " Month-Year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 4 | \n",
- " 1246.777591 | \n",
- " 1.942477 | \n",
- " 1249.979712 | \n",
- " 1244.305362 | \n",
- " 67354 | \n",
- " 2 | \n",
- " gpt-4-0125-preview | \n",
- " GPT-4-0125-preview | \n",
- " - | \n",
- " - | \n",
- " 2023/12 | \n",
- " Proprietary LLM | \n",
- " OpenAI | \n",
- " https://openai.com/blog/new-models-and-develop... | \n",
- " 2024-01-25 | \n",
- " Proprietary LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 32 | \n",
- " 1111.132640 | \n",
- " 7.801741 | \n",
- " 1115.356993 | \n",
- " 1105.658254 | \n",
- " 13177 | \n",
- " 29 | \n",
- " yi-34b-chat | \n",
- " Yi-34B-Chat | \n",
- " - | \n",
- " 0.735 | \n",
- " 2023/6 | \n",
- " Open LLM | \n",
- " 01 AI | \n",
- " https://huggingface.co/01-ai/Yi-34B-Chat | \n",
- " 2024-01-23 | \n",
- " Open LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 36 | \n",
- " 1107.129810 | \n",
- " 2.419182 | \n",
- " 1110.056188 | \n",
- " 1104.002581 | \n",
- " 47220 | \n",
- " 32 | \n",
- " gpt-3.5-turbo-0125 | \n",
- " GPT-3.5-Turbo-0125 | \n",
- " - | \n",
- " - | \n",
- " 2021/9 | \n",
- " Proprietary LLM | \n",
- " OpenAI | \n",
- " https://platform.openai.com/docs/models/gpt-3-... | \n",
- " 2024-01-25 | \n",
- " Proprietary LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 39 | \n",
- " 1098.527455 | \n",
- " 6.400166 | \n",
- " 1103.343592 | \n",
- " 1093.903695 | \n",
- " 14159 | \n",
- " 36 | \n",
- " openchat-3.5-0106 | \n",
- " OpenChat-3.5-0106 | \n",
- " 7.8 | \n",
- " 0.658 | \n",
- " 2024/1 | \n",
- " Open LLM | \n",
- " OpenChat | \n",
- " https://huggingface.co/openchat/openchat-3.5-0106 | \n",
- " 2024-01-06 | \n",
- " Open LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 43 | \n",
- " 1087.307758 | \n",
- " 18.314258 | \n",
- " 1094.532598 | \n",
- " 1078.413814 | \n",
- " 3980 | \n",
- " 40 | \n",
- " nous-hermes-2-mixtral-8x7b-dpo | \n",
- " Nous-Hermes-2-Mixtral-8x7B-DPO | \n",
- " - | \n",
- " - | \n",
- " 2024/1 | \n",
- " Open LLM | \n",
- " NousResearch | \n",
- " https://huggingface.co/NousResearch/Nous-Herme... | \n",
- " 2024-01-13 | \n",
- " Open LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 60 | \n",
- " 1047.927688 | \n",
- " 60.707225 | \n",
- " 1061.952116 | \n",
- " 1034.283514 | \n",
- " 1321 | \n",
- " 55 | \n",
- " codellama-70b-instruct | \n",
- " CodeLlama-70B-instruct | \n",
- " - | \n",
- " - | \n",
- " 2024/1 | \n",
- " Open LLM | \n",
- " Meta | \n",
- " https://huggingface.co/codellama/CodeLlama-70b-hf | \n",
- " 2024-01-29 | \n",
- " Open LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " rating variance rating_q975 rating_q025 num_battles \\\n",
- "4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
- "32 1111.132640 7.801741 1115.356993 1105.658254 13177 \n",
- "36 1107.129810 2.419182 1110.056188 1104.002581 47220 \n",
- "39 1098.527455 6.400166 1103.343592 1093.903695 14159 \n",
- "43 1087.307758 18.314258 1094.532598 1078.413814 3980 \n",
- "60 1047.927688 60.707225 1061.952116 1034.283514 1321 \n",
- "\n",
- " final_ranking key \\\n",
- "4 2 gpt-4-0125-preview \n",
- "32 29 yi-34b-chat \n",
- "36 32 gpt-3.5-turbo-0125 \n",
- "39 36 openchat-3.5-0106 \n",
- "43 40 nous-hermes-2-mixtral-8x7b-dpo \n",
- "60 55 codellama-70b-instruct \n",
- "\n",
- " Model MT-bench (score) MMLU \\\n",
- "4 GPT-4-0125-preview - - \n",
- "32 Yi-34B-Chat - 0.735 \n",
- "36 GPT-3.5-Turbo-0125 - - \n",
- "39 OpenChat-3.5-0106 7.8 0.658 \n",
- "43 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n",
- "60 CodeLlama-70B-instruct - - \n",
- "\n",
- " Knowledge cutoff date License Organization \\\n",
- "4 2023/12 Proprietary LLM OpenAI \n",
- "32 2023/6 Open LLM 01 AI \n",
- "36 2021/9 Proprietary LLM OpenAI \n",
- "39 2024/1 Open LLM OpenChat \n",
- "43 2024/1 Open LLM NousResearch \n",
- "60 2024/1 Open LLM Meta \n",
- "\n",
- " Link Release Date \\\n",
- "4 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n",
- "32 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n",
- "36 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n",
- "39 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n",
- "43 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n",
- "60 https://huggingface.co/codellama/CodeLlama-70b-hf 2024-01-29 \n",
- "\n",
- " license_binary Month-Year \n",
- "4 Proprietary LLM 2024-01 \n",
- "32 Open LLM 2024-01 \n",
- "36 Proprietary LLM 2024-01 \n",
- "39 Open LLM 2024-01 \n",
- "43 Open LLM 2024-01 \n",
- "60 Open LLM 2024-01 "
- ]
- },
- "execution_count": 69,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df[\"Month-Year\"] == \"2024-01\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/w0/6t9rxkj97rv47l9sc0q22yth0000gn/T/ipykernel_7726/1725500526.py:1: DeprecationWarning:\n",
- "\n",
- "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rating | \n",
- " variance | \n",
- " rating_q975 | \n",
- " rating_q025 | \n",
- " num_battles | \n",
- " final_ranking | \n",
- " key | \n",
- " Model | \n",
- " MT-bench (score) | \n",
- " MMLU | \n",
- " Knowledge cutoff date | \n",
- " License | \n",
- " Organization | \n",
- " Link | \n",
- " Release Date | \n",
- " license_binary | \n",
- " Month-Year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1111.132640 | \n",
- " 7.801741 | \n",
- " 1115.356993 | \n",
- " 1105.658254 | \n",
- " 13177 | \n",
- " 29 | \n",
- " yi-34b-chat | \n",
- " Yi-34B-Chat | \n",
- " - | \n",
- " 0.735 | \n",
- " 2023/6 | \n",
- " Open LLM | \n",
- " 01 AI | \n",
- " https://huggingface.co/01-ai/Yi-34B-Chat | \n",
- " 2024-01-23 | \n",
- " Open LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1098.527455 | \n",
- " 6.400166 | \n",
- " 1103.343592 | \n",
- " 1093.903695 | \n",
- " 14159 | \n",
- " 36 | \n",
- " openchat-3.5-0106 | \n",
- " OpenChat-3.5-0106 | \n",
- " 7.8 | \n",
- " 0.658 | \n",
- " 2024/1 | \n",
- " Open LLM | \n",
- " OpenChat | \n",
- " https://huggingface.co/openchat/openchat-3.5-0106 | \n",
- " 2024-01-06 | \n",
- " Open LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1087.307758 | \n",
- " 18.314258 | \n",
- " 1094.532598 | \n",
- " 1078.413814 | \n",
- " 3980 | \n",
- " 40 | \n",
- " nous-hermes-2-mixtral-8x7b-dpo | \n",
- " Nous-Hermes-2-Mixtral-8x7B-DPO | \n",
- " - | \n",
- " - | \n",
- " 2024/1 | \n",
- " Open LLM | \n",
- " NousResearch | \n",
- " https://huggingface.co/NousResearch/Nous-Herme... | \n",
- " 2024-01-13 | \n",
- " Open LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 1246.777591 | \n",
- " 1.942477 | \n",
- " 1249.979712 | \n",
- " 1244.305362 | \n",
- " 67354 | \n",
- " 2 | \n",
- " gpt-4-0125-preview | \n",
- " GPT-4-0125-preview | \n",
- " - | \n",
- " - | \n",
- " 2023/12 | \n",
- " Proprietary LLM | \n",
- " OpenAI | \n",
- " https://openai.com/blog/new-models-and-develop... | \n",
- " 2024-01-25 | \n",
- " Proprietary LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 1107.129810 | \n",
- " 2.419182 | \n",
- " 1110.056188 | \n",
- " 1104.002581 | \n",
- " 47220 | \n",
- " 32 | \n",
- " gpt-3.5-turbo-0125 | \n",
- " GPT-3.5-Turbo-0125 | \n",
- " - | \n",
- " - | \n",
- " 2021/9 | \n",
- " Proprietary LLM | \n",
- " OpenAI | \n",
- " https://platform.openai.com/docs/models/gpt-3-... | \n",
- " 2024-01-25 | \n",
- " Proprietary LLM | \n",
- " 2024-01 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " rating variance rating_q975 rating_q025 num_battles \\\n",
- "0 1111.132640 7.801741 1115.356993 1105.658254 13177 \n",
- "1 1098.527455 6.400166 1103.343592 1093.903695 14159 \n",
- "2 1087.307758 18.314258 1094.532598 1078.413814 3980 \n",
- "3 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
- "4 1107.129810 2.419182 1110.056188 1104.002581 47220 \n",
- "\n",
- " final_ranking key \\\n",
- "0 29 yi-34b-chat \n",
- "1 36 openchat-3.5-0106 \n",
- "2 40 nous-hermes-2-mixtral-8x7b-dpo \n",
- "3 2 gpt-4-0125-preview \n",
- "4 32 gpt-3.5-turbo-0125 \n",
- "\n",
- " Model MT-bench (score) MMLU \\\n",
- "0 Yi-34B-Chat - 0.735 \n",
- "1 OpenChat-3.5-0106 7.8 0.658 \n",
- "2 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n",
- "3 GPT-4-0125-preview - - \n",
- "4 GPT-3.5-Turbo-0125 - - \n",
- "\n",
- " Knowledge cutoff date License Organization \\\n",
- "0 2023/6 Open LLM 01 AI \n",
- "1 2024/1 Open LLM OpenChat \n",
- "2 2024/1 Open LLM NousResearch \n",
- "3 2023/12 Proprietary LLM OpenAI \n",
- "4 2021/9 Proprietary LLM OpenAI \n",
- "\n",
- " Link Release Date \\\n",
- "0 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n",
- "1 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n",
- "2 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n",
- "3 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n",
- "4 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n",
- "\n",
- " license_binary Month-Year \n",
- "0 Open LLM 2024-01 \n",
- "1 Open LLM 2024-01 \n",
- "2 Open LLM 2024-01 \n",
- "3 Proprietary LLM 2024-01 \n",
- "4 Proprietary LLM 2024-01 "
- ]
- },
- "execution_count": 70,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df[\"Month-Year\"] == \"2024-01\"].groupby([\"Month-Year\", \"License\"]).apply(\n",
- " lambda x: x.nlargest(3, \"rating\")\n",
- ").reset_index(drop=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n",
- " 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n",
- " 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n",
- " 'Release Date', 'license_binary'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.plotly.v1+json": {
- "config": {
- "plotlyServerURL": "https://plot.ly"
- },
- "data": [
- {
- "customdata": [
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Anthropic",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Google",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Google",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Anthropic",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Anthropic",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Mistral",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Reka AI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Anthropic",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Mistral",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Reka AI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Google",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Anthropic",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Mistral",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Anthropic",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Google",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Anthropic",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "LMSYS",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Perplexity AI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "OpenAI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Perplexity AI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "UW",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Google",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "UC Berkeley",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Nomic AI",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Stanford",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Tsinghua",
- "Proprietary LLM",
- "Proprietary LLM"
- ],
- [
- "Meta",
- "Proprietary LLM",
- "Proprietary LLM"
- ]
- ],
- "hovertemplate": "%{hovertext}
license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}",
- "hovertext": [
- "GPT-4-Turbo-2024-04-09",
- "GPT-4-1106-preview",
- "Claude 3 Opus",
- "Gemini 1.5 Pro API-0409-Preview",
- "GPT-4-0125-preview",
- "Bard (Gemini Pro)",
- "Claude 3 Sonnet",
- "GPT-4-0314",
- "Claude 3 Haiku",
- "GPT-4-0613",
- "Mistral-Large-2402",
- "Reka-Flash-21B-online",
- "Claude-1",
- "Mistral Medium",
- "Reka-Flash-21B",
- "Gemini Pro (Dev API)",
- "Claude-2.0",
- "Mistral-Next",
- "GPT-3.5-Turbo-0613",
- "Claude-2.1",
- "Gemini Pro",
- "Claude-Instant-1",
- "GPT-3.5-Turbo-0314",
- "GPT-3.5-Turbo-0125",
- "Vicuna-33B",
- "pplx-70b-online",
- "GPT-3.5-Turbo-1106",
- "pplx-7b-online",
- "Guanaco-33B",
- "PaLM-Chat-Bison-001",
- "Koala-13B",
- "GPT4All-13B-Snoozy",
- "Alpaca-13B",
- "ChatGLM-6B",
- "LLaMA-13B"
- ],
- "legendgroup": "Proprietary LLM",
- "marker": {
- "color": "#636efa",
- "size": 8,
- "symbol": "circle"
- },
- "mode": "markers",
- "name": "Proprietary LLM",
- "orientation": "v",
- "showlegend": true,
- "type": "scatter",
- "x": [
- "2024-04-09T00:00:00",
- "2023-11-06T00:00:00",
- "2024-02-29T00:00:00",
- "2024-04-09T00:00:00",
- "2024-01-25T00:00:00",
- "2024-02-01T00:00:00",
- "2024-02-29T00:00:00",
- "2024-03-14T00:00:00",
- "2024-03-07T00:00:00",
- "2023-06-13T00:00:00",
- "2024-02-24T00:00:00",
- "2024-02-26T00:00:00",
- "2023-03-14T00:00:00",
- "2023-12-11T00:00:00",
- "2024-02-26T00:00:00",
- "2023-12-13T00:00:00",
- "2023-07-11T00:00:00",
- "2024-02-17T00:00:00",
- "2023-06-13T00:00:00",
- "2023-11-21T00:00:00",
- "2023-12-13T00:00:00",
- "2023-03-14T00:00:00",
- "2024-03-14T00:00:00",
- "2024-01-25T00:00:00",
- "2023-06-21T00:00:00",
- "2023-11-29T00:00:00",
- "2023-11-06T00:00:00",
- "2023-11-29T00:00:00",
- "2023-05-22T00:00:00",
- "2023-07-10T00:00:00",
- "2023-04-03T00:00:00",
- "2023-04-24T00:00:00",
- "2023-03-13T00:00:00",
- "2023-03-13T00:00:00",
- "2023-02-27T00:00:00"
- ],
- "xaxis": "x",
- "y": [
- 1258.8152791324715,
- 1252.6848856241577,
- 1250.9262064295565,
- 1249.6183945401244,
- 1246.7775913509702,
- 1208.7128773784577,
- 1201.2654981955752,
- 1189.557977031121,
- 1180.8870022256567,
- 1165.279013874706,
- 1157.2129636222178,
- 1153.368015144387,
- 1150.6246111849628,
- 1148.003325470259,
- 1147.136619289767,
- 1135.7254379948201,
- 1132.3083987521873,
- 1126.6887059695398,
- 1119.8996424050451,
- 1119.0708879096221,
- 1115.3213731540973,
- 1110.3806845414053,
- 1108.9125926100855,
- 1107.1298100300314,
- 1093.8870113925889,
- 1075.4285458870645,
- 1072.711340370162,
- 1043.3909111518306,
- 1034.3952377983876,
- 1009.7116452193085,
- 969.48148016344,
- 938.8924300511185,
- 908.0843590844727,
- 886.8734292498528,
- 804.3563285706291
- ],
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "Meta",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Cohere",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Meta",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Alibaba",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Cohere",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Mistral",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Alibaba",
- "Open LLM",
- "Open LLM"
- ],
- [
- "HuggingFace",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Nexusflow",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Alibaba",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Mistral",
- "Open LLM",
- "Open LLM"
- ],
- [
- "01 AI",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Microsoft",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Databricks",
- "Open LLM",
- "Open LLM"
- ],
- [
- "AllenAI/UW",
- "Open LLM",
- "Open LLM"
- ],
- [
- "OpenChat",
- "Open LLM",
- "Open LLM"
- ],
- [
- "UC Berkeley",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Meta",
- "Open LLM",
- "Open LLM"
- ],
- [
- "NousResearch",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Google",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Nvidia",
- "Open LLM",
- "Open LLM"
- ],
- [
- "DeepSeek AI",
- "Open LLM",
- "Open LLM"
- ],
- [
- "OpenChat",
- "Open LLM",
- "Open LLM"
- ],
- [
- "NousResearch",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Alibaba",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Mistral",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Cognitive Computations",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Upstage AI",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Microsoft",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Meta",
- "Open LLM",
- "Open LLM"
- ],
- [
- "HuggingFace",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Microsoft",
- "Open LLM",
- "Open LLM"
- ],
- [
- "LMSYS",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Meta",
- "Open LLM",
- "Open LLM"
- ],
- [
- "MosaicML",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Meta",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Google",
- "Open LLM",
- "Open LLM"
- ],
- [
- "HuggingFace",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Meta",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Alibaba",
- "Open LLM",
- "Open LLM"
- ],
- [
- "TII",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Together AI",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Allen AI",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Google",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Mistral",
- "Open LLM",
- "Open LLM"
- ],
- [
- "LMSYS",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Alibaba",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Google",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Tsinghua",
- "Open LLM",
- "Open LLM"
- ],
- [
- "MosaicML",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Tsinghua",
- "Open LLM",
- "Open LLM"
- ],
- [
- "RWKV",
- "Open LLM",
- "Open LLM"
- ],
- [
- "OpenAssistant",
- "Open LLM",
- "Open LLM"
- ],
- [
- "LMSYS",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Stability AI",
- "Open LLM",
- "Open LLM"
- ],
- [
- "Databricks",
- "Open LLM",
- "Open LLM"
- ]
- ],
- "hovertemplate": "%{hovertext}
license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}",
- "hovertext": [
- "Llama-3-70b-Instruct",
- "Command R+",
- "Llama-3-8b-Instruct",
- "Qwen1.5-72B-Chat",
- "Command R",
- "Mixtral-8x22b-Instruct-v0.1",
- "Qwen1.5-32B-Chat",
- "Zephyr-ORPO-141b-A35b-v0.1",
- "Starling-LM-7B-beta",
- "Qwen1.5-14B-Chat",
- "Mixtral-8x7b-Instruct-v0.1",
- "Yi-34B-Chat",
- "WizardLM-70B-v1.0",
- "DBRX-Instruct-Preview",
- "Tulu-2-DPO-70B",
- "OpenChat-3.5-0106",
- "Starling-LM-7B-alpha",
- "Llama-2-70b-chat",
- "Nous-Hermes-2-Mixtral-8x7B-DPO",
- "Gemma-1.1-7B-it",
- "NV-Llama2-70B-SteerLM-Chat",
- "DeepSeek-LLM-67B-Chat",
- "OpenChat-3.5",
- "OpenHermes-2.5-Mistral-7b",
- "Qwen1.5-7B-Chat",
- "Mistral-7B-Instruct-v0.2",
- "Dolphin-2.2.1-Mistral-7B",
- "SOLAR-10.7B-Instruct-v1.0",
- "WizardLM-13b-v1.2",
- "Llama-2-13b-chat",
- "Zephyr-7b-beta",
- "Phi-3-Mini-128k-Instruct",
- "Vicuna-13B",
- "CodeLlama-70B-instruct",
- "MPT-30B-chat",
- "CodeLlama-34B-instruct",
- "Gemma-7B-it",
- "Zephyr-7b-alpha",
- "Llama-2-7b-chat",
- "Qwen-14B-Chat",
- "falcon-180b-chat",
- "StripedHyena-Nous-7B",
- "OLMo-7B-instruct",
- "Gemma-1.1-2B-it",
- "Mistral-7B-Instruct-v0.1",
- "Vicuna-7B",
- "Qwen1.5-4B-Chat",
- "Gemma-2B-it",
- "ChatGLM3-6B",
- "MPT-7B-Chat",
- "ChatGLM2-6B",
- "RWKV-4-Raven-14B",
- "OpenAssistant-Pythia-12B",
- "FastChat-T5-3B",
- "StableLM-Tuned-Alpha-7B",
- "Dolly-V2-12B"
- ],
- "legendgroup": "Open LLM",
- "marker": {
- "color": "#EF553B",
- "size": 8,
- "symbol": "circle"
- },
- "mode": "markers",
- "name": "Open LLM",
- "orientation": "v",
- "showlegend": true,
- "type": "scatter",
- "x": [
- "2024-04-18T00:00:00",
- "2024-04-04T00:00:00",
- "2024-04-18T00:00:00",
- "2024-02-04T00:00:00",
- "2024-03-11T00:00:00",
- "2024-04-17T00:00:00",
- "2024-02-04T00:00:00",
- "2024-04-12T00:00:00",
- "2024-03-20T00:00:00",
- "2024-02-04T00:00:00",
- "2023-12-11T00:00:00",
- "2024-01-23T00:00:00",
- "2023-08-09T00:00:00",
- "2024-03-27T00:00:00",
- "2023-11-12T00:00:00",
- "2024-01-06T00:00:00",
- "2023-11-25T00:00:00",
- "2023-07-18T00:00:00",
- "2024-01-13T00:00:00",
- "2024-04-09T00:00:00",
- "2023-11-24T00:00:00",
- "2023-11-29T00:00:00",
- "2023-11-16T00:00:00",
- "2023-10-29T00:00:00",
- "2024-02-04T00:00:00",
- "2023-12-11T00:00:00",
- "2023-10-30T00:00:00",
- "2023-12-13T00:00:00",
- "2023-07-25T00:00:00",
- "2023-07-18T00:00:00",
- "2023-10-26T00:00:00",
- "2024-04-23T00:00:00",
- "2023-07-23T00:00:00",
- "2024-01-29T00:00:00",
- "2023-06-09T00:00:00",
- "2023-08-24T00:00:00",
- "2024-02-21T00:00:00",
- "2023-10-09T00:00:00",
- "2023-07-18T00:00:00",
- "2023-09-24T00:00:00",
- "2023-09-05T00:00:00",
- "2023-12-07T00:00:00",
- "2024-02-23T00:00:00",
- "2024-04-09T00:00:00",
- "2023-09-27T00:00:00",
- "2023-07-29T00:00:00",
- "2024-02-04T00:00:00",
- "2024-02-21T00:00:00",
- "2023-10-25T00:00:00",
- "2023-05-04T00:00:00",
- "2023-06-25T00:00:00",
- "2023-05-22T00:00:00",
- "2023-04-03T00:00:00",
- "2023-04-27T00:00:00",
- "2023-04-19T00:00:00",
- "2023-04-12T00:00:00"
- ],
- "xaxis": "x",
- "y": [
- 1209.6462958943152,
- 1190.5291640364956,
- 1152.500938092916,
- 1152.485612667822,
- 1147.8966494489798,
- 1145.8123271934626,
- 1133.8011394014864,
- 1128.8163366984966,
- 1118.5178781177128,
- 1118.475700517794,
- 1114,
- 1111.1326399460543,
- 1108.552744333791,
- 1103.2167069462541,
- 1102.79428840509,
- 1098.527455141752,
- 1091.5210240331344,
- 1088.7078065720734,
- 1087.307757938674,
- 1082.9619916739105,
- 1082.4713591517852,
- 1079.7362777221456,
- 1078.6663284631356,
- 1078.6429577216027,
- 1076.5321247427814,
- 1074.0655548845186,
- 1065.574858796917,
- 1065.0611191304033,
- 1061.9003873957429,
- 1056.9265912995625,
- 1054.4162995844372,
- 1050.1481252382014,
- 1047.9555279582555,
- 1047.927687897156,
- 1047.823066613369,
- 1047.396876459045,
- 1043.5443043467913,
- 1043.0842673002462,
- 1040.7537596503887,
- 1038.586932982431,
- 1037.076380506833,
- 1023.112092466059,
- 1020.7569311460566,
- 1014.832737666584,
- 1012.1048679697501,
- 1009.3834445358582,
- 1002.744713564041,
- 999.6431193544297,
- 960.7895509564338,
- 933.340871331175,
- 933.3372880828122,
- 928.4512512366093,
- 900.2948677134343,
- 876.9291083582452,
- 848.9325675003323,
- 826.6473317994165
- ],
- "yaxis": "y"
- }
- ],
- "layout": {
- "legend": {
- "title": {
- "text": "license_binary"
- },
- "tracegroupgap": 0
- },
- "template": {
- "data": {
- "bar": [
- {
- "error_x": {
- "color": "#2a3f5f"
- },
- "error_y": {
- "color": "#2a3f5f"
- },
- "marker": {
- "line": {
- "color": "white",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "bar"
- }
- ],
- "barpolar": [
- {
- "marker": {
- "line": {
- "color": "white",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "barpolar"
- }
- ],
- "carpet": [
- {
- "aaxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "#C8D4E3",
- "linecolor": "#C8D4E3",
- "minorgridcolor": "#C8D4E3",
- "startlinecolor": "#2a3f5f"
- },
- "baxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "#C8D4E3",
- "linecolor": "#C8D4E3",
- "minorgridcolor": "#C8D4E3",
- "startlinecolor": "#2a3f5f"
- },
- "type": "carpet"
- }
- ],
- "choropleth": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "choropleth"
- }
- ],
- "contour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "contour"
- }
- ],
- "contourcarpet": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "contourcarpet"
- }
- ],
- "heatmap": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmap"
- }
- ],
- "heatmapgl": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmapgl"
- }
- ],
- "histogram": [
- {
- "marker": {
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "histogram"
- }
- ],
- "histogram2d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2d"
- }
- ],
- "histogram2dcontour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2dcontour"
- }
- ],
- "mesh3d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "mesh3d"
- }
- ],
- "parcoords": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "parcoords"
- }
- ],
- "pie": [
- {
- "automargin": true,
- "type": "pie"
- }
- ],
- "scatter": [
- {
- "fillpattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- },
- "type": "scatter"
- }
- ],
- "scatter3d": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter3d"
- }
- ],
- "scattercarpet": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattercarpet"
- }
- ],
- "scattergeo": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergeo"
- }
- ],
- "scattergl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergl"
- }
- ],
- "scattermapbox": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermapbox"
- }
- ],
- "scatterpolar": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolar"
- }
- ],
- "scatterpolargl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolargl"
- }
- ],
- "scatterternary": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterternary"
- }
- ],
- "surface": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "surface"
- }
- ],
- "table": [
- {
- "cells": {
- "fill": {
- "color": "#EBF0F8"
- },
- "line": {
- "color": "white"
- }
- },
- "header": {
- "fill": {
- "color": "#C8D4E3"
- },
- "line": {
- "color": "white"
- }
- },
- "type": "table"
- }
- ]
- },
- "layout": {
- "annotationdefaults": {
- "arrowcolor": "#2a3f5f",
- "arrowhead": 0,
- "arrowwidth": 1
- },
- "autotypenumbers": "strict",
- "coloraxis": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "colorscale": {
- "diverging": [
- [
- 0,
- "#8e0152"
- ],
- [
- 0.1,
- "#c51b7d"
- ],
- [
- 0.2,
- "#de77ae"
- ],
- [
- 0.3,
- "#f1b6da"
- ],
- [
- 0.4,
- "#fde0ef"
- ],
- [
- 0.5,
- "#f7f7f7"
- ],
- [
- 0.6,
- "#e6f5d0"
- ],
- [
- 0.7,
- "#b8e186"
- ],
- [
- 0.8,
- "#7fbc41"
- ],
- [
- 0.9,
- "#4d9221"
- ],
- [
- 1,
- "#276419"
- ]
- ],
- "sequential": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "sequentialminus": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ]
- },
- "colorway": [
- "#636efa",
- "#EF553B",
- "#00cc96",
- "#ab63fa",
- "#FFA15A",
- "#19d3f3",
- "#FF6692",
- "#B6E880",
- "#FF97FF",
- "#FECB52"
- ],
- "font": {
- "color": "#2a3f5f"
- },
- "geo": {
- "bgcolor": "white",
- "lakecolor": "white",
- "landcolor": "white",
- "showlakes": true,
- "showland": true,
- "subunitcolor": "#C8D4E3"
- },
- "hoverlabel": {
- "align": "left"
- },
- "hovermode": "closest",
- "mapbox": {
- "style": "light"
- },
- "paper_bgcolor": "white",
- "plot_bgcolor": "white",
- "polar": {
- "angularaxis": {
- "gridcolor": "#EBF0F8",
- "linecolor": "#EBF0F8",
- "ticks": ""
- },
- "bgcolor": "white",
- "radialaxis": {
- "gridcolor": "#EBF0F8",
- "linecolor": "#EBF0F8",
- "ticks": ""
- }
- },
- "scene": {
- "xaxis": {
- "backgroundcolor": "white",
- "gridcolor": "#DFE8F3",
- "gridwidth": 2,
- "linecolor": "#EBF0F8",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "#EBF0F8"
- },
- "yaxis": {
- "backgroundcolor": "white",
- "gridcolor": "#DFE8F3",
- "gridwidth": 2,
- "linecolor": "#EBF0F8",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "#EBF0F8"
- },
- "zaxis": {
- "backgroundcolor": "white",
- "gridcolor": "#DFE8F3",
- "gridwidth": 2,
- "linecolor": "#EBF0F8",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "#EBF0F8"
- }
- },
- "shapedefaults": {
- "line": {
- "color": "#2a3f5f"
- }
- },
- "ternary": {
- "aaxis": {
- "gridcolor": "#DFE8F3",
- "linecolor": "#A2B1C6",
- "ticks": ""
- },
- "baxis": {
- "gridcolor": "#DFE8F3",
- "linecolor": "#A2B1C6",
- "ticks": ""
- },
- "bgcolor": "white",
- "caxis": {
- "gridcolor": "#DFE8F3",
- "linecolor": "#A2B1C6",
- "ticks": ""
- }
- },
- "title": {
- "x": 0.05
- },
- "xaxis": {
- "automargin": true,
- "gridcolor": "#EBF0F8",
- "linecolor": "#EBF0F8",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "#EBF0F8",
- "zerolinewidth": 2
- },
- "yaxis": {
- "automargin": true,
- "gridcolor": "#EBF0F8",
- "linecolor": "#EBF0F8",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "#EBF0F8",
- "zerolinewidth": 2
- }
- }
- },
- "title": {
- "text": "Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)"
- },
- "xaxis": {
- "anchor": "y",
- "domain": [
- 0,
- 1
- ],
- "title": {
- "text": "Release Date"
- }
- },
- "yaxis": {
- "anchor": "x",
- "domain": [
- 0,
- 1
- ],
- "title": {
- "text": "Arena ELO"
- }
- }
- }
- }
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "import plotly.express as px\n",
- "import plotly.graph_objects as go\n",
- "\n",
- "# Plotting\n",
- "fig = px.scatter(\n",
- " df,\n",
- " x=\"Release Date\",\n",
- " y=\"rating\",\n",
- " color=\"license_binary\",\n",
- " hover_name=\"Model\",\n",
- " hover_data=[\n",
- " \"Release Date\",\n",
- " \"Organization\",\n",
- " \"License\",\n",
- " \"license_binary\",\n",
- " ],\n",
- " title=\"Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)\",\n",
- " labels={\"rating\": \"Arena ELO\", \"Release Date\": \"Release Date\"},\n",
- " template=\"plotly_white\",\n",
- ")\n",
- "fig.update_traces(marker=dict(size=8))\n",
- "\n",
- "# Display the plot\n",
- "fig.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "plotly.graph_objs._figure.Figure"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "type(fig)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}