{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "def get_leaderboard_df():\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n", " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n", " task = path_parts[4].capitalize()\n", " df.loc[model_revision, \"Date\"] = date\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " else:\n", " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", " df.loc[model_revision, task] = value\n", " \n", " # Drop rows where every entry is NaN\n", " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n", " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n", " # Strip off date from model name\n", " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n", " return df" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Model | \n", "Date | \n", "Average | \n", "Ifeval | \n", "Truthfulqa | \n", "Winogrande | \n", "Gsm8k | \n", "Mmlu | \n", "Hellaswag | \n", "Arc | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main | \n", "2024-03-02 | \n", "0.617 | \n", "0.553 | \n", "0.477 | \n", "0.785 | \n", "0.622 | \n", "0.51 | \n", "0.677 | \n", "0.698 | \n", "
1 | \n", "NousResearch_Nous-Hermes-2-Yi-34B_main | \n", "2024-03-04 | \n", "0.604 | \n", "NaN | \n", "0.439 | \n", "0.806 | \n", "NaN | \n", "0.48 | \n", "0.640 | \n", "0.654 | \n", "
2 | \n", "mistralai_Mixtral-8x7B-Instruct-v0.1_main | \n", "2024-03-02 | \n", "0.603 | \n", "0.497 | \n", "0.554 | \n", "0.736 | \n", "0.599 | \n", "0.43 | \n", "0.709 | \n", "0.698 | \n", "
3 | \n", "deepseek-ai_deepseek-llm-67b-chat_main | \n", "2024-03-04 | \n", "0.603 | \n", "NaN | \n", "0.395 | \n", "0.792 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.622 | \n", "
4 | \n", "deepseek-ai_deepseek-llm-67b-chat_main | \n", "2024-03-05 | \n", "0.585 | \n", "0.505 | \n", "NaN | \n", "NaN | \n", "0.761 | \n", "0.42 | \n", "0.654 | \n", "NaN | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
269 | \n", "HuggingFaceH4_starcoder2-15b-ift_v18.0 | \n", "2024-03-10 | \n", "0.089 | \n", "0.170 | \n", "NaN | \n", "NaN | \n", "0.008 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
270 | \n", "HuggingFaceH4_mistral-7b-ift_v49.0 | \n", "2024-03-07 | \n", "0.086 | \n", "0.172 | \n", "NaN | \n", "NaN | \n", "0.000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
271 | \n", "HuggingFaceH4_starchat-beta_main | \n", "2024-03-12 | \n", "0.079 | \n", "0.079 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
272 | \n", "HuggingFaceH4_starcoder2-15b-ift_v7.0 | \n", "2024-03-10 | \n", "0.070 | \n", "0.107 | \n", "NaN | \n", "NaN | \n", "0.032 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
273 | \n", "HuggingFaceH4_zephyr-7b-beta-ift_v1.1 | \n", "2024-03-13 | \n", "0.043 | \n", "0.087 | \n", "NaN | \n", "NaN | \n", "0.000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
274 rows × 10 columns
\n", "\n", " | Model | \n", "Average | \n", "Ifeval | \n", "Truthfulqa | \n", "Winogrande | \n", "Gsm8k | \n", "Mmlu | \n", "Hellaswag | \n", "Arc | \n", "
---|---|---|---|---|---|---|---|---|---|
50 | \n", "HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08 | \n", "0.49 | \n", "0.418 | \n", "0.359 | \n", "0.672 | \n", "0.453 | \n", "0.33 | \n", "0.656 | \n", "0.545 | \n", "
532 | \n", "HuggingFaceH4_mistral-7b-ift_v48.56 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "