Spaces:

Eurolingua
/

european-llm-leaderboard

Running

ajude commited on Sep 19, 2024

Commit

db1ab48

1 Parent(s): da6c970

feat(leaderboard): Added two features

1. Slider for filtering out the models based on the number of parameters.
2. Model name has embedded links to the respective hf model page.

Files changed (3) hide show

app.py +70 -43
core.py +28 -8
utils.py +193 -0

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import core as core
 from style import CSS, LANG_SYMBOLS, MT_BENCH_LANG_SYMBOLS, T_SYMBOLS, TITLE
 demo = gr.Blocks(css=CSS)
 with demo:
@@ -28,17 +29,23 @@ with demo:
                         show_label=True,
                         elem_id="search-bar",
                     )
-                    model_types = gr.CheckboxGroup(
-                        label="Select model type",
-                        choices=[
-                            (
-                                f"Pretrained {T_SYMBOLS['pretrained']}",
-                                T_SYMBOLS["pretrained"],
-                            ),
-                            (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
-                        ],
-                        value=list(T_SYMBOLS.values()),
-                    )
                 with gr.Row():
                     langs_bar = gr.CheckboxGroup(
@@ -92,7 +99,9 @@ with demo:
                     inputs=[],
                     outputs=shown_tasks,
                 )
-            leaderboard_table = gr.Dataframe()
         with gr.TabItem(
             "🏅 LLM accuracy benchmark (Zero-Shot)",
@@ -107,17 +116,24 @@ with demo:
                         show_label=True,
                         elem_id="search-bar",
                     )
-                    model_types_zero_shot = gr.CheckboxGroup(
-                        label="Select model type",
-                        choices=[
-                            (
-                                f"Pretrained {T_SYMBOLS['pretrained']}",
-                                T_SYMBOLS["pretrained"],
-                            ),
-                            (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
-                        ],
-                        value=list(T_SYMBOLS.values()),
-                    )
                 with gr.Row():
                     langs_bar_zero_shot = gr.CheckboxGroup(
@@ -171,7 +187,7 @@ with demo:
                     inputs=[],
                     outputs=shown_tasks_zero_shot,
                 )
-            leaderboard_table_zero_shot = gr.Dataframe()
         with gr.TabItem(
             "🌐 LLM translation benchmark",
@@ -187,17 +203,23 @@ with demo:
                         elem_id="search-bar",
                     )
-                    model_types_misc = gr.CheckboxGroup(
-                        label="Select model type",
-                        choices=[
-                            (
-                                f"Pretrained {T_SYMBOLS['pretrained']}",
-                                T_SYMBOLS["pretrained"],
-                            ),
-                            (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
-                        ],
-                        value=list(T_SYMBOLS.values()),
-                    )
                 with gr.Row():
                     langs_bar_misc = gr.CheckboxGroup(
@@ -252,7 +274,7 @@ with demo:
                     outputs=shown_tasks_misc,
                 )
-            leaderboard_table_misc = gr.Dataframe()
         with gr.TabItem(
             "🌐 LLM MT-Bench benchmark",
@@ -295,17 +317,19 @@ with demo:
                         outputs=langs_bar_mtbench,
                     )
-            leaderboard_table_mtbench = gr.Dataframe(scale=5)
         for comp, fn in [
             (search_bar, "submit"),
             (langs_bar, "change"),
             (shown_tasks, "change"),
             (model_types, "change"),
         ]:
             getattr(comp, fn)(
                 core.update_df,
-                [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
                 leaderboard_table,
             )
@@ -314,10 +338,11 @@ with demo:
             (model_types_zero_shot, "change"),
             (langs_bar_zero_shot, "change"),
             (shown_tasks_zero_shot, "change"),
         ]:
             getattr(comp, fn)(
                 core.update_df,
-                [shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot, gr.State(value=False)],
                 leaderboard_table_zero_shot,
             )
@@ -326,10 +351,11 @@ with demo:
             (langs_bar_misc, "change"),
             (shown_tasks_misc, "change"),
             (model_types_misc, "change"),
         ]:
             getattr(comp, fn)(
                 core.update_df,
-                [shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, gr.State(value=False)],
                 leaderboard_table_misc,
             )
@@ -346,21 +372,22 @@ with demo:
     gr.Blocks.load(
         block=demo,
         fn=core.update_df,
-        inputs=[shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
         outputs=leaderboard_table,
     )
     gr.Blocks.load(
         block=demo,
         fn=core.update_df,
-        inputs=[shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot, gr.State(value=False)],
         outputs=leaderboard_table_zero_shot,
     )
     gr.Blocks.load(
         block=demo,
         fn=core.update_df,
-        inputs=[shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, gr.State(value=False)],
         outputs=leaderboard_table_misc,
     )

 import core as core
 from style import CSS, LANG_SYMBOLS, MT_BENCH_LANG_SYMBOLS, T_SYMBOLS, TITLE
+from gradio_rangeslider import RangeSlider
 demo = gr.Blocks(css=CSS)
 with demo:
                         show_label=True,
                         elem_id="search-bar",
                     )
+                with gr.Row():
+                    with gr.Column():
+                        model_types = gr.CheckboxGroup(
+                            label="Select model type",
+                            choices=[
+                                (
+                                    f"Pretrained {T_SYMBOLS['pretrained']}",
+                                    T_SYMBOLS["pretrained"],
+                                ),
+                                (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
+                            ],
+                            value=list(T_SYMBOLS.values()),
+                        )
+                    with gr.Column():
+                        model_sizes = RangeSlider(minimum=0,maximum=150,value=(7, 10),label="Select the number of parameters (B)")
                 with gr.Row():
                     langs_bar = gr.CheckboxGroup(
                     inputs=[],
                     outputs=shown_tasks,
                 )
+            # TODO When adding markdown as the data type of the model_name column, the text is getting overflown into the next column.
+            # leaderboard_table = gr.Dataframe(datatype=['str', 'markdown'])
+            leaderboard_table = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "30%"], wrap=False)
         with gr.TabItem(
             "🏅 LLM accuracy benchmark (Zero-Shot)",
                         show_label=True,
                         elem_id="search-bar",
                     )
+                with gr.Row():
+                    with gr.Column():
+                        model_types_zero_shot = gr.CheckboxGroup(
+                            label="Select model type",
+                            choices=[
+                                (
+                                    f"Pretrained {T_SYMBOLS['pretrained']}",
+                                    T_SYMBOLS["pretrained"],
+                                ),
+                                (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
+                            ],
+                            value=list(T_SYMBOLS.values()),
+                        )
+                    with gr.Column():
+                        model_sizes_zero_shot = RangeSlider(minimum=0, maximum=150, value=(7, 10),
+                                                  label="Select the number of parameters (B)")
                 with gr.Row():
                     langs_bar_zero_shot = gr.CheckboxGroup(
                     inputs=[],
                     outputs=shown_tasks_zero_shot,
                 )
+            leaderboard_table_zero_shot = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "30%"], wrap=False)
         with gr.TabItem(
             "🌐 LLM translation benchmark",
                         elem_id="search-bar",
                     )
+                with gr.Row():
+                    with gr.Column():
+                        model_types_misc = gr.CheckboxGroup(
+                            label="Select model type",
+                            choices=[
+                                (
+                                    f"Pretrained {T_SYMBOLS['pretrained']}",
+                                    T_SYMBOLS["pretrained"],
+                                ),
+                                (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
+                            ],
+                            value=list(T_SYMBOLS.values()),
+                        )
+                    with gr.Column():
+                        model_sizes_misc = RangeSlider(minimum=0, maximum=150, value=(7, 10),
+                                                            label="Select the number of parameters (B)")
                 with gr.Row():
                     langs_bar_misc = gr.CheckboxGroup(
                     outputs=shown_tasks_misc,
                 )
+            leaderboard_table_misc = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "30%"], wrap=False)
         with gr.TabItem(
             "🌐 LLM MT-Bench benchmark",
                         outputs=langs_bar_mtbench,
                     )
+            leaderboard_table_mtbench = gr.Dataframe(datatype=["str", "markdown"], column_widths=[None, "60%"], wrap=False)
         for comp, fn in [
             (search_bar, "submit"),
             (langs_bar, "change"),
             (shown_tasks, "change"),
             (model_types, "change"),
+            (model_sizes, "change"),
         ]:
             getattr(comp, fn)(
                 core.update_df,
+                [shown_tasks, search_bar, langs_bar, model_types, model_sizes, gr.State(value=True)],
+                # [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
                 leaderboard_table,
             )
             (model_types_zero_shot, "change"),
             (langs_bar_zero_shot, "change"),
             (shown_tasks_zero_shot, "change"),
+            (model_sizes_zero_shot, "change")
         ]:
             getattr(comp, fn)(
                 core.update_df,
+                [shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot, model_sizes_zero_shot, gr.State(value=False)],
                 leaderboard_table_zero_shot,
             )
             (langs_bar_misc, "change"),
             (shown_tasks_misc, "change"),
             (model_types_misc, "change"),
+            (model_sizes_misc, "change"),
         ]:
             getattr(comp, fn)(
                 core.update_df,
+                [shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, model_sizes_misc, gr.State(value=False)],
                 leaderboard_table_misc,
             )
     gr.Blocks.load(
         block=demo,
         fn=core.update_df,
+        inputs=[shown_tasks, search_bar, langs_bar, model_types, model_sizes, gr.State(value=True)],
+        # inputs=[shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
         outputs=leaderboard_table,
     )
     gr.Blocks.load(
         block=demo,
         fn=core.update_df,
+        inputs=[shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, model_types_zero_shot, model_sizes_zero_shot, gr.State(value=False)],
         outputs=leaderboard_table_zero_shot,
     )
     gr.Blocks.load(
         block=demo,
         fn=core.update_df,
+        inputs=[shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, model_sizes_misc, gr.State(value=False)],
         outputs=leaderboard_table_misc,
     )

core.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
 import style
@@ -27,7 +28,8 @@ def init():
     task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
     task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
     languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
-    mt_bench_language_list = hidden_df[hidden_df["Task_Group"] == "MTBENCH"]["Language"].drop_duplicates().str.upper().tolist()
     model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
     model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
@@ -41,8 +43,19 @@ def init():
     hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])
 def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
     task_cols = get_task_columns(df)
     return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)
@@ -97,12 +110,13 @@ def select_shots(df: pd.DataFrame, fewshot: bool = False):
 def update_df(
-    tasks: list[str],
-    model_query: str,
-    langs: list[str],
-    model_types: list[str],
-    fewshot: bool = False,
-    format: bool = True,
 ) -> pd.DataFrame:
     """Return a filtered dataframe according to selected models, tasks and
     languages. The format flag controls whether the output dataframe should
@@ -119,6 +133,11 @@ def update_df(
     df = search_model(df, model_query)
     df = filter_type(df, model_types)
     if format:
         return sort_cols(df, fewshot).style.format(precision=2, decimal=".", na_rep="N/A")
     else:
@@ -132,7 +151,8 @@ def get_selected_task_type(task_type_id):
 def get_available_task_groups(selected_task_type, fewshot):
-    task_groups = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type]
     if fewshot:
         available_tasks = [c for c in task_groups if c not in ZERO_SHOT_ONLY]

 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from utils import model_hf_look_up_table_filter
 import style
     task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
     task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
     languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
+    mt_bench_language_list = hidden_df[hidden_df["Task_Group"] == "MTBENCH"][
+        "Language"].drop_duplicates().str.upper().tolist()
     model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
     model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
     hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"> {model_name} </a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/" + model_hf_look_up_table_filter[model_name]['link']
+    return model_hyperlink(link, model_name)
 def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
     task_cols = get_task_columns(df)
+    df['Model_Name'] = df['Model_Name'].apply(
+        lambda x: make_clickable_model(x) if x in model_hf_look_up_table_filter else x)
     return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)
 def update_df(
+        tasks: list[str],
+        model_query: str,
+        langs: list[str],
+        model_types: list[str],
+        model_sizes: list[str],
+        fewshot: bool = False,
+        format: bool = True,
 ) -> pd.DataFrame:
     """Return a filtered dataframe according to selected models, tasks and
     languages. The format flag controls whether the output dataframe should
     df = search_model(df, model_query)
     df = filter_type(df, model_types)
+    if model_sizes:
+        result = [key for key, value in model_hf_look_up_table_filter.items() if
+                  (value.get("model_size") >= model_sizes[0] and value.get("model_size") <= model_sizes[1])]
+        df = df[df['Model_Name'].isin(result)]
     if format:
         return sort_cols(df, fewshot).style.format(precision=2, decimal=".", na_rep="N/A")
     else:
 def get_available_task_groups(selected_task_type, fewshot):
+    task_groups = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if
+                   task_type == selected_task_type]
     if fewshot:
         available_tasks = [c for c in task_groups if c not in ZERO_SHOT_ONLY]

utils.py ADDED Viewed

	@@ -0,0 +1,193 @@

+model_hf_look_up_table_filter = {
+    "Aya-23-8B": {
+        "link": "CohereForAI/aya-23-8B",
+        "model_size": 8
+    },
+    "Bloom-7b1": {
+        "link": "bigscience/bloom-7b1",
+        "model_size": 7,
+    },
+    "Bloomz-7b1": {
+        "link": "bigscience/bloomz-7b1",
+        "model_size": 7,
+    },
+    "Meta-Llama-2-7B": {
+        "link": "meta-llama/Llama-2-7b",
+        "model_size": 7,
+    },
+    "Gemma-7b": {
+        "link": "google/gemma-7b",
+        "model_size": 7,
+    },
+    "Gemma-1.1-7b-Instruct": {
+        "link": "google/gemma-1.1-7b-it",
+        "model_size": 7,
+    },
+    "Meta-Llama-3-8B": {
+        "link": "meta-llama/Meta-Llama-3-8B",
+        "model_size": 8
+    },
+    "Meta-Llama-3-8B-Instruct": {
+        "link": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "model_size": 8
+    },
+    "Mistral-7B-Instruct-v0.3": {
+        "link": "mistralai/Mistral-7B-Instruct-v0.3",
+        "model_size": 7
+    },
+    "Mistral-7B-Instruct-v0.1": {
+        "link": "mistralai/Mistral-7B-Instruct-v0.1",
+        "model_size": 7
+    },
+    "Mistral-7B-Instruct-v0.2": {
+        "link": "mistralai/Mistral-7B-Instruct-v0.2",
+        "model_size": 7
+    },
+    "Mistral-7B-v0.1": {
+        "link": "mistralai/Mistral-7B-v0.1",
+        "model_size": 7
+    },
+    "Mistral-7B-v0.3": {
+        "link": "mistralai/Mistral-7B-v0.3",
+        "model_size": 7
+    },
+    "Occiglot-7b-eu5": {
+        "link": "occiglot/occiglot-7b-eu5",
+        "model_size": 7
+    },
+    "Occiglot-7b-eu5-Instruct": {
+        "link": "occiglot/occiglot-7b-eu5-instruct",
+        "model_size": 7
+    },
+    "Phi-3-mini-4k-Instruct": {
+        "link": "microsoft/Phi-3-mini-4k-instruct",
+        "model_size": 3.8
+    },
+    "Qwen2-7B": {
+        "link": "Qwen/Qwen2-7B-Instruct",
+        "model_size": 7
+    },
+    "Qwen2-7B-Instruct": {
+        "link": "Qwen/Qwen2-7B-Instruct",
+        "model_size": 7
+    },
+    "7B_24EU_2.5T_bactrianx17_bb_ckp1": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_2.5T_bactrianx5_bb_ckp1": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_2.5T_honey_ckp2701": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_2T_bactrianx17_bb_ckp2": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_2T_bactrianx5_bb_ckp2": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_2.86T_EP5_iter_0681300": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_2.86T_iter_0602100": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_1.45T_bactrianx17_ckp1": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_1.45T_bactrianx17_bb_ckp2": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_1.45T_bactrianx5_ckp1": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_1.65T_bactrianx17_ckp1": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_1.65T_bactrianx17_bb_ckp1": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_24EU_1.65T_bactrianx5_ckp1": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EN_200B_iter_0047683": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EQUAL_200B_iter_0046950": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_1.1T_iter_0236250": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_1.45T_iter_0346050": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_1.65T_iter_0393075": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_2.5T_DE_213B": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_2.5T_DE_262B": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_2.5T_iter_0602100": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_2T_iter_0477675": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_2T_iter_0477900": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_2T_iter_0478125": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_3T_oscar_iter_0715255": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_3T_fw_iter_0715255": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_fw_3T_honey_ckp1350": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_fw_3.1T_iter_0025875": {
+        "link": "",
+        "model_size": 7
+    },
+    "7B_EU24_1.1T_bactrianx_ckp2": {
+        "link": "",
+        "model_size": 7
+    },
+}