Spaces:

Krisseck
/

IFEval-Leaderboard

Running

App Files Files Community

Krisseck commited on Jun 6, 2024

Commit

8163dc5

0 Parent(s):

Initial commit

Browse files

Files changed (4) hide show

.gitignore +2 -0
IFEval.csv +5 -0
README.md +13 -0
app.py +139 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv
2	+ .idea

IFEval.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+Model,Parameters (B),Repo,Quantization,Final Score,Strict Prompt Score,Strict Inst Score,Loose Prompt Score,Loose Inst Score,Link
+Llama 3 8B,8,failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF,Q8_0,0.7589,0.7001,0.7818,0.7394,0.8141,https://huggingface.co/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF
+Llama 3 8B,8,MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF,Q8_0,0.7366,0.6765,0.7614,0.7172,0.7914,https://huggingface.co/MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF
+Mistral 7B v0.3,7,MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF,Q8_0,0.5689,0.4972,0.5983,0.5397,0.6403,https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF
+Phi 3 Medium 4K,14,bartowski/Phi-3-medium-4k-instruct-GGUF,Q8_0,0.5689,0.4972,0.5983,0.5397,0.6403,https://huggingface.co/bartowski/Phi-3-medium-4k-instruct-GGUF

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: IFEval Leaderboard
+emoji: 🏆
+colorFrom: green
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.15.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import ast
+import argparse
+import glob
+import pickle
+import gradio as gr
+import numpy as np
+import pandas as pd
+def model_hyperlink(model_name, link):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def load_leaderboard_table_csv(filename, add_hyperlink=True):
+    lines = open(filename).readlines()
+    heads = [v.strip() for v in lines[0].split(",")]
+    rows = []
+    for i in range(1, len(lines)):
+        row = [v.strip() for v in lines[i].split(",")]
+        for j in range(len(heads)):
+            item = {}
+            for h, v in zip(heads, row):
+                if "Score" in h:
+                    item[h] = float(v)
+                elif h != "Model" and h != "Parameters (B)" and h != "Repo" and h != "Quantization" and h != "Link":
+                    item[h] = int(v)
+                else:
+                    item[h] = v
+            if add_hyperlink:
+                item["Repo"] = model_hyperlink(item["Repo"], item["Link"])
+        rows.append(item)
+    return rows
+def get_arena_table(model_table_df):
+    # sort by rating
+    model_table_df = model_table_df.sort_values(by=["Final Score"], ascending=False)
+    values = []
+    for i in range(len(model_table_df)):
+        row = []
+        model_key = model_table_df.index[i]
+        model_name = model_table_df["Model"].values[model_key]
+        # rank
+        row.append(i + 1)
+        # model display name
+        row.append(model_name)
+        row.append(
+            model_table_df["Parameters (B)"].values[model_key]
+        )
+        row.append(
+            model_table_df["Repo"].values[model_key]
+        )
+        row.append(
+            model_table_df["Quantization"].values[model_key]
+        )
+        row.append(
+            model_table_df["Final Score"].values[model_key]
+        )
+        row.append(
+            model_table_df["Strict Prompt Score"].values[model_key]
+        )
+        row.append(
+            model_table_df["Strict Inst Score"].values[model_key]
+        )
+        row.append(
+            model_table_df["Loose Prompt Score"].values[model_key]
+        )
+        row.append(
+            model_table_df["Loose Inst Score"].values[model_key]
+        )
+        values.append(row)
+    return values
+def build_leaderboard_tab(leaderboard_table_file, show_plot=False):
+    if leaderboard_table_file:
+        data = load_leaderboard_table_csv(leaderboard_table_file)
+        model_table_df = pd.DataFrame(data)
+        md_head = f"""
+        # 🏆 IFEval Leaderboard
+        """
+        gr.Markdown(md_head, elem_id="leaderboard_markdown")
+        with gr.Tabs() as tabs:
+            # arena table
+            arena_table_vals = get_arena_table(model_table_df)
+            with gr.Tab("IFEval", id=0):
+                md = "Leaderboard for various Large Language Models measured with IFEval benchmark.\n\n[IFEval](https://github.com/google-research/google-research/tree/master/instruction_following_eval) is a straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set of \"verifiable instructions\" such as \"write in more than 400 words\" and \"mention the keyword of AI at least 3 times\". We identified 25 types of those verifiable instructions and constructed around 500 prompts, with each prompt containing one or more verifiable instructions. \n\nTest ran with `lm-evaluation-harness`. Raw results can be found in the `results` directory. Made by [Kristian Polso](https://polso.info)."
+                gr.Markdown(md, elem_id="leaderboard_markdown")
+                gr.Dataframe(
+                    headers=[
+                        "Rank",
+                        "Model",
+                        "Parameters (B)",
+                        "Repo",
+                        "Quantization",
+                        "Final Score",
+                        "Strict Prompt Score",
+                        "Strict Inst Score",
+                        "Loose Prompt Score",
+                        "Loose Inst Score"
+                    ],
+                    datatype=[
+                        "number",
+                        "str",
+                        "number",
+                        "markdown",
+                        "str",
+                        "number",
+                        "number",
+                        "number",
+                        "number",
+                        "number"
+                    ],
+                    value=arena_table_vals,
+                    elem_id="arena_leaderboard_dataframe",
+                    height=700,
+                    column_widths=[50, 150, 100, 150, 100, 100, 100, 100, 100, 100],
+                    wrap=True,
+                )
+    else:
+        pass
+def build_demo(leaderboard_table_file):
+    text_size = gr.themes.sizes.text_lg
+    with gr.Blocks(
+        title="IFEval Leaderboard",
+        theme=gr.themes.Base(text_size=text_size),
+    ) as demo:
+        leader_components = build_leaderboard_tab(
+            leaderboard_table_file, show_plot=True
+        )
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--IFEval_file", type=str, default="./IFEval.csv")
+    args = parser.parse_args()
+    demo = build_demo(args.IFEval_file)
+    demo.launch()