File size: 7,420 Bytes
aff26b6 bd4436b aff26b6 7ac902b aff26b6 69bc633 aff26b6 a53938b aff26b6 599688f a53938b 599688f aff26b6 7ac902b a53938b 152be83 a53938b 37ce6d8 152be83 37ce6d8 152be83 9d1c3ff 152be83 69bc633 a96d97e 62abb01 88fd41c 29ce0ec fd86754 6e4986c a53938b 1cfc013 7ac902b 1cfc013 b966f9c 1cfc013 aff26b6 1cb92d9 b61cab9 b966f9c b61cab9 37ce6d8 b61cab9 c259781 b61cab9 c259781 b61cab9 599688f a53938b 21ef6e1 aff26b6 21ef6e1 599688f 8ea545e b6a8a9e 7ac902b aff26b6 69bc633 8ea545e aff26b6 c9b339f 6e537e5 c9b339f 6e537e5 c9b339f aff26b6 c9b339f 8ea545e 9d1c3ff 8ea545e aff26b6 8ea545e fcb88b9 7ac902b fcb88b9 aff26b6 c9b339f 8ea545e c9b339f aff26b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import json
from pathlib import Path
import gradio as gr
import pandas as pd
TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""
DESCRIPTION = f"""
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""
BENCHMARKS_TO_SKIP = ["math", "mini_math"]
def get_leaderboard_df(merge_values: bool = True):
filepaths = list(Path("eval_results").rglob("*.json"))
# Parse filepaths to get unique models
models = set()
for filepath in filepaths:
path_parts = Path(filepath).parts
model_revision = "_".join(path_parts[1:4])
models.add(model_revision)
# Initialize DataFrame
df = pd.DataFrame(index=list(models))
# Extract data from each file and populate the DataFrame
for filepath in filepaths:
path_parts = Path(filepath).parts
date = filepath.stem.split("_")[-1][:-3].split("T")[0]
model_revision = "_".join(path_parts[1:4]) + "_" + date
task = path_parts[4].capitalize()
df.loc[model_revision, "Date"] = date
with open(filepath, "r") as file:
data = json.load(file)
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
# Skip benchmarks that we don't want to include in the leaderboard
if task.lower() in BENCHMARKS_TO_SKIP:
continue
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
if task.lower() == "truthfulqa":
value = data["results"][first_result_key]["truthfulqa_mc2"]
# IFEval has several metrics but we report just the prompt-loose-acc one
elif task.lower() == "ifeval":
value = data["results"][first_result_key]["prompt_level_loose_acc"]
# MMLU has several metrics but we report just the average one
elif task.lower() == "mmlu":
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
# HellaSwag and ARC reports acc_norm
elif task.lower() in ["hellaswag", "arc"]:
value = data["results"][first_result_key]["acc_norm"]
# BBH has several metrics but we report just the average one
elif task.lower() == "bbh":
if "all" in data["results"]:
value = data["results"]["all"]["acc"]
else:
value = -100
# AGIEval reports acc_norm
elif task.lower() == "agieval":
value = data["results"]["all"]["acc_norm"]
# MATH reports qem
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
value = data["results"]["all"]["qem"]
else:
first_metric_key = next(
iter(data["results"][first_result_key])
) # gets the first key in the first result
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
if task.lower() in ["mini_math_v2"]:
for k, v in data["results"].items():
if k != "all":
level = k.split("|")[1].split(":")[-1]
value = v["qem"]
df.loc[model_revision, f"{task}_{level}"] = value
# For AlpacaEval we report base winrate and lenght corrected one
elif task.lower() == "alpaca_eval":
value = data["results"][first_result_key]["win_rate"]
df.loc[model_revision, "Alpaca_eval"] = value / 100.0
value = data["results"][first_result_key]["length_controlled_winrate"]
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
else:
df.loc[model_revision, task] = value
# Put IFEval / BBH / AGIEval / AlpacaEval in first columns
alpaca_col = df.pop("Alpaca_eval")
df.insert(1, "Alpaca_eval", alpaca_col)
alpaca_col = df.pop("Alpaca_eval_lc")
df.insert(2, "Alpaca_eval_lc", alpaca_col)
ifeval_col = df.pop("Ifeval")
df.insert(3, "Ifeval", ifeval_col)
bbh_col = df.pop("Bbh")
df.insert(4, "Bbh", bbh_col)
agieval_col = df.pop("Agieval")
df.insert(5, "Agieval", agieval_col)
gsm8k_col = df.pop("Gsm8k")
df.insert(6, "Gsm8k", gsm8k_col)
mmlu_col = df.pop("Mmlu")
df.insert(7, "Mmlu", mmlu_col)
# Drop rows where every entry is NaN
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
# Convert all values to percentage
df[df.select_dtypes(include=["number"]).columns] *= 100.0
df = df.sort_values(by=["Average"], ascending=False)
df = df.reset_index().rename(columns={"index": "Model"}).round(2)
# Strip off date from model name
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
if merge_values:
merged_df = df.drop(["Date", "Average"], axis=1).groupby("Model").max().reset_index()
merged_df.insert(loc=0, column="Average", value=merged_df.mean(axis=1, numeric_only=True))
df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
df.drop_duplicates(subset=["Model"], inplace=True)
df = df.sort_values(by=["Average"], ascending=False).round(2)
# Trim minimath column names
df.columns = [c.replace("_level_", "_l") for c in df.columns]
return df
def refresh(merge_values: bool = True):
return get_leaderboard_df(merge_values)
# Function to update the table based on search query
def update_table(search_query):
df = get_leaderboard_df()
if search_query:
search_terms = search_query.split(";")
search_terms = [term.strip().lower() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
return df
leaderboard_df = get_leaderboard_df()
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
with gr.Column():
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
with gr.Row():
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
merge_values = gr.Checkbox(
value=True,
label="Merge evals",
info="Merge evals for the same model. If there are duplicates, we display the largest one.",
)
with gr.Group():
leaderboard_df = get_leaderboard_df()
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
wrap=True,
height=1000,
column_widths=[400, 110] + [(150 + len(c)) for c in leaderboard_df.columns[2:]],
)
with gr.Row():
refresh_button = gr.Button("Refresh")
merge_values.change(refresh, inputs=[merge_values], outputs=[leaderboard_table])
search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
demo.launch()
|