|
import json |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
|
|
TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>""" |
|
|
|
DESCRIPTION = f""" |
|
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy. |
|
""" |
|
|
|
|
|
def get_leaderboard_df(): |
|
filepaths = list(Path("eval_results").rglob("*.json")) |
|
|
|
|
|
models = set() |
|
for filepath in filepaths: |
|
path_parts = Path(filepath).parts |
|
model_revision = "_".join(path_parts[1:4]) |
|
models.add(model_revision) |
|
|
|
|
|
df = pd.DataFrame(index=list(models)) |
|
|
|
|
|
for filepath in filepaths: |
|
path_parts = Path(filepath).parts |
|
model_revision = "_".join(path_parts[1:4]) |
|
task = path_parts[4].capitalize() |
|
|
|
timestamp = filepath.stem.split("_")[-1][:-3] |
|
df.loc[model_revision, "Timestamp"] = timestamp |
|
|
|
with open(filepath, "r") as file: |
|
data = json.load(file) |
|
first_result_key = next(iter(data["results"])) |
|
|
|
if task.lower() == "truthfulqa": |
|
value = data["results"][first_result_key]["truthfulqa_mc2"] |
|
|
|
elif task.lower() == "ifeval": |
|
value = data["results"][first_result_key]["prompt_level_loose_acc"] |
|
|
|
elif task.lower() == "mmlu": |
|
value = data["results"]["lighteval|mmlu:_average|5"]["acc"] |
|
|
|
elif task.lower() in ["hellaswag", "arc"]: |
|
value = data["results"][first_result_key]["acc_norm"] |
|
else: |
|
first_metric_key = next( |
|
iter(data["results"][first_result_key]) |
|
) |
|
value = data["results"][first_result_key][first_metric_key] |
|
df.loc[model_revision, task] = value |
|
|
|
|
|
ifeval_col = df.pop("Ifeval") |
|
df.insert(1, "Ifeval", ifeval_col) |
|
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) |
|
|
|
df[df.select_dtypes(include=["number"]).columns] *= 100.0 |
|
df = df.sort_values(by=["Average"], ascending=False) |
|
df = df.reset_index().rename(columns={"index": "Model"}).round(2) |
|
return df |
|
|
|
|
|
def refresh(): |
|
return get_leaderboard_df() |
|
|
|
|
|
|
|
def update_table(search_query): |
|
df = get_leaderboard_df() |
|
if search_query: |
|
search_terms = search_query.split(";") |
|
search_terms = [term.strip() for term in search_terms] |
|
pattern = "|".join(search_terms) |
|
df = df[df["Model"].str.contains(pattern, regex=True)] |
|
return df |
|
|
|
|
|
leaderboard_df = get_leaderboard_df() |
|
|
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
with gr.Column(): |
|
gr.Markdown(DESCRIPTION, elem_classes="markdown-text") |
|
with gr.Row(): |
|
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) |
|
with gr.Group(): |
|
leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000) |
|
with gr.Row(): |
|
refresh_button = gr.Button("Refresh") |
|
|
|
search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table]) |
|
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) |
|
|
|
demo.launch() |
|
|