File size: 3,944 Bytes
aff26b6 bd4436b aff26b6 a53938b aff26b6 a53938b aff26b6 a53938b 152be83 a53938b 37ce6d8 152be83 37ce6d8 152be83 a53938b aff26b6 37ce6d8 a53938b 21ef6e1 aff26b6 21ef6e1 aff26b6 c9b339f aff26b6 c9b339f aff26b6 8fe80ef aff26b6 c9b339f aff26b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import json
from pathlib import Path
import gradio as gr
import pandas as pd
TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""
DESCRIPTION = f"""
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""
def get_leaderboard_df():
filepaths = list(Path("eval_results").rglob("*.json"))
# Parse filepaths to get unique models
models = set()
for filepath in filepaths:
path_parts = Path(filepath).parts
model_revision = "_".join(path_parts[1:4])
models.add(model_revision)
# Initialize DataFrame
df = pd.DataFrame(index=list(models))
# Extract data from each file and populate the DataFrame
for filepath in filepaths:
path_parts = Path(filepath).parts
model_revision = "_".join(path_parts[1:4])
task = path_parts[4].capitalize()
# Extract timestamp from filepath
timestamp = filepath.stem.split("_")[-1][:-3]
df.loc[model_revision, "Timestamp"] = timestamp
with open(filepath, "r") as file:
data = json.load(file)
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
if task.lower() == "truthfulqa":
value = data["results"][first_result_key]["truthfulqa_mc2"]
# IFEval has several metrics but we report just the prompt-loose-acc one
elif task.lower() == "ifeval":
value = data["results"][first_result_key]["prompt_level_loose_acc"]
# MMLU has several metrics but we report just the average one
elif task.lower() == "mmlu":
value = data["results"]["lighteval|mmlu:_average|5"]["acc"]
# HellaSwag and ARC reports acc_norm
elif task.lower() in ["hellaswag", "arc"]:
value = data["results"][first_result_key]["acc_norm"]
else:
first_metric_key = next(
iter(data["results"][first_result_key])
) # gets the first key in the first result
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
df.loc[model_revision, task] = value
# Put IFEval in first column
ifeval_col = df.pop("Ifeval")
df.insert(1, "Ifeval", ifeval_col)
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
# Convert all values to percentage
df[df.select_dtypes(include=["number"]).columns] *= 100.0
df = df.sort_values(by=["Average"], ascending=False)
df = df.reset_index().rename(columns={"index": "Model"}).round(2)
return df
def refresh():
return get_leaderboard_df()
# Function to update the table based on search query
def update_table(search_query):
df = get_leaderboard_df()
if search_query:
search_terms = search_query.split(";")
search_terms = [term.strip() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.contains(pattern, regex=True)]
return df
leaderboard_df = get_leaderboard_df()
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
with gr.Column():
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
with gr.Row():
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
with gr.Group():
leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000)
with gr.Row():
refresh_button = gr.Button("Refresh")
search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
demo.launch()
|