import json
from pathlib import Path
import gradio as gr
import pandas as pd
TITLE = """
LLM Leaderboard for H4 Models
"""
DESCRIPTION = f"""
Evaluation of H4 models across a diverse range of benchmarks from Eleuther's [LLM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
"""
def get_leaderboard_df():
filepaths = list(Path("eval_results").rglob("*.json"))
# Parse filepaths to get unique models
models = set()
for filepath in filepaths:
path_parts = Path(filepath).parts
model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure
models.add(model_revision)
# Initialize DataFrame
df = pd.DataFrame(index=list(models))
# Extract data from each file and populate the DataFrame
for filepath in filepaths:
path_parts = Path(filepath).parts
model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure
task = Path(filepath).stem.split("_")[-1].capitalize() # gets 'mmlu' from the filename
with open(filepath, "r") as file:
data = json.load(file)
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
first_metric_key = next(iter(data["results"][first_result_key])) # gets the first key in the first result
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
df.loc[model_revision, task] = value
df.insert(loc=0, column="Average", value=df.mean(axis=1))
df = df.sort_values(by=["Average"], ascending=False)
df = df.reset_index().rename(columns={"index": "Model"}).round(3)
return df
def refresh():
return get_leaderboard_df()
# Function to update the table based on search query
def update_table(search_query):
df = get_leaderboard_df()
if search_query:
search_terms = search_query.split(";")
search_terms = [term.strip() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.contains(pattern, regex=True)]
return df
leaderboard_df = get_leaderboard_df()
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
with gr.Column():
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
with gr.Row():
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
with gr.Group():
leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000)
with gr.Row():
refresh_button = gr.Button("Refresh")
search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
demo.launch()