import json from pathlib import Path import gradio as gr import pandas as pd TITLE = """

LLM Leaderboard for H4 Models

""" DESCRIPTION = f""" Evaluation of H4 models across a diverse range of benchmarks from Eleuther's [LLM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness). """ def get_leaderboard_df(): filepaths = list(Path("eval_results").rglob("*.json")) # Parse filepaths to get unique models models = set() for filepath in filepaths: path_parts = Path(filepath).parts model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure models.add(model_revision) # Initialize DataFrame df = pd.DataFrame(index=list(models)) # Extract data from each file and populate the DataFrame for filepath in filepaths: path_parts = Path(filepath).parts model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure task = Path(filepath).stem.split("_")[-1].capitalize() # gets 'mmlu' from the filename with open(filepath, "r") as file: data = json.load(file) first_result_key = next(iter(data["results"])) # gets the first key in 'results' first_metric_key = next(iter(data["results"][first_result_key])) # gets the first key in the first result value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric df.loc[model_revision, task] = value df.insert(loc=0, column="Average", value=df.mean(axis=1)) df = df.sort_values(by=["Average"], ascending=False) df = df.reset_index().rename(columns={"index": "Model"}).round(3) return df def refresh(): return get_leaderboard_df() # Function to update the table based on search query def update_table(search_query): df = get_leaderboard_df() if search_query: search_terms = search_query.split(";") search_terms = [term.strip() for term in search_terms] pattern = "|".join(search_terms) df = df[df["Model"].str.contains(pattern, regex=True)] return df leaderboard_df = get_leaderboard_df() demo = gr.Blocks() with demo: gr.HTML(TITLE) with gr.Column(): gr.Markdown(DESCRIPTION, elem_classes="markdown-text") with gr.Row(): search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) with gr.Group(): leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000) with gr.Row(): refresh_button = gr.Button("Refresh") search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table]) refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) demo.launch()