import json from pathlib import Path import gradio as gr import pandas as pd TITLE = """

LLM Leaderboard for H4 Models

""" DESCRIPTION = f""" Evaluation of H4 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval) """ def get_leaderboard_df(): filepaths = list(Path("eval_results").rglob("*.json")) # Parse filepaths to get unique models models = set() for filepath in filepaths: path_parts = Path(filepath).parts model_revision = "_".join(path_parts[1:4]) models.add(model_revision) # Initialize DataFrame df = pd.DataFrame(index=list(models)) # Extract data from each file and populate the DataFrame for filepath in filepaths: path_parts = Path(filepath).parts model_revision = "_".join(path_parts[1:4]) task = path_parts[4].capitalize() # Extract timestamp from filepath timestamp = filepath.stem.split("_")[-1][:-3] df.loc[model_revision, "Timestamp"] = timestamp with open(filepath, "r") as file: data = json.load(file) first_result_key = next(iter(data["results"])) # gets the first key in 'results' # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard if task == "truthfulqa": value = data["results"][first_result_key]["truthfulqa_mc2"] # IFEval has several metrics but we report just the prompt-loose-acc one elif task == "ifeval": value = data["results"][first_result_key]["prompt_level_loose_acc"] else: first_metric_key = next( iter(data["results"][first_result_key]) ) # gets the first key in the first result value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric df.loc[model_revision, task] = value # Put IFEval in first column ifeval_col = df.pop("Ifeval") df.insert(1, "Ifeval", ifeval_col) df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) df = df.sort_values(by=["Average"], ascending=False) df = df.reset_index().rename(columns={"index": "Model"}).round(3) return df def refresh(): return get_leaderboard_df() # Function to update the table based on search query def update_table(search_query): df = get_leaderboard_df() if search_query: search_terms = search_query.split(";") search_terms = [term.strip() for term in search_terms] pattern = "|".join(search_terms) df = df[df["Model"].str.contains(pattern, regex=True)] return df leaderboard_df = get_leaderboard_df() demo = gr.Blocks() with demo: gr.HTML(TITLE) with gr.Column(): gr.Markdown(DESCRIPTION, elem_classes="markdown-text") with gr.Row(): search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) with gr.Group(): leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000) with gr.Row(): refresh_button = gr.Button("Refresh") search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table]) refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) demo.launch()