import json from pathlib import Path import gradio as gr import pandas as pd TITLE = """

LLM Leaderboard for H4 Models

""" DESCRIPTION = f""" Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy. """ BENCHMARKS_TO_SKIP = ["math", "mini_math"] def get_leaderboard_df(merge_values: bool = True): filepaths = list(Path("eval_results").rglob("*.json")) # Parse filepaths to get unique models models = set() for filepath in filepaths: path_parts = Path(filepath).parts model_revision = "_".join(path_parts[1:4]) models.add(model_revision) # Initialize DataFrame df = pd.DataFrame(index=list(models)) # Extract data from each file and populate the DataFrame for filepath in filepaths: path_parts = Path(filepath).parts date = filepath.stem.split("_")[-1][:-3].split("T")[0] model_revision = "_".join(path_parts[1:4]) + "_" + date task = path_parts[4].capitalize() df.loc[model_revision, "Date"] = date with open(filepath, "r") as file: data = json.load(file) first_result_key = next(iter(data["results"])) # gets the first key in 'results' # Skip benchmarks that we don't want to include in the leaderboard if task.lower() in BENCHMARKS_TO_SKIP: continue # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard if task.lower() == "truthfulqa": value = data["results"][first_result_key]["truthfulqa_mc2"] # IFEval has several metrics but we report just the prompt-loose-acc one elif task.lower() == "ifeval": value = data["results"][first_result_key]["prompt_level_loose_acc"] # MMLU has several metrics but we report just the average one elif task.lower() == "mmlu": value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0] # HellaSwag and ARC reports acc_norm elif task.lower() in ["hellaswag", "arc"]: value = data["results"][first_result_key]["acc_norm"] # BBH has several metrics but we report just the average one elif task.lower() == "bbh": if "all" in data["results"]: value = data["results"]["all"]["acc"] else: value = -100 # AGIEval reports acc_norm elif task.lower() == "agieval": value = data["results"]["all"]["acc_norm"] # MATH reports qem elif task.lower() in ["math", "math_v2", "aimo_kaggle"]: value = data["results"]["all"]["qem"] else: first_metric_key = next( iter(data["results"][first_result_key]) ) # gets the first key in the first result value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe if task.lower() in ["mini_math_v2"]: for k, v in data["results"].items(): if k != "all": level = k.split("|")[1].split(":")[-1] value = v["qem"] df.loc[model_revision, f"{task}_{level}"] = value # For AlpacaEval we report base winrate and lenght corrected one elif task.lower() == "alpaca_eval": value = data["results"][first_result_key]["win_rate"] df.loc[model_revision, "Alpaca_eval"] = value / 100.0 value = data["results"][first_result_key]["length_controlled_winrate"] df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0 else: df.loc[model_revision, task] = value # Put IFEval / BBH / AGIEval / AlpacaEval in first columns alpaca_col = df.pop("Alpaca_eval") df.insert(1, "Alpaca_eval", alpaca_col) alpaca_col = df.pop("Alpaca_eval_lc") df.insert(2, "Alpaca_eval_lc", alpaca_col) ifeval_col = df.pop("Ifeval") df.insert(3, "Ifeval", ifeval_col) bbh_col = df.pop("Bbh") df.insert(4, "Bbh", bbh_col) agieval_col = df.pop("Agieval") df.insert(5, "Agieval", agieval_col) gsm8k_col = df.pop("Gsm8k") df.insert(6, "Gsm8k", gsm8k_col) mmlu_col = df.pop("Mmlu") df.insert(7, "Mmlu", mmlu_col) # Drop rows where every entry is NaN df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"]) df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) # Convert all values to percentage df[df.select_dtypes(include=["number"]).columns] *= 100.0 df = df.sort_values(by=["Average"], ascending=False) df = df.reset_index().rename(columns={"index": "Model"}).round(2) # Strip off date from model name df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0]) if merge_values: merged_df = df.drop(["Date", "Average"], axis=1).groupby("Model").max().reset_index() merged_df.insert(loc=0, column="Average", value=merged_df.mean(axis=1, numeric_only=True)) df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left") df.drop_duplicates(subset=["Model"], inplace=True) df = df.sort_values(by=["Average"], ascending=False).round(2) # Trim minimath column names df.columns = [c.replace("_level_", "_l") for c in df.columns] return df def refresh(merge_values: bool = True): return get_leaderboard_df(merge_values) # Function to update the table based on search query def update_table(search_query): df = get_leaderboard_df() if search_query: search_terms = search_query.split(";") search_terms = [term.strip().lower() for term in search_terms] pattern = "|".join(search_terms) df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] return df leaderboard_df = get_leaderboard_df() demo = gr.Blocks() with demo: gr.HTML(TITLE) with gr.Column(): gr.Markdown(DESCRIPTION, elem_classes="markdown-text") with gr.Row(): search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) merge_values = gr.Checkbox( value=True, label="Merge evals", info="Merge evals for the same model. If there are duplicates, we display the largest one.", ) with gr.Group(): leaderboard_df = get_leaderboard_df() leaderboard_table = gr.Dataframe( value=leaderboard_df, wrap=True, height=1000, column_widths=[400, 110] + [(150 + len(c)) for c in leaderboard_df.columns[2:]], ) with gr.Row(): refresh_button = gr.Button("Refresh") merge_values.change(refresh, inputs=[merge_values], outputs=[leaderboard_table]) search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table]) refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) demo.launch()