Spaces:
Running
Running
| import ast | |
| import argparse | |
| import glob | |
| import pickle | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| def model_hyperlink(model_name, link): | |
| return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| def load_leaderboard_table_csv(filename, add_hyperlink=True): | |
| lines = open(filename).readlines() | |
| heads = [v.strip() for v in lines[0].split(",")] | |
| rows = [] | |
| for i in range(1, len(lines)): | |
| row = [v.strip() for v in lines[i].split(",")] | |
| for j in range(len(heads)): | |
| item = {} | |
| for h, v in zip(heads, row): | |
| if "Score" in h: | |
| item[h] = float(v) | |
| elif h != "Model" and h != "Params (B)" and h != "Repo" and h != "Quantization" and h != "Link": | |
| item[h] = int(v) | |
| else: | |
| item[h] = v | |
| if add_hyperlink: | |
| item["Repo"] = model_hyperlink(item["Repo"], item["Link"]) | |
| rows.append(item) | |
| return rows | |
| def get_arena_table(model_table_df): | |
| # sort by rating | |
| model_table_df = model_table_df.sort_values(by=["Final Score"], ascending=False) | |
| values = [] | |
| for i in range(len(model_table_df)): | |
| row = [] | |
| model_key = model_table_df.index[i] | |
| model_name = model_table_df["Model"].values[model_key] | |
| # rank | |
| row.append(i + 1) | |
| # model display name | |
| row.append(model_name) | |
| row.append( | |
| model_table_df["Params (B)"].values[model_key] | |
| ) | |
| row.append( | |
| model_table_df["Repo"].values[model_key] | |
| ) | |
| row.append( | |
| model_table_df["Quantization"].values[model_key] | |
| ) | |
| row.append( | |
| model_table_df["Final Score"].values[model_key] | |
| ) | |
| row.append( | |
| model_table_df["Strict Prompt Score"].values[model_key] | |
| ) | |
| row.append( | |
| model_table_df["Strict Inst Score"].values[model_key] | |
| ) | |
| row.append( | |
| model_table_df["Loose Prompt Score"].values[model_key] | |
| ) | |
| row.append( | |
| model_table_df["Loose Inst Score"].values[model_key] | |
| ) | |
| values.append(row) | |
| return values | |
| def build_leaderboard_tab(leaderboard_table_file, show_plot=False): | |
| if leaderboard_table_file: | |
| data = load_leaderboard_table_csv(leaderboard_table_file) | |
| model_table_df = pd.DataFrame(data) | |
| md_head = f""" | |
| # π IFEval Leaderboard | |
| """ | |
| gr.Markdown(md_head, elem_id="leaderboard_markdown") | |
| with gr.Tabs() as tabs: | |
| # arena table | |
| arena_table_vals = get_arena_table(model_table_df) | |
| with gr.Tab("IFEval", id=0): | |
| md = "Leaderboard for various Large Language Models measured with IFEval benchmark.\n\n[IFEval](https://github.com/google-research/google-research/tree/master/instruction_following_eval) is a straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set of \"verifiable instructions\" such as \"write in more than 400 words\" and \"mention the keyword of AI at least 3 times\". We identified 25 types of those verifiable instructions and constructed around 500 prompts, with each prompt containing one or more verifiable instructions. \n\nTest ran with `lm-evaluation-harness`. Raw results can be found in the `results` directory. Made by [Kristian Polso](https://polso.info)." | |
| gr.Markdown(md, elem_id="leaderboard_markdown") | |
| gr.Dataframe( | |
| headers=[ | |
| "Rank", | |
| "Model", | |
| "Params (B)", | |
| "Repo", | |
| "Quantization", | |
| "Final Score", | |
| "Strict Prompt Score", | |
| "Strict Inst Score", | |
| "Loose Prompt Score", | |
| "Loose Inst Score" | |
| ], | |
| datatype=[ | |
| "number", | |
| "str", | |
| "number", | |
| "markdown", | |
| "str", | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| "number" | |
| ], | |
| value=arena_table_vals, | |
| elem_id="arena_leaderboard_dataframe", | |
| height=700, | |
| column_widths=[50, 160, 60, 230, 100, 90, 90, 90, 90, 90], | |
| wrap=True, | |
| ) | |
| else: | |
| pass | |
| def build_demo(leaderboard_table_file): | |
| text_size = gr.themes.sizes.text_lg | |
| with gr.Blocks( | |
| title="IFEval Leaderboard", | |
| theme=gr.themes.Base(text_size=text_size), | |
| ) as demo: | |
| leader_components = build_leaderboard_tab( | |
| leaderboard_table_file, show_plot=True | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--share", action="store_true") | |
| parser.add_argument("--IFEval_file", type=str, default="./IFEval.csv") | |
| args = parser.parse_args() | |
| demo = build_demo(args.IFEval_file) | |
| demo.launch() | |