import gradio as gr import pandas as pd import json from src.about import ( REPRODUCIBILITY_TEXT, INTRODUCTION_TEXT, ABOUT_TEXT, TITLE, ) from src.display.css_html_js import custom_css, custom_js # from src.display.utils import ( # COLS, # ST_BENCHMARK_COLS, # AGENTIC_BENCHMARK_COLS, # EVAL_COLS, # AutoEvalColumn, # fields, # ) # from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN # from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP # from src.submission.submit import add_new_eval from src.display.formatting import make_clickable_field # def restart_space(): # API.restart_space(repo_id=REPO_ID) # ### Space initialisation # try: # print(EVAL_REQUESTS_PATH) # snapshot_download( # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN # ) # except Exception: # restart_space() # try: # print(EVAL_RESULTS_PATH) # snapshot_download( # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN # ) # except Exception: # restart_space() # ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS) # AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS) # ( # finished_eval_queue_df, # running_eval_queue_df, # pending_eval_queue_df, # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) # def bold_max(s): # is_max = s == s.max() # Boolean Series: True for the max value(s) # return ['font-weight: bold' if v else '' for v in is_max] # def init_leaderboard(df, benchmark_type): # if df is None or df.empty: # raise ValueError("Leaderboard DataFrame is empty or None.") # non_task_cols = ["Model"] # if benchmark_type == "agentic": # # Include agent column # non_task_cols.append("Agent") # elif benchmark_type == "base": # # Drop agent column # dataframe = dataframe.drop(columns=["Agent"]) # AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))] # styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]]) # df.style.set_table_styles([ # {'selector': 'th', 'props': [('text-align', 'center')]}, # {'selector': 'td', 'props': [('text-align', 'center')]} # ]) # Define a common tooltip text # tooltip_text = "This is the common tooltip" # # Create a tooltip DataFrame with the same shape as df, # # filled with the same tooltip text for each cell. # tooltips = pd.DataFrame(tooltip_text, index=df.index, columns=df.columns) # # Apply the tooltips to the DataFrame # styled_df = df.style.set_tooltips(tooltips) # return gr.components.Dataframe( # value=df, # datatype=[c.type for c in AutoEvalColumnSubset], # column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset], # wrap=False, # ) def build_leaderboard(type): with open('data/results.json', 'r') as f: results = json.load(f) with open('data/tasks.json', 'r') as f: tasks = json.load(f) # Filter tasks based on type filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type} data = [] for model_name, model_data in results.items(): # For agentic type, skip models that have all null values for agentic tasks if type == "agentic": has_agentic_results = any( model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None for task in filtered_tasks ) if not has_agentic_results: continue model_sha = model_data["config"]["model_sha"] model_name = model_data["config"]["model_name"] row = { 'Model': make_clickable_field(model_name, model_sha) } for dataset, metrics in model_data['results'].items(): # Only include metrics for tasks of the specified type if dataset in filtered_tasks: value = next(iter(metrics.values())) log_url = metrics.get('log_url') # Use display name from tasks.json instead of raw dataset name display_name = filtered_tasks[dataset]['display_name'] # Round non-null values to 2 decimal places and make clickable if log_url exists if value is not None: value = round(value*100, 2) if log_url: value = make_clickable_field(value, log_url) row[display_name] = value data.append(row) results_df = pd.DataFrame(data) # Round all numeric columns to 2 decimal places numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns results_df[numeric_cols] = results_df[numeric_cols].round(2) # Fill null values with "-" results_df = results_df.fillna("--") if type == "agentic": # Include agent column as second column after Model results_df.insert(1, 'Agent', '[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)') return gr.components.Dataframe( value=results_df, datatype=["html" for _ in results_df.columns], column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns], wrap=False, ) black_logo_path = "src/assets/logo-icon-black.png" white_logo_path = "src/assets/logo-icon-white.png" demo = gr.Blocks( css=custom_css, js=custom_js, theme=gr.themes.Default(primary_hue=gr.themes.colors.pink), fill_height=True, fill_width=True, ) with demo: gr.HTML(f""" """) gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False) with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs: with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0): build_leaderboard("base") with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1): build_leaderboard("agentic") with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2): gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False) with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3): gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False) assets = [black_logo_path, white_logo_path] demo.launch()