eval-leaderboard

Running

xeon27 commited on Jan 24

Commit

8ad1a09

1 Parent(s): 1d1f5e9

Fix bug

Files changed (2) hide show

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from src.about import (
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     ST_BENCHMARK_COLS,
     AGENTIC_BENCHMARK_COLS,
     EVAL_COLS,
@@ -49,8 +50,8 @@ except Exception:
     restart_space()
-ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ST_BENCHMARK_COLS)
-AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, AGENTIC_BENCHMARK_COLS)
 (
     finished_eval_queue_df,
@@ -99,7 +100,7 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Single-turn Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(ST_LEADERBOARD_DF)
         with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF)

 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
+    COLS,
     ST_BENCHMARK_COLS,
     AGENTIC_BENCHMARK_COLS,
     EVAL_COLS,
     restart_space()
+ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
+AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("Single-turn Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(ST_LEADERBOARD_DF)
         with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF)

src/populate.py CHANGED Viewed

@@ -34,7 +34,7 @@ def get_inspect_log_url(model_name: str, benchmark_name: str) -> str:
         return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
-def get_leaderboard_df(results_path: str, requests_path: str, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
@@ -42,7 +42,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, benchmark_cols: li
     df = pd.DataFrame.from_records(all_data_json)
     # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df[benchmark_cols].round(decimals=2)
     # # filter out if any of the benchmarks have not been produced
     # df = df[has_no_nan_values(df, benchmark_cols)]

         return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    df = df[cols].round(decimals=2)
+    # subset for model and benchmark cols
+    df = df[[AutoEvalColumn.model.name] + benchmark_cols]
     # # filter out if any of the benchmarks have not been produced
     # df = df[has_no_nan_values(df, benchmark_cols)]