xeon27
commited on
Commit
·
8ad1a09
1
Parent(s):
1d1f5e9
Fix bug
Browse files- app.py +4 -3
- src/populate.py +5 -2
app.py
CHANGED
@@ -14,6 +14,7 @@ from src.about import (
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
|
|
17 |
ST_BENCHMARK_COLS,
|
18 |
AGENTIC_BENCHMARK_COLS,
|
19 |
EVAL_COLS,
|
@@ -49,8 +50,8 @@ except Exception:
|
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
-
ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ST_BENCHMARK_COLS)
|
53 |
-
AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, AGENTIC_BENCHMARK_COLS)
|
54 |
|
55 |
(
|
56 |
finished_eval_queue_df,
|
@@ -99,7 +100,7 @@ with demo:
|
|
99 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
100 |
with gr.TabItem("Single-turn Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
101 |
leaderboard = init_leaderboard(ST_LEADERBOARD_DF)
|
102 |
-
|
103 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
104 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF)
|
105 |
|
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
+
COLS,
|
18 |
ST_BENCHMARK_COLS,
|
19 |
AGENTIC_BENCHMARK_COLS,
|
20 |
EVAL_COLS,
|
|
|
50 |
restart_space()
|
51 |
|
52 |
|
53 |
+
ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
|
54 |
+
AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
|
55 |
|
56 |
(
|
57 |
finished_eval_queue_df,
|
|
|
100 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
101 |
with gr.TabItem("Single-turn Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
102 |
leaderboard = init_leaderboard(ST_LEADERBOARD_DF)
|
103 |
+
|
104 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
105 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF)
|
106 |
|
src/populate.py
CHANGED
@@ -34,7 +34,7 @@ def get_inspect_log_url(model_name: str, benchmark_name: str) -> str:
|
|
34 |
return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
35 |
|
36 |
|
37 |
-
def get_leaderboard_df(results_path: str, requests_path: str, benchmark_cols: list) -> pd.DataFrame:
|
38 |
"""Creates a dataframe from all the individual experiment results"""
|
39 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
40 |
all_data_json = [v.to_dict() for v in raw_data]
|
@@ -42,7 +42,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, benchmark_cols: li
|
|
42 |
df = pd.DataFrame.from_records(all_data_json)
|
43 |
|
44 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
45 |
-
df = df[
|
|
|
|
|
|
|
46 |
|
47 |
# # filter out if any of the benchmarks have not been produced
|
48 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
34 |
return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
35 |
|
36 |
|
37 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
38 |
"""Creates a dataframe from all the individual experiment results"""
|
39 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
40 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
42 |
df = pd.DataFrame.from_records(all_data_json)
|
43 |
|
44 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
45 |
+
df = df[cols].round(decimals=2)
|
46 |
+
|
47 |
+
# subset for model and benchmark cols
|
48 |
+
df = df[[AutoEvalColumn.model.name] + benchmark_cols]
|
49 |
|
50 |
# # filter out if any of the benchmarks have not been produced
|
51 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|