Spaces:
Running
Running
File size: 2,910 Bytes
62c7044 ada4cd8 b27b717 1cade3b ada4cd8 1cade3b b27b717 ada4cd8 b27b717 ada4cd8 b27b717 ada4cd8 4c0cc56 ada4cd8 2bc2f6b 4c0cc56 2bc2f6b 4c0cc56 5d9a791 4c0cc56 5d9a791 4c0cc56 5d9a791 2bc2f6b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import os
import pandas as pd
def get_leaderboard_df_crm(
crm_results_path: str, accuracy_cols: list, cost_cols: list
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Creates a dataframe from all the individual experiment results"""
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
# leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
# by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
# )
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
# leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
# leaderboard_ts_df["LLM Provider"] = leaderboard_ts_df["LLM Provider"].fillna("Google")
privacy_cols = leaderboard_ts_df[
[
"Privacy Zero-Shot Match Avoidance",
"Privacy Zero-Shot Reveal Avoidance",
"Privacy Five-Shot Match Avoidance",
"Privacy Five-Shot Reveal Avoidance",
]
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
ts_cols = leaderboard_ts_df[
[
"Safety",
"Privacy",
"Truthfulness",
"Bias No CI",
]
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|