File size: 2,910 Bytes
62c7044
 
 
 
 
ada4cd8
 
 
b27b717
1cade3b
ada4cd8
 
1cade3b
b27b717
 
 
ada4cd8
b27b717
ada4cd8
b27b717
ada4cd8
 
 
4c0cc56
ada4cd8
2bc2f6b
 
4c0cc56
2bc2f6b
 
4c0cc56
 
5d9a791
 
 
 
 
 
 
 
 
 
4c0cc56
5d9a791
 
 
 
 
 
4c0cc56
5d9a791
 
 
2bc2f6b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os

import pandas as pd


def get_leaderboard_df_crm(
    crm_results_path: str, accuracy_cols: list, cost_cols: list
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Creates a dataframe from all the individual experiment results"""
    sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]

    leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
    leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
    # leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
    #     by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
    # )
    leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)

    ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()

    leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
    leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
    leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
    # leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
    leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)

    leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
    leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
    leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
    leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
    leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
    # leaderboard_ts_df["LLM Provider"] = leaderboard_ts_df["LLM Provider"].fillna("Google")
    privacy_cols = leaderboard_ts_df[
        [
            "Privacy Zero-Shot Match Avoidance",
            "Privacy Zero-Shot Reveal Avoidance",
            "Privacy Five-Shot Match Avoidance",
            "Privacy Five-Shot Reveal Avoidance",
        ]
    ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)

    leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
    leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])

    ts_cols = leaderboard_ts_df[
        [
            "Safety",
            "Privacy",
            "Truthfulness",
            "Bias No CI",
        ]
    ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
    leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))

    return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df