File size: 4,898 Bytes
9ae8d89
 
 
 
 
 
0a14325
553b217
9ae8d89
 
 
09b313f
9ae8d89
d8147b8
09b313f
 
 
9ae8d89
 
0a14325
0da5ee3
 
0a14325
ba515db
0da5ee3
 
553b217
2a7ac72
553b217
2a7ac72
553b217
2a7ac72
09b313f
9ae8d89
 
 
09b313f
9ae8d89
 
 
 
 
 
 
 
 
 
 
09b313f
9ae8d89
0a14325
d86ca68
 
0a14325
553b217
 
9ae8d89
 
 
09b313f
9ae8d89
 
 
 
09b313f
 
9ae8d89
d86ca68
 
0a14325
553b217
 
9ae8d89
d86ca68
 
 
 
 
0a14325
553b217
 
d86ca68
 
 
 
 
 
 
9ae8d89
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import json
import os

import pandas as pd

from src.display.formatting import has_no_nan_values, make_clickable_model
# changes to be made here
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns
from src.leaderboard.read_evals import get_raw_eval_results


def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    raw_data =  get_raw_eval_results(results_path, requests_path, evaluation_metric)
    # print(raw_data)
    # raise Exception("stop")
    all_data_json = [v.to_dict(subset=subset) for v in raw_data]

    df = pd.DataFrame.from_records(all_data_json)
    # changes to be made here
    if subset == "datasets":
        df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
    elif subset == "med_safety":
        df = df.sort_values(by=["Harmfulness Score"], ascending=True)
    elif subset == "open_ended":
        df = df.sort_values(by=["ELO"], ascending=False)
    elif subset == "medical_summarization":
        df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
    elif subset == "aci":
        df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
    elif subset == "soap":
        df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
    cols = list(set(df.columns).intersection(set(cols)))
    df = df[cols].round(decimals=2)
    # filter out if any of the benchmarks have not been produced
    df = df[has_no_nan_values(df, benchmark_cols)]
    return raw_data, df


def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requestes"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []
    for entry in entries:
        if ".json" in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)
            data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
            # changes to be made here
            data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
            data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
            data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
            data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
            data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
            all_evals.append(data)
        elif ".md" not in entry:
            # this is a folder
            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
            for sub_entry in sub_entries:
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    data = json.load(fp)
                # print(data)
                data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
                data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
                data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
                data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
                data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
                all_evals.append(data)
    # breakpoint()
    pending_list = []
    running_list = []
    finished_list = []
    for run in all_evals:
        # changes to be made here
        status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
        # status_list = status_list
        if "RUNNING" in status_list:
            running_list.append(run)
        elif "PENDING" in status_list or "RERUN" in status_list:
            pending_list.append(run)
        else:
            finished_list.append(run)
        # breakpoint()
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]