File size: 8,413 Bytes
b98f07f
 
 
 
3b86dfc
b98f07f
3b3db42
91e8a06
ceb2102
370d5a0
b98f07f
 
3b86dfc
 
 
 
 
 
 
 
 
 
 
 
 
385e405
5f7fcf4
 
 
3b86dfc
385e405
3b86dfc
5fe3b95
b98f07f
3b86dfc
 
 
 
 
 
 
 
 
 
5f7fcf4
 
370d5a0
3b86dfc
 
 
5f7fcf4
370d5a0
3b86dfc
 
 
5909269
 
3b86dfc
 
5f7fcf4
 
3b86dfc
5f7fcf4
 
 
 
 
b98f07f
5f7fcf4
385e405
 
3b86dfc
385e405
5f7fcf4
 
 
 
 
 
 
 
 
 
385e405
5f7fcf4
385e405
 
5f7fcf4
385e405
370d5a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b98f07f
 
818f024
5fe3b95
b98f07f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b86dfc
 
 
b98f07f
 
 
 
 
 
 
 
 
 
 
abebeac
b98f07f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import json
import os

import pandas as pd
import numpy as np

from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results
from src.about import Tasks, SingleTableTasks, SingleColumnTasks


# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
#     """Creates a dataframe from all the individual experiment results"""
#     raw_data = get_raw_eval_results(results_path, requests_path)
#     all_data_json = [v.to_dict() for v in raw_data]

#     df = pd.DataFrame.from_records(all_data_json)
#     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
#     df = df[cols].round(decimals=2)

#     # filter out if any of the benchmarks have not been produced
#     df = df[has_no_nan_values(df, benchmark_cols)]
#     return df


def strip_emoji(text: str) -> str:
    """Removes emojis from text"""
    return text.encode("ascii", "ignore").decode("ascii").rstrip()


def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""

    # iterate thorugh all files in the results path and read them into json
    all_data_json = []
    res_path = os.path.join(results_path, "demo-leaderboard", "syntherela-demo")
    for entry in os.listdir(res_path):
        if entry.endswith(".json"):
            file_path = os.path.join(res_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)
                all_data_json.append(data)

    multi_table_metrics = [task.value.col_name for task in Tasks]
    single_table_metrics = [task.value.col_name for task in SingleTableTasks]
    single_column_metrics = [task.value.col_name for task in SingleColumnTasks]

    # create empty dataframe with the columns multi_table_metrics
    multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
    singletable_df = pd.DataFrame(columns=["Dataset", "Model"] + single_table_metrics)
    singlecolumn_df = pd.DataFrame(columns=["Dataset", "Table", "Model"] + single_column_metrics)

    # iterate through all json files and add the data to the dataframe
    for data in all_data_json:
        model = data["method_name"]
        dataset = data["dataset_name"]
        row = {"Dataset": dataset, "Model": model}
        for metric in multi_table_metrics:
            stripped_metric = strip_emoji(metric)
            if stripped_metric in data["multi_table_metrics"]:
                metric_values = []
                for table in data["multi_table_metrics"][stripped_metric].keys():
                    if "accuracy" in data["multi_table_metrics"][stripped_metric][table]:
                        metric_values.append(data["multi_table_metrics"][stripped_metric][table]["accuracy"])
                    if "statistic" in data["multi_table_metrics"][stripped_metric][table]:
                        metric_values.append(data["multi_table_metrics"][stripped_metric][table]["statistic"])

                row[metric] = np.mean(metric_values).round(decimals=2)
            else:
                row[metric] = np.nan
        multitable_df = pd.concat([multitable_df, pd.DataFrame([row])], ignore_index=True)

        singletable_row = {"Dataset": dataset, "Model": model}
        for metric in single_table_metrics:
            stripped_metric = strip_emoji(metric)
            if stripped_metric in data["single_table_metrics"]:
                metric_values = []
                for table in data["single_table_metrics"][stripped_metric].keys():
                    if "accuracy" in data["single_table_metrics"][stripped_metric][table]:
                        metric_values.append(data["single_table_metrics"][stripped_metric][table]["accuracy"])
                    if "value" in data["single_table_metrics"][stripped_metric][table]:
                        metric_values.append(data["single_table_metrics"][stripped_metric][table]["value"])

                singletable_row[metric] = np.mean(metric_values).round(decimals=2)
            else:
                singletable_row[metric] = np.nan
        singletable_df = pd.concat([singletable_df, pd.DataFrame([singletable_row])], ignore_index=True)

        singlecolumn_row = {"Dataset": dataset, "Model": model, "Table": ""}
        # insert row
        for metric in single_column_metrics:
            stripped_metric = strip_emoji(metric)
            if stripped_metric in data["single_column_metrics"]:
                for table in data["single_column_metrics"][stripped_metric].keys():
                    # check if row where dataset = dataset, model = model, table = table exists
                    if singlecolumn_df[
                        (singlecolumn_df["Dataset"] == dataset) & 
                        (singlecolumn_df["Model"] == model) & 
                        (singlecolumn_df["Table"] == table)
                    ].empty:
                        singlecolumn_row = {"Dataset": dataset, "Model": model, "Table": table}
                        singlecolumn_df = pd.concat([singlecolumn_df, pd.DataFrame([singlecolumn_row])], ignore_index=True)

                    metric_values = []
                    for column in data["single_column_metrics"][stripped_metric][table].keys():
                        if "accuracy" in data["single_column_metrics"][stripped_metric][table][column]:
                            metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["accuracy"])
                        if "value" in data["single_column_metrics"][stripped_metric][table][column]:
                            metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["value"])
                        if "statistic" in data["single_column_metrics"][stripped_metric][table][column]:
                            metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["statistic"])

                    # save np.mean(metric_values).round(decimals=2) to singlecolumn_df where dataset = dataset, model = model, table = table
                    singlecolumn_df.loc[
                        (singlecolumn_df["Dataset"] == dataset) & 
                        (singlecolumn_df["Model"] == model) & 
                        (singlecolumn_df["Table"] == table), metric] = np.mean(metric_values).round(decimals=2)
            

    return singlecolumn_df, singletable_df, multitable_df


def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requestes"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []

    for entry in entries:
        if ".json" in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)

            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
            data[EvalQueueColumn.revision.name] = data.get("revision", "main")

            all_evals.append(data)
        elif ".md" not in entry:
            # this is a folder
            sub_entries = [
                e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
            ]
            for sub_entry in sub_entries:
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    data = json.load(fp)

                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                all_evals.append(data)

    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]