Spaces:
Running
Running
Martin Jurkovic
commited on
Commit
·
5f7fcf4
1
Parent(s):
3b86dfc
Add singletable metrics
Browse files- app.py +33 -5
- src/about.py +13 -4
- src/display/utils.py +12 -1
- src/populate.py +32 -13
app.py
CHANGED
|
@@ -19,6 +19,7 @@ from src.display.utils import (
|
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
| 21 |
AutoEvalColumn,
|
|
|
|
| 22 |
ModelType,
|
| 23 |
fields,
|
| 24 |
# WeightType,
|
|
@@ -49,7 +50,7 @@ except Exception:
|
|
| 49 |
restart_space()
|
| 50 |
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
(
|
| 55 |
finished_eval_queue_df,
|
|
@@ -57,7 +58,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
|
| 57 |
pending_eval_queue_df,
|
| 58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 59 |
|
| 60 |
-
def
|
| 61 |
if dataframe is None or dataframe.empty:
|
| 62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 63 |
return Leaderboard(
|
|
@@ -71,7 +72,8 @@ def init_leaderboard(dataframe):
|
|
| 71 |
search_columns=[AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
|
| 72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
filter_columns=[
|
| 74 |
-
|
|
|
|
| 75 |
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 76 |
# ColumnFilter(
|
| 77 |
# AutoEvalColumn.params.name,
|
|
@@ -88,6 +90,27 @@ def init_leaderboard(dataframe):
|
|
| 88 |
interactive=False,
|
| 89 |
)
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
demo = gr.Blocks(css=custom_css)
|
| 93 |
with demo:
|
|
@@ -95,8 +118,13 @@ with demo:
|
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
with gr.TabItem("🏅
|
| 99 |
-
leaderboard =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=2):
|
| 102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
| 21 |
AutoEvalColumn,
|
| 22 |
+
singletable_AutoEvalColumn,
|
| 23 |
ModelType,
|
| 24 |
fields,
|
| 25 |
# WeightType,
|
|
|
|
| 50 |
restart_space()
|
| 51 |
|
| 52 |
|
| 53 |
+
SINGLETABLE_LEADERBOARD_DF, MULTITABLE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
| 54 |
|
| 55 |
(
|
| 56 |
finished_eval_queue_df,
|
|
|
|
| 58 |
pending_eval_queue_df,
|
| 59 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 60 |
|
| 61 |
+
def init_multitable_leaderboard(dataframe):
|
| 62 |
if dataframe is None or dataframe.empty:
|
| 63 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 64 |
return Leaderboard(
|
|
|
|
| 72 |
search_columns=[AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
|
| 73 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 74 |
filter_columns=[
|
| 75 |
+
ColumnFilter(AutoEvalColumn.dataset.name, type="checkboxgroup", label="Datasets"),
|
| 76 |
+
ColumnFilter(AutoEvalColumn.model.name, type="checkboxgroup", label="Models"),
|
| 77 |
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 78 |
# ColumnFilter(
|
| 79 |
# AutoEvalColumn.params.name,
|
|
|
|
| 90 |
interactive=False,
|
| 91 |
)
|
| 92 |
|
| 93 |
+
def init_singletable_leaderboard(dataframe):
|
| 94 |
+
if dataframe is None or dataframe.empty:
|
| 95 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 96 |
+
return Leaderboard(
|
| 97 |
+
value=dataframe,
|
| 98 |
+
datatype=[c.type for c in fields(singletable_AutoEvalColumn)],
|
| 99 |
+
select_columns=SelectColumns(
|
| 100 |
+
default_selection=[c.name for c in fields(singletable_AutoEvalColumn) if c.displayed_by_default],
|
| 101 |
+
cant_deselect=[c.name for c in fields(singletable_AutoEvalColumn) if c.never_hidden],
|
| 102 |
+
label="Select Columns to Display:",
|
| 103 |
+
),
|
| 104 |
+
search_columns=[singletable_AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
|
| 105 |
+
hide_columns=[c.name for c in fields(singletable_AutoEvalColumn) if c.hidden],
|
| 106 |
+
filter_columns=[
|
| 107 |
+
ColumnFilter(singletable_AutoEvalColumn.dataset.name, type="checkboxgroup", label="Datasets"),
|
| 108 |
+
ColumnFilter(singletable_AutoEvalColumn.model.name, type="checkboxgroup", label="Models"),
|
| 109 |
+
],
|
| 110 |
+
bool_checkboxgroup_label="Hide models",
|
| 111 |
+
interactive=False,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
|
| 115 |
demo = gr.Blocks(css=custom_css)
|
| 116 |
with demo:
|
|
|
|
| 118 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 119 |
|
| 120 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 121 |
+
with gr.TabItem("🏅 MultiTable", elem_id="syntherela-benchmark-tab-table", id=0):
|
| 122 |
+
leaderboard = init_multitable_leaderboard(MULTITABLE_LEADERBOARD_DF)
|
| 123 |
+
|
| 124 |
+
with gr.TabItem("🏅 SingleTable", elem_id="syntherela-benchmark-tab-table", id=1):
|
| 125 |
+
singletable_leaderboard = init_singletable_leaderboard(SINGLETABLE_LEADERBOARD_DF)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
|
| 129 |
with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=2):
|
| 130 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/about.py
CHANGED
|
@@ -14,9 +14,18 @@ class Tasks(Enum):
|
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
| 16 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
| 17 |
-
task_0 = Task("multi-table", "AggregationDetection-LogisticRegression", "AggregationDetection-LogisticRegression")
|
| 18 |
-
task_1 = Task("multi-table", "AggregationDetection-XGBClassifier", "AggregationDetection-XGBClassifier")
|
| 19 |
-
task_2 = Task("multi-table", "CardinalityShapeSimilarity", "CardinalityShapeSimilarity")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 22 |
# ---------------------------------------------------
|
|
@@ -24,7 +33,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 24 |
|
| 25 |
|
| 26 |
# Your leaderboard name
|
| 27 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 28 |
|
| 29 |
# What does your leaderboard evaluate?
|
| 30 |
INTRODUCTION_TEXT = """
|
|
|
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
| 16 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
| 17 |
+
task_0 = Task("multi-table", "AggregationDetection-LogisticRegression", "AggregationDetection-LogisticRegression ⬇️")
|
| 18 |
+
task_1 = Task("multi-table", "AggregationDetection-XGBClassifier", "AggregationDetection-XGBClassifier ⬇️")
|
| 19 |
+
task_2 = Task("multi-table", "CardinalityShapeSimilarity", "CardinalityShapeSimilarity ⬆️")
|
| 20 |
+
|
| 21 |
+
class SingleTableTasks(Enum):
|
| 22 |
+
task_0 = Task("single-table", "MaximumMeanDiscrepancy", "MaximumMeanDiscrepancy ⬇️")
|
| 23 |
+
# PairwiseCorrelationDifference
|
| 24 |
+
task_1 = Task("single-table", "PairwiseCorrelationDifference", "PairwiseCorrelationDifference ⬇️")
|
| 25 |
+
# SingleTableDetection-LogisticRegression
|
| 26 |
+
task_2 = Task("single-table", "SingleTableDetection-LogisticRegression", "SingleTableDetection-LogisticRegression ⬇️")
|
| 27 |
+
# SingleTableDetection-XGBClassifier
|
| 28 |
+
task_3 = Task("single-table", "SingleTableDetection-XGBClassifier", "SingleTableDetection-XGBClassifier ⬇️")
|
| 29 |
|
| 30 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 31 |
# ---------------------------------------------------
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
# Your leaderboard name
|
| 36 |
+
TITLE = """<h1 align="center" id="space-title">Syntherela leaderboard</h1>"""
|
| 37 |
|
| 38 |
# What does your leaderboard evaluate?
|
| 39 |
INTRODUCTION_TEXT = """
|
src/display/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from enum import Enum
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.about import Tasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -44,6 +44,17 @@ for task in Tasks:
|
|
| 44 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 45 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
## For the queue columns in the submission tab
|
| 48 |
@dataclass(frozen=True)
|
| 49 |
class EvalQueueColumn: # Queue column
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.about import Tasks, SingleTableTasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 44 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 45 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 46 |
|
| 47 |
+
singletable_auto_eval_column_dict = []
|
| 48 |
+
# Init
|
| 49 |
+
singletable_auto_eval_column_dict.append(["dataset", ColumnContent, ColumnContent("Dataset", "str", True, never_hidden=True)])
|
| 50 |
+
# singletable_auto_eval_column_dict.append(["table", ColumnContent, ColumnContent("Table", "str", True, never_hidden=True)])
|
| 51 |
+
singletable_auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 52 |
+
#Scores
|
| 53 |
+
for task in SingleTableTasks:
|
| 54 |
+
singletable_auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 55 |
+
|
| 56 |
+
singletable_AutoEvalColumn = make_dataclass("AutoEvalColumn", singletable_auto_eval_column_dict, frozen=True)
|
| 57 |
+
|
| 58 |
## For the queue columns in the submission tab
|
| 59 |
@dataclass(frozen=True)
|
| 60 |
class EvalQueueColumn: # Queue column
|
src/populate.py
CHANGED
|
@@ -7,6 +7,7 @@ import numpy as np
|
|
| 7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 8 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
@@ -22,6 +23,9 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
| 22 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 23 |
# return df
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 27 |
"""Creates a dataframe from all the individual experiment results"""
|
|
@@ -36,14 +40,13 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
| 36 |
data = json.load(fp)
|
| 37 |
all_data_json.append(data)
|
| 38 |
|
| 39 |
-
multi_table_metrics = [
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
"CardinalityShapeSimilarity",
|
| 43 |
-
]
|
| 44 |
|
| 45 |
# create empty dataframe with the columns multi_table_metrics
|
| 46 |
multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
|
|
|
|
| 47 |
|
| 48 |
# iterate through all json files and add the data to the dataframe
|
| 49 |
for data in all_data_json:
|
|
@@ -51,17 +54,33 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
| 51 |
dataset = data["dataset"]
|
| 52 |
row = {"Dataset": dataset, "Model": model}
|
| 53 |
for metric in multi_table_metrics:
|
| 54 |
-
|
|
|
|
| 55 |
metric_values = []
|
| 56 |
-
for table in data["multi_table_metrics"][
|
| 57 |
-
if "accuracy" in data["multi_table_metrics"][
|
| 58 |
-
metric_values.append(data["multi_table_metrics"][
|
| 59 |
-
if "statistic" in data["multi_table_metrics"][
|
| 60 |
-
metric_values.append(data["multi_table_metrics"][
|
| 61 |
|
| 62 |
-
row[metric] = np.mean(metric_values)
|
| 63 |
multitable_df = pd.concat([multitable_df, pd.DataFrame([row])], ignore_index=True)
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
|
| 7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 8 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
+
from src.about import Tasks, SingleTableTasks
|
| 11 |
|
| 12 |
|
| 13 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
|
| 23 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 24 |
# return df
|
| 25 |
|
| 26 |
+
def strip_emoji(text: str) -> str:
|
| 27 |
+
"""Removes emojis from text"""
|
| 28 |
+
return text.encode("ascii", "ignore").decode("ascii").rstrip()
|
| 29 |
|
| 30 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 31 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
| 40 |
data = json.load(fp)
|
| 41 |
all_data_json.append(data)
|
| 42 |
|
| 43 |
+
multi_table_metrics = [task.value.col_name for task in Tasks]
|
| 44 |
+
|
| 45 |
+
single_table_metrics = [task.value.col_name for task in SingleTableTasks]
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# create empty dataframe with the columns multi_table_metrics
|
| 48 |
multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
|
| 49 |
+
singletable_df = pd.DataFrame(columns=["Dataset", "Model"] + single_table_metrics)
|
| 50 |
|
| 51 |
# iterate through all json files and add the data to the dataframe
|
| 52 |
for data in all_data_json:
|
|
|
|
| 54 |
dataset = data["dataset"]
|
| 55 |
row = {"Dataset": dataset, "Model": model}
|
| 56 |
for metric in multi_table_metrics:
|
| 57 |
+
stripped_metric = strip_emoji(metric)
|
| 58 |
+
if stripped_metric in data["multi_table_metrics"]:
|
| 59 |
metric_values = []
|
| 60 |
+
for table in data["multi_table_metrics"][stripped_metric].keys():
|
| 61 |
+
if "accuracy" in data["multi_table_metrics"][stripped_metric][table]:
|
| 62 |
+
metric_values.append(data["multi_table_metrics"][stripped_metric][table]["accuracy"])
|
| 63 |
+
if "statistic" in data["multi_table_metrics"][stripped_metric][table]:
|
| 64 |
+
metric_values.append(data["multi_table_metrics"][stripped_metric][table]["statistic"])
|
| 65 |
|
| 66 |
+
row[metric] = np.mean(metric_values).round(decimals=2)
|
| 67 |
multitable_df = pd.concat([multitable_df, pd.DataFrame([row])], ignore_index=True)
|
| 68 |
+
|
| 69 |
+
singletable_row = {"Dataset": dataset, "Model": model}
|
| 70 |
+
for metric in single_table_metrics:
|
| 71 |
+
stripped_metric = strip_emoji(metric)
|
| 72 |
+
if stripped_metric in data["single_table_metrics"]:
|
| 73 |
+
metric_values = []
|
| 74 |
+
for table in data["single_table_metrics"][stripped_metric].keys():
|
| 75 |
+
if "accuracy" in data["single_table_metrics"][stripped_metric][table]:
|
| 76 |
+
metric_values.append(data["single_table_metrics"][stripped_metric][table]["accuracy"])
|
| 77 |
+
if "value" in data["single_table_metrics"][stripped_metric][table]:
|
| 78 |
+
metric_values.append(data["single_table_metrics"][stripped_metric][table]["value"])
|
| 79 |
+
|
| 80 |
+
singletable_row[metric] = np.mean(metric_values).round(decimals=2)
|
| 81 |
+
singletable_df = pd.concat([singletable_df, pd.DataFrame([singletable_row])], ignore_index=True)
|
| 82 |
+
|
| 83 |
+
return singletable_df, multitable_df
|
| 84 |
|
| 85 |
|
| 86 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|