Martin Jurkovic commited on
Commit
5f7fcf4
·
1 Parent(s): 3b86dfc

Add singletable metrics

Browse files
Files changed (4) hide show
  1. app.py +33 -5
  2. src/about.py +13 -4
  3. src/display/utils.py +12 -1
  4. src/populate.py +32 -13
app.py CHANGED
@@ -19,6 +19,7 @@ from src.display.utils import (
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
 
22
  ModelType,
23
  fields,
24
  # WeightType,
@@ -49,7 +50,7 @@ except Exception:
49
  restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,7 +58,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
- def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
  return Leaderboard(
@@ -71,7 +72,8 @@ def init_leaderboard(dataframe):
71
  search_columns=[AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
 
75
  # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  # ColumnFilter(
77
  # AutoEvalColumn.params.name,
@@ -88,6 +90,27 @@ def init_leaderboard(dataframe):
88
  interactive=False,
89
  )
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
@@ -95,8 +118,13 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 Syntherela Benchmark", elem_id="syntherela-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
100
 
101
  with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
+ singletable_AutoEvalColumn,
23
  ModelType,
24
  fields,
25
  # WeightType,
 
50
  restart_space()
51
 
52
 
53
+ SINGLETABLE_LEADERBOARD_DF, MULTITABLE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
54
 
55
  (
56
  finished_eval_queue_df,
 
58
  pending_eval_queue_df,
59
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
 
61
+ def init_multitable_leaderboard(dataframe):
62
  if dataframe is None or dataframe.empty:
63
  raise ValueError("Leaderboard DataFrame is empty or None.")
64
  return Leaderboard(
 
72
  search_columns=[AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
73
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
74
  filter_columns=[
75
+ ColumnFilter(AutoEvalColumn.dataset.name, type="checkboxgroup", label="Datasets"),
76
+ ColumnFilter(AutoEvalColumn.model.name, type="checkboxgroup", label="Models"),
77
  # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
78
  # ColumnFilter(
79
  # AutoEvalColumn.params.name,
 
90
  interactive=False,
91
  )
92
 
93
+ def init_singletable_leaderboard(dataframe):
94
+ if dataframe is None or dataframe.empty:
95
+ raise ValueError("Leaderboard DataFrame is empty or None.")
96
+ return Leaderboard(
97
+ value=dataframe,
98
+ datatype=[c.type for c in fields(singletable_AutoEvalColumn)],
99
+ select_columns=SelectColumns(
100
+ default_selection=[c.name for c in fields(singletable_AutoEvalColumn) if c.displayed_by_default],
101
+ cant_deselect=[c.name for c in fields(singletable_AutoEvalColumn) if c.never_hidden],
102
+ label="Select Columns to Display:",
103
+ ),
104
+ search_columns=[singletable_AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
105
+ hide_columns=[c.name for c in fields(singletable_AutoEvalColumn) if c.hidden],
106
+ filter_columns=[
107
+ ColumnFilter(singletable_AutoEvalColumn.dataset.name, type="checkboxgroup", label="Datasets"),
108
+ ColumnFilter(singletable_AutoEvalColumn.model.name, type="checkboxgroup", label="Models"),
109
+ ],
110
+ bool_checkboxgroup_label="Hide models",
111
+ interactive=False,
112
+ )
113
+
114
 
115
  demo = gr.Blocks(css=custom_css)
116
  with demo:
 
118
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
119
 
120
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
121
+ with gr.TabItem("🏅 MultiTable", elem_id="syntherela-benchmark-tab-table", id=0):
122
+ leaderboard = init_multitable_leaderboard(MULTITABLE_LEADERBOARD_DF)
123
+
124
+ with gr.TabItem("🏅 SingleTable", elem_id="syntherela-benchmark-tab-table", id=1):
125
+ singletable_leaderboard = init_singletable_leaderboard(SINGLETABLE_LEADERBOARD_DF)
126
+
127
+
128
 
129
  with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=2):
130
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -14,9 +14,18 @@ class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  # task0 = Task("anli_r1", "acc", "ANLI")
16
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
- task_0 = Task("multi-table", "AggregationDetection-LogisticRegression", "AggregationDetection-LogisticRegression")
18
- task_1 = Task("multi-table", "AggregationDetection-XGBClassifier", "AggregationDetection-XGBClassifier")
19
- task_2 = Task("multi-table", "CardinalityShapeSimilarity", "CardinalityShapeSimilarity")
 
 
 
 
 
 
 
 
 
20
 
21
  NUM_FEWSHOT = 0 # Change with your few shot
22
  # ---------------------------------------------------
@@ -24,7 +33,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
24
 
25
 
26
  # Your leaderboard name
27
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
28
 
29
  # What does your leaderboard evaluate?
30
  INTRODUCTION_TEXT = """
 
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  # task0 = Task("anli_r1", "acc", "ANLI")
16
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
+ task_0 = Task("multi-table", "AggregationDetection-LogisticRegression", "AggregationDetection-LogisticRegression ⬇️")
18
+ task_1 = Task("multi-table", "AggregationDetection-XGBClassifier", "AggregationDetection-XGBClassifier ⬇️")
19
+ task_2 = Task("multi-table", "CardinalityShapeSimilarity", "CardinalityShapeSimilarity ⬆️")
20
+
21
+ class SingleTableTasks(Enum):
22
+ task_0 = Task("single-table", "MaximumMeanDiscrepancy", "MaximumMeanDiscrepancy ⬇️")
23
+ # PairwiseCorrelationDifference
24
+ task_1 = Task("single-table", "PairwiseCorrelationDifference", "PairwiseCorrelationDifference ⬇️")
25
+ # SingleTableDetection-LogisticRegression
26
+ task_2 = Task("single-table", "SingleTableDetection-LogisticRegression", "SingleTableDetection-LogisticRegression ⬇️")
27
+ # SingleTableDetection-XGBClassifier
28
+ task_3 = Task("single-table", "SingleTableDetection-XGBClassifier", "SingleTableDetection-XGBClassifier ⬇️")
29
 
30
  NUM_FEWSHOT = 0 # Change with your few shot
31
  # ---------------------------------------------------
 
33
 
34
 
35
  # Your leaderboard name
36
+ TITLE = """<h1 align="center" id="space-title">Syntherela leaderboard</h1>"""
37
 
38
  # What does your leaderboard evaluate?
39
  INTRODUCTION_TEXT = """
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -44,6 +44,17 @@ for task in Tasks:
44
  # We use make dataclass to dynamically fill the scores from Tasks
45
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
46
 
 
 
 
 
 
 
 
 
 
 
 
47
  ## For the queue columns in the submission tab
48
  @dataclass(frozen=True)
49
  class EvalQueueColumn: # Queue column
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, SingleTableTasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
44
  # We use make dataclass to dynamically fill the scores from Tasks
45
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
46
 
47
+ singletable_auto_eval_column_dict = []
48
+ # Init
49
+ singletable_auto_eval_column_dict.append(["dataset", ColumnContent, ColumnContent("Dataset", "str", True, never_hidden=True)])
50
+ # singletable_auto_eval_column_dict.append(["table", ColumnContent, ColumnContent("Table", "str", True, never_hidden=True)])
51
+ singletable_auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
52
+ #Scores
53
+ for task in SingleTableTasks:
54
+ singletable_auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
55
+
56
+ singletable_AutoEvalColumn = make_dataclass("AutoEvalColumn", singletable_auto_eval_column_dict, frozen=True)
57
+
58
  ## For the queue columns in the submission tab
59
  @dataclass(frozen=True)
60
  class EvalQueueColumn: # Queue column
src/populate.py CHANGED
@@ -7,6 +7,7 @@ import numpy as np
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
 
10
 
11
 
12
  # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
@@ -22,6 +23,9 @@ from src.leaderboard.read_evals import get_raw_eval_results
22
  # df = df[has_no_nan_values(df, benchmark_cols)]
23
  # return df
24
 
 
 
 
25
 
26
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
27
  """Creates a dataframe from all the individual experiment results"""
@@ -36,14 +40,13 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
36
  data = json.load(fp)
37
  all_data_json.append(data)
38
 
39
- multi_table_metrics = [
40
- "AggregationDetection-LogisticRegression",
41
- "AggregationDetection-XGBClassifier",
42
- "CardinalityShapeSimilarity",
43
- ]
44
 
45
  # create empty dataframe with the columns multi_table_metrics
46
  multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
 
47
 
48
  # iterate through all json files and add the data to the dataframe
49
  for data in all_data_json:
@@ -51,17 +54,33 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
51
  dataset = data["dataset"]
52
  row = {"Dataset": dataset, "Model": model}
53
  for metric in multi_table_metrics:
54
- if metric in data["multi_table_metrics"]:
 
55
  metric_values = []
56
- for table in data["multi_table_metrics"][metric].keys():
57
- if "accuracy" in data["multi_table_metrics"][metric][table]:
58
- metric_values.append(data["multi_table_metrics"][metric][table]["accuracy"])
59
- if "statistic" in data["multi_table_metrics"][metric][table]:
60
- metric_values.append(data["multi_table_metrics"][metric][table]["statistic"])
61
 
62
- row[metric] = np.mean(metric_values)
63
  multitable_df = pd.concat([multitable_df, pd.DataFrame([row])], ignore_index=True)
64
- return multitable_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
+ from src.about import Tasks, SingleTableTasks
11
 
12
 
13
  # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
23
  # df = df[has_no_nan_values(df, benchmark_cols)]
24
  # return df
25
 
26
+ def strip_emoji(text: str) -> str:
27
+ """Removes emojis from text"""
28
+ return text.encode("ascii", "ignore").decode("ascii").rstrip()
29
 
30
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
31
  """Creates a dataframe from all the individual experiment results"""
 
40
  data = json.load(fp)
41
  all_data_json.append(data)
42
 
43
+ multi_table_metrics = [task.value.col_name for task in Tasks]
44
+
45
+ single_table_metrics = [task.value.col_name for task in SingleTableTasks]
 
 
46
 
47
  # create empty dataframe with the columns multi_table_metrics
48
  multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
49
+ singletable_df = pd.DataFrame(columns=["Dataset", "Model"] + single_table_metrics)
50
 
51
  # iterate through all json files and add the data to the dataframe
52
  for data in all_data_json:
 
54
  dataset = data["dataset"]
55
  row = {"Dataset": dataset, "Model": model}
56
  for metric in multi_table_metrics:
57
+ stripped_metric = strip_emoji(metric)
58
+ if stripped_metric in data["multi_table_metrics"]:
59
  metric_values = []
60
+ for table in data["multi_table_metrics"][stripped_metric].keys():
61
+ if "accuracy" in data["multi_table_metrics"][stripped_metric][table]:
62
+ metric_values.append(data["multi_table_metrics"][stripped_metric][table]["accuracy"])
63
+ if "statistic" in data["multi_table_metrics"][stripped_metric][table]:
64
+ metric_values.append(data["multi_table_metrics"][stripped_metric][table]["statistic"])
65
 
66
+ row[metric] = np.mean(metric_values).round(decimals=2)
67
  multitable_df = pd.concat([multitable_df, pd.DataFrame([row])], ignore_index=True)
68
+
69
+ singletable_row = {"Dataset": dataset, "Model": model}
70
+ for metric in single_table_metrics:
71
+ stripped_metric = strip_emoji(metric)
72
+ if stripped_metric in data["single_table_metrics"]:
73
+ metric_values = []
74
+ for table in data["single_table_metrics"][stripped_metric].keys():
75
+ if "accuracy" in data["single_table_metrics"][stripped_metric][table]:
76
+ metric_values.append(data["single_table_metrics"][stripped_metric][table]["accuracy"])
77
+ if "value" in data["single_table_metrics"][stripped_metric][table]:
78
+ metric_values.append(data["single_table_metrics"][stripped_metric][table]["value"])
79
+
80
+ singletable_row[metric] = np.mean(metric_values).round(decimals=2)
81
+ singletable_df = pd.concat([singletable_df, pd.DataFrame([singletable_row])], ignore_index=True)
82
+
83
+ return singletable_df, multitable_df
84
 
85
 
86
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: