xeon27 commited on
Commit
aa87c61
·
1 Parent(s): 37ebe4e

Remove commented code

Browse files
Files changed (4) hide show
  1. app.py +1 -125
  2. src/display/utils.py +1 -13
  3. src/leaderboard/read_evals.py +0 -13
  4. src/populate.py +0 -6
app.py CHANGED
@@ -62,36 +62,8 @@ AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PAT
62
  def init_leaderboard(dataframe, benchmark_type):
63
  if dataframe is None or dataframe.empty:
64
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
65
  AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
66
- # return Leaderboard(
67
- # value=dataframe,
68
- # datatype=[c.type for c in AutoEvalColumnSubset],
69
- # select_columns=SelectColumns(
70
- # default_selection=[c.name for c in AutoEvalColumnSubset if c.displayed_by_default],
71
- # cant_deselect=[c.name for c in AutoEvalColumnSubset if c.never_hidden],
72
- # label="Select Columns to Display:",
73
- # ),
74
- # # # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
75
- # search_columns=[AutoEvalColumn.model.name,],
76
- # hide_columns=[c.name for c in AutoEvalColumnSubset if c.hidden],
77
- # # filter_columns=[
78
- # # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
79
- # # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
80
- # # ColumnFilter(
81
- # # AutoEvalColumn.params.name,
82
- # # type="slider",
83
- # # min=0.01,
84
- # # max=150,
85
- # # label="Select the number of parameters (B)",
86
- # # ),
87
- # # ColumnFilter(
88
- # # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
89
- # # ),
90
- # # ],
91
- # filter_columns=[],
92
- # bool_checkboxgroup_label="Hide models",
93
- # interactive=False,
94
- # )
95
 
96
  return gr.components.Dataframe(
97
  value=dataframe,
@@ -115,102 +87,6 @@ with demo:
115
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
116
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
117
 
118
- # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
119
- # with gr.Column():
120
- # with gr.Row():
121
- # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
122
-
123
- # with gr.Column():
124
- # with gr.Accordion(
125
- # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
126
- # open=False,
127
- # ):
128
- # with gr.Row():
129
- # finished_eval_table = gr.components.Dataframe(
130
- # value=finished_eval_queue_df,
131
- # headers=EVAL_COLS,
132
- # datatype=EVAL_TYPES,
133
- # row_count=5,
134
- # )
135
- # with gr.Accordion(
136
- # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
137
- # open=False,
138
- # ):
139
- # with gr.Row():
140
- # running_eval_table = gr.components.Dataframe(
141
- # value=running_eval_queue_df,
142
- # headers=EVAL_COLS,
143
- # datatype=EVAL_TYPES,
144
- # row_count=5,
145
- # )
146
-
147
- # with gr.Accordion(
148
- # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
149
- # open=False,
150
- # ):
151
- # with gr.Row():
152
- # pending_eval_table = gr.components.Dataframe(
153
- # value=pending_eval_queue_df,
154
- # headers=EVAL_COLS,
155
- # datatype=EVAL_TYPES,
156
- # row_count=5,
157
- # )
158
- # with gr.Row():
159
- # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
160
-
161
- # with gr.Row():
162
- # with gr.Column():
163
- # model_name_textbox = gr.Textbox(label="Model name")
164
- # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
165
- # model_type = gr.Dropdown(
166
- # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
167
- # label="Model type",
168
- # multiselect=False,
169
- # value=None,
170
- # interactive=True,
171
- # )
172
-
173
- # with gr.Column():
174
- # precision = gr.Dropdown(
175
- # choices=[i.value.name for i in Precision if i != Precision.Unknown],
176
- # label="Precision",
177
- # multiselect=False,
178
- # value="float16",
179
- # interactive=True,
180
- # )
181
- # weight_type = gr.Dropdown(
182
- # choices=[i.value.name for i in WeightType],
183
- # label="Weights type",
184
- # multiselect=False,
185
- # value="Original",
186
- # interactive=True,
187
- # )
188
- # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
189
-
190
- # submit_button = gr.Button("Submit Eval")
191
- # submission_result = gr.Markdown()
192
- # submit_button.click(
193
- # add_new_eval,
194
- # [
195
- # model_name_textbox,
196
- # base_model_name_textbox,
197
- # revision_name_textbox,
198
- # precision,
199
- # weight_type,
200
- # model_type,
201
- # ],
202
- # submission_result,
203
- # )
204
-
205
- # with gr.Row():
206
- # with gr.Accordion("📙 Citation", open=False):
207
- # citation_button = gr.Textbox(
208
- # value=CITATION_BUTTON_TEXT,
209
- # label=CITATION_BUTTON_LABEL,
210
- # lines=20,
211
- # elem_id="citation-button",
212
- # show_copy_button=True,
213
- # )
214
 
215
  scheduler = BackgroundScheduler()
216
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
62
  def init_leaderboard(dataframe, benchmark_type):
63
  if dataframe is None or dataframe.empty:
64
  raise ValueError("Leaderboard DataFrame is empty or None.")
65
+
66
  AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  return gr.components.Dataframe(
69
  value=dataframe,
 
87
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
88
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  scheduler = BackgroundScheduler()
92
  scheduler.add_job(restart_space, "interval", seconds=1800)
src/display/utils.py CHANGED
@@ -23,22 +23,10 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "markdown", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
32
- # # Model information
33
- # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
 
26
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
27
+ # Scores
 
28
  for task in Tasks:
29
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
 
 
 
 
 
 
 
 
 
 
30
 
31
  # We use make dataclass to dynamically fill the scores from Tasks
32
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -113,21 +113,9 @@ class EvalResult:
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
- # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
117
  data_dict = {
118
  "eval_name": self.eval_name, # not a column, just a save name,
119
- # AutoEvalColumn.precision.name: self.precision.value.name,
120
- # AutoEvalColumn.model_type.name: self.model_type.value.name,
121
- # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
122
- # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
123
- # AutoEvalColumn.architecture.name: self.architecture,
124
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
125
- # AutoEvalColumn.revision.name: self.revision,
126
- # AutoEvalColumn.average.name: average,
127
- # AutoEvalColumn.license.name: self.license,
128
- # AutoEvalColumn.likes.name: self.likes,
129
- # AutoEvalColumn.params.name: self.num_params,
130
- # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
131
  }
132
 
133
  for task in Tasks:
@@ -185,7 +173,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
185
  # Store results of same eval together
186
  eval_name = eval_result.eval_name
187
  if eval_name in eval_results.keys():
188
- # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
189
  eval_results[eval_name].results.update(eval_result.results)
190
  else:
191
  eval_results[eval_name] = eval_result
 
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
116
  data_dict = {
117
  "eval_name": self.eval_name, # not a column, just a save name,
 
 
 
 
 
118
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
 
 
 
 
 
119
  }
120
 
121
  for task in Tasks:
 
173
  # Store results of same eval together
174
  eval_name = eval_result.eval_name
175
  if eval_name in eval_results.keys():
 
176
  eval_results[eval_name].results.update(eval_result.results)
177
  else:
178
  eval_results[eval_name] = eval_result
src/populate.py CHANGED
@@ -41,23 +41,17 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
41
 
42
  df = pd.DataFrame.from_records(all_data_json)
43
 
44
- # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
45
  df = df[cols].round(decimals=2)
46
 
47
  # subset for model and benchmark cols
48
  df = df[[AutoEvalColumn.model.name] + benchmark_cols]
49
 
50
- # # filter out if any of the benchmarks have not been produced
51
- # df = df[has_no_nan_values(df, benchmark_cols)]
52
  df = df.fillna(EMPTY_SYMBOL)
53
 
54
  # make values clickable and link to log files
55
  for col in benchmark_cols:
56
  df[col] = df[[AutoEvalColumn.model.name, col]].apply(lambda x: f"[{x[col]}]({get_inspect_log_url(model_name=x[AutoEvalColumn.model.name].split('>')[1].split('<')[0], benchmark_name=TASK_NAME_INVERSE_MAP[col]['name'])})" if x[col] != EMPTY_SYMBOL else x[col], axis=1)
57
 
58
- # # make task names clickable and link to inspect-evals repository - this creates issues later
59
- # df = df.rename(columns={col: f"[{col}]({TASK_NAME_INVERSE_MAP[col]['source']})" for col in benchmark_cols})
60
-
61
  return df
62
 
63
 
 
41
 
42
  df = pd.DataFrame.from_records(all_data_json)
43
 
 
44
  df = df[cols].round(decimals=2)
45
 
46
  # subset for model and benchmark cols
47
  df = df[[AutoEvalColumn.model.name] + benchmark_cols]
48
 
 
 
49
  df = df.fillna(EMPTY_SYMBOL)
50
 
51
  # make values clickable and link to log files
52
  for col in benchmark_cols:
53
  df[col] = df[[AutoEvalColumn.model.name, col]].apply(lambda x: f"[{x[col]}]({get_inspect_log_url(model_name=x[AutoEvalColumn.model.name].split('>')[1].split('<')[0], benchmark_name=TASK_NAME_INVERSE_MAP[col]['name'])})" if x[col] != EMPTY_SYMBOL else x[col], axis=1)
54
 
 
 
 
55
  return df
56
 
57